def get_tsn_schedule(BASE_URL, COL_HEADS): date_recorded = datetime.now() logging.info("{}: Collecting TSN UFC schedule".format( date_recorded.strftime("%Y-%m-%d %H:%M"))) # Scrape the event names, dates and TSN feeds schedule_url = get_schedule_url(BASE_URL) schedule_page_soup = get_soup(schedule_url) # Extract rows from schedule table schedule_data = [] rows = schedule_page_soup.find("tbody").findAll("tr") for r in rows: schedule_data.append([td.text.strip() for td in r.findAll("td")] + [date_recorded]) # Put into dataframe to be inserted into SQL DF = pd.DataFrame(schedule_data, columns=COL_HEADS) columnDict = { "Broadcast_Date": 'str', "Event": 'str', "Broadcast_Time": 'str', "Network": 'str', "Recorded": 'DateTime' } server = "nonDashboard" database = "UFC" sqlTableName = "TSNScheduleRaw" primaryKeyColName = "ID" insertQ = create_insert_query(DF, columnDict, sqlTableName) run_sql_commmand(insertQ, server, database)
def get_all_rankings(BASE_URL, BASE_URL_UK, COL_HEADS): date_recorded = datetime.now() logging.info("{}: Collecting UFC rankings".format( date_recorded.strftime("%Y-%m-%d %H:%M"))) page_soup = get_soup(BASE_URL) rankings = [] # Get soups for ranking table for each weight division weight_categories = page_soup.findAll("div", "view-grouping") if len(weight_categories) == 0: logging.info("Looks like you've been redirected to the UK website!") # Go to the UK site for the rankings page_soup = get_soup(BASE_URL_UK) weight_categories = page_soup.findAll("div", "ranking-list tall") for wc in weight_categories: rankings += get_rankings_for_category_uk(wc) else: # Get the rankings for each weight category for wc in weight_categories: rankings += get_rankings_for_category(wc) # Insert to SQL logging.info("Inserting rankings to SQL...") DF = pd.DataFrame(rankings, columns=COL_HEADS) DF["Recorded"] = date_recorded columnDict = { "Weight_Class": 'str', "Ranking": 'str', "Athlete": 'str', "Recorded": 'DateTime' } server = "nonDashboard" database = "UFC" sqlTableName = "Rankings" primaryKeyColName = "ID" insertQ = create_insert_query(DF, columnDict, sqlTableName) run_sql_commmand(insertQ, server, database)
def scrape_southafrica(): todayDate = datetime.now().date() utc_tz = pytz.timezone("utc") sa_tz = pytz.timezone("Africa/Johannesburg") chans = [('SABC Sport', 'SABC%20Sport')] for channel_name, channel_code in chans: r = requests.get(f'https://tvguide.etv.co.za/guide/{channel_code}') soup = BS(r.text, 'html.parser') row_list = [] #progBoxes = todaySection.find('tbody').find_all('tr') #print(len(progBoxes)) for progBoxTable in soup.find_all('table')[:3]: for pb in progBoxTable.find_all('tr'): tba = {'Channel': channel_name} tds = pb.find_all('td') ## Time tba['StartLocal'] = datetime.combine(date=todayDate, time=datetime.strptime( tds[0].text, "%I:%M %p").time()) tba['StartUTC'] = sa_tz.localize( tba['StartLocal']).astimezone(utc_tz).replace(tzinfo=None) ## Programme if pb.b.text != 'Currently playing': tba['ProgrammeName'] = pb.b.text else: tba['ProgrammeName'] = pb.h3.text row_list.append(tba) DF = pd.DataFrame(row_list).sort_values('StartLocal').reset_index( drop=True) last_dt_local = datetime.combine(date=todayDate + timedelta(days=1), time=time(hour=0)) last_dt_utc = sa_tz.localize(last_dt_local).astimezone(utc_tz).replace( tzinfo=None) DF['EndLocal'] = DF.StartLocal.to_list()[:-1] + [last_dt_local] DF['EndUTC'] = DF.StartUTC.to_list()[:-1] + [last_dt_utc] columnDict = { "StartLocal": 'DateTime', "EndLocal": 'DateTime', "StartUTC": 'DateTime', "EndUTC": 'DateTime', "Channel": 'str', "ProgrammeName": 'str' } server = "nonDashboard" database = "WebScraping" sqlTableName = "SouthAfricaTVGuide" primaryKeyColName = "RowID" insertQ = create_insert_query(DF, columnDict, sqlTableName) logging.info(f"insertQ: {insertQ}") run_sql_commmand(insertQ, server, database) removeDuplicatesQ = create_removeDuplicates_query( columnDict, sqlTableName, primaryKeyColName) run_sql_commmand(removeDuplicatesQ, server, database)
def scrape_netherlands(): utc_tz = pytz.timezone("utc") nl_tz = pytz.timezone("Europe/Amsterdam") ## Get tomorrow's date in right format (for some reason today's doesn't work) dateToUse = datetime.now() + timedelta(days=1) dateString = dateToUse.strftime("%d-%m-%Y") ## Dict of TV channels tvChannels = {"ziggosportracing": "Ziggo Sport Racing"} UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36" ## Empty dict for dfs dfs = {} for code, chan in tvChannels.items(): ## Make request and get soup req = requests.get(f"https://www.tvgids.nl/gids/{dateString}/{code}", headers={'user-agent': UA}) soup = BS(req.text, "html.parser") ## Get block containing relevant information infoBlock = soup.find("div", attrs={'class': "guide__guide-container"}) ## Get all the programme tags progTags = [ x for x in infoBlock.find_all('a') if x.attrs['class'] == ['no-link', 'program', 'program--guide'] ] progs = [] for pt in progTags: try: tba = {} ## Channel tba['Channel'] = chan ## Start time tba['StartTime'] = datetime.strptime( pt.find('div', attrs={ 'class': 'program__starttime' }).text.strip(), "%H:%M").time() ## Programme Name tba['ProgrammeName'] = pt.find('h3', attrs={ 'class': 'program__title' }).text.strip() ## Description tba['Description'] = pt.find('p', attrs={ 'class': 'program__text' }).text.strip() progs.append(tba) except AttributeError: pass ## Some progs are from the day before/after `dateString` so ## we need to work out which ones those are startTimes = [x['StartTime'] for x in progs] AMsPMs = [ampm(x) for x in startTimes] daysDiff = [] for i, ap in enumerate(AMsPMs): if i == 0: if ap == "am": daysDiff.append(0) elif ap == "pm": daysDiff.append(-1) else: if ap == "am": if AMsPMs[i - 1] == "am": ## AM->AM, no day change, so same as last prog daysDiff.append(daysDiff[i - 1]) elif AMsPMs[i - 1] == "pm": ## PM->AM, next day, so plus one from last prog daysDiff.append(daysDiff[i - 1] + 1) elif ap == "pm": if AMsPMs[i - 1] == "am": ## AM->PM, no day change, so same as last prog daysDiff.append(daysDiff[i - 1]) elif AMsPMs[i - 1] == "pm": ## PM->PM, no day change, so same as last prog daysDiff.append(daysDiff[i - 1]) for i, dicto in enumerate(progs): ## Set local time dicto['StartLocal'] = datetime.combine(date=dateToUse + timedelta(days=daysDiff[i]), time=dicto['StartTime']) ## Set UTC time dicto['StartUTC'] = nl_tz.localize( dicto['StartLocal']).astimezone(utc_tz).replace(tzinfo=None) ## Create df from list df = pd.DataFrame(progs) del df['StartTime'] ## Add EndLocal and EndUTC columns endLocalEnd = datetime.combine(date=dateToUse + timedelta(days=1), time=time(hour=6)) df['EndLocal'] = df['StartLocal'].to_list()[1:] + [endLocalEnd] endUTCEnd = nl_tz.localize(endLocalEnd).astimezone(utc_tz).replace( tzinfo=None) df['EndUTC'] = df['StartUTC'].to_list()[1:] + [endUTCEnd] ## Only the rows of progs starting on `dateToUse` will be uploaded, ## so find those rows todayDD = [i for i, x in enumerate(daysDiff) if x == 0] minIndex = min(todayDD) maxIndex = max(todayDD) toSQLdf = df[(df.index >= minIndex) & (df.index <= maxIndex)].reset_index(drop=True) ## Add to dict dfs[code] = toSQLdf DF = pd.concat(dfs.values(), ignore_index=True) columnDict = { "StartLocal": 'DateTime', "EndLocal": 'DateTime', "StartUTC": 'DateTime', "EndUTC": 'DateTime', "Channel": 'str', "ProgrammeName": 'str', "Description": 'str', } server = "nonDashboard" database = "WebScraping" sqlTableName = "NetherlandsTVGuide" primaryKeyColName = "RowID" insertQ = create_insert_query(DF, columnDict, sqlTableName) run_sql_commmand(insertQ, server, database) removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName, primaryKeyColName) run_sql_commmand(removeDuplicatesQ, server, database)
def scrape_portugal(): utc_tz = pytz.timezone("utc") por_tz = pytz.timezone("Europe/Lisbon") ## List of channels to loop through ch = [ (518,'Eleven Sports 1'), (519,'Eleven Sports 2'), (532,"Eleven Sports 3"), (514,"Eleven Sports 4"), (515,"Eleven Sports 5"), (516,"Eleven Sports 6"), (10,"Eurosport 1"), (128,"Eurosport 2") ] progs = [] for channelID,channelName in ch: url = f"https://www.nos.pt/particulares/televisao/guia-tv/Pages/channel.aspx?channel={channelID}" todayDate = datetime.now().date() req = requests.get(url) soup = BS(req.text,'html.parser') ## Get first column (today's column) firstCol = soup.find("div",attrs={'class':["programs-day-list","active-day"]}) boxes = firstCol.find_all('li') for i,li in enumerate(boxes): tba = {} ## Channel tba['Channel'] = channelName ## ProgrammeName tba['ProgrammeName'] = li.a['title'] ## Start & End seText = li.find('span',attrs={'class':'duration'}).text.strip() for punc in ["\r","\n"," "]: seText = seText.replace(punc,"") start,end = seText.split("-") startT = datetime.strptime(start,"%H:%M").time() endT = datetime.strptime(end,"%H:%M").time() ## If first start time is yesterday, adjust for that if (i == 0) & (startT.hour > 12): startDT = datetime.combine( date=todayDate - timedelta(days=1), time=startT ) else: startDT = datetime.combine( date=todayDate, time=startT ) endDT = datetime.combine( date=todayDate, time=endT ) tba['StartLocal'] = startDT tba['EndLocal'] = endDT tba['StartUTC'] = por_tz.localize( tba['StartLocal'] ).astimezone( utc_tz ).replace(tzinfo=None) tba['EndUTC'] = por_tz.localize( tba['EndLocal'] ).astimezone( utc_tz ).replace(tzinfo=None) progs.append(tba) DF = pd.DataFrame(progs) columnDict = { "StartLocal" : 'DateTime', "EndLocal" : 'DateTime', "StartUTC" : 'DateTime', "EndUTC" : 'DateTime', "Channel" : 'str', "ProgrammeName" : 'str' } server = "nonDashboard" database = "WebScraping" sqlTableName = "PortugalTVGuide" primaryKeyColName = "RowID" insertQ = create_insert_query(DF,columnDict,sqlTableName) run_sql_commmand(insertQ,server,database) removeDuplicatesQ = create_removeDuplicates_query(columnDict,sqlTableName,primaryKeyColName) run_sql_commmand(removeDuplicatesQ,server,database)
def scrape_setantaeurasia(): utc_tz = pytz.timezone("utc") ## TV Guide is in UTC+2 (Sofia is used to represent that) eet_tz = pytz.timezone("Europe/Sofia") tomorrowDT = datetime.now() + timedelta(days=1) tomorrowString = datetime.strftime(tomorrowDT, "%a-%b-%d").lower() url = "https://www.setantaeurasia.com/en/tv-listings/" req = requests.get(url) soup = BS(req.text, 'html.parser') ccs = [("setantasports1", "Setanta Sports 1"), ("setantasports2", "Setanta Sports 2")] dfs = {} for code, clean in ccs: ## Get channel's panel panel = soup.find('div', id=code) ## Get tomorrow's tab tt = panel.find('div', id=f"tab-{tomorrowString}") ## Get all the progs progList = tt.find_all('li', attrs={'class': 'event-detail-list__item'}) progs = [] for p in progList: tba = {} ## Channel tba['Channel'] = clean ## Start startStr = p.find('div', attrs={'class': 'event-time'})['datetime'] startDT = datetime.strptime(startStr, "%Y-%m-%dT%H:%M") tba['StartUTC'] = eet_tz.localize(startDT).astimezone( utc_tz).replace(tzinfo=None) ## ProgrammeName pnList = [] leagueNameElement = p.find('h3', attrs={'class': 'event-league-name'}) if leagueNameElement is not None: leagueName = leagueNameElement.text.strip() if len(leagueName) > 0: pnList.append(leagueName) eventNameElement = p.find('h4', attrs={'class': 'event-name'}) if eventNameElement is not None: eventName = eventNameElement.text.strip() if len(eventName) > 0: pnList.append(eventName) tba['ProgrammeName'] = " - ".join(pnList) ## Description tba['Description'] = p.find('p', attrs={ 'class': 'event-description' }).text.strip() progs.append(tba) df = pd.DataFrame(progs) endDate = datetime.combine(date=tomorrowDT + timedelta(days=1), time=time(hour=0)) endUTCEnd = eet_tz.localize(endDate).astimezone(utc_tz).replace( tzinfo=None) df['EndUTC'] = df['StartUTC'].to_list()[1:] + [endUTCEnd] dfs[code] = df DF = pd.concat(dfs.values(), ignore_index=True) columnDict = { "StartUTC": 'DateTime', "EndUTC": 'DateTime', "Channel": 'str', "ProgrammeName": 'str', 'Description': 'str' } server = "nonDashboard" database = "WebScraping" sqlTableName = "SetantaEurasiaTVGuide" primaryKeyColName = "RowID" insertQ = create_insert_query(DF, columnDict, sqlTableName) run_sql_commmand(insertQ, server, database) removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName, primaryKeyColName) run_sql_commmand(removeDuplicatesQ, server, database)
def scrape_switzerland(): swTZ = pytz.timezone("Europe/Zurich") ## Get channel IDs r0 = requests.get( url= "https://obo-prod.oesp.upctv.ch/oesp/v4/CH/eng/web/channels?byLocationId=100&includeInvisible=true&personalised=false&sort=channelNumber&withStationResolutions=SD%2CHD" ) channelIDLookup = { ch['title']: ch['id'].replace(":100-", ":") for ch in r0.json()['channels'] } channelIDLookupREV = {x: y for y, x in channelIDLookup.items()} channelsOfInterest = ["MySports One", "MySports One F"] channelIDs = [channelIDLookup[x] for x in channelsOfInterest] dateDT = datetime.now() dateStr = dateDT.strftime("%Y%m%d") dfList = [] for i in range(1, 5): r = requests.get( url= f"https://obo-prod.oesp.upctv.ch/oesp/v4/CH/eng/web/programschedules/{dateStr}/{i}" ) js = r.json() for entrySubsection in js['entries']: if entrySubsection['o'] in channelIDs: df0 = pd.DataFrame(entrySubsection['l']) df0['o'] = entrySubsection['o'] dfList.append(df0) ## Concat all DFs DF_ = pd.concat(dfList, sort=False, ignore_index=True) # DF0 = DF_.drop_duplicates() DF0 = DF_.copy() ## Create df to upload to SQL DF = pd.DataFrame() DF['Channel'] = DF0.o.map(channelIDLookupREV) DF['ProgrammeName'] = DF0.t DF['StartLocal'] = DF0.s.apply( lambda x: datetime.fromtimestamp(x / 1000, swTZ)) DF['StartUTC'] = DF0.s.apply(lambda x: datetime.utcfromtimestamp(x / 1000)) DF['EndLocal'] = DF0.e.apply( lambda x: datetime.fromtimestamp(x / 1000, swTZ)) DF['EndUTC'] = DF0.e.apply(lambda x: datetime.utcfromtimestamp(x / 1000)) columnDict = { "StartLocal": 'DateTime', "EndLocal": 'DateTime', "StartUTC": 'DateTime', "EndUTC": 'DateTime', "Channel": 'str', "ProgrammeName": 'str' } server = "nonDashboard" database = "WebScraping" sqlTableName = "SwitzerlandTVGuide" primaryKeyColName = "RowID" insertQ = create_insert_query(DF, columnDict, sqlTableName) run_sql_commmand(insertQ, server, database) removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName, primaryKeyColName) run_sql_commmand(removeDuplicatesQ, server, database)
def scrape_jsports(): ## Create dict of dicts to record all the minutes accounted for accountedMinutes = {} for i in range(1, 5): dicto = {(datetime.combine(date=datetime.now(), time=time(hour=0)) + timedelta(minutes=j)).time(): False for j in range(24 * 60)} accountedMinutes[i] = dicto utc_tz = pytz.timezone("utc") jp_tz = pytz.timezone("Asia/Tokyo") ## Get tomorrow's date in right format dateToUse = datetime.now() dateString = dateToUse.strftime("%y%m%d") ## Send request req = requests.get( f"https://www.jsports.co.jp/program_guide/month/english/{dateString}") ## Get soup soup = BS(req.text, 'html.parser') tbody = soup.find('tbody') trs = tbody.find_all('tr') progs = [] for I, tr in enumerate(trs): tds = [ x for x in tr.find_all('td') #,attrs={'class':"w-channel__item"}) if x.attrs['class'][0] in ["w-channel__item", "w-channel__item--now"] ] ## If no tds, skip to next tr if len(tds) == 0: continue ## If there are 4 <td> elements, there's one for each channel all4 = len(tds) == 4 if all4: channelList = [f"J Sports {i}" for i in range(1, 5)] ## If there aren't, work out which the channels are else: geumc, earliestMin = getEarliestUnaccountedMinuteChannels( accountedMinutes) assert len(geumc) == len(tds) channelList = [f"J Sports {i}" for i in geumc] ## Get progs for i, td in enumerate(tds): tba = {} ## Starts and Ends try: ## 'pm0:00' is used rather than 'pm12:00', correct their mistake txt = td.p.text.replace("pm0:", "pm12:").replace("am0:", "am12:") T = datetime.strptime(txt, "%p%I:%M").time() except AttributeError: if I == 0: T = time(hour=4) else: raise ValueError("no time provided") dtu = dateToUse + timedelta(days=1) \ if (T >= time(hour=0)) & (T <= time(hour=4)) \ else dateToUse tba['StartLocal'] = datetime.combine(date=dtu, time=T) tba['StartUTC'] = jp_tz.localize( tba['StartLocal']).astimezone(utc_tz).replace(tzinfo=None) durationMins = int(td.attrs['rowspan']) tba['EndLocal'] = tba['StartLocal'] + timedelta( minutes=durationMins) tba['EndUTC'] = tba['StartUTC'] + timedelta(minutes=durationMins) ## Channel tba['Channel'] = channelList[i] channelNumber = int(channelList[i][-1]) ## ProgrammeName tba['ProgrammeName'] = td.dd.text.strip() progs.append(tba) T2 = (datetime.combine(date=datetime.now(), time=T) + timedelta(minutes=durationMins)).time() for m in range(durationMins): accountedMin = (datetime.combine(date=datetime.now(), time=T) + timedelta(minutes=m)).time() accountedMinutes[channelNumber][accountedMin] = True DF = pd.DataFrame(progs) columnDict = { "StartLocal": 'DateTime', "EndLocal": 'DateTime', "StartUTC": 'DateTime', "EndUTC": 'DateTime', "Channel": 'str', "ProgrammeName": 'str' } server = "nonDashboard" database = "WebScraping" sqlTableName = "JSportsTVGuide" primaryKeyColName = "RowID" insertQ = create_insert_query(DF, columnDict, sqlTableName) run_sql_commmand(insertQ, server, database) removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName, primaryKeyColName) run_sql_commmand(removeDuplicatesQ, server, database)
def scrape_ukraine(): ### GETS YESTERDAY'S PROGRAMMING, NOT TODAY'S yesterdaysDate = (datetime.now() - timedelta(days=1)).date() utc_tz = pytz.timezone("utc") ukr_tz = pytz.timezone('Europe/Kiev') logging.info(f"yesterdaysDate: {yesterdaysDate}") days = [ 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', ] translator = Translator() chans = [ ('Setanta Sports Ukraine',1451) ] for channel_name, channelID in chans: prog_url = f'https://tv.meta.ua/{days[yesterdaysDate.weekday()]}/' logging.info(f"prog_url:`{prog_url}`") r = requests.get( url=prog_url, headers={ 'cookie' : f'_chnls={channelID}' } ) logging.info(f"channel_name: {channel_name}") row_list = [] soup = BS(r.text,'html.parser') tableSoup = soup.find( 'table', attrs={ 'class' : 'channel-inner-table' } ) for tr in tableSoup.find_all('tr'): ## Get all the divs divs = tr.find_all( 'div', attrs={ 'style' : 'clear:both' } ) for div in divs: tba = { 'Channel' : channel_name } start_time_str = div.find( 'div', attrs={ 'class' : ['ptime_a','ptime'] } ).text start_time_dt = datetime.strptime( start_time_str, "%H:%M" ) ## If before 6am, it's the next day's programming if start_time_dt.hour < 6: xt = 1 else: xt = 0 tba['StartLocal'] = datetime.combine( date=yesterdaysDate + timedelta(days=xt), time=start_time_dt.time() ) tba['StartUTC'] = ukr_tz.localize( tba['StartLocal'] ).astimezone( utc_tz ).replace(tzinfo=None) russian_progname = div.find( 'div', attrs={ 'style' : 'display:table; _height:0; ' } ).text tba['ProgrammeName'] = translator.translate( text=russian_progname, src='ru', dest='en' ).text row_list.append(tba) DF = pd.DataFrame(row_list).sort_values( 'StartLocal' ).reset_index(drop=True) last_dt_local = datetime.combine( date=yesterdaysDate+timedelta(days=1), time=time(hour=6) ) last_dt_utc = ukr_tz.localize( last_dt_local ).astimezone( utc_tz ).replace(tzinfo=None) DF['EndLocal'] = DF.StartLocal.to_list()[:-1] + [last_dt_local] DF['EndUTC'] = DF.StartUTC.to_list()[:-1] + [last_dt_utc] columnDict = { "StartLocal" : 'DateTime', "EndLocal" : 'DateTime', "StartUTC" : 'DateTime', "EndUTC" : 'DateTime', "Channel" : 'str', "ProgrammeName" : 'str' } server = "nonDashboard" database = "WebScraping" sqlTableName = "UkraineTVGuide" primaryKeyColName = "RowID" insertQ = create_insert_query(DF,columnDict,sqlTableName) logging.info(f"insertQ: {insertQ}") run_sql_commmand(insertQ,server,database) removeDuplicatesQ = create_removeDuplicates_query(columnDict,sqlTableName,primaryKeyColName) run_sql_commmand(removeDuplicatesQ,server,database)
def scrape_FoxSports(): ## Get today's date in the right format todaysDate = datetime.strftime(datetime.now(), "%Y%m%d") channels = { ("Fox Sports", "Philippines"): "EPH1", ("Fox Sports 2", "Philippines"): "F2E1", ("Fox Sports 3", "Philippines"): "FM31", ("Fox Sports", "Malaysia"): "EML1", ("Fox Sports 2", "Malaysia"): "F2M1", ("Fox Sports 3", "Malaysia"): "FM31", ("Fox Sports", "Singapore"): "ESG1", ("Fox Sports 2", "Singapore"): "F2S1", ("Fox Sports 3", "Singapore"): "FM31", ("Star Sports", "China"): "SCN1", ("Star Sports2", "China"): "ECN1", } timezones = { "Philippines": "Asia/Manila", "Malaysia": "Asia/Kuala_Lumpur", "Singapore": "Asia/Singapore", "China": "Asia/Shanghai", } utc_tz = pytz.timezone("utc") dfs = {} for (channelName, country), channelCode in channels.items(): loc_tz = pytz.timezone(timezones[country]) reqURL = "https://tv.foxsportsasia.com/getEPG.php" reqParams = { "lang": "en", "channelCode": channelCode, "date": todaysDate } ## Make request and get response r = requests.get(reqURL, params=reqParams) js = r.json()[channelCode] ## Create pandas df from JSON channelDF = pd.DataFrame(js) ## Add channel name and coutnry as columns channelDF['ChannelName'] = channelName channelDF['Country'] = country ## Compare `date` and `start_time` to make LocalStart channelDF['LocalStart'] = [ datetime.combine(date=datetime.strptime(d, "%m-%d-%y").date(), time=datetime.strptime(s, "%H:%M:%S").time()) for d, s in zip(channelDF.date, channelDF.start_time) ] ## Use `duration` to make LocalEnd channelDF['LocalEnd'] = [ ls + timedelta( seconds=time2secs(datetime.strptime(d, "%H:%M:%S").time())) for ls, d in zip(channelDF.LocalStart, channelDF.duration) ] ## Use `LocalStart` and `LocalEnd` to make UTCStart and UTCEnd channelDF['UTCStart'] = [ loc_tz.localize(ls).astimezone(utc_tz).replace(tzinfo=None) for ls in channelDF.LocalStart ] channelDF['UTCEnd'] = [ loc_tz.localize(le).astimezone(utc_tz).replace(tzinfo=None) for le in channelDF.LocalEnd ] ## Add to dict dfs[channelCode] = channelDF logging.info(f"channelName: {channelName}") logging.info(f"country: {country}") logging.info(f"rows: {len(channelDF)}") ## Concat dfs df = pd.concat(dfs.values(), ignore_index=True) logging.info(f"Total rows: {len(df)}") ## Remove the unused columns removeMes = ['date', 'start_time', 'duration', 'dow'] for rem in removeMes: del df[rem] columnDict = { 'channel_code': 'str', 'sub_genre': 'str', 'genre': 'str', 'live': 'str', 'programme': 'str', 'matchup': 'str', 'ChannelName': 'str', 'Country': 'str', 'LocalStart': 'DateTime', 'LocalEnd': 'DateTime', 'UTCStart': 'DateTime', 'UTCEnd': 'DateTime' } server = "nonDashboard" database = "WebScraping" sqlTableName = "FoxSports" primaryKeyColName = "RowID" insertQ = create_insert_query(df, columnDict, sqlTableName) run_sql_commmand(insertQ, server, database) removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName, primaryKeyColName) run_sql_commmand(removeDuplicatesQ, server, database)
def scrape_france(): dt = datetime.now().date().strftime("%Y-%m-%d") chans = ['eurosport-1-5', 'eurosport-2-63', 'canalplus-decale-36', 'c8-4', 'cstar-28', 'canalplus-2', 'lequipe-204' ] dfs = {} for chan in chans: url = f"https://www.programme-tv.net/programme/chaine/{dt}/programme-{chan}.html" pyDt = datetime.strptime(dt,"%Y-%m-%d").date() req = requests.get(url) soup = BS(req.text, 'html.parser') channelName = soup.find('span',attrs={'class' : 'gridChannel-title'}).text progs = soup.find_all('div',attrs={'class':'singleBroadcastCard'}) starts_ = [None if x.find('div',attrs={'class' : 'singleBroadcastCard-hour'}) is None \ else x.find('div',attrs={'class' : 'singleBroadcastCard-hour'}).text.replace("\n","").strip() \ for x in progs] starts = [datetime.combine(pyDt,time(hour=int(x.split("h")[0]),minute=int(x.split("h")[1]))) for x in starts_] titles = [None if x.find('a',attrs={'class' : 'singleBroadcastCard-title'}) is None \ else x.find('a',attrs={'class' : 'singleBroadcastCard-title'}).text.replace("\n","").strip() \ for x in progs] subtitles_ = [None if x.find('div',attrs={'class' : 'singleBroadcastCard-subtitle'}) is None \ else x.find('div',attrs={'class' : 'singleBroadcastCard-subtitle'}).text.replace("\n","").strip() \ for x in progs] subtitles = [None if len(x) == 0 else x for x in subtitles_] genres_ = [None if x.find('div',attrs={'class' : 'singleBroadcastCard-genre'}) is None \ else x.find('div',attrs={'class' : 'singleBroadcastCard-genre'}).text.replace("\n","").strip() \ for x in progs] genres = [None if len(x) == 0 else x for x in genres_] durations_ = [None if x.find('span',attrs={'class' : 'singleBroadcastCard-durationContent'}) is None \ else x.find('span',attrs={'class' : 'singleBroadcastCard-durationContent'}).text.replace("\n","").replace("min","").strip() \ for x in progs] durations = [timedelta(minutes=int(x)) if x.isdigit() \ else timedelta(hours=int(x.split("h")[0])) if x[-1] == "h" \ else timedelta(hours=int(x.split("h")[0]), minutes=int(x.split("h")[1])) for x in durations_] ends = [x + y for x,y in zip(starts,durations)] df = pd.DataFrame({'Start' : starts, 'End' : ends, 'Title' : titles, 'Subtitle' : subtitles, 'Genre' : genres}) df['Channel'] = channelName dfs[f"{channelName}-{dt}"] = df DF = pd.concat(dfs.values(),ignore_index=True,sort=False) columnDict = { "Start" : 'DateTime', "End" : 'DateTime', "Title" : 'str', "Subtitle" : 'str', "Genre" : 'str', "Channel" : 'str' } server = "nonDashboard" database = "WebScraping" sqlTableName = "FranceTVGuide" primaryKeyColName = "RowID" insertQ = create_insert_query(DF,columnDict,sqlTableName) run_sql_commmand(insertQ,server,database) removeDuplicatesQ = create_removeDuplicates_query(columnDict,sqlTableName,primaryKeyColName) run_sql_commmand(removeDuplicatesQ,server,database)
def scrape_nz(): utc_tz = pytz.timezone("utc") nz_tz = pytz.timezone("Pacific/Auckland") today9am = int( datetime.combine( date=datetime.now(), time=time(hour=9) ).timestamp()*1e3) tomorrow859am = int( datetime.combine( date=datetime.now()+timedelta(days=1), time=time(hour=8,minute=59) ).timestamp()*1e3) ## List of channels to scrape, case sensitive channels = [ 'SKY Sport Select' ,'SKY Sport 1' ,'SKY Sport 2' ,'SKY Sport 3' ,'SKY Sport 4' ,'SKY Sport 5' ,'SKY Sport 6' ,'SKY Sport 7' ,'SKY Sport 8' ,'SKY Sport 9' ] ## Get channel IDs channelURL = "https://static.sky.co.nz/sky/json/channels.prod.json" channelJS = requests.get(channelURL).json() channelIDdict = { int(x['number']):x['name'] for x in channelJS if x['name'] in channels } channelIDs = list(channelIDdict.keys()) ## Get programming url = f"https://web-epg.sky.co.nz/prod/epgs/v1?start={today9am}&end={tomorrow859am}&limit=20000" req = requests.get(url) relevantProgs = [ x for x in req.json()['events'] if x['channelNumber'] in channelIDs ] progs = [] for rp in relevantProgs: tba = {} ## Start & End startUTC = datetime.utcfromtimestamp(int(rp['start'])/1000) endUTC = datetime.utcfromtimestamp(int(rp['end'])/1000) tba['StartLocal'] = utc_tz.localize( startUTC ).astimezone( nz_tz ).replace(tzinfo=None) tba['StartUTC'] = startUTC tba['EndLocal'] = utc_tz.localize( endUTC ).astimezone( nz_tz ).replace(tzinfo=None) tba['EndUTC'] = endUTC ## ProgrammeName tba['ProgrammeName'] = rp['title'] ## Description tba['Description'] = rp['synopsis'] ## Channel tba['Channel'] = channelIDdict[rp['channelNumber']] progs.append(tba) DF = pd.DataFrame(progs).sort_values('StartUTC').reset_index(drop=True) columnDict = { "StartLocal" : 'DateTime', "EndLocal" : 'DateTime', "StartUTC" : 'DateTime', "EndUTC" : 'DateTime', "Channel" : 'str', "ProgrammeName" : 'str', 'Description' : 'str' } server = "nonDashboard" database = "WebScraping" sqlTableName = "NewZealandTVGuide" primaryKeyColName = "RowID" insertQ = create_insert_query(DF,columnDict,sqlTableName) run_sql_commmand(insertQ,server,database) removeDuplicatesQ = create_removeDuplicates_query(columnDict,sqlTableName,primaryKeyColName) run_sql_commmand(removeDuplicatesQ,server,database)
def scrape_mexico(dateStr=None): ## Get the system's timezone system_tz = get_localzone() utc_tz = pytz.timezone("utc") mx_tz = pytz.timezone("America/Mexico_City") if dateStr is None: dateDT = datetime.now() dateStr = dateDT.strftime("%Y-%m-%d") else: dateDT = datetime.strptime(dateStr, "%Y-%m-%d") logging.info(f"dateStr: {dateStr}") ccs = [("claro_sports", "Claro Sports")] C = ["tbl_EPG_row", "tbl_EPG_rowAlternate"] progs = [] for code, clean in ccs: url = f"https://www.gatotv.com/canal/{code}/{dateStr}" logging.info(f"url: {url}") req = requests.get(url) soup = BS(req.text, 'html.parser') ## Get EPG table epgTable = soup.find('table', attrs={'class': 'tbl_EPG'}) ## Get programme rows progRows0 = [x for x in epgTable.find_all('tr') if 'class' in x.attrs] progRows = [x for x in progRows0 if x.attrs['class'][0] in C] for pr in progRows: tba = {} ## Start & End startDT, endDT = [ datetime.combine(date=dateDT, time=datetime.strptime( x.text.strip(), "%H:%M").time()) for x in pr.find_all('div')[:2] ] tba['StartLocal'] = system_tz.localize(startDT).astimezone( mx_tz).replace(tzinfo=None) tba['StartUTC'] = system_tz.localize(startDT).astimezone( utc_tz).replace(tzinfo=None) tba['EndLocal'] = system_tz.localize(endDT).astimezone( mx_tz).replace(tzinfo=None) tba['EndUTC'] = system_tz.localize(endDT).astimezone( utc_tz).replace(tzinfo=None) ## ProgrammeName tba['ProgrammeName'] = pr.find('div', attrs={ 'class': "div_program_title_on_channel" }).text.strip() ## Channel tba['Channel'] = clean progs.append(tba) DF = pd.DataFrame(progs) columnDict = { "StartLocal": 'DateTime', "EndLocal": 'DateTime', "StartUTC": 'DateTime', "EndUTC": 'DateTime', "Channel": 'str', "ProgrammeName": 'str' } server = "nonDashboard" database = "WebScraping" sqlTableName = "MexicoTVGuideFromAzure" primaryKeyColName = "RowID" insertQ = create_insert_query(DF, columnDict, sqlTableName) run_sql_commmand(insertQ, server, database) removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName, primaryKeyColName) run_sql_commmand(removeDuplicatesQ, server, database)
def scrape_topX(X): if X is None: X = 100 elif isinstance(X, str): X = str(X) ## Get top X rows Q = f""" SELECT TOP {X} O.RowID, O.Master_PostID, O.PlatformName, O.VideoViews, O.DateScraped, O.ScrapeCount, M.Article_Timestamp, T.MediaType, T.Link FROM ObservedSocialVideoViews O LEFT JOIN MASTER_ClientArticles M ON M.Master_PostID=O.Master_PostID LEFT JOIN [Toolkit_SourceTable_CrowdTangle_Octagon] T ON M.SourceTable_LocalID = T.ImportId WHERE O.RowID NOT IN (SELECT RowID FROM VideoViewsFailures) AND O.VideoViews IS NULL ORDER BY M.Article_Timestamp ASC """ initialTopXdf = get_df_from_sqlQuery(sqlQuery=Q, database="GlobalMultimedia") ## Start session userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" with requests.Session() as s: ## Login to Instagram insta_login(session=s, userAgent=userAgent) ## Loop through the rows, scrape and update the table for i in initialTopXdf.index: logging.info(f"RowID: {initialTopXdf.loc[i,'RowID']}") scrapeSuccess = False if "www.facebook.com" in initialTopXdf.loc[i, "Link"]: videoViews = fb_get_video_views( session=s, URL=initialTopXdf.loc[i, "Link"], rowID=initialTopXdf.loc[i, 'RowID'], userAgent=userAgent) scrapeSuccess = isinstance(videoViews, int) ## If first attempt didn't work, try again with a different URL if not scrapeSuccess: vID = get_fb_vID(initialTopXdf.loc[i, "Link"]) urlAttempt2 = f"https://www.facebook.com/watch/?v={vID}" videoViews = fb_get_video_views( session=s, URL=urlAttempt2, rowID=initialTopXdf.loc[i, 'RowID'], userAgent=userAgent) scrapeSuccess = isinstance(videoViews, int) if not scrapeSuccess: add_rowID_to_vvf(initialTopXdf.loc[i, 'RowID']) elif "www.instagram.com" in initialTopXdf.loc[i, "Link"]: videoViews = insta_get_video_views( session=s, URL=initialTopXdf.loc[i, "Link"], mediaType=initialTopXdf.loc[i, "MediaType"], rowID=initialTopXdf.loc[i, 'RowID'], UA=userAgent) scrapeSuccess = isinstance(videoViews, int) else: logging.info( f"Neither FB nor Instagram: {initialTopXdf.loc[i,'Link']}") if scrapeSuccess: dateScraped = datetime.now().strftime( "%Y-%m-%d %H:%M:%S.%f")[:-3] ## Update the SQL table uC = f""" UPDATE ObservedSocialVideoViews SET VideoViews = {videoViews}, DateScraped = '{dateScraped}', ScrapeCount = 1 WHERE Master_PostID = '{initialTopXdf.loc[i,"Master_PostID"]}' """ run_sql_commmand(query=uC, database="GlobalMultimedia", server="nonDashboard") logging.info("SUCCESS") else: logging.info("FAILURE")
def add_rowID_to_vvf(rowID): Q = f"INSERT INTO VideoViewsFailures ([RowID]) VALUES ({rowID})" run_sql_commmand(query=Q, server="nonDashboard", database="GlobalMultimedia") logging.info(f"{rowID} added to VideoViewsFailures")