Esempio n. 1
0
def scrape_netherlands():
    utc_tz = pytz.timezone("utc")
    nl_tz = pytz.timezone("Europe/Amsterdam")

    ## Get tomorrow's date in right format (for some reason today's doesn't work)
    dateToUse = datetime.now() + timedelta(days=1)
    dateString = dateToUse.strftime("%d-%m-%Y")

    ## Dict of TV channels
    tvChannels = {"ziggosportracing": "Ziggo Sport Racing"}

    UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"

    ## Empty dict for dfs
    dfs = {}

    for code, chan in tvChannels.items():
        ## Make request and get soup
        req = requests.get(f"https://www.tvgids.nl/gids/{dateString}/{code}",
                           headers={'user-agent': UA})
        soup = BS(req.text, "html.parser")
        ## Get block containing relevant information
        infoBlock = soup.find("div", attrs={'class': "guide__guide-container"})
        ## Get all the programme tags
        progTags = [
            x for x in infoBlock.find_all('a')
            if x.attrs['class'] == ['no-link', 'program', 'program--guide']
        ]
        progs = []
        for pt in progTags:
            try:
                tba = {}
                ## Channel
                tba['Channel'] = chan
                ## Start time
                tba['StartTime'] = datetime.strptime(
                    pt.find('div', attrs={
                        'class': 'program__starttime'
                    }).text.strip(), "%H:%M").time()
                ## Programme Name
                tba['ProgrammeName'] = pt.find('h3',
                                               attrs={
                                                   'class': 'program__title'
                                               }).text.strip()
                ## Description
                tba['Description'] = pt.find('p',
                                             attrs={
                                                 'class': 'program__text'
                                             }).text.strip()
                progs.append(tba)
            except AttributeError:
                pass

        ## Some progs are from the day before/after `dateString` so
        ##    we need to work out which ones those are
        startTimes = [x['StartTime'] for x in progs]
        AMsPMs = [ampm(x) for x in startTimes]
        daysDiff = []
        for i, ap in enumerate(AMsPMs):
            if i == 0:
                if ap == "am":
                    daysDiff.append(0)
                elif ap == "pm":
                    daysDiff.append(-1)
            else:
                if ap == "am":
                    if AMsPMs[i - 1] == "am":
                        ## AM->AM, no day change, so same as last prog
                        daysDiff.append(daysDiff[i - 1])
                    elif AMsPMs[i - 1] == "pm":
                        ## PM->AM, next day, so plus one from last prog
                        daysDiff.append(daysDiff[i - 1] + 1)
                elif ap == "pm":
                    if AMsPMs[i - 1] == "am":
                        ## AM->PM, no day change, so same as last prog
                        daysDiff.append(daysDiff[i - 1])
                    elif AMsPMs[i - 1] == "pm":
                        ## PM->PM, no day change, so same as last prog
                        daysDiff.append(daysDiff[i - 1])
        for i, dicto in enumerate(progs):
            ## Set local time
            dicto['StartLocal'] = datetime.combine(date=dateToUse +
                                                   timedelta(days=daysDiff[i]),
                                                   time=dicto['StartTime'])
            ## Set UTC time
            dicto['StartUTC'] = nl_tz.localize(
                dicto['StartLocal']).astimezone(utc_tz).replace(tzinfo=None)
        ## Create df from list
        df = pd.DataFrame(progs)
        del df['StartTime']
        ## Add EndLocal and EndUTC columns
        endLocalEnd = datetime.combine(date=dateToUse + timedelta(days=1),
                                       time=time(hour=6))
        df['EndLocal'] = df['StartLocal'].to_list()[1:] + [endLocalEnd]
        endUTCEnd = nl_tz.localize(endLocalEnd).astimezone(utc_tz).replace(
            tzinfo=None)
        df['EndUTC'] = df['StartUTC'].to_list()[1:] + [endUTCEnd]
        ## Only the rows of progs starting on `dateToUse` will be uploaded,
        ##    so find those rows
        todayDD = [i for i, x in enumerate(daysDiff) if x == 0]
        minIndex = min(todayDD)
        maxIndex = max(todayDD)
        toSQLdf = df[(df.index >= minIndex)
                     & (df.index <= maxIndex)].reset_index(drop=True)
        ## Add to dict
        dfs[code] = toSQLdf

    DF = pd.concat(dfs.values(), ignore_index=True)

    columnDict = {
        "StartLocal": 'DateTime',
        "EndLocal": 'DateTime',
        "StartUTC": 'DateTime',
        "EndUTC": 'DateTime',
        "Channel": 'str',
        "ProgrammeName": 'str',
        "Description": 'str',
    }
    server = "nonDashboard"
    database = "WebScraping"
    sqlTableName = "NetherlandsTVGuide"
    primaryKeyColName = "RowID"

    insertQ = create_insert_query(DF, columnDict, sqlTableName)

    run_sql_commmand(insertQ, server, database)

    removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName,
                                                      primaryKeyColName)

    run_sql_commmand(removeDuplicatesQ, server, database)
Esempio n. 2
0
def scrape_southafrica():
    todayDate = datetime.now().date()
    utc_tz = pytz.timezone("utc")
    sa_tz = pytz.timezone("Africa/Johannesburg")

    chans = [('SABC Sport', 'SABC%20Sport')]

    for channel_name, channel_code in chans:

        r = requests.get(f'https://tvguide.etv.co.za/guide/{channel_code}')

        soup = BS(r.text, 'html.parser')

        row_list = []

        #progBoxes = todaySection.find('tbody').find_all('tr')
        #print(len(progBoxes))
        for progBoxTable in soup.find_all('table')[:3]:
            for pb in progBoxTable.find_all('tr'):
                tba = {'Channel': channel_name}
                tds = pb.find_all('td')
                ## Time
                tba['StartLocal'] = datetime.combine(date=todayDate,
                                                     time=datetime.strptime(
                                                         tds[0].text,
                                                         "%I:%M %p").time())
                tba['StartUTC'] = sa_tz.localize(
                    tba['StartLocal']).astimezone(utc_tz).replace(tzinfo=None)
                ## Programme
                if pb.b.text != 'Currently playing':
                    tba['ProgrammeName'] = pb.b.text
                else:
                    tba['ProgrammeName'] = pb.h3.text

                row_list.append(tba)

        DF = pd.DataFrame(row_list).sort_values('StartLocal').reset_index(
            drop=True)

        last_dt_local = datetime.combine(date=todayDate + timedelta(days=1),
                                         time=time(hour=0))
        last_dt_utc = sa_tz.localize(last_dt_local).astimezone(utc_tz).replace(
            tzinfo=None)

        DF['EndLocal'] = DF.StartLocal.to_list()[:-1] + [last_dt_local]
        DF['EndUTC'] = DF.StartUTC.to_list()[:-1] + [last_dt_utc]

        columnDict = {
            "StartLocal": 'DateTime',
            "EndLocal": 'DateTime',
            "StartUTC": 'DateTime',
            "EndUTC": 'DateTime',
            "Channel": 'str',
            "ProgrammeName": 'str'
        }
        server = "nonDashboard"
        database = "WebScraping"
        sqlTableName = "SouthAfricaTVGuide"
        primaryKeyColName = "RowID"

        insertQ = create_insert_query(DF, columnDict, sqlTableName)
        logging.info(f"insertQ: {insertQ}")

        run_sql_commmand(insertQ, server, database)

        removeDuplicatesQ = create_removeDuplicates_query(
            columnDict, sqlTableName, primaryKeyColName)

        run_sql_commmand(removeDuplicatesQ, server, database)
Esempio n. 3
0
def scrape_setantaeurasia():
    utc_tz = pytz.timezone("utc")
    ## TV Guide is in UTC+2 (Sofia is used to represent that)
    eet_tz = pytz.timezone("Europe/Sofia")

    tomorrowDT = datetime.now() + timedelta(days=1)
    tomorrowString = datetime.strftime(tomorrowDT, "%a-%b-%d").lower()

    url = "https://www.setantaeurasia.com/en/tv-listings/"
    req = requests.get(url)
    soup = BS(req.text, 'html.parser')

    ccs = [("setantasports1", "Setanta Sports 1"),
           ("setantasports2", "Setanta Sports 2")]
    dfs = {}

    for code, clean in ccs:
        ## Get channel's panel
        panel = soup.find('div', id=code)
        ## Get tomorrow's tab
        tt = panel.find('div', id=f"tab-{tomorrowString}")
        ## Get all the progs
        progList = tt.find_all('li',
                               attrs={'class': 'event-detail-list__item'})
        progs = []
        for p in progList:
            tba = {}
            ## Channel
            tba['Channel'] = clean
            ## Start
            startStr = p.find('div', attrs={'class': 'event-time'})['datetime']
            startDT = datetime.strptime(startStr, "%Y-%m-%dT%H:%M")
            tba['StartUTC'] = eet_tz.localize(startDT).astimezone(
                utc_tz).replace(tzinfo=None)
            ## ProgrammeName
            pnList = []
            leagueNameElement = p.find('h3',
                                       attrs={'class': 'event-league-name'})
            if leagueNameElement is not None:
                leagueName = leagueNameElement.text.strip()
                if len(leagueName) > 0:
                    pnList.append(leagueName)
            eventNameElement = p.find('h4', attrs={'class': 'event-name'})
            if eventNameElement is not None:
                eventName = eventNameElement.text.strip()
                if len(eventName) > 0:
                    pnList.append(eventName)
            tba['ProgrammeName'] = " - ".join(pnList)
            ## Description
            tba['Description'] = p.find('p',
                                        attrs={
                                            'class': 'event-description'
                                        }).text.strip()

            progs.append(tba)

        df = pd.DataFrame(progs)

        endDate = datetime.combine(date=tomorrowDT + timedelta(days=1),
                                   time=time(hour=0))
        endUTCEnd = eet_tz.localize(endDate).astimezone(utc_tz).replace(
            tzinfo=None)
        df['EndUTC'] = df['StartUTC'].to_list()[1:] + [endUTCEnd]

        dfs[code] = df

    DF = pd.concat(dfs.values(), ignore_index=True)

    columnDict = {
        "StartUTC": 'DateTime',
        "EndUTC": 'DateTime',
        "Channel": 'str',
        "ProgrammeName": 'str',
        'Description': 'str'
    }
    server = "nonDashboard"
    database = "WebScraping"
    sqlTableName = "SetantaEurasiaTVGuide"
    primaryKeyColName = "RowID"

    insertQ = create_insert_query(DF, columnDict, sqlTableName)

    run_sql_commmand(insertQ, server, database)

    removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName,
                                                      primaryKeyColName)

    run_sql_commmand(removeDuplicatesQ, server, database)
Esempio n. 4
0
def scrape_portugal():    
    utc_tz = pytz.timezone("utc")
    por_tz = pytz.timezone("Europe/Lisbon")
    ## List of channels to loop through
    ch = [
        (518,'Eleven Sports 1'),
        (519,'Eleven Sports 2'),
        (532,"Eleven Sports 3"),
        (514,"Eleven Sports 4"),
        (515,"Eleven Sports 5"),
        (516,"Eleven Sports 6"),
        (10,"Eurosport 1"),
        (128,"Eurosport 2")
    ]
    progs = []
    for channelID,channelName in ch:
        url = f"https://www.nos.pt/particulares/televisao/guia-tv/Pages/channel.aspx?channel={channelID}"
        todayDate = datetime.now().date()
        req = requests.get(url)
        soup = BS(req.text,'html.parser')

        ## Get first column (today's column)
        firstCol = soup.find("div",attrs={'class':["programs-day-list","active-day"]})
        boxes = firstCol.find_all('li')
        for i,li in enumerate(boxes):
            tba = {}
            ## Channel
            tba['Channel'] = channelName
            ## ProgrammeName
            tba['ProgrammeName'] = li.a['title']
            ## Start & End
            seText = li.find('span',attrs={'class':'duration'}).text.strip()
            for punc in ["\r","\n"," "]:
                seText = seText.replace(punc,"")
            start,end = seText.split("-")
            startT = datetime.strptime(start,"%H:%M").time()
            endT = datetime.strptime(end,"%H:%M").time()
            ## If first start time is yesterday, adjust for that
            if (i == 0) & (startT.hour > 12):
                startDT = datetime.combine(
                            date=todayDate - timedelta(days=1),
                            time=startT
                    )
            else:
                startDT = datetime.combine(
                            date=todayDate,
                            time=startT
                    )
            endDT = datetime.combine(
                            date=todayDate,
                            time=endT
                    )
            tba['StartLocal'] = startDT
            tba['EndLocal'] = endDT
            tba['StartUTC'] = por_tz.localize(
                                        tba['StartLocal']
                                            ).astimezone(
                                                    utc_tz
                                                        ).replace(tzinfo=None)
            tba['EndUTC'] = por_tz.localize(
                                        tba['EndLocal']
                                            ).astimezone(
                                                    utc_tz
                                                        ).replace(tzinfo=None)
            progs.append(tba)
        
    DF = pd.DataFrame(progs)

    columnDict = {
        "StartLocal" : 'DateTime',
        "EndLocal" : 'DateTime',
        "StartUTC" : 'DateTime',
        "EndUTC" : 'DateTime',
        "Channel" : 'str',
        "ProgrammeName" : 'str'
        }
    server = "nonDashboard"
    database = "WebScraping"
    sqlTableName = "PortugalTVGuide"
    primaryKeyColName = "RowID"
        
    insertQ = create_insert_query(DF,columnDict,sqlTableName)

    run_sql_commmand(insertQ,server,database)

    removeDuplicatesQ = create_removeDuplicates_query(columnDict,sqlTableName,primaryKeyColName)

    run_sql_commmand(removeDuplicatesQ,server,database)
Esempio n. 5
0
def scrape_switzerland():
    swTZ = pytz.timezone("Europe/Zurich")

    ## Get channel IDs
    r0 = requests.get(
        url=
        "https://obo-prod.oesp.upctv.ch/oesp/v4/CH/eng/web/channels?byLocationId=100&includeInvisible=true&personalised=false&sort=channelNumber&withStationResolutions=SD%2CHD"
    )
    channelIDLookup = {
        ch['title']: ch['id'].replace(":100-", ":")
        for ch in r0.json()['channels']
    }
    channelIDLookupREV = {x: y for y, x in channelIDLookup.items()}

    channelsOfInterest = ["MySports One", "MySports One F"]

    channelIDs = [channelIDLookup[x] for x in channelsOfInterest]

    dateDT = datetime.now()

    dateStr = dateDT.strftime("%Y%m%d")

    dfList = []
    for i in range(1, 5):
        r = requests.get(
            url=
            f"https://obo-prod.oesp.upctv.ch/oesp/v4/CH/eng/web/programschedules/{dateStr}/{i}"
        )

        js = r.json()
        for entrySubsection in js['entries']:
            if entrySubsection['o'] in channelIDs:
                df0 = pd.DataFrame(entrySubsection['l'])
                df0['o'] = entrySubsection['o']
                dfList.append(df0)

    ## Concat all DFs
    DF_ = pd.concat(dfList, sort=False, ignore_index=True)
    # DF0 = DF_.drop_duplicates()
    DF0 = DF_.copy()
    ## Create df to upload to SQL
    DF = pd.DataFrame()
    DF['Channel'] = DF0.o.map(channelIDLookupREV)
    DF['ProgrammeName'] = DF0.t
    DF['StartLocal'] = DF0.s.apply(
        lambda x: datetime.fromtimestamp(x / 1000, swTZ))
    DF['StartUTC'] = DF0.s.apply(lambda x: datetime.utcfromtimestamp(x / 1000))
    DF['EndLocal'] = DF0.e.apply(
        lambda x: datetime.fromtimestamp(x / 1000, swTZ))
    DF['EndUTC'] = DF0.e.apply(lambda x: datetime.utcfromtimestamp(x / 1000))

    columnDict = {
        "StartLocal": 'DateTime',
        "EndLocal": 'DateTime',
        "StartUTC": 'DateTime',
        "EndUTC": 'DateTime',
        "Channel": 'str',
        "ProgrammeName": 'str'
    }
    server = "nonDashboard"
    database = "WebScraping"
    sqlTableName = "SwitzerlandTVGuide"
    primaryKeyColName = "RowID"

    insertQ = create_insert_query(DF, columnDict, sqlTableName)

    run_sql_commmand(insertQ, server, database)

    removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName,
                                                      primaryKeyColName)

    run_sql_commmand(removeDuplicatesQ, server, database)
Esempio n. 6
0
def scrape_jsports():
    ## Create dict of dicts to record all the minutes accounted for
    accountedMinutes = {}
    for i in range(1, 5):
        dicto = {(datetime.combine(date=datetime.now(), time=time(hour=0)) +
                  timedelta(minutes=j)).time(): False
                 for j in range(24 * 60)}
        accountedMinutes[i] = dicto

    utc_tz = pytz.timezone("utc")
    jp_tz = pytz.timezone("Asia/Tokyo")

    ## Get tomorrow's date in right format
    dateToUse = datetime.now()
    dateString = dateToUse.strftime("%y%m%d")

    ## Send request
    req = requests.get(
        f"https://www.jsports.co.jp/program_guide/month/english/{dateString}")
    ## Get soup
    soup = BS(req.text, 'html.parser')
    tbody = soup.find('tbody')
    trs = tbody.find_all('tr')
    progs = []

    for I, tr in enumerate(trs):
        tds = [
            x for x in tr.find_all('td')  #,attrs={'class':"w-channel__item"})
            if x.attrs['class'][0] in
            ["w-channel__item", "w-channel__item--now"]
        ]

        ## If no tds, skip to next tr
        if len(tds) == 0:
            continue

        ## If there are 4 <td> elements, there's one for each channel
        all4 = len(tds) == 4
        if all4:
            channelList = [f"J Sports {i}" for i in range(1, 5)]
        ## If there aren't, work out which the channels are
        else:
            geumc, earliestMin = getEarliestUnaccountedMinuteChannels(
                accountedMinutes)
            assert len(geumc) == len(tds)
            channelList = [f"J Sports {i}" for i in geumc]
        ## Get progs
        for i, td in enumerate(tds):
            tba = {}
            ## Starts and Ends
            try:
                ## 'pm0:00' is used rather than 'pm12:00', correct their mistake
                txt = td.p.text.replace("pm0:",
                                        "pm12:").replace("am0:", "am12:")
                T = datetime.strptime(txt, "%p%I:%M").time()
            except AttributeError:
                if I == 0:
                    T = time(hour=4)
                else:
                    raise ValueError("no time provided")
            dtu = dateToUse + timedelta(days=1) \
                    if (T >= time(hour=0)) & (T <= time(hour=4)) \
                    else dateToUse
            tba['StartLocal'] = datetime.combine(date=dtu, time=T)
            tba['StartUTC'] = jp_tz.localize(
                tba['StartLocal']).astimezone(utc_tz).replace(tzinfo=None)
            durationMins = int(td.attrs['rowspan'])
            tba['EndLocal'] = tba['StartLocal'] + timedelta(
                minutes=durationMins)
            tba['EndUTC'] = tba['StartUTC'] + timedelta(minutes=durationMins)
            ## Channel
            tba['Channel'] = channelList[i]
            channelNumber = int(channelList[i][-1])
            ## ProgrammeName
            tba['ProgrammeName'] = td.dd.text.strip()

            progs.append(tba)

            T2 = (datetime.combine(date=datetime.now(), time=T) +
                  timedelta(minutes=durationMins)).time()
            for m in range(durationMins):
                accountedMin = (datetime.combine(date=datetime.now(), time=T) +
                                timedelta(minutes=m)).time()
                accountedMinutes[channelNumber][accountedMin] = True

    DF = pd.DataFrame(progs)

    columnDict = {
        "StartLocal": 'DateTime',
        "EndLocal": 'DateTime',
        "StartUTC": 'DateTime',
        "EndUTC": 'DateTime',
        "Channel": 'str',
        "ProgrammeName": 'str'
    }
    server = "nonDashboard"
    database = "WebScraping"
    sqlTableName = "JSportsTVGuide"
    primaryKeyColName = "RowID"

    insertQ = create_insert_query(DF, columnDict, sqlTableName)

    run_sql_commmand(insertQ, server, database)

    removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName,
                                                      primaryKeyColName)

    run_sql_commmand(removeDuplicatesQ, server, database)
Esempio n. 7
0
def scrape_FoxSports():
    ## Get today's date in the right format
    todaysDate = datetime.strftime(datetime.now(), "%Y%m%d")

    channels = {
        ("Fox Sports", "Philippines"): "EPH1",
        ("Fox Sports 2", "Philippines"): "F2E1",
        ("Fox Sports 3", "Philippines"): "FM31",
        ("Fox Sports", "Malaysia"): "EML1",
        ("Fox Sports 2", "Malaysia"): "F2M1",
        ("Fox Sports 3", "Malaysia"): "FM31",
        ("Fox Sports", "Singapore"): "ESG1",
        ("Fox Sports 2", "Singapore"): "F2S1",
        ("Fox Sports 3", "Singapore"): "FM31",
        ("Star Sports", "China"): "SCN1",
        ("Star Sports2", "China"): "ECN1",
    }

    timezones = {
        "Philippines": "Asia/Manila",
        "Malaysia": "Asia/Kuala_Lumpur",
        "Singapore": "Asia/Singapore",
        "China": "Asia/Shanghai",
    }
    utc_tz = pytz.timezone("utc")

    dfs = {}

    for (channelName, country), channelCode in channels.items():
        loc_tz = pytz.timezone(timezones[country])
        reqURL = "https://tv.foxsportsasia.com/getEPG.php"
        reqParams = {
            "lang": "en",
            "channelCode": channelCode,
            "date": todaysDate
        }
        ## Make request and get response
        r = requests.get(reqURL, params=reqParams)
        js = r.json()[channelCode]
        ## Create pandas df from JSON
        channelDF = pd.DataFrame(js)
        ## Add channel name and coutnry as columns
        channelDF['ChannelName'] = channelName
        channelDF['Country'] = country

        ## Compare `date` and `start_time` to make LocalStart
        channelDF['LocalStart'] = [
            datetime.combine(date=datetime.strptime(d, "%m-%d-%y").date(),
                             time=datetime.strptime(s, "%H:%M:%S").time())
            for d, s in zip(channelDF.date, channelDF.start_time)
        ]
        ## Use `duration` to make LocalEnd
        channelDF['LocalEnd'] = [
            ls + timedelta(
                seconds=time2secs(datetime.strptime(d, "%H:%M:%S").time()))
            for ls, d in zip(channelDF.LocalStart, channelDF.duration)
        ]
        ## Use `LocalStart` and `LocalEnd` to make UTCStart and UTCEnd
        channelDF['UTCStart'] = [
            loc_tz.localize(ls).astimezone(utc_tz).replace(tzinfo=None)
            for ls in channelDF.LocalStart
        ]
        channelDF['UTCEnd'] = [
            loc_tz.localize(le).astimezone(utc_tz).replace(tzinfo=None)
            for le in channelDF.LocalEnd
        ]
        ## Add to dict
        dfs[channelCode] = channelDF
        logging.info(f"channelName: {channelName}")
        logging.info(f"country: {country}")
        logging.info(f"rows: {len(channelDF)}")

    ## Concat dfs
    df = pd.concat(dfs.values(), ignore_index=True)
    logging.info(f"Total rows: {len(df)}")
    ## Remove the unused columns
    removeMes = ['date', 'start_time', 'duration', 'dow']
    for rem in removeMes:
        del df[rem]

    columnDict = {
        'channel_code': 'str',
        'sub_genre': 'str',
        'genre': 'str',
        'live': 'str',
        'programme': 'str',
        'matchup': 'str',
        'ChannelName': 'str',
        'Country': 'str',
        'LocalStart': 'DateTime',
        'LocalEnd': 'DateTime',
        'UTCStart': 'DateTime',
        'UTCEnd': 'DateTime'
    }
    server = "nonDashboard"
    database = "WebScraping"
    sqlTableName = "FoxSports"
    primaryKeyColName = "RowID"

    insertQ = create_insert_query(df, columnDict, sqlTableName)

    run_sql_commmand(insertQ, server, database)

    removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName,
                                                      primaryKeyColName)

    run_sql_commmand(removeDuplicatesQ, server, database)
Esempio n. 8
0
def scrape_ukraine():

    ### GETS YESTERDAY'S PROGRAMMING, NOT TODAY'S

    yesterdaysDate = (datetime.now() - timedelta(days=1)).date()
    utc_tz = pytz.timezone("utc")
    ukr_tz = pytz.timezone('Europe/Kiev')

    logging.info(f"yesterdaysDate: {yesterdaysDate}")

    days = [
        'monday',
        'tuesday',
        'wednesday',
        'thursday',
        'friday',
        'saturday',
        'sunday',
    ]

    translator = Translator()

    chans = [
        ('Setanta Sports Ukraine',1451)
    ]

    for channel_name, channelID in chans:
        prog_url = f'https://tv.meta.ua/{days[yesterdaysDate.weekday()]}/'
        logging.info(f"prog_url:`{prog_url}`")
        r = requests.get(
            url=prog_url,
            headers={
                'cookie' : f'_chnls={channelID}'
                    }
            )
        logging.info(f"channel_name: {channel_name}")
        row_list = []
        
        soup = BS(r.text,'html.parser')
        tableSoup = soup.find(
            'table',
            attrs={
                'class' : 'channel-inner-table'
            }
        )
        for tr in tableSoup.find_all('tr'):
            ## Get all the divs
            divs = tr.find_all(
                'div',
                attrs={
                    'style' : 'clear:both'
                }
            )
            for div in divs:
                tba = {
                    'Channel' : channel_name
                }
                start_time_str = div.find(
                    'div',
                    attrs={
                        'class' : ['ptime_a','ptime']
                    }
                ).text
                start_time_dt = datetime.strptime(
                    start_time_str,
                    "%H:%M"
                )
                ## If before 6am, it's the next day's programming
                if start_time_dt.hour < 6:
                    xt = 1
                else:
                    xt = 0
                tba['StartLocal'] = datetime.combine(
                    date=yesterdaysDate + timedelta(days=xt),
                    time=start_time_dt.time()
                )
                tba['StartUTC'] = ukr_tz.localize(
                    tba['StartLocal']
                ).astimezone(
                    utc_tz
                ).replace(tzinfo=None)
                
                russian_progname = div.find(
                    'div',
                    attrs={
                        'style' : 'display:table; _height:0; '
                    }
                ).text
                tba['ProgrammeName'] = translator.translate(
                    text=russian_progname,
                    src='ru',
                    dest='en'
                ).text
                
                row_list.append(tba)
                
        DF = pd.DataFrame(row_list).sort_values(
            'StartLocal'
        ).reset_index(drop=True)
        
        last_dt_local = datetime.combine(
            date=yesterdaysDate+timedelta(days=1),
            time=time(hour=6)
        )
        last_dt_utc = ukr_tz.localize(
            last_dt_local
        ).astimezone(
            utc_tz
        ).replace(tzinfo=None)
        
        DF['EndLocal'] = DF.StartLocal.to_list()[:-1] + [last_dt_local]
        DF['EndUTC'] = DF.StartUTC.to_list()[:-1] + [last_dt_utc]

        columnDict = {
            "StartLocal" : 'DateTime',
            "EndLocal" : 'DateTime',
            "StartUTC" : 'DateTime',
            "EndUTC" : 'DateTime',
            "Channel" : 'str',
            "ProgrammeName" : 'str'
            }
        server = "nonDashboard"
        database = "WebScraping"
        sqlTableName = "UkraineTVGuide"
        primaryKeyColName = "RowID"
            
        insertQ = create_insert_query(DF,columnDict,sqlTableName)
        logging.info(f"insertQ: {insertQ}")

        run_sql_commmand(insertQ,server,database)

        removeDuplicatesQ = create_removeDuplicates_query(columnDict,sqlTableName,primaryKeyColName)

        run_sql_commmand(removeDuplicatesQ,server,database)
Esempio n. 9
0
def scrape_france():

    dt = datetime.now().date().strftime("%Y-%m-%d")

    chans = ['eurosport-1-5',
            'eurosport-2-63',
            'canalplus-decale-36',
            'c8-4',
            'cstar-28',
            'canalplus-2',
            'lequipe-204'
        ]

    dfs = {}

    for chan in chans:
            
        url = f"https://www.programme-tv.net/programme/chaine/{dt}/programme-{chan}.html"
        
        pyDt = datetime.strptime(dt,"%Y-%m-%d").date()
        
        req = requests.get(url)
        
        soup = BS(req.text, 'html.parser')
        
        channelName = soup.find('span',attrs={'class' : 'gridChannel-title'}).text
        
        progs = soup.find_all('div',attrs={'class':'singleBroadcastCard'})
        
        starts_ = [None if x.find('div',attrs={'class' : 'singleBroadcastCard-hour'}) is None \
                else x.find('div',attrs={'class' : 'singleBroadcastCard-hour'}).text.replace("\n","").strip() \
                    for x in progs]
        starts = [datetime.combine(pyDt,time(hour=int(x.split("h")[0]),minute=int(x.split("h")[1])))
                    for x in starts_]
        
        titles = [None if x.find('a',attrs={'class' : 'singleBroadcastCard-title'}) is None \
                else x.find('a',attrs={'class' : 'singleBroadcastCard-title'}).text.replace("\n","").strip() \
                    for x in progs]
        
        subtitles_ = [None if x.find('div',attrs={'class' : 'singleBroadcastCard-subtitle'}) is None \
                    else x.find('div',attrs={'class' : 'singleBroadcastCard-subtitle'}).text.replace("\n","").strip() \
                    for x in progs]
        subtitles = [None if len(x) == 0 else x for x in subtitles_]
        
        genres_ = [None if x.find('div',attrs={'class' : 'singleBroadcastCard-genre'}) is None \
                else x.find('div',attrs={'class' : 'singleBroadcastCard-genre'}).text.replace("\n","").strip() \
                    for x in progs]
        genres = [None if len(x) == 0 else x for x in genres_]
        
        durations_ = [None if x.find('span',attrs={'class' : 'singleBroadcastCard-durationContent'}) is None \
                    else x.find('span',attrs={'class' : 'singleBroadcastCard-durationContent'}).text.replace("\n","").replace("min","").strip() \
                    for x in progs]
        durations = [timedelta(minutes=int(x)) if x.isdigit() \
                        else timedelta(hours=int(x.split("h")[0])) if x[-1] == "h" \
                        else timedelta(hours=int(x.split("h")[0]), minutes=int(x.split("h")[1]))
                        for x in durations_]
        
        ends = [x + y for x,y in zip(starts,durations)]
        
        
        df = pd.DataFrame({'Start' : starts,
                        'End' : ends,
                        'Title' : titles,
                        'Subtitle' : subtitles,
                        'Genre' : genres})
        df['Channel'] = channelName
        
        dfs[f"{channelName}-{dt}"] = df
        
        
    DF = pd.concat(dfs.values(),ignore_index=True,sort=False)

    columnDict = {
        "Start" : 'DateTime',
        "End" : 'DateTime',
        "Title" : 'str',
        "Subtitle" : 'str',
        "Genre" : 'str',
        "Channel" : 'str'
        }
    server = "nonDashboard"
    database = "WebScraping"
    sqlTableName = "FranceTVGuide"
    primaryKeyColName = "RowID"
        
    insertQ = create_insert_query(DF,columnDict,sqlTableName)

    run_sql_commmand(insertQ,server,database)

    removeDuplicatesQ = create_removeDuplicates_query(columnDict,sqlTableName,primaryKeyColName)

    run_sql_commmand(removeDuplicatesQ,server,database)
Esempio n. 10
0
def scrape_nz():    
    utc_tz = pytz.timezone("utc")
    nz_tz = pytz.timezone("Pacific/Auckland")

    today9am = int(
            datetime.combine(
                date=datetime.now(),
                time=time(hour=9)
            ).timestamp()*1e3)
    tomorrow859am = int(
            datetime.combine(
                date=datetime.now()+timedelta(days=1),
                time=time(hour=8,minute=59)
            ).timestamp()*1e3)

    ## List of channels to scrape, case sensitive
    channels = [
            'SKY Sport Select'
            ,'SKY Sport 1'
            ,'SKY Sport 2'
            ,'SKY Sport 3'
            ,'SKY Sport 4'
            ,'SKY Sport 5'
            ,'SKY Sport 6'
            ,'SKY Sport 7'
            ,'SKY Sport 8'
            ,'SKY Sport 9'
            ]
    ## Get channel IDs
    channelURL = "https://static.sky.co.nz/sky/json/channels.prod.json"
    channelJS = requests.get(channelURL).json()
    channelIDdict = {
                        int(x['number']):x['name']
                        for x in channelJS
                        if x['name'] in channels
                    }
    channelIDs = list(channelIDdict.keys())
    ## Get programming
    url = f"https://web-epg.sky.co.nz/prod/epgs/v1?start={today9am}&end={tomorrow859am}&limit=20000"
    req = requests.get(url)
    relevantProgs = [
            x
            for x in req.json()['events']
            if x['channelNumber'] in channelIDs
            ]
    progs = []
    for rp in relevantProgs:
        tba = {}
        ## Start & End
        startUTC = datetime.utcfromtimestamp(int(rp['start'])/1000)
        endUTC = datetime.utcfromtimestamp(int(rp['end'])/1000)
        tba['StartLocal'] = utc_tz.localize(
                                    startUTC
                                        ).astimezone(
                                                nz_tz
                                                    ).replace(tzinfo=None)
        tba['StartUTC'] = startUTC
        tba['EndLocal'] = utc_tz.localize(
                                    endUTC
                                        ).astimezone(
                                                nz_tz
                                                    ).replace(tzinfo=None)
        tba['EndUTC'] = endUTC
        ## ProgrammeName
        tba['ProgrammeName'] = rp['title']
        ## Description
        tba['Description'] = rp['synopsis']
        ## Channel
        tba['Channel'] = channelIDdict[rp['channelNumber']]

        progs.append(tba)
        
    DF = pd.DataFrame(progs).sort_values('StartUTC').reset_index(drop=True)

    columnDict = {
        "StartLocal" : 'DateTime',
        "EndLocal" : 'DateTime',
        "StartUTC" : 'DateTime',
        "EndUTC" : 'DateTime',
        "Channel" : 'str',
        "ProgrammeName" : 'str',
        'Description' : 'str'
        }
    server = "nonDashboard"
    database = "WebScraping"
    sqlTableName = "NewZealandTVGuide"
    primaryKeyColName = "RowID"
        
    insertQ = create_insert_query(DF,columnDict,sqlTableName)

    run_sql_commmand(insertQ,server,database)

    removeDuplicatesQ = create_removeDuplicates_query(columnDict,sqlTableName,primaryKeyColName)

    run_sql_commmand(removeDuplicatesQ,server,database)
Esempio n. 11
0
def scrape_mexico(dateStr=None):
    ## Get the system's timezone
    system_tz = get_localzone()
    utc_tz = pytz.timezone("utc")
    mx_tz = pytz.timezone("America/Mexico_City")
    if dateStr is None:
        dateDT = datetime.now()
        dateStr = dateDT.strftime("%Y-%m-%d")
    else:
        dateDT = datetime.strptime(dateStr, "%Y-%m-%d")
    logging.info(f"dateStr: {dateStr}")
    ccs = [("claro_sports", "Claro Sports")]

    C = ["tbl_EPG_row", "tbl_EPG_rowAlternate"]
    progs = []
    for code, clean in ccs:

        url = f"https://www.gatotv.com/canal/{code}/{dateStr}"
        logging.info(f"url: {url}")
        req = requests.get(url)
        soup = BS(req.text, 'html.parser')

        ## Get EPG table
        epgTable = soup.find('table', attrs={'class': 'tbl_EPG'})
        ## Get programme rows
        progRows0 = [x for x in epgTable.find_all('tr') if 'class' in x.attrs]
        progRows = [x for x in progRows0 if x.attrs['class'][0] in C]
        for pr in progRows:
            tba = {}
            ## Start & End
            startDT, endDT = [
                datetime.combine(date=dateDT,
                                 time=datetime.strptime(
                                     x.text.strip(), "%H:%M").time())
                for x in pr.find_all('div')[:2]
            ]

            tba['StartLocal'] = system_tz.localize(startDT).astimezone(
                mx_tz).replace(tzinfo=None)
            tba['StartUTC'] = system_tz.localize(startDT).astimezone(
                utc_tz).replace(tzinfo=None)
            tba['EndLocal'] = system_tz.localize(endDT).astimezone(
                mx_tz).replace(tzinfo=None)
            tba['EndUTC'] = system_tz.localize(endDT).astimezone(
                utc_tz).replace(tzinfo=None)
            ## ProgrammeName
            tba['ProgrammeName'] = pr.find('div',
                                           attrs={
                                               'class':
                                               "div_program_title_on_channel"
                                           }).text.strip()
            ## Channel
            tba['Channel'] = clean

            progs.append(tba)

    DF = pd.DataFrame(progs)

    columnDict = {
        "StartLocal": 'DateTime',
        "EndLocal": 'DateTime',
        "StartUTC": 'DateTime',
        "EndUTC": 'DateTime',
        "Channel": 'str',
        "ProgrammeName": 'str'
    }
    server = "nonDashboard"
    database = "WebScraping"
    sqlTableName = "MexicoTVGuideFromAzure"
    primaryKeyColName = "RowID"

    insertQ = create_insert_query(DF, columnDict, sqlTableName)

    run_sql_commmand(insertQ, server, database)

    removeDuplicatesQ = create_removeDuplicates_query(columnDict, sqlTableName,
                                                      primaryKeyColName)

    run_sql_commmand(removeDuplicatesQ, server, database)