Python soup Exemples, general.soup Python Exemples

Exemple #1

0

Afficher le fichier

def getStartLink(nativeTitle, startDate, channel):
    channelNumber = networkDictionary[channel]
    attempt = 1
    date = d.datetime.strftime(startDate, '%Y%m%d')
    while True:
        try:
            soup = g.soup(
                f"{rootKakaku}/tv/channel={channelNumber}/date={date}")
            suffixLink = soup.find('a',
                                   string=nativeTitle).find_next('a')['href']
            print(f'Found the show on {date}')
            break
        except AttributeError:
            # The following will account for possible date mis-entries in MDL because of the 28HR system used in
            # Japanese TV
            print(f'Cannot find the show on {date}')
            date = d.datetime.strftime(
                startDate + d.timedelta(days=-(-1)**attempt *
                                        (1 + (attempt > 2))), '%Y%m%d')
            attempt += 1
            if attempt > 4:
                print('Cannot find the show between dates {} - {}'.format(
                    d.datetime.strftime(startDate - d.timedelta(days=2),
                                        '%Y%m%d'),
                    d.datetime.strftime(startDate + d.timedelta(days=2),
                                        '%Y%m%d')))
                date = input(
                    'Provide a valid date in the exact format (YYYYMMDD) : '
                ).replace(' ', '')
            if attempt == 6:
                exit(
                    'No valid dates. Either that or the title or the channel is incorrect/invalid'
                )
    return suffixLink

Exemple #2

0

Afficher le fichier

def getImages(title, forced=False):
    episode, archives = 0, {}
    for episode, link in enumerate(
            g.soup(f"{root}/{title}/js/get_story.js").find_all('a'), start=1):
        archives[episode] = f"{root}{link['href']}"
    if forced:
        archives[episode + 1] = archives[episode].replace(
            str(episode).zfill(2),
            str(episode + 1).zfill(2))
    return {
        epNum:
        [{
            'url': link,
            'keyNotes': f"TV Osaka Gallery Episode {epNum} ",
            'description': "Official scene shots"
            if block.h1.text == '場面写真' else "Official offshoots",
            'images': [image['src'] for image in block.find_all('img')]
        } for block in g.soup(link).find_all(class_='photos')]
        for epNum, link in archives.items()
    } if archives else None

Exemple #3

0

Afficher le fichier

def dataZoo(link):
    soup = g.soup(link)
    information = {
        'cast': [
            name.split('（')[0]
            for name in soup.find(class_='icn_cast').next_sibling.text.replace(
                '\n', '').split('\xa0')
        ],
        'url':
        link
    }
    try:
        history = soup.find(class_='oldprogram').find_all('a', href=True)[0]
        prevDates = re.split('[（：）]', history.text)
        prevDate = d.datetime.strptime(
            f"{prevDates[0]} {prevDates[1].zfill(2)}:{prevDates[2].zfill(2)}",
            '%Y年%m月%d日 %H:%M')
        link = f"{rootZoo}{history['href']}"
    except:
        link = prevDate = None
    return information, prevDate, link

Exemple #4

0

Afficher le fichier

def searchDataZoo(nativeTitle, airDate=None, exclusions='', totalResults=1):
    if type(totalResults) != int:
        print('totalResults only accepts integer values')
        raise TypeError
    if type(airDate) == d.datetime:
        if airDate >= d.datetime(2010, 4, 15):
            queryURL = f"{rootZoo}/d/{airDate.year}/{airDate.month}/{airDate.day}/{int(airDate.hour / 6) + 1}"
            soup = g.soup(queryURL)
            link = f"{rootZoo}{soup.find(string=nativeTitle).parent['href']}"
            return link, airDate
        else:
            print('Too far back in time, not within database')
    elif type(airDate) == str:
        queryURL = 'https://cse.google.com/cse/element/v1'
        params = {
            'rsz':
            'filtered_cse',
            'num':
            totalResults,
            'hl':
            'ja',
            'source':
            'gcsc',
            'gss':
            '.com',
            'cselibv':
            '26b8d00a7c7a0812',
            'cx':
            'partner-pub-1955300006877540:6422076301',
            'q':
            f"{nativeTitle} {airDate} -{exclusions}",
            'safe':
            'active',
            'cse_tok':
            'AJvRUv29OfLq77RISJdH3hnYlRBU:1599371773557',
            'exp':
            'csqr.cc',
            'oq':
            f"{nativeTitle} {airDate}",
            'gs_I':
            'partner-generic.3...10019.11097.2.11305.0.0.0.0.0.0.0.0..0.0.csems,nrl=13...0.11073j100193469j5j2...'
            '1.34.partner-generic..0.0.0.',
            'callback':
            'google.search.cse.api16222'
        }
        headers = {'user-agent': 'Mozilla/5.0'}
        try:
            response = g.soup(queryURL,
                              params=params,
                              headers=headers,
                              response=True)
            if response.status_code == 200:
                result = [
                    i for i in json.loads(response.content[35:-2])['results']
                    if '/tv/' in i['url']
                ]
                if len(result) == 0:
                    raise FileNotFoundError
                elif len(result) == 1:
                    result = result[0]
                else:
                    print('Which of the following is the right result?')
                    for i, r in enumerate(result, start=1):
                        print(f"{i} : {r['richSnippet']['metatags']['title']}")
                    answer = attempt = 0
                    while attempt < 3:
                        try:
                            answer = int(
                                input('Result No.')) if attempt == 0 else int(
                                    input('Invalid answer. Result No.'))
                            if answer not in range(1, totalResults + 1):
                                raise ValueError
                            else:
                                break
                        except ValueError:
                            attempt += 1
                            if attempt < 3:
                                pass
                            else:
                                print(
                                    'Too many invalid attempts. Picking first choice'
                                )
                                answer = 1
                                break
                    result = result[answer - 1]
                title = result['richSnippet']['metatags']['title']
                title = japaneseDays(title[:title.find(' の放送内容')][-18:],
                                     ['(', ')'])
                return result['url'], d.datetime.strptime(
                    title, '%Y/%m/%d%a%H:%M')
            else:
                raise FileNotFoundError
        except (FileNotFoundError, KeyError):
            print('No search results')
            return False, False

Exemple #5

0

Afficher le fichier

def getEpisodes(nativeTitle, startDate, channel, omit=None, startEpisode=1):
    episodeSuffix = getStartLink(nativeTitle, startDate, channel)
    episodeList = {}
    seasonsList = {}
    episodeNumber = startEpisode
    episodeLinks = [episodeSuffix]
    while True:
        soup = g.soup(f"{rootKakaku}{episodeSuffix}")

        # Gets airdate information
        date = soup.find(attrs={'name': 'keywords'})['content'].split(',')[-2]
        time = soup.find(
            id='epiinfo').text.split('\u3000')[0].split(date)[1].split('〜')[0]
        start = d.datetime.strptime(
            japaneseDays(date, delimiter=['（', '）']) + time,
            '%Y年%m月%d日%a %H:%M')

        if int(d.datetime.strftime(start, '%y%m%d')) not in omit:
            episodeList[episodeNumber] = {
                'start': start,
                'cast': [],
                'guests': [],
                'url': f"{rootKakaku}{episodeSuffix}"
            }

            # Gets cast and guests information
            castInfo = [
                info
                for info in str(soup.find(id='epiinfo')).split('<br/>')[1:]
                if '\xa0' in info
            ]
            if castInfo:
                for castDetails in castInfo:
                    castType, cast = castDetails.split('\xa0')
                    if castType in ['【出演】', '【声の出演】']:
                        episodeList[episodeNumber]['cast'] += [
                            name.text.split('（')[0]
                            for name in bs(cast, 'lxml').find_all('a')
                        ]
                    else:
                        episodeList[episodeNumber]['guests'] += [
                            name.text.split('（')[0]
                            for name in bs(cast, 'lxml').find_all('a')
                        ]
            else:
                zooLink = searchDataZoo(nativeTitle, start)[0]
                episodeList[episodeNumber].update(dataZoo(zooLink)[0])

            if episodeList[episodeNumber]['start'].year not in list(
                    seasonsList.keys()):
                seasonsList[episodeList[episodeNumber]['start'].year] = {
                    'start': episodeNumber,
                    'end': 0,
                    'startDate': episodeList[episodeNumber]['start']
                }
                try:
                    seasonsList[episodeList[episodeNumber]['start'].year -
                                1]['end'] = episodeNumber - 1
                except KeyError:
                    pass

            if episodeNumber % 1 == 0:
                print(f'Episode {episodeNumber} @ {start}')
        else:
            episodeNumber -= 1

        nextButton = soup.find(
            src='https://img1.kakaku.k-img.com/images/tv/2008/epi_next.gif')
        nextEpisode = nextButton.parent['href'] if nextButton else ''

        if nextEpisode:
            episodeNumber += 1
            episodeSuffix = nextEpisode
            episodeLinks.append(episodeSuffix)
        else:
            print(f"Scraped episodes {len(episodeList)}\nNo more epiosdes")
            break

    seasonsList[list(seasonsList)[-1]]['end'] = episodeNumber
    return episodeList, seasonsList