Ejemplo n.º 1
0
def parse_html_page(link, publication_year, publication_month, publication_day,
                    publisher, num_pages, image_url):
    soup = goodreads.get_html_page(link)
    info_tag = soup.find(itemprop='numberOfPages')
    if info_tag != None:
        num_pages = info_tag.contents[0].split(' ')[0]
        detail = info_tag.parent.next_sibling.next_sibling.contents[0]
        p_date = detail.split('Published')[1].split('by')[0].strip()
        p_arr = p_date.split(' ')
        if len(p_arr) == 3:
            publication_year = p_arr[2]
            publication_day = p_arr[1][:-2]
            publication_month = int(
                datetime.strptime(p_arr[0], '%B').strftime('%m'))
        elif len(p_arr) == 2:
            publication_year = p_arr[1]
            publication_month = int(
                datetime.strptime(p_arr[0], '%B').strftime('%m'))
        elif len(p_arr) == 1:
            publication_year = p_arr[0]
        publisher = detail.split('by')[1].strip() if len(
            detail.split('by')) > 1 else 'Unknown'
    image_tag = soup.find(id='coverImage')
    if image_tag != None:
        image_url = image_tag.get('src')
    book_info = dict(publication_year=publication_year,
                     publication_month=publication_month,
                     publication_day=publication_day,
                     publisher=publisher,
                     num_pages=num_pages,
                     image_url=image_url)
    return book_info
Ejemplo n.º 2
0
def parse_html_page(link, born_at):
    soup = goodreads.get_html_page(link)
    birth_date = soup.find(itemprop='birthDate')
    if birth_date:
        bd = birth_date.string.split(' ').strip()
        born_at = datetime.strptime(bd, '%B %d %Y').strftime('%Y/%m/%d')
        print ('===========from Goodreads page', born_at)
    return born_at