コード例 #1
0
ファイル: acm.py プロジェクト: mayuanucas/paper_spider
def handle_third_page(url, attrs):
    soup = get_html_text(url)
    if soup is None:
        logger.info('soup is None:' + str(url))
        return None
    data_dict = copy.deepcopy(attrs)  # 深拷贝字典
    # 获取关于论文的描述信息:标题、作者、发表日期、关键词等等
    paper_id = re.split(r'\.', url)[-1].strip()
    bib_url = 'http://dl.acm.org/exportformats.cfm?id=' + paper_id + '&expformat=bibtex'
    page = get_html_text(bib_url)
    if page:
        temp = page.find('pre')
        if temp:
            content = temp.get_text()
            filepath = root_dir + paper_id
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(content)
                f.flush()
            with open(filepath, 'r') as f:
                for line in f:
                    if 'keywords' in line:
                        temp = re.split(r'[{}]', line)[-2]
                        data_dict['keywords'] = re.split(r',', temp)
    # 获取论文PDF的下载地址
    pdf_url = soup.find('a', attrs={'name': 'FullTextPDF'})
    if pdf_url is not None:
        pdf_url = 'http://dl.acm.org/' + pdf_url.get('href').strip()
        data_dict['pdf_url'] = pdf_url
    authors = soup.find_all('a', attrs={'title': 'Author Profile Page'})
    if (authors is not None) and (authors != ''):
        authors_dict = {}
        for tmp in authors:
            temp = tmp.find_next('a',
                                 attrs={'title': 'Institutional Profile Page'})
            if (temp is not None) and (temp != ''):
                institute = temp.find_next('small')
                if (institute is not None) and (institute != ''):
                    affiliation_dict = dict()
                    # mongodb中带有"."号,"_"号和"$"号前缀的Key被保留
                    author_name = re.sub(r'[\._$]', ' ',
                                         tmp.get_text().strip())
                    institute = institute.get_text().strip()
                    data_list = re.split(r',', institute)
                    affiliation_dict['affiliation'] = institute
                    affiliation_dict['affiliation_name'] = data_list[0]
                    if len(data_list) != 1:
                        affiliation_dict['affiliation_country'] = data_list[-1]
                    authors_dict[author_name] = affiliation_dict
        data_dict['author'] = authors_dict
        return data_dict  #返回数据字典(类别、等级、作者信息)
    else:
        logger.info('三级页面没有找到论文描述信息' + str(url))
コード例 #2
0
def handle_second_page(url, attrs):
    # 获得二级页面
    soup = get_html_text(url)
    if soup is None:
        return None
    # 优先使用DOI链接
    raw_links = soup.find_all(text=re.compile(r'electronic edition via DOI'))
    if len(raw_links) == 0:
        # 没有找到DOI链接,就选择使用通过 @ 找到的链接
        raw_links = soup.find_all(text=re.compile(r'electronic edition @'))
    links = map(lambda tmp: tmp.find_parent('a'), raw_links)
    if links is None:
        logger.info('处理二级页面,没有找到electronic edition链接' + str(url))
    for raw_url in links:
        paper_dict = handle_third_page(raw_url.get('href'), attrs)
        tmp = raw_url.find_parent('li', class_='drop-down')
        if tmp is not None:
            temp = tmp.find_next_sibling('li', class_='drop-down')
            if temp is not None:
                raw_ris = temp.select_one(
                    'div[class="body"] > ul:nth-of-type(1) > li:nth-of-type(2) > a'
                )
                if raw_ris is not None:
                    download_paper_info(raw_ris.get('href'), root_dir,
                                        paper_dict)
        time.sleep(get_random_uniform(begin=2.0, end=60.0))
コード例 #3
0
def download_paper_info(url, root_dir, attrs):
    filename = re.split(r'/', url)[-1]
    page_content = get_html_text(url)
    if page_content is None:
        logger.error('download_paper_info出错!' + str(url))
        return None
    data = page_content.get_text()
    if data is not None:
        # 将数据存入本地文件中,方便读取和写入数据库
        filepath = root_dir + filename
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(data)
            f.flush()
        write_to_database(filepath, attrs)
コード例 #4
0
def handle_second_page(url, attrs):
    # 获得二级页面
    soup = get_html_text(url)
    if soup is None:
        return None
    # 优先使用DOI链接
    raw_links = soup.find_all(text=re.compile(r'electronic edition via DOI'))
    if len(raw_links) == 0:
        # 没有找到DOI链接,就选择使用通过 @ 找到的链接
        raw_links = soup.find_all(text=re.compile(r'electronic edition @'))
    links = map(lambda tmp: tmp.find_parent('a').get('href'), raw_links)
    if links is None:
        logger.info('处理二级页面,没有找到electronic edition链接' + str(url))
    handle_third_page(list(links), attrs)
コード例 #5
0
def handle_first_page(url, attrs):
    # 获得一级页面
    page_content = get_html_text(url)
    if page_content is None:
        logger.info('1级页面无法获取:' + str(url))
        return None
    raw_links = page_content.find_all('a', text='[contents]')
    if (raw_links is not None) and (len(raw_links) > 0):
        links = map(lambda raw_link: raw_link.get('href'), raw_links)   # 会议论文
    else:
        raw_links = page_content.find_all('a', text=re.compile(r'Volume'))   # 期刊
        links = map(lambda raw_link: raw_link.get('href'), raw_links)
    for url in links:
        handle_second_page(url, attrs)
        time.sleep(get_random_uniform(begin=2.0, end=60.0))
コード例 #6
0
def handle_first_page(url, attrs):
    # 获得一级页面
    page_content = get_html_text(url)
    if page_content is None:
        logger.info('1级页面无法获取:' + str(url))
        return None
    raw_links = list()
    li_list = page_content.select(
        'a[href^="http://dblp.uni-trier.de/db/journals/"]')
    for li in li_list:
        temp = li.get('href')
        if 'http://dblp.uni-trier.de/db/journals/' != temp:
            raw_links.append(temp)
    for url in raw_links:
        handle_second_page(url, attrs)
        time.sleep(get_random_uniform(begin=2.0, end=60.0))
コード例 #7
0
ファイル: usenix.py プロジェクト: mayuanucas/paper_spider
def handle_third_page(url, attrs):
    soup = get_html_text(url)
    if soup is None:
        logger.info('soup is None:' + str(url))
        return None
    # 获取关于论文的描述信息:标题、作者、发表日期等等
    data_dict = copy.deepcopy(attrs)  # 深拷贝字典
    tmp = soup.find('div', class_='field-label')
    if tmp is not None:
        tmp = soup.find('div', class_='field-item odd').find_next('p')
        tmp_list = list()
        if tmp is not None:
            i = 0
            for child in tmp.children:
                i += 1
                if (i % 2) != 0:
                    tmp_list.append(child)  # 作者名
                else:
                    child = child.get_text().strip()
                    tmp_list.append(child.strip(';'))  # 机构名称
            authors_dict = dict()
            for n in range(0, len(tmp_list), 2):
                affiliation_dict = dict()
                affiliation_dict['affiliation_name'] = tmp_list[n + 1]
                author_list = re.split(r'(?:and|,)\s*', tmp_list[n])[:-1]
                for author in author_list:
                    if (author != '') and (author != ','):
                        author = re.sub(r'[\.$_]', ' ', author.strip())
                        authors_dict[author] = affiliation_dict
            data_dict['author'] = authors_dict
    div = soup.find('div', text=re.compile(r'Abstract'))
    if div:
        div = div.find_next_sibling('div', class_='field-items')
        if div:
            data_dict['abstract'] = div.get_text()
    return data_dict  # 返回数据字典(类别、等级、作者信息)
コード例 #8
0
    soup = BeautifulSoup(html, 'html.parser')
    imgs = soup('img', class_='card-img-top round-0')
    # print(imgs)
    image_links = []
    for img in imgs:
        image_link = img.get('source')
        image_links.append(image_link)
    return image_links


def save_image(image_link):
    content = requests.get(image_link, timeout=10).content
    root = 'g://img//timeroute//'
    if not os.path.exists(root):
        os.makedirs(root)
    path = root + image_link.split('/')[-1]
    if not os.path.exists(path):
        with open(path, 'wb') as f:
            f.write(content)
            f.close()
        print(path + ' saved sucessfully.')
    else:
        print(path + ' has already exist!')

for i in range(0, 805):
    url = 'http://timeroute.cn/desktop/page/%d' % i
    print(url)
    html = common.get_html_text(url)
    image_links = get_image_links(html)
    for image_link in image_links:
        save_image(image_link)
コード例 #9
0
ファイル: wx.py プロジェクト: czhwu/PythonSpider
            with open(path, 'wb') as f:
                for content in contents:
                    f.write(bytearray(content, 'utf8'))
                    f.close()
                    print(path + '.txt has saved sucessfully!')
        else:
            print(title+'.txt has already exist.')
    else:
        path = root + 'black.txt'
        print(path)
        if not os.path.exists(root):
            os.makedirs(root)
        if not os.path.exists(path):
            with open(path, 'wb') as f:
                f.write(bytearray('', 'utf8'))
                f.close()
        # pass


html = common.get_html_text('http://cl.b8y.xyz/thread0806.php?fid=21')
article_links = get_article_links(html)
for article_link in article_links:
    print(article_link)
    html_sub = common.get_html_text(article_link)
    # (title, contents) = get_title_content(html_sub)
    # print(title)
    # for content in contents:
    #     print(content)
    #     save_article(content)
    save_article(html_sub)