Beispiel #1
0
def handle_second_page(url, attrs):
    # 获得二级页面
    soup = get_html_text(url)
    if soup is None:
        return None
    # 优先使用DOI链接
    raw_links = soup.find_all(text=re.compile(r'electronic edition via DOI'))
    if len(raw_links) == 0:
        # 没有找到DOI链接,就选择使用通过 @ 找到的链接
        raw_links = soup.find_all(text=re.compile(r'electronic edition @'))
    links = map(lambda tmp: tmp.find_parent('a'), raw_links)
    if links is None:
        logger.info('处理二级页面,没有找到electronic edition链接' + str(url))
    for raw_url in links:
        paper_dict = handle_third_page(raw_url.get('href'), attrs)
        tmp = raw_url.find_parent('li', class_='drop-down')
        if tmp is not None:
            temp = tmp.find_next_sibling('li', class_='drop-down')
            if temp is not None:
                raw_ris = temp.select_one(
                    'div[class="body"] > ul:nth-of-type(1) > li:nth-of-type(2) > a'
                )
                if raw_ris is not None:
                    download_paper_info(raw_ris.get('href'), root_dir,
                                        paper_dict)
        time.sleep(get_random_uniform(begin=2.0, end=60.0))
def handle_third_page(urls, attrs):
    for url in urls:
        soup = get_html_str(get_phantomjs_page(url))
        if soup is None:
            logger.info('3级页面无法获取:' + str(url))
            return None
        else:
            link = soup.find('link', attrs={'rel': 'canonical'})
            if link:
                link = link.get('href')
            else:
                logger.info('handle_third_page没有找到跳转链接link:' + str(url))
                return None
        soup = get_html_str(get_phantomjs_page(link))
        # 获取关于论文的描述信息:标题、作者、发表日期等等
        data_dict = copy.deepcopy(attrs)  # 深拷贝字典
        data_dict['url'] = link  # 保存论文的正真链接地址
        h1 = soup.find('h1', class_='svTitle')
        if h1:
            data_dict['title'] = h1.get_text().strip()
        ul = soup.find('ul', class_='authorGroup noCollab svAuthor')
        if ul:
            a_list = ul.find_all_next('a', class_='authorName svAuthor')
            authors_dict = dict()
            for a in a_list:
                affiliation_dict = dict()
                affiliation_dict['affiliation'] = ''
                affiliation_dict['affiliation_name'] = ''
                affiliation_dict['affiliation_country'] = ''
                author_name = a.get_text().strip()
                author_name = re.sub(r'[\._$]', ' ', author_name)
                authors_dict[author_name] = affiliation_dict
            data_dict['author'] = authors_dict
        h2 = soup.find('h2', text=re.compile(r'Abstract'))
        if h2:
            p = h2.find_next_sibling('p')
            data_dict['abstract'] = p.get_text()
        h2 = soup.find('h2', text=re.compile(r'Keywords'))
        if h2:
            ul = h2.find_next_sibling('ul')
            keywords_list = ul.find_all_next('li', class_='svKeywords')
            keywords = list()
            for keyword in keywords_list:
                keywords.append(keyword.get_text().strip())
            data_dict['keywords'] = keywords
        h2 = soup.find('h2', text=re.compile(r'References'))
        if h2:
            li_list = h2.find_all_next('li', class_='title')
            references = list()
            for li in li_list:
                references.append(li.get_text().strip())
            data_dict['reference'] = references
        write_to_database(data_dict)
        time.sleep(get_random_uniform(begin=2.0, end=60.0))
Beispiel #3
0
def handle_first_page(url, attrs):
    # 获得一级页面
    page_content = get_html_text(url)
    if page_content is None:
        logger.info('1级页面无法获取:' + str(url))
        return None
    raw_links = page_content.find_all('a', text='[contents]')
    if (raw_links is not None) and (len(raw_links) > 0):
        links = map(lambda raw_link: raw_link.get('href'), raw_links)   # 会议论文
    else:
        raw_links = page_content.find_all('a', text=re.compile(r'Volume'))   # 期刊
        links = map(lambda raw_link: raw_link.get('href'), raw_links)
    for url in links:
        handle_second_page(url, attrs)
        time.sleep(get_random_uniform(begin=2.0, end=60.0))
def handle_first_page(url, attrs):
    # 获得一级页面
    page_content = get_html_text(url)
    if page_content is None:
        logger.info('1级页面无法获取:' + str(url))
        return None
    raw_links = list()
    li_list = page_content.select(
        'a[href^="http://dblp.uni-trier.de/db/journals/"]')
    for li in li_list:
        temp = li.get('href')
        if 'http://dblp.uni-trier.de/db/journals/' != temp:
            raw_links.append(temp)
    for url in raw_links:
        handle_second_page(url, attrs)
        time.sleep(get_random_uniform(begin=2.0, end=60.0))
Beispiel #5
0
def run_ieee_update():
    while True:
        hour = int(time.strftime('%H'))
        if START_HOUR <= hour <= END_HOUR:
            init_dir(log_dir)
            init_dir(root_dir)
            try:
                logger.warning('update_ieee正常启动!')
                update_ieee(ieee_updates_url)
            except Exception as e:
                logger.exception('update_ieee异常停止!')
            else:
                sleep_time = get_random_uniform(begin=30 * 60, end=1 * 60 * 60)
                logger.warning('update_ieee正常停止!即将休眠大约{:.2f}分钟...'.format(
                    sleep_time / 60))
                time.sleep(sleep_time)  # 本次更新已经完成,休眠0.5h~1.0h
        else:  # 休眠1小时
            time.sleep(1 * 60 * 60)
Beispiel #6
0
def handle_second_page(urls):
    links = list()
    for url in urls:
        page_content = get_html_str(get_phantomjs_page(url))
        if page_content is None:
            logger.info('2级页面无法获取:' + str(url))
            return None
        ul = page_content.find('ul', class_='results')
        if ul is not None:
            divs = ul.find_all_next('div', class_='txt')
            for div in divs:
                temp = div.find_next('a', class_='art-abs-url')
                if temp is not None:
                    links.append('http://ieeexplore.ieee.org' +
                                 temp.get('href'))
        # 找到分页代码,获得分页总数,并向分页链接请求页面内容
        pagination = page_content.find('div', class_='pagination')
        if pagination is not None:
            a_list = pagination.select('a[aria-label^="Pagination Page"]')
            if a_list:
                pageNumber = a_list[-1].get_text().strip()
                if pageNumber is not None:
                    pageNumber = int(pageNumber)
                    url_list = list()
                    for number in range(2, pageNumber + 1):
                        url_list.append(url + '&pageNumber=' + str(number))
                    for tmp_url in url_list:
                        page_content = get_html_str(
                            get_phantomjs_page(tmp_url))
                        if page_content is None:
                            logger.info('2级页面无法获取:' + str(url))
                            return None
                        ul = page_content.find('ul', class_='results')
                        if ul is not None:
                            divs = ul.find_all_next('div', class_='txt')
                            for div in divs:
                                temp = div.find_next('a', class_='art-abs-url')
                                if temp is not None:
                                    links.append('http://ieeexplore.ieee.org' +
                                                 temp.get('href'))
        else:
            logger.info('处理2级页面时没有找到分页代码:' + str(url))
        time.sleep(get_random_uniform(begin=5.0, end=10.0))
    handle_third_page(links)  # 进一步处理已采集到的当前页面上的所有3级页面的链接
Beispiel #7
0
def handle_third_page(urls):
    for url in urls:
        data_dict = dict()
        page_content = get_html_str(get_phantomjs_page(url))
        if page_content is None:
            logger.info('论文内容页面无法获取!' + url)
            continue
        # 论文URL地址
        data_dict['url'] = url
        # 论文类型
        data_dict['category'] = 'conference'
        # IEEE更新论文日期
        data_dict['update_time'] = time.strftime('%Y%m%d', time.localtime())
        # 论文采集时间
        data_dict['spider_time'] = time.strftime('%Y.%m.%d %H:%M:%S',
                                                 time.localtime())
        # 采集论文名
        if page_content.title is not None:
            data_dict['title'] = page_content.title.string[:-22].strip()
        # 采集论文摘要信息
        abstract = page_content.find('div', class_='abstract-text ng-binding')
        if abstract is not None:
            data_dict['abstract'] = abstract.get_text()
        # 采集论文发表日期
        date = page_content.find('strong', text='Date of Publication:')
        if date is not None:
            div = date.find_parent('div')
            if div is not None:
                date = re.split(r':', div.get_text())[-1].strip()
                data_dict['publication_date'] = date
        # 采集论文关键词信息
        ul = page_content.find('ul', class_='doc-all-keywords-list')
        if ul is not None:
            spans = ul.find_all_next('span')
            keywords = list()
            for span in spans:
                temp = span.find_next('a', class_='ng-binding')
                if temp is not None:
                    keywords.append(temp.get_text().strip())
            data_dict['keywords'] = keywords
        # 采集论文作者信息
        h2 = page_content.find('h2', text='Authors')
        if h2 is not None:
            div = h2.find_next_sibling('div', class_='ng-scope')
            if div is not None:
                temp = div.select(
                    'a[href^="/search/searchresult.jsp?searchWithin="]')
                if temp is not None:
                    authors_dict = dict()  # 保存多个作者信息到字典
                    for a in temp:
                        affiliation_dict = dict()
                        span = a.find_next('span', class_='ng-binding')
                        if span is not None:
                            author_name = span.get_text().strip()
                            author_name = re.sub(r'[._$]', ' ', author_name)
                            tmp = a.parent.find_next_sibling(
                                'div', class_='ng-binding')
                            if tmp is not None:
                                affiliation = tmp.get_text().strip()
                                data_list = re.split(r',', affiliation)
                                affiliation_dict['affiliation'] = affiliation
                                if data_list is not None:
                                    affiliation_dict[
                                        'affiliation_country'] = data_list[-1]
                            authors_dict[author_name] = affiliation_dict
                    data_dict['author'] = authors_dict
        # 获取论文参考信息
        page_content = get_html_str(
            get_phantomjs_page(url + 'references?ctx=references'))
        if page_content is not None:
            h2 = page_content.find('h2', text='References')
            if h2 is not None:
                divs = h2.find_next_siblings(
                    'div', class_='reference-container ng-scope')
                references = list()
                for div in divs:
                    div_temp = div.find_next('div',
                                             class_='description ng-binding')
                    if div_temp:
                        references.append(div_temp.get_text().strip())
                data_dict['references'] = references
        # 获取论文被引用信息
        page_content = get_html_str(
            get_phantomjs_page(
                url +
                'citations?anchor=anchor-paper-citations-ieee&ctx=citations'))
        if page_content is not None:
            # Cited in Papers - IEEE
            h2 = page_content.find('h2',
                                   text=re.compile(r'Cited in Papers - IEEE'))
            citations = list()
            if h2 is not None:
                divs = h2.find_next_siblings('div', class_='ng-scope')
                for div in divs:
                    div_temp = div.find_next('div',
                                             class_='description ng-binding')
                    if div_temp:
                        citations.append(div_temp.get_text().strip())
            # Cited in Papers - Other Publishers
            h2 = page_content.find(
                'h2', text=re.compile(r'Cited in Papers - Other Publishers'))
            if h2 is not None:
                divs = h2.find_next_siblings('div', class_='ng-scope')
                for div in divs:
                    div_temp = div.find_next('div',
                                             class_='description ng-binding')
                    if div_temp:
                        citations.append(div_temp.get_text().strip())
            data_dict['citations'] = citations
        write_to_database(data_dict)
        time.sleep(get_random_uniform(begin=1.0, end=20.0))