Exemple #1
0
def start_get_paper_thread(id, author_id):
    with db_session:
        p = Paper.get(paper_id=id)
        if p:
            logger.debug('paper has existed, paper_id=%s', id)
            return
    executor.submit(get_paper, id, author_id)
Exemple #2
0
def upsert_paper(info, author_id):
    try:
        with db_session:
            publisher_id = info['publisher_id']
            if publisher_id:
                publisher = Publisher.get(publisher_id=publisher_id)
                if publisher:
                    pass
                else:
                    publisher = Publisher(publisher_id=publisher_id)
                    publisher.name = info['publishername']
    except:
        pass
    with db_session:
        p = Paper.get(paper_id=info['paper_id'])
        if p:
            logger.debug('paper has existed, paper_id=%s', info['paper_id'])
            return
        paper = Paper(paper_id=info['paper_id'])
        paper.title = info['title']
        paper.abstract = info['abstract']
        paper.cite_num = info['cite_num']
        paper.cited_num = info['cited_num']

        publisher_id = info['publisher_id']
        if publisher_id:
            publisher = Publisher.get(publisher_id=publisher_id)
            if publisher:
                paper.publisher = publisher

        if author_id is None:
            return
        a = Author.get(author_id=author_id)
        if a:
            paper.authors.add(a)
        else:
            a_info = api.get_author(author_id)
            author = Author(author_id=a_info['author_id'])
            author.name = a_info['name']
            author.image_url = a_info['image_url']
            author.organization = a_info['organization']
            author.home_page = a_info['home_page']
            author.paper_count = a_info['paper_count']
            author.citied_count = a_info['cited_count']
            paper.authors.add(author)
Exemple #3
0
def add(search_query, author, title):
    fl = [
        'id', 'author', 'first_author', 'bibcode', 'id', 'year', 'title',
        'abstract', 'doi', 'pubdate', "pub", "keyword", "doctype",
        "identifier", "links_data"
    ]
    if author:
        search_query += "author:" + author
    if title:
        search_query += "title:" + title
    papers = list(ads.SearchQuery(q=search_query, fl=fl))
    if len(papers) == 0:
        selection = ads.search.Article
        exit()
    elif len(papers) == 1:
        selection = papers[0]  # type:ads.search.Article
    else:
        # first_ten = itertools.islice(papers, 10)
        first_ten = papers[:10]
        single_paper: ads.search.Article
        for index, single_paper in enumerate(first_ten):
            print(index, single_paper.title[0], single_paper.first_author)
        selected_index = click.prompt('select paper', type=int)
        selection = papers[selected_index]  # type:ads.search.Article

    assert len(selection.doi) == 1
    doi = selection.doi[0]

    try:

        paper = Paper.get(Paper.doi == doi)
        print("this paper has already been added")
        exit(1)

    except peewee.DoesNotExist:
        pass

    print("fetching bibcode")
    q = ads.ExportQuery([selection.bibcode])
    bibtex = q.execute()

    print("saving in db")

    paper = Paper()
    assert len(selection.title) == 1
    paper.doi = doi
    paper.title = selection.title[0]
    paper.abstract = selection.abstract
    paper.bibcode = selection.bibcode
    paper.year = selection.year
    paper.pubdate = selection.pubdate
    paper.pdf_downloaded = False
    paper.first_author = Author.get_or_create(name=selection.first_author)[0]
    paper.publication = Publication.get_or_create(name=selection.pub)[0]
    paper.doctype = Doctype.get_or_create(name=selection.doctype)[0]
    paper.arxiv_identifier = [
        ident for ident in selection.identifier if "arXiv:" in ident
    ][0].split("arXiv:")[-1]
    paper.bibtex = bibtex
    links = [json.loads(string) for string in selection.links_data]
    print(links)
    paper.save()
    authors = [Author.get_or_create(name=name)[0] for name in selection.author]
    for author in db.batch_commit(authors, 100):
        PaperAuthors.create(author=author, paper=paper)
    keywords = [
        Keyword.get_or_create(keyword=keyword)[0]
        for keyword in selection.keyword
    ]
    for keyword in db.batch_commit(keywords, 100):
        PaperKeywords.create(keyword=keyword, paper=paper)
    print("fetching PDF")
    arxiv_url = "https://arxiv.org/pdf/{id}".format(id=paper.arxiv_identifier)
    r = requests.get(arxiv_url, stream=True)
    print(arxiv_url)
    with open('library/{filename}.pdf'.format(filename=paper.id), 'wb') as f:
        chunk_size = 1024  # bytes
        file_size = int(r.headers.get('content-length', 0))
        progress_length = math.ceil(file_size // chunk_size)
        with click.progressbar(r.iter_content(chunk_size=20),
                               length=progress_length) as progress_chunks:
            for chunk in progress_chunks:
                f.write(chunk)
    paper.pdf_downloaded = True
    paper.save()
Exemple #4
0
def crawl_category(term='cs.LG'):
    index_iteration = 500
    logging.info("Crawling category : %s", term)
    for index in range(start_index, end_index, index_iteration):
        logging.info("\nBatch : %d-%d" % (index, index + index_iteration))
        articles = arxivpy.query(search_query=[term],
                                 start_index=index,
                                 max_index=index + index_iteration,
                                 results_per_iteration=index_iteration,
                                 wait_time=0.2,
                                 sort_by='lastUpdatedDate')
        article_batch_count = len(articles)
        if article_batch_count == 0:
            logging.warning('Article not found in batch %d - %d' %
                            (index, index + index_iteration))
        for idx, article in tqdm(enumerate(articles),
                                 total=article_batch_count):
            arvixID = article['id'].split('v')[0]
            query = Paper.select().where(Paper.arvixID == arvixID)
            if query.exists():
                paper = Paper.get(Paper.arvixID == arvixID)
                categories = paper.category
                if term not in categories:
                    categories.append(term)
                Paper.update(category=categories).where(
                    Paper.arvixID == arvixID).execute()
                continue
            success, article_meta = get_arvixpaper_semantic_scholar(arvixID)
            if success is False:
                logging.debug(
                    "Paper not exists in semantic scholar, arvixID : %s" %
                    arvixID)
                continue
            authorIDList = [
                int(author['authorId'])
                if author['authorId'] is not None else -1
                for author in article_meta['authors']
            ]
            authorNames = [article['main_author']]
            authorCount = len(article_meta['authors'])
            if authorCount > 1:
                other_author = [
                    name.strip() for name in article['authors'].split(',')
                    if len(name) > 1 and name != article['main_author']
                ]
                authorNames += other_author
            paper_category = [article['term']]
            if article['term'] != term:
                paper_category.append(term)
            try:
                paper = Paper.create(
                    indexID=idx + index,
                    arvixID=arvixID,
                    paperId=article_meta['paperId'],
                    doiID=str(article_meta['doi']),
                    title=article['title'],
                    summary=article['abstract'],
                    category=paper_category,
                    comments=article['comment'],
                    journal_ref=article['journal_ref'],
                    url=article['url'],
                    authorID=authorIDList,
                    authorName=authorNames,
                    authorCount=authorCount,
                    publishedDate=article['publish_date'],
                    citationVelocity=article_meta['citationVelocity'],
                    referencesCount=len(article_meta['references']),
                    topics=article_meta['topics'],
                    venue=str(article_meta['venue']),
                    year=article_meta['year'],
                    influentialCitationCount=article_meta[
                        'influentialCitationCount'],
                    citationCount=len(article_meta['citations']),
                    citations=article_meta['citations'],
                )
                try:
                    for meta in ['page', 'figure', 'table']:
                        if meta in article['comment']:
                            comment = article['comment'].replace(';', ',')
                            for segment in comment.split(','):
                                if meta in segment:
                                    page_prefix = segment.split(meta)[0]
                                    if meta == 'page':
                                        paper.pages = int(page_prefix.strip())
                                    elif meta == 'figure':
                                        paper.figures = int(
                                            page_prefix.strip())
                                    elif meta == 'table':
                                        paper.table = int(page_prefix.strip())
                                    break
                except:
                    logging.debug("Error in parsing meta data")
                paper.save()
            except BaseException as e:
                logging.warning("Error in arvix id %s, error: %s" %
                                (arvixID, str(e)))
            time.sleep(0.3)