def download_extract(paper, extract_figure=False, extract_table=False): if paper.pages >= 0 and paper.table >= 0: return False paper_info = { 'pdf_url': paper.url, 'title': paper.title, } api_paper = arxiv.query(id_list=[paper.arvixID])[0] if 'pdf_url' not in api_paper: return False pdf_url = api_paper['pdf_url'] # pdf_url = 'https://arxiv.org/pdf/' + paper.url.split('/')[-1] +'.pdf' file_path = os.path.join(store_path, paper.paperId + '.pdf') # if not os.path.isfile(file_path): urllib.request.urlretrieve(pdf_url, file_path) if extract_table: df = wrapper.read_pdf(file_path, multiple_tables=True, pages='all') table_count = len(df) del df if extract_figure: figure_count, page_count = get_figure_count(file_path) modified = False if paper.pages == -1: modified = True paper.pages = page_count else: page_count = paper.pages if paper.table == -1: modified = True paper.table = table_count if os.path.exists(file_path): os.remove(file_path) if modified: Paper.update(table=table_count, pages=page_count).where( Paper.arvixID == paper.arvixID).execute() # paper.save() return modified # api_paper = arxiv.query(id_list=[paper.arvixID])[0] # if 'pdf_url' not in api_paper: # return False # pdf_url = api_paper['pdf_url'] texts = extract_text(file_path, pdf_url) if texts is None: print("PDF either do not exists or failed : ", paper.url) return False affiliation = [] for text in texts.split(): if re.match("[^@]+@[^@]+\.[^@]+", text): domain_name = text.split('@')[-1] affiliation.append(domain_name) if len(affiliation) > 0: Paper.update(affiliation=affiliation).where( Paper.arvixID == paper.arvixID).execute() return False
def crawl_category(term='cs.LG'): index_iteration = 500 logging.info("Crawling category : %s", term) for index in range(start_index, end_index, index_iteration): logging.info("\nBatch : %d-%d" % (index, index + index_iteration)) articles = arxivpy.query(search_query=[term], start_index=index, max_index=index + index_iteration, results_per_iteration=index_iteration, wait_time=0.2, sort_by='lastUpdatedDate') article_batch_count = len(articles) if article_batch_count == 0: logging.warning('Article not found in batch %d - %d' % (index, index + index_iteration)) for idx, article in tqdm(enumerate(articles), total=article_batch_count): arvixID = article['id'].split('v')[0] query = Paper.select().where(Paper.arvixID == arvixID) if query.exists(): paper = Paper.get(Paper.arvixID == arvixID) categories = paper.category if term not in categories: categories.append(term) Paper.update(category=categories).where( Paper.arvixID == arvixID).execute() continue success, article_meta = get_arvixpaper_semantic_scholar(arvixID) if success is False: logging.debug( "Paper not exists in semantic scholar, arvixID : %s" % arvixID) continue authorIDList = [ int(author['authorId']) if author['authorId'] is not None else -1 for author in article_meta['authors'] ] authorNames = [article['main_author']] authorCount = len(article_meta['authors']) if authorCount > 1: other_author = [ name.strip() for name in article['authors'].split(',') if len(name) > 1 and name != article['main_author'] ] authorNames += other_author paper_category = [article['term']] if article['term'] != term: paper_category.append(term) try: paper = Paper.create( indexID=idx + index, arvixID=arvixID, paperId=article_meta['paperId'], doiID=str(article_meta['doi']), title=article['title'], summary=article['abstract'], category=paper_category, comments=article['comment'], journal_ref=article['journal_ref'], url=article['url'], authorID=authorIDList, authorName=authorNames, authorCount=authorCount, publishedDate=article['publish_date'], citationVelocity=article_meta['citationVelocity'], referencesCount=len(article_meta['references']), topics=article_meta['topics'], venue=str(article_meta['venue']), year=article_meta['year'], influentialCitationCount=article_meta[ 'influentialCitationCount'], citationCount=len(article_meta['citations']), citations=article_meta['citations'], ) try: for meta in ['page', 'figure', 'table']: if meta in article['comment']: comment = article['comment'].replace(';', ',') for segment in comment.split(','): if meta in segment: page_prefix = segment.split(meta)[0] if meta == 'page': paper.pages = int(page_prefix.strip()) elif meta == 'figure': paper.figures = int( page_prefix.strip()) elif meta == 'table': paper.table = int(page_prefix.strip()) break except: logging.debug("Error in parsing meta data") paper.save() except BaseException as e: logging.warning("Error in arvix id %s, error: %s" % (arvixID, str(e))) time.sleep(0.3)