def download_info(self): self.articles = arxivpy.query(search_query=['cond-mat.quant-gas'], start_index=0, max_index=30, sort_by='submittedDate') p_ = 'interesting_authors.csv' url = "https://www.dropbox.com/s/yismcsi2ti35qse/interesting_authors.csv?dl=1" u = urllib.request.urlopen(url) data = u.read() u.close() with open(p_, "wb") as f: f.write(data) with open(p_, 'r') as my_file: reader = csv.reader(my_file, delimiter=',') self.interesting_authors = list(reader)[0] p_ = 'interesting_keywords.csv' url = "https://www.dropbox.com/s/u9pqzmomoa0jgmm/interesting_keywords.csv?dl=1" u = urllib.request.urlopen(url) data = u.read() u.close() with open(p_, "wb") as f: f.write(data) with open(p_, 'r') as my_file: reader = csv.reader(my_file, delimiter=',') self.interesting_title_keywords = list(reader)[0]
def scrape_arxiv(): articles = arxivpy.query( search_query=['cs.CV', 'cs.LG', 'cs.CL', 'cs.NE', 'stat.ML'], start_index=0, max_index=200, results_per_iteration=100, wait_time=5.0, sort_by='lastUpdatedDate') # grab 200 articles arxivpy.download(articles, path='arxiv_pdf')
def crawl_machine_learning(start_index: int, sort_order: str): conn = connect_database() machine_learning_categories = [ 'cs.CV', 'cs.CL', 'cs.LG', 'cs.AI', 'cs.NE', 'stat.ML' ] STEP = 100 articles_per_minute = STEP * 2 article_len = articles_per_minute if start_index == -1: start_index = Article.get_n_articles(conn) - STEP logging.info('crawling start') logging.info('start index : ' + str(start_index)) logging.info('sort_order : ' + sort_order) while article_len == articles_per_minute: # query 100 results per iteration # wait 30 seconds per query try: articles = arxivpy.query(search_query=machine_learning_categories, start_index=start_index, max_index=start_index + articles_per_minute, results_per_iteration=STEP, wait_time=30, sort_by='lastUpdatedDate', sort_order=sort_order) # crawling log logging.info('last: ' + articles[-1]['published']) logging.info(str(start_index + STEP * 2) + ' articles crawled') # save articles for article in articles: Article(article, conn).save() # compute start_index start_index += STEP * 2 # compute article_len article_len = len(articles) # sleep 5 minute time.sleep(MINUTE * 5) except Exception as e: logging.error(e) time.sleep(60 * 30) conn.close()
def fetch_recent_cv_papers(filename, num=65536): papers = arxivpy.query(search_query=['cs.CV'], start_index=0, max_index=num - 1, results_per_iteration=128, wait_time=2.0, sort_by='submittedDate') # Normalise articles for paper in papers: # Dates as strings paper['publish_date'] = paper['publish_date'].isoformat() paper['update_date'] = paper['update_date'].isoformat() with open(filename, 'w') as f: f.write(json.dumps(papers))
import urllib.request, json from phraseg import * import matplotlib.pyplot as plt from wordcloud import WordCloud import matplotlib as mpl import csv import arxivpy mpl.rcParams['figure.dpi'] = 300 articles = arxivpy.query(search_query=['cs.CL'], start_index=0, max_index=500, results_per_iteration=100, wait_time=1.0, sort_by='lastUpdatedDate') datas = "" for a in articles: datas += a['title'] + "\n" datas += a['abstract'] + "\n" print("Finish fetching") phraseg = Phraseg(datas, idf_chunk=300) result = phraseg.extract(result_word_minlen=1, merge_overlap=True) wordcloud = WordCloud(font_path='wordcloud/NotoSansCJKtc-Medium.otf', width=1800, height=1000, margin=1, background_color="white").fit_words(result)
def crawl_category(term='cs.LG'): index_iteration = 500 logging.info("Crawling category : %s", term) for index in range(start_index, end_index, index_iteration): logging.info("\nBatch : %d-%d" % (index, index + index_iteration)) articles = arxivpy.query(search_query=[term], start_index=index, max_index=index + index_iteration, results_per_iteration=index_iteration, wait_time=0.2, sort_by='lastUpdatedDate') article_batch_count = len(articles) if article_batch_count == 0: logging.warning('Article not found in batch %d - %d' % (index, index + index_iteration)) for idx, article in tqdm(enumerate(articles), total=article_batch_count): arvixID = article['id'].split('v')[0] query = Paper.select().where(Paper.arvixID == arvixID) if query.exists(): paper = Paper.get(Paper.arvixID == arvixID) categories = paper.category if term not in categories: categories.append(term) Paper.update(category=categories).where( Paper.arvixID == arvixID).execute() continue success, article_meta = get_arvixpaper_semantic_scholar(arvixID) if success is False: logging.debug( "Paper not exists in semantic scholar, arvixID : %s" % arvixID) continue authorIDList = [ int(author['authorId']) if author['authorId'] is not None else -1 for author in article_meta['authors'] ] authorNames = [article['main_author']] authorCount = len(article_meta['authors']) if authorCount > 1: other_author = [ name.strip() for name in article['authors'].split(',') if len(name) > 1 and name != article['main_author'] ] authorNames += other_author paper_category = [article['term']] if article['term'] != term: paper_category.append(term) try: paper = Paper.create( indexID=idx + index, arvixID=arvixID, paperId=article_meta['paperId'], doiID=str(article_meta['doi']), title=article['title'], summary=article['abstract'], category=paper_category, comments=article['comment'], journal_ref=article['journal_ref'], url=article['url'], authorID=authorIDList, authorName=authorNames, authorCount=authorCount, publishedDate=article['publish_date'], citationVelocity=article_meta['citationVelocity'], referencesCount=len(article_meta['references']), topics=article_meta['topics'], venue=str(article_meta['venue']), year=article_meta['year'], influentialCitationCount=article_meta[ 'influentialCitationCount'], citationCount=len(article_meta['citations']), citations=article_meta['citations'], ) try: for meta in ['page', 'figure', 'table']: if meta in article['comment']: comment = article['comment'].replace(';', ',') for segment in comment.split(','): if meta in segment: page_prefix = segment.split(meta)[0] if meta == 'page': paper.pages = int(page_prefix.strip()) elif meta == 'figure': paper.figures = int( page_prefix.strip()) elif meta == 'table': paper.table = int(page_prefix.strip()) break except: logging.debug("Error in parsing meta data") paper.save() except BaseException as e: logging.warning("Error in arvix id %s, error: %s" % (arvixID, str(e))) time.sleep(0.3)
if args.random: args.start = int(np.random.uniform(low=0, high=10000)) args.number = int(np.random.uniform(low=1, high=3)) print("random mode, start: {}, number: {}".format( args.start, args.number)) if args.field == 'cv': search_query = ['cs.CV'] else: search_query = args.field print('Searching for {}'.format(search_query)) articles = arxivpy.query(search_query=search_query, start_index=args.start, max_index=args.start + args.number, results_per_iteration=100, wait_time=5.0, sort_by='lastUpdatedDate') # grab 200 articles print("Available Keys: ", articles[0].keys()) # print(articles[1]) paperlist_file = open("paperlist.txt", "w") items = [] for idx, article in enumerate(articles): items.append( "============================================ Paper {} ===========================================\n" .format(idx + 1)) items.append("Title: \n {}\n".format(article['title'])) items.append("Author: \n {}\n".format(article['authors']))
def crawl_machine_learning(start_index: int, sort_order: str): # conn = connect_database() DBU = RDS_utils() check_axv = arxiv_id_check(DBU) machine_learning_categories = [ 'cs.CV', 'cs.CL', 'cs.LG', 'cs.AI', 'cs.NE', 'stat.ML', 'cs.MA' ] STEP = 100 articles_per_minute = STEP * 2 article_len = articles_per_minute # if start_index == -1: # start_index = Article.get_n_articles(conn) - STEP logging.info('crawling start') logging.info('start index : ' + str(start_index)) logging.info('sort_order : ' + sort_order) update_paper, insert_paper, insertfail, updatafail = 0, 0, 0, 0 while article_len == articles_per_minute: # query 100 results per iteration # wait 30 seconds per query try: start = time.time() articles = arxivpy.query(search_query=machine_learning_categories, start_index=start_index, max_index=start_index + articles_per_minute, results_per_iteration=STEP, wait_time=5, sort_by='lastUpdatedDate', sort_order=sort_order) # crawling log # logging.info('last: ' + articles[-1].get('published', '')) logging.info(str(start_index + STEP * 2) + ' articles crawled') # save articles for article in articles: data = Article(article, None).tolist() print("'{}' cralwed / arxiv_id : {}".format(data[1], data[0])) axvid, pubyear = data[0], data[6] data[1], qt = get_qtitle(data[1]) # title if pubyear: qt = qt.strip() + str(pubyear) pid = None if type(check_axv) == dict: if check_axv.get(axvid): pid = check_axv[axvid] else: pid = DBU.get_pid_from_arXiv_id(axvid) if not pid: pid = Duplication_check.check(qt) if pid: ori = DBU.get_paper_by_p_id(pid) data = Update_aXv_paper(ori, data) if DBU.update_axv(pid, data): update_paper += 1 else: updatafail += 1 else: if DBU.insert_axv(data): pid = DBU.get_pid_from_arXiv_id(axvid) Duplication_check.insert_title_year(qt, pid) insert_paper += 1 else: insertfail += 1 # compute start_index start_index += STEP * 2 # compute article_len article_len = len(articles) e = int(time.tim() - start) print('took {:02d}:{:02d}:{:02d} to crawl {} paper'.format( e // 3600, (e % 3600 // 60), e % 60, article_len)) # sleep 1 minute, no 30 seconds time.sleep(MINUTE / 2) except Exception as e: logging.error(e) print("insert fail : {}, update fail : {}".format( insertfail, updatafail)) DBU.DB.conn.close() return start_index, insert_paper, update_paper print("insert fail : {}, update fail : {}".format(insertfail, updatafail)) DBU.DB.conn.close() return start_index, insert_paper, update_paper