def main(): papers = Paper.select().where((Paper.year == 2015)) for paper in tqdm(papers): try: download_extract(paper) except KeyboardInterrupt: sys.exit()
def query_by_time(): now = datetime.now() n_days = datetime(now.year, now.month, now.day) - timedelta(days=3) papers = Paper.select().where(Paper.post_time > n_days).order_by( Paper.post_time.desc()).execute() data = list(map(model_to_dict, papers)) result = {'code': 0, 'data': data} return jsonify(result)
def crawl_author(): papers = Paper.select().execute() for paper in tqdm(papers): for author_id in paper.authorID: if author_id == -1: continue query = Author.select().where(Author.authorID == author_id) if query.exists(): continue # try: success, author_profile = get_author_data(author_id) if success is False: logging.info("Author %d not exist!" % author_id) continue estCitation = author_profile['statistics'][ 'estCitationAcceleration']['estimate'] influencedIDList = author_profile['statistics']['influence'][ 'influenced'] influencedIDName = [ inf['author']['ids'][0] for inf in influencedIDList ] influenceCount = len(influencedIDName) influencedByIDList = author_profile['statistics']['influence'][ 'influencedBy'] influencedByIDName = [ inf['author']['ids'][0] for inf in influencedByIDList ] influenceByCount = len(influencedIDName) author = Author.create( authorID=author_id, name=author_profile['name'], semanticScholarUrl=author_profile['url'], hIndex=author_profile['statistics']['hIndex'], influentialCitationCount=author_profile[ 'influentialCitationCount'], citationVelocity=author_profile['citationVelocity'], totalInfluentialCitationCount=author_profile['statistics'] ['totalInfluentialCitationCount'], maxEstCitationAcceleration=estCitation['max'], minEstCitationAcceleration=estCitation['min'], estCitationAcceleration=estCitation['value'], estCitationAccelerationConfidence=estCitation['confidence'], influencedIDList=influencedIDName, influencedPaper=influencedIDList, influenceCount=influenceCount, influencedByIDList=influencedByIDName, influencedByPaper=influencedByIDList, influenceByCount=influenceByCount, citationHistory=author_profile['statistics'] ['citedByYearHistogram'], totalPaper=len(author_profile['papers']), ) author.save() time.sleep(0.3)
def query_by_hot(): now = datetime.now() n_days = datetime(now.year, now.month, now.day) - timedelta(days=3) papers = Paper.select().where(Paper.post_time > n_days).order_by( Paper.read_num.desc()).execute() data = list(map(model_to_dict, papers)) remove_keys = ['author', 'content', 'wx_name', 'add_time'] for item in data: for key in remove_keys: item.pop(key) result = {'code': 0, 'data': data} return jsonify(result)
def admissions(): all_years = YearDW.select() for y in all_years: all_congresses = Congress.select(Congress.idCongress) \ .where(Congress.submissionDeadline.year == y.congressyear) for c in all_congresses: rel_congress_paper = Congress_Paper.select(Congress_Paper.idPaper) \ .where(Congress_Paper.idCongress == c.idCongress) rel_congress = [r.idPaper for r in rel_congress_paper] autors_papers = [ a.idParticipant for a in Autor.select( Autor.idParticipant).where(Autor.idPaper << rel_congress) ] all_autors = AutorDW.select( AutorDW.idautor).where(AutorDW.idautor << autors_papers) for l_autor in all_autors: rel_autor = [ a.idPaper for a in Autor.select().where( Autor.idParticipant == l_autor.idautor, Autor.idPaper << rel_congress) ] n_refused = Paper.select() \ .where(Paper.accepted == False, Paper.paperId << rel_autor, Paper.paperId << rel_congress) \ .count() n_accepted = Paper.select() \ .where(Paper.accepted, Paper.paperId << rel_autor, Paper.paperId << rel_congress) \ .count() if (n_refused + n_accepted) > 0: AdmissionsDW.get_or_create(idadmcongress=c.idCongress, idadmautor=l_autor.idautor, accepted=n_accepted, refused=n_refused, idadmyear=y.idyear)
def update_paper(): all_papers = Paper.select() counter = 0 for p in all_papers: counter += 1 reviews = Review.select().where(Review.idPaper == p.paperId) num_reviews = len(reviews) sum_score = sum([r.score for r in reviews]) avg_score = int(sum_score / num_reviews) p.finalScore = int(sum_score / num_reviews) if num_reviews < 3 or avg_score < 7: p.accepted = False p.save() if (counter % 10000) == 0: print(f"{counter} papers updated")
def run(): gs = GSData() db.connect() with open('data.txt', encoding='utf-8') as f: for line in f: wx_name = line.split(' ')[0] try: data = gs.query(wx_name) except GSException as ex: print(str(ex)) continue else: for item in data: url = 'https://{0}'.format(item['url'].split('://', 1)[1]) md5s = hashlib.md5(url.encode('utf-8')).hexdigest() if Paper.select().where(Paper.url_hash == md5s).count(): continue print(item) p = Paper.create( wx_name=item['wx_name'], name=item['name'], title=item['title'], author=item['author'], content=item['content'], url=url, url_hash=md5s, post_time=datetime.strptime(item['posttime'], '%Y-%m-%d %H:%M:%S'), add_time=datetime.strptime(item['add_time'], '%Y-%m-%d %H:%M:%S')) if type(item['readnum_newest']) == int: p.read_num = item['readnum_newest'] if type(item['likenum_newest']) == int: p.like_num = item['likenum_newest'] if item['picurl']: p.pic_url = item['picurl'] p.save() sleep(3) db.close()
def update_paper(): idx = 0 for filename in tqdm(glob.glob("oai/*.xml")): article = parse_xml_file(filename) if article is None or idx < 346728: idx += 1 continue arvixID = article['id'].split('v')[0] query = Paper.select().where(Paper.arvixID == arvixID) if query.exists(): continue success, article_meta = get_arvixpaper_semantic_scholar(arvixID) if success is False: logging.debug( "Paper not exists in semantic scholar, arvixID : %s" % arvixID) continue authorIDList = [ int(author['authorId']) if author['authorId'] is not None else -1 for author in article_meta['authors'] ] authorNames = [article['main_author']] authorCount = len(article_meta['authors']) if authorCount > 1: other_author = [ name.strip() for name in article['authors'].split(',') if len(name) > 1 and name != article['main_author'] ] authorNames += other_author paper_category = [article['term']] try: paper = Paper.create( indexID=idx, arvixID=arvixID, paperId=article_meta['paperId'], doiID=str(article_meta['doi']), title=article['title'], summary=article['abstract'], category=paper_category, comments=article['comment'], journal_ref=article['journal_ref'], url=article['url'], authorID=authorIDList, authorName=authorNames, authorCount=authorCount, publishedDate=article['publish_date'], citationVelocity=article_meta['citationVelocity'], referencesCount=len(article_meta['references']), topics=article_meta['topics'], venue=str(article_meta['venue']), year=article_meta['year'], influentialCitationCount=article_meta[ 'influentialCitationCount'], citationCount=len(article_meta['citations']), citations=article_meta['citations'], ) try: for meta in ['page', 'figure', 'table']: if meta in article['comment']: comment = article['comment'].replace(';', ',') for segment in comment.split(','): if meta in segment: page_prefix = segment.split(meta)[0] if meta == 'page': paper.pages = int(page_prefix.strip()) elif meta == 'figure': paper.figures = int(page_prefix.strip()) elif meta == 'table': paper.table = int(page_prefix.strip()) break except: logging.debug("Error in parsing meta data") paper.save() except BaseException as e: logging.warning("Error in arvix id %s, error: %s" % (arvixID, str(e))) time.sleep(0.2) idx += 1
def crawl_category(term='cs.LG'): index_iteration = 500 logging.info("Crawling category : %s", term) for index in range(start_index, end_index, index_iteration): logging.info("\nBatch : %d-%d" % (index, index + index_iteration)) articles = arxivpy.query(search_query=[term], start_index=index, max_index=index + index_iteration, results_per_iteration=index_iteration, wait_time=0.2, sort_by='lastUpdatedDate') article_batch_count = len(articles) if article_batch_count == 0: logging.warning('Article not found in batch %d - %d' % (index, index + index_iteration)) for idx, article in tqdm(enumerate(articles), total=article_batch_count): arvixID = article['id'].split('v')[0] query = Paper.select().where(Paper.arvixID == arvixID) if query.exists(): paper = Paper.get(Paper.arvixID == arvixID) categories = paper.category if term not in categories: categories.append(term) Paper.update(category=categories).where( Paper.arvixID == arvixID).execute() continue success, article_meta = get_arvixpaper_semantic_scholar(arvixID) if success is False: logging.debug( "Paper not exists in semantic scholar, arvixID : %s" % arvixID) continue authorIDList = [ int(author['authorId']) if author['authorId'] is not None else -1 for author in article_meta['authors'] ] authorNames = [article['main_author']] authorCount = len(article_meta['authors']) if authorCount > 1: other_author = [ name.strip() for name in article['authors'].split(',') if len(name) > 1 and name != article['main_author'] ] authorNames += other_author paper_category = [article['term']] if article['term'] != term: paper_category.append(term) try: paper = Paper.create( indexID=idx + index, arvixID=arvixID, paperId=article_meta['paperId'], doiID=str(article_meta['doi']), title=article['title'], summary=article['abstract'], category=paper_category, comments=article['comment'], journal_ref=article['journal_ref'], url=article['url'], authorID=authorIDList, authorName=authorNames, authorCount=authorCount, publishedDate=article['publish_date'], citationVelocity=article_meta['citationVelocity'], referencesCount=len(article_meta['references']), topics=article_meta['topics'], venue=str(article_meta['venue']), year=article_meta['year'], influentialCitationCount=article_meta[ 'influentialCitationCount'], citationCount=len(article_meta['citations']), citations=article_meta['citations'], ) try: for meta in ['page', 'figure', 'table']: if meta in article['comment']: comment = article['comment'].replace(';', ',') for segment in comment.split(','): if meta in segment: page_prefix = segment.split(meta)[0] if meta == 'page': paper.pages = int(page_prefix.strip()) elif meta == 'figure': paper.figures = int( page_prefix.strip()) elif meta == 'table': paper.table = int(page_prefix.strip()) break except: logging.debug("Error in parsing meta data") paper.save() except BaseException as e: logging.warning("Error in arvix id %s, error: %s" % (arvixID, str(e))) time.sleep(0.3)