def insert_paper(): if request.method == 'POST': paper = db.session.query(Paper).filter_by(doi=request.json['doi']).first() if not paper: paper = Paper(year=request.json['year'], title=request.json['title'], abstract=request.json['abstract'], user_id=g.user.id, doi=request.json['doi']) db.session.add(paper) db.session.flush() for author in request.json['authors']: paper_author = db.session.query(Author).filter_by(name=author).first() if paper_author: paper_author.start_owning(paper) else: paper_author = Author(name=author) db.session.add(paper_author) db.session.flush() paper_author.start_owning(paper) db.session.commit() for doi in request.json['doi_refs']: ref_paper = db.session.query(Paper).filter_by(doi=doi).first() if ref_paper: paper.start_referencing(ref_paper) else: ref_paper = Paper(doi=doi) db.session.add(ref_paper) db.session.flush() paper.start_referencing(ref_paper) db.session.commit() return json.dumps(dict(data=request.json))
def add(request): """Add action""" form = None if request.method == "POST": form = NewPaperForm(request.POST) if form.is_valid(): if hasattr(form.cleaned_data['tags'], 'split'): form.cleaned_data['tags'] = form.cleaned_data['tags'].split(",") p = Paper(**form.cleaned_data) p.put() return HttpResponseRedirect(p.permalink()) else: pass elif request.method == "GET": form = NewPaperForm() else: pass return render_to_response('add_paper.html', {"form" : form, "dest_url" : reverse(__name__+'.add') })
def dbSavePapersAndAuthors(papers, latestMailing=True): """Saves an array of paper information into the database. Returns numbers of new papers and authors added. If the latestMailing argument is true, then sets the paper dates to either today or tomorrow, regardless of the date from the arXiv. It sets to today if the function is run before 8pm ET, and to tomorrow otherwise. The idea is that this function should be run regularly every day, the night that the mailing goes out. If run late in the day before midnight, then the mailing has tomorrow's date. If run early in the day, e.g., if for some reason it didn't run when it should have, then the mailing was sent out yesterday and is for today. """ if latestMailing: latestMailingDate = datetime.date.today() now = datetime.datetime.now(pytz.timezone('US/Eastern')) cutoff = now.replace(hour=20,minute=0,second=0,microsecond=0) if now > cutoff: latestMailingDate += datetime.timedelta(days=+1) # note: The official mailing date is the day the email goes out, a few hours after the paper was made available numNewPapersAdded = numNewAuthorsAdded = 0 for paper in papers: authors = [] for author in paper['authors']: authorsWithSameName = Author.objects.filter(name=author) if authorsWithSameName: # author with same name already exists in database---don't add a duplicate a = authorsWithSameName[0] # there might be duplicates --- take the first (maybe fix later) else: a = Author(name=author) a.save() numNewAuthorsAdded += 1 authors.append(a) if Paper.objects.filter(arxivId=paper['arxivId']): continue # NOTE: If we make a mistake adding the paper the first time, this line will keep the code below from ever running to fix it if latestMailing: mailing_date = latestMailingDate else: mailing_date = mailingDate(paper['datePublished']) p = Paper( arxivId = paper['arxivId'], title = paper['title'], abstract = paper['abstract'], date_published = paper['datePublished'], date_mailed = mailing_date, #authors = authors, # ManyToManyField is set up later category = paper['category'], categories = paper['categories'], version = paper['version'], linkAbsPage = paper['linkAbsPage'], linkPdf = paper['linkPdf'] ) p.save() # need to save before setting up the ManyToMany field of authors for author in authors: # alternatively, to clear a ManyToMany field, use p.authors.clear() p.authors.add(author) p.save() numNewPapersAdded += 1 print "%d new papers, %d new authors added" % (numNewPapersAdded, numNewAuthorsAdded) return numNewPapersAdded, numNewAuthorsAdded
def papers_view(request): user = UserProfile.objects.get(username=request.user.username) papers = Paper.objects.all() paper_data = [] for paper in papers: # Apply policy to paper author. if not paper.policy_paperlabel(user): paper.author = Paper.jeeves_get_private_author(paper) paper_versions = PaperVersion.objects.filter(paper=paper).order_by('-time').all() latest_version_title = paper_versions[0].title if \ paper_versions.__len__() > 0 else None # Make sure we're actually allowed to see the paper. if not paper_versions[0].jeeves_restrict_paperversionlabel(user): latest_version_title = PaperVersion.jeeves_get_private_title( paper_versions[0]) paper_data.append({ 'paper' : paper, 'latest' : latest_version_title }) return ("papers.html", { 'papers' : papers , 'which_page' : "home" , 'paper_data' : paper_data , 'name' : user.name })
def index(request): papers = Paper.all() papers.order('-date') papers = papers.fetch(1000) def paper_compare(a, b): return b.score() - a.score() papers.sort(paper_compare) return render_to_response('papers.html', {"papers": papers[:30]})
def AbstractPool(request, review_name_slug): review = Review.objects.get(user=request.user, slug=review_name_slug) if request.method == "POST": if request.POST.get('results') == None: q = request.POST.get('queryField') s = request.POST.get('sortType') n = request.POST.get('noResults') abstractList = search.main(q,s, n) for document in abstractList: documentURL = document.get("url") if Paper.objects.filter(paper_url= documentURL, review= review).exists(): abstractList.remove(document) else: abstractList = eval(request.POST.get('results')) q = request.POST.get('queryField') relevant="Unchecked" if request.POST.get("relevanceField") == "relevant": relevant="Relevant" else: if request.POST.get("relevanceField") == "irrelevant": relevant="Not Relevant" if relevant!="Unchecked": print "traceA" compareCount_value = int(request.POST.get("hiddenCompareCount")) for s in abstractList: if s.get('compareCount') == compareCount_value: currentDoc = s paper = Paper(review=review, title=currentDoc["title"], paper_url=currentDoc["url"], full_text=currentDoc['fullText'], abstract=currentDoc["abstract"], authors=currentDoc["author"], abstract_relevance=relevant) paper.save() if len(abstractList)>1: for abstract in abstractList: if int(abstract.get('compareCount')) > compareCount_value-1: abstract['compareCount'] -= 1 del abstractList[compareCount_value-1] else: abstractList = [] #for abstract in abstractList: #if int(abstract.get('compareCount')) > compareCount_value: #abstract['compareCount'] -= 1 #del abstractList[compareCount_value] return render(request, 'ultimatereview/AbstractPool.html', {"Abstracts": abstractList, 'query': q, 'review':review.title,'slug': review_name_slug})
def view(request, id): #Paper.sample_init() """Handler for individual papers""" paper = Paper.get_by_id(int(id)) values = { "paper": paper, "paper_date":util.as_time_ago(paper.date), "paper_html_description" : util.plaintext2html(paper.description), } return render_to_response('paper.html', values)
def _paperAdd(requestData, user): ''' 新增一个问卷的具体处理过程 ''' # 获取Paper模型中的所有属性 keys = requestData.keys() data = {} # fields = zip(*Paper._meta.get_fields_with_model())[0] for field in getModelFields(Paper): # 跳过系统自动增加的字段 if field.auto_created: continue # 读取request数据 value = requestData.get(field.name, None) # 特殊处理json的Boolean型的变量 if type(field) == BooleanField: value = jsonBoolean2Python(value) # 对创建人和修改人的信息进行特殊处理 if field.name in [USER_CREATE_BY_FIELD_NAME, USER_MODIFY_BY_FIELD_NAME]: value = user # 如果调用者没有显示执行字段值为空,则不增加到data中去,让模型的默认值发挥作用 # 字段代码不能早于对createBy和modifyBy的处理 if value is None and field.name not in keys: continue # 将校验的数据添加到data,准备为创建数据库用 data[field.name] = value paper = Paper(**data) # 校验数据 try: paper.full_clean() except ValidationError as exception: return packageResult( RESULT_CODE.ERROR, RESULT_MESSAGE.VALIDATION_ERROR, {'validationMessage': exception.message_dict}) # 保存到数据库 paper.save() return packageResult(RESULT_CODE.SUCCESS, RESULT_MESSAGE.SUCCESS, {'paperId': paper.id})
def upload(request): if request.method =='POST': #form = UploaderForm(request.POST,request.FILES) #if form.is_valid(): paper_title=request.FILES['ups'].name uploaded=request.FILES['ups'] print uploaded print uploaded.size username = get_user_fromcookie(request) data = {'title': paper_title,'paper_file':uploaded} form = UploadForm(data) obj = Paper(title=paper_title,paper_file=uploaded,user=username) obj.save() #form.save() ''' if form.is_valid(): form.save() else: print form.errors ''' #return HttpResponse("errros") return HttpResponse('Research Paper uploaded')
def upload(request): if request.method =='POST': t = [] #form = UploaderForm(request.POST,request.FILES) #if form.is_valid(): paper_title=request.FILES['ups'].name uploaded=request.FILES['ups'] tags = request.POST['tags'] t = tags.split(',') print t p = [] for i in range(len(t)-1): l = Topic.objects.get(subject=t[i].encode('ascii','replace')) p.append(l.id) #print uploaded print p #print uploaded.size username = get_user_fromcookie(request) data = {'title': paper_title,'paper_file':uploaded} form = UploadForm(data) obj = Paper(title=paper_title,paper_file=uploaded,user=username) obj.save() pap = Paper.objects.get(title=paper_title) for i in p: pap.tags.add(int(i)) convert(paper_title) #form.save() ''' if form.is_valid(): form.save() else: print form.errors ''' #return HttpResponse("errros") return render(request,'thanks.html',{'message': 'Research Paper Uploaded'})
def createTestPaper(): for i in range(3): paper = Paper(name="paper" + str(i), description="paper" + str(i) + "...", state='A') paper.save() userpaper = UserPaper(user=testUser, paper=paper, finish_state="finished") userpaper.save() for i in range(4, 7): paper = Paper(name="paper" + str(i), description="paper" + str(i) + "...", state='A') paper.save() userpaper = UserPaper(user=testUser, paper=paper, finish_state="unfinished") userpaper.save()
def save(self, user): d = self.cleaned_data authors = [user] if 'coauthor1' in d: authors.append(d['coauthor1']) if 'coauthor2' in d: authors.append(d['coauthor2']) if 'coauthor3' in d: authors.append(d['coauthor3']) paper = Paper() paper.save() paper.authors.add(user) for coauthor in d['coauthors']: paper.authors.add(coauthor) paper.save() d['contents'].name = '%030x' % random.randrange(16**30) + ".pdf" paper_version = PaperVersion( paper = paper, title = d['title'], abstract = d['abstract'], contents = d['contents'], ) paper_version.save() # need to save paper twice since paper and paper_version point to each other... paper.latest_version = paper_version paper.save() for conflict_username in d['conflicts']: ra = ReviewAssignment() ra.user = User.objects.get(username=conflict_username) ra.paper = paper ra.type = 'conflict' ra.save() return paper
def __get_expanded_tuples(inner_table): sql = """ with ids as ({0}) select a.id, a.name, a.affiliation, pa.authorid, pa.paperid, pa.name, pa.affiliation, p.id, p.title, p.year, p.keyword, c.id, c.shortname, c.fullname, j.id, j.shortname, j.fullname from ids inner join paperauthor pa on ids.authorid = pa.authorid and ids.paperid = pa.paperid inner join author a on pa.authorid = a.id inner join paper p on pa.paperid = p.id left join conference c on p.conferenceid = c.id left join journal j on p.journalid = j.id """.format(inner_table) build_from_row = lambda r: Expanded(author=Author._make(r[0:3]) if r[0] else None, paperauthor=PaperAuthor._make(r[3:7]) if r[3] and r[4] else None, paper=Paper._make(r[7:11]) if r[7] else None, conference=Conference._make(r[11:14]) if r[11] else None, journal=Journal._make(r[14:17]) if r[14] else None) return [build_from_row(r) for r in __execute_sql(sql)]
def get_portfolio_papers(portfolio_id: int): return [Paper(row['id']) for row in query.fetch('shares', ['id'], portfolio_id=portfolio_id)]
def new(request): papers = Paper.all() papers.order('-date') papers = papers.fetch(30) return render_to_response('papers.html', {"papers": papers})
def update_paper(): idx = 0 for filename in tqdm(glob.glob("oai/*.xml")): article = parse_xml_file(filename) if article is None or idx < 346728: idx += 1 continue arvixID = article['id'].split('v')[0] query = Paper.select().where(Paper.arvixID == arvixID) if query.exists(): continue success, article_meta = get_arvixpaper_semantic_scholar(arvixID) if success is False: logging.debug( "Paper not exists in semantic scholar, arvixID : %s" % arvixID) continue authorIDList = [ int(author['authorId']) if author['authorId'] is not None else -1 for author in article_meta['authors'] ] authorNames = [article['main_author']] authorCount = len(article_meta['authors']) if authorCount > 1: other_author = [ name.strip() for name in article['authors'].split(',') if len(name) > 1 and name != article['main_author'] ] authorNames += other_author paper_category = [article['term']] try: paper = Paper.create( indexID=idx, arvixID=arvixID, paperId=article_meta['paperId'], doiID=str(article_meta['doi']), title=article['title'], summary=article['abstract'], category=paper_category, comments=article['comment'], journal_ref=article['journal_ref'], url=article['url'], authorID=authorIDList, authorName=authorNames, authorCount=authorCount, publishedDate=article['publish_date'], citationVelocity=article_meta['citationVelocity'], referencesCount=len(article_meta['references']), topics=article_meta['topics'], venue=str(article_meta['venue']), year=article_meta['year'], influentialCitationCount=article_meta[ 'influentialCitationCount'], citationCount=len(article_meta['citations']), citations=article_meta['citations'], ) try: for meta in ['page', 'figure', 'table']: if meta in article['comment']: comment = article['comment'].replace(';', ',') for segment in comment.split(','): if meta in segment: page_prefix = segment.split(meta)[0] if meta == 'page': paper.pages = int(page_prefix.strip()) elif meta == 'figure': paper.figures = int(page_prefix.strip()) elif meta == 'table': paper.table = int(page_prefix.strip()) break except: logging.debug("Error in parsing meta data") paper.save() except BaseException as e: logging.warning("Error in arvix id %s, error: %s" % (arvixID, str(e))) time.sleep(0.2) idx += 1
def test_paper(self): with db_session: p = Paper(paper_id='666aaaaa66') p.title = 'hehehehhehe'
def write_db(self): print "len of entry list " + str(len(self.entry_list)) for entry in self.entry_list: paper = Paper() if entry.has_key("id"): paper.id = entry["id"] if entry.has_key("type"): paper.type = entry["type"] if entry.has_key("title"): paper.title = entry["title"] if entry.has_key("author"): paper.authors = entry["author"] if entry.has_key("year"): paper.year = int(entry["year"]) if entry.has_key("journal"): paper.journal = entry["journal"] if entry.has_key("booktitle"): paper.book_title = entry["booktitle"] if entry.has_key("publisher"): paper.publisher = entry["publisher"] if entry.has_key("institution"): paper.institution = entry["institution"] if entry.has_key("volume"): paper.volume = int(entry["volume"]) if entry.has_key("number"): paper.number = int(entry["number"]) if entry.has_key("pages"): paper.pages = entry["pages"] if entry.has_key("url"): paper.url = entry["url"] if entry.has_key("doi"): paper.doi = entry["doi"] if entry.has_key("isbn"): paper.isbn = entry["isbn"] paper.save()
# echo true goes on engine engine = create_engine('sqlite:///my_papers.db') Session = sessionmaker(bind=engine) session = Session() Base.metadata.create_all(engine) jchem_phys = Journal(name='J. Chem. Phys', publisher='AIP') jcim = Journal(name='J. Chem. Inf. Model', publisher='ACS') # Add a paper to the DB molssi_paper = Paper( DOI='10.1063/1.5052551', paper_title= 'Perspective: Computational chemistry software and its advancement as illustrated through three grand challenge cases for molecular science', journal=jchem_phys, publication_year=2018, authors= 'Anna Krylov, Theresa L. Windus, Taylor Barnes, Eliseo Marin-Rimoldi, Jessica A. Nash, Benjamin Pritchard, Daniel G.A. Smith, Doaa Altarawy, Paul Saxe, Cecilia Clementi, T. Daniel Crawford, Robert J. Harrison, Shantenu Jha, Vijay S. Pande, Teresa Head-Gordon' ) # Add another paper bse_paper = Paper( DOI='10.1021/acs.jcim.9b00725', paper_title= 'New Basis Set Exchange: An Open, Up-to-Date Resource for the Molecular Sciences Community', journal=jcim, publication_year=2019, authors= 'Benjamin P. Pritchard, Doaa Altarawy, Brett Didier, Tara D. Gibson, Theresa L. Windus' )
def get_paper(ticker: str, portfolio_id: int, holder_id: int) -> Paper: try: return Paper(query.fetch('shares', ['id'], ticker=ticker, holder_id=holder_id, portfolio_id=portfolio_id)[0]['id']) except IndexError: return
def vote(request, id): paper = Paper.get_by_id(int(id)) if request.method == 'GET': paper.points += 1 paper.put() return HttpResponseRedirect(paper.permalink())
def paper_view(request): user = UserProfile.objects.get(username=request.user.username) paper = Paper.objects.get(id=request.GET.get('id', '')) if paper != None: if request.method == 'POST': if request.POST.get('add_comment', 'false') == 'true': Comment.objects.create(paper=paper, user=user, contents=request.POST.get('comment', '')) elif request.POST.get('add_review', 'false') == 'true': Review.objects.create(paper=paper, reviewer=user, contents=request.POST.get('review', ''), score_novelty=int(request.POST.get('score_novelty', '1')), score_presentation=int(request.POST.get('score_presentation', '1')), score_technical=int(request.POST.get('score_technical', '1')), score_confidence=int(request.POST.get('score_confidence', '1')), ) elif request.POST.get('new_version', 'false') == 'true' and user == paper.author: contents = request.FILES.get('contents', None) if contents != None and paper.author != None: set_random_name(contents) PaperVersion.objects.create(paper=paper, title=request.POST.get('title', ''), contents=contents, abstract=request.POST.get('abstract', ''), ) all_paper_versions = PaperVersion.objects.filter(paper=paper).order_by('time').all() paper_versions = [] for paper_version in all_paper_versions: print paper_version if paper_version.jeeves_restrict_paperversionlabel(user): paper_versions.append(paper_version) all_coauthors = PaperCoauthor.objects.filter(paper=paper).all() coauthors = [] for coauthor in all_coauthors: if coauthor.jeeves_restrict_papercoauthorlabel(user): coauthors.append(coauthor) if paper_versions[0].jeeves_restrict_paperversionlabel(user): latest_abstract = paper_versions[0].abstract \ if paper_versions.__len__() > 0 else None latest_title = paper_versions[0].title \ if paper_versions.__len__() > 0 else None else: latest_abstract = PaperVersion.jeeves_get_private_abstract( paper_versions[0]) latest_title = PaperVersion.jeeves_get_private_title( paper_versions[0]) reviews = Review.objects.filter(paper=paper).order_by('-time').all() for review in reviews: if not review.jeeves_restrict_reviewlabel(user): review.paper = Review.jeeves_get_private_paper(review) review.reviewer = Review.jeeves.get_private_reviewer(review) review.contents = Review.jeeves_get_private_contents(review) review.score_novelty = \ Review.jeeves_get_private_score_novelty(review) review.score_presentation = \ Review.jeeves_get_private_score_presentation(review) review.score_technical = \ Review.jeeves_get_private_score_technical(review) review.score_confidence = \ Review.jeeves_get_private_score_confidence(review) all_comments = Comment.objects.filter(paper=paper).order_by( 'time').all() comments = [] for comment in all_comments: if comment.jeeves_restrict_reviewlabel(user): comment.paper = Comment.jeeves_get_private_paper(comment) comment.user = Comment.jeeves_get_private_user(comment) comment.contents = Comment.jeeves_get_private_contents(comment) commends.append(comment) if paper.policy_paperlabel(user): author = paper.author else: author = Paper.jeeves_get_private_author(paper) else: paper = None paper_versions = [] coauthors = [] latest_abstract = None latest_title = None reviews = [] comments = [] author = None return ("paper.html", { 'paper' : paper, 'paper_versions' : paper_versions, 'author' : author, 'coauthors' : coauthors, 'latest_abstract' : latest_abstract, 'latest_title' : latest_title, 'reviews' : reviews, 'comments' : comments, 'which_page' : "paper", 'review_score_fields': [ ("Novelty", "score_novelty", 10) , ("Presentation", "score_presentation", 10) , ("Technical", "score_technical", 10) , ("Confidence", "score_confidence", 10) ] })
def post(self, request): param = QueryDict(request.body) uuid = param.get('uuid') title = param.get('title') time = param.get('time') origin = param.get('origin') _authors = param.getlist('authors') link = param.get('link') _tags = param.getlist('tags') content = param.get('content') refer_to = param.getlist('reference') score = param.get('score') try: year, month = time.split('-') year, month = int(year), int(month) publish_time = datetime.date(year, month, 1) except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '提供的日期{}有误'.format(time)}, status=500) for _tag in _tags: try: _tag = int(_tag) _ = ResearchTag.objects.get(research_tag_id=_tag) except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '错误的标签{}'.format(_tag)}, status=500) tags = ResearchTag.objects.filter( research_tag_id__in=[int(_t) for _t in _tags]) author_ids = [] for _author in _authors: if _author.isdigit(): author_ids.append(int(_author)) elif Author.objects.filter(name=_author).exists(): a = Author.objects.get(name=_author).author_id author_ids.append(a) else: a = Author(name=_author) a.save() author_ids.append(a.author_id) authors = Author.objects.filter(author_id__in=author_ids) try: score = int(score) except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '错误的评分分数格式'}, status=500) if not Paper.objects.filter(paper_uuid=uuid).exists(): # 新建的场合 try: comment = PaperComment(content=content) comment.save() paper = Paper(paper_uuid=uuid, title=title, publish_origin=origin, publish_time=publish_time, author=authors, link=link, tag=tags, comment=comment, self_score=score) paper.save() redis.set(self.LATEST_KEY, str(uuid_gen.uuid4())) except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '保存失败'}, status=500) else: return JsonResponse({ 'next': reverse('paperdb.detail', kwargs={'paper_uuid': paper.paper_uuid}) }) try: # 编辑的场合 paper = Paper.objects.get(paper_uuid=uuid) except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '错误的uuid/未找到相关论文记录'}, status=404) else: paper.title = title paper.publish_time = publish_time paper.publish_origin = origin paper.author = authors paper.link = paper.link paper.tag = tags paper.self_score = score try: paper.save() except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '保存失败'}, status=500) if paper.comment is None: if content != '': comment = PaperComment(content=content) comment.save() paper.comment = comment paper.save() elif content != paper.comment.content.replace( '\r\n', '\n'): # traditional下的换行符出入 paper.comment.content = content paper.comment.save() for refer_to_paper in Paper.objects.filter(paper_uuid__in=refer_to): if not Reference.objects.filter( reference_src=paper, reference_trg=refer_to_paper).exists(): reference = Reference(reference_src=paper, reference_trg=refer_to_paper) reference.save() return JsonResponse({ 'next': reverse('paperdb.detail', kwargs={'paper_uuid': paper.paper_uuid}) })
def delete_paper_index(): Paper.delete_papers()
def crawl_category(term='cs.LG'): index_iteration = 500 logging.info("Crawling category : %s", term) for index in range(start_index, end_index, index_iteration): logging.info("\nBatch : %d-%d" % (index, index + index_iteration)) articles = arxivpy.query(search_query=[term], start_index=index, max_index=index + index_iteration, results_per_iteration=index_iteration, wait_time=0.2, sort_by='lastUpdatedDate') article_batch_count = len(articles) if article_batch_count == 0: logging.warning('Article not found in batch %d - %d' % (index, index + index_iteration)) for idx, article in tqdm(enumerate(articles), total=article_batch_count): arvixID = article['id'].split('v')[0] query = Paper.select().where(Paper.arvixID == arvixID) if query.exists(): paper = Paper.get(Paper.arvixID == arvixID) categories = paper.category if term not in categories: categories.append(term) Paper.update(category=categories).where( Paper.arvixID == arvixID).execute() continue success, article_meta = get_arvixpaper_semantic_scholar(arvixID) if success is False: logging.debug( "Paper not exists in semantic scholar, arvixID : %s" % arvixID) continue authorIDList = [ int(author['authorId']) if author['authorId'] is not None else -1 for author in article_meta['authors'] ] authorNames = [article['main_author']] authorCount = len(article_meta['authors']) if authorCount > 1: other_author = [ name.strip() for name in article['authors'].split(',') if len(name) > 1 and name != article['main_author'] ] authorNames += other_author paper_category = [article['term']] if article['term'] != term: paper_category.append(term) try: paper = Paper.create( indexID=idx + index, arvixID=arvixID, paperId=article_meta['paperId'], doiID=str(article_meta['doi']), title=article['title'], summary=article['abstract'], category=paper_category, comments=article['comment'], journal_ref=article['journal_ref'], url=article['url'], authorID=authorIDList, authorName=authorNames, authorCount=authorCount, publishedDate=article['publish_date'], citationVelocity=article_meta['citationVelocity'], referencesCount=len(article_meta['references']), topics=article_meta['topics'], venue=str(article_meta['venue']), year=article_meta['year'], influentialCitationCount=article_meta[ 'influentialCitationCount'], citationCount=len(article_meta['citations']), citations=article_meta['citations'], ) try: for meta in ['page', 'figure', 'table']: if meta in article['comment']: comment = article['comment'].replace(';', ',') for segment in comment.split(','): if meta in segment: page_prefix = segment.split(meta)[0] if meta == 'page': paper.pages = int(page_prefix.strip()) elif meta == 'figure': paper.figures = int( page_prefix.strip()) elif meta == 'table': paper.table = int(page_prefix.strip()) break except: logging.debug("Error in parsing meta data") paper.save() except BaseException as e: logging.warning("Error in arvix id %s, error: %s" % (arvixID, str(e))) time.sleep(0.3)
def get_references_citations_by_id(profile_id): if isinstance(profile_id, dict): profile_id = profile_id.get('profile_id') if MONGO: if data_collection.find({"id": profile_id}).count() > 0: # 说明这个数据已经被爬取过了 return [] print('func2') if not profile_id: return -1 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9' } session = requests.Session() while True: try: response = session.get( 'https://cn.bing.com/academic/profile?id={}&encoded=0&v=paper_preview&mkt=zh-cn' .format(profile_id), headers=headers) response.raise_for_status() response.encoding = 'utf-8' break except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: time.sleep(3.0) print(e) result = re.search(r'IG:"(.*?)"', response.text) if result: ig = result.group(1) result = re.search( r'被 引 量</span></span><span class="aca_content"><div>(\d*)</div>', response.text) if result: citation_num = result.group(1) html = etree.HTML(response.text) paper = Paper(save2mongo=MONGO) try: paper.title = html.xpath('//li[@class="aca_title"]/text()')[0] paper.id = profile_id paper.citation_num = citation_num result = re.search( r'<span class="aca_label">DOI</span></span><span class="aca_content"><div>(.*?)</div>', response.text) if result: paper.doi = result.group(1) paper.authors = html.xpath( '//div[@class="aca_desc b_snippet"]/span//a/text()') paper.abstract = html.xpath( '//div[@class="aca_desc b_snippet"]/span[1]//text()')[-1] result = re.search( r'<span class="aca_label">发表日期</span></span><span class="aca_content"><div>(\d*)</div>', response.text) if result: paper.publish_year = result.group(1) base_url = 'https://cn.bing.com/academic/papers?ajax=scroll&infscroll=1&id={id}&encoded=0&v=paper_preview&mkt=zh-cn&first={first}&count={count}&IG={ig}&IID=morepage.{num}&SFX={num}&rt={rt}' count = 9 citation_links = list() for i in range(1, int(citation_num) // count): ajax_url = base_url.format(id=profile_id, first=i * (count + 1), count=count + 1, ig=ig, num=i, rt='2') while True: try: response = session.get(ajax_url, headers=headers) response.raise_for_status() response.encoding = 'utf-8' break except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: time.sleep(3.0) print(e) html = etree.HTML(response.text) citation_links.extend(html.xpath('//a[@target="_blank"]/@href')) print('number of citation_links', len(citation_links), 'citation_num', citation_num) if len(citation_links) >= 0: for i, citation_link in enumerate(citation_links): profile_id = get_profile_id(citation_link) if profile_id.get('title', False): paper.citations.append(profile_id) print('get_profile_id: {}/{}\r'.format(i + 1, len(citation_links)), end='') print('\nnumber of ids:', len(paper.citations)) except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: print(e) paper.save() # for profile_id in paper.citations: # get_references_citations_by_id(profile_id) return paper.citations
class TEIExtractor: def __init__(self, file, test_tsv=None): self.file = file self.uni_rank = ReadPickle('uni_rank.pickle') self.sjr = ReadPickle('journal_dictionary.pkl') self.document = test_tsv self.paper = Paper() with open(file, 'rb') as tei: self.soup = BeautifulSoup(tei, features="lxml") # TODO: return paper | redesign extractor to make it more modular to test individual components def get_self_citations(self): # DOI doi = self.soup.teiheader.find("idno", type="DOI") if doi: self.paper.doi = elem_to_text(doi) elif self.document: self.paper.doi = self.document['doi'] # Title title = self.soup.teiheader.find("title") if title: self.paper.title = elem_to_text(title) # Authors authors = self.get_authors(self.soup.analytic.find_all('author')) if authors: self.paper.authors = authors # Citations bibliography = self.soup.listbibl.find_all('biblstruct') for bibl in bibliography: citation = Citation() cited_paper = bibl.analytic if cited_paper: citation.title = elem_to_text( cited_paper.find("title", type="main")) citation_authors = self.get_authors( cited_paper.find_all("author")) citation.doi = elem_to_text( cited_paper.find("idno", type="DOI")) if citation_authors: citation.authors = citation_authors cited_journal = bibl.monogr if cited_journal: citation.source = elem_to_text(cited_journal.find("title")) try: citation.publish_year = cited_journal.imprint.date['when'] except TypeError: pass self.paper.citations.append(citation) self.paper.set_self_citations() return { 'doi': self.paper.doi, 'title': self.paper.title, 'total_citations': len(self.paper.citations), 'self_citations': self.paper.self_citations } def extract_paper_info(self): # DOI doi = self.soup.teiheader.find("idno", type="DOI") if doi: self.paper.doi = elem_to_text(doi) elif self.document: self.paper.doi = self.document['doi'] # Title title = self.soup.teiheader.find("title") if title: self.paper.title = elem_to_text(title) # Authors authors = self.get_authors(self.soup.analytic.find_all('author')) if authors: self.paper.authors = authors # Year published = self.soup.analytic.find("publicationstmt") if published: self.paper.year = elem_to_text(published.find("date", type="when")) # Organization / Affiliations affiliations = self.soup.analytic.find_all('affiliation') for affiliation in affiliations: org = Organization() org.type = "institution" org.name = elem_to_text( affiliation.find("orgname", type="institution")) address = Address() addr = affiliation.find("address") if addr: address.place = elem_to_text(addr.find("settlement")) address.region = elem_to_text(addr.find("region")) address.country = elem_to_text(addr.find("country")) org.address = address self.paper.affiliations.append(org) # University Ranking if self.paper.affiliations: if self.paper.affiliations[0] != '': self.paper.uni_rank = self.uni_rank.get_rank( self.paper.affiliations[0].name) elif len(self.paper.affiliations) > 1: self.paper.uni_rank = self.uni_rank.get_rank( self.paper.affiliations[1].name) else: self.paper.uni_rank = self.uni_rank.get_rank('Random') # Citations bibliography = self.soup.listbibl.find_all('biblstruct') for bibl in bibliography: citation = Citation() cited_paper = bibl.analytic if cited_paper: citation.title = elem_to_text( cited_paper.find("title", type="main")) citation_authors = self.get_authors( cited_paper.find_all("author")) citation.doi = elem_to_text( cited_paper.find("idno", type="DOI")) if citation_authors: citation.authors = citation_authors cited_journal = bibl.monogr if cited_journal: citation.source = elem_to_text(cited_journal.find("title")) try: citation.publish_year = cited_journal.imprint.date['when'] except TypeError: pass self.paper.citations.append(citation) # NER - Ack pairs - Funding status self.paper.ack_pairs = self.get_funding_status() er_list = [org for (entity, org) in self.paper.ack_pairs] if 'ORG' in er_list: self.paper.funded = 1 else: self.paper.funded = 0 # SJR api_resp = self.get_sjr(self.paper.doi, self.paper.title) if api_resp: self.paper.cited_by_count = api_resp["num_citations"] self.paper.sjr = api_resp["sjr"] self.paper.subject = api_resp["subject"] self.paper.subject_code = api_resp["subject_code"] self.paper.normalized = api_resp["normalized_citations"] self.paper.velocity = api_resp["citationVelocity"] self.paper.influentialcitations = api_resp[ "influentialCitationCount"] self.paper.references = api_resp["references_count"] self.paper.flag = api_resp["openaccessflag"] self.paper.influentialref = api_resp["influentialReferencesCount"] self.paper.ref_background = api_resp["reference_background"] self.paper.ref_result = api_resp["reference_result"] self.paper.ref_method = api_resp["reference_methodology"] self.paper.cite_background = api_resp["citations_background"] self.paper.cite_result = api_resp["citations_result"] self.paper.cite_method = api_resp["citations_methodology"] self.paper.cite_next = api_resp["citations_next"] # Set self-citations self.paper.self_citations = self.paper.set_self_citations() # Set influential_methodology_references self.paper.influential_references_methodology = self.set_influential_references_methodology( ) # return paper t2, t3 = coCite(self.paper.doi) return { "doi": self.paper.doi, "title": self.paper.title, "num_citations": self.paper.cited_by_count, "author_count": len(self.paper.authors), "sjr": self.paper.sjr, "u_rank": self.paper.uni_rank, "funded": self.paper.funded, "self_citations": self.paper.self_citations, "subject": self.paper.subject, "subject_code": self.paper.subject_code, "citationVelocity": self.paper.velocity, "influentialCitationCount": self.paper.influentialcitations, "references_count": self.paper.references, "openaccessflag": self.paper.flag, "influentialReferencesCount": self.paper.influentialref, "normalized_citations": self.paper.normalized, "reference_background": self.paper.ref_background, "reference_result": self.paper.ref_result, "reference_methodology": self.paper.ref_method, "citations_background": self.paper.cite_background, "citations_result": self.paper.cite_result, "citations_methodology": self.paper.cite_method, "citations_next": self.paper.cite_next, "upstream_influential_methodology_count": self.paper.influential_references_methodology, "coCite2": t2, "coCite3": t3 } @staticmethod def get_authors(authors): authors_list = [] for author in authors: person = Author() pers_name = author.persname if not pers_name: continue person.first_name = elem_to_text( pers_name.find("forename", type="first")) person.middle_name = elem_to_text( pers_name.find("forename", type="middle")) person.surname = elem_to_text(pers_name.surname) person.set_name() if not any(auth.name == person.name for auth in authors_list): authors_list.append(person) return authors_list def get_funding_status(self): pairs = NER(XML2ack(self.file)) return pairs @staticmethod def get_sjr(doi, title): api = getapi(doi, title) if api.empty: return None else: try: cited_by = api['num_citations'][0] except KeyError: cited_by = 0 try: normalized = api['normalized_citations'][0] except: normalized = 0.0 try: velocity = api['citationVelocity'][0] except: velocity = 0 try: influentialcitations = api['influentialCitationCount'][0] except: influentialcitations = 0 try: references = api['references_count'][0] except: references = 0 try: sjr_score = api['SJR'][0] except KeyError: sjr_score = 0 try: subject = api['subject'][0] except: subject = 0 try: subject_code = api['subject_code'][0] except: subject_code = 900 try: flag = api['openaccessflag'][0] except: flag = 0 try: influentialref = api['influentialReferencesCount'][0] except: influentialref = 0 try: ref_background = api['reference_background'][0] except: ref_background = 0 try: ref_result = api['reference_result'][0] except: ref_result = 0 try: ref_method = api['reference_methodology'][0] except: ref_method = 0 try: cite_background = api['citations_background'][0] except: cite_background = 0 try: cite_result = api['citations_result'][0] except: cite_result = 0 try: cite_method = api['citations_methodology'][0] except: cite_method = 0 try: cite_next = api['citation_next'][0] except: cite_next = 0 return { "sjr": sjr_score, "num_citations": cited_by, "subject": subject, "subject_code": subject_code, "normalized_citations": normalized, "citationVelocity": velocity, "influentialCitationCount": influentialcitations, "references_count": references, "openaccessflag": flag, "influentialReferencesCount": influentialref, "reference_background": ref_background, "reference_result": ref_result, "reference_methodology": ref_method, "citations_background": cite_background, "citations_result": cite_result, "citations_methodology": cite_method, "citations_next": cite_next } def set_influential_references_methodology(self): # Counts the number of influential references in the paper in the context of methodology count = 0 if self.paper.doi: url = 'https://partner.semanticscholar.org/v1/paper/{0}'.format( self.paper.doi) headers = {'x-api-key': 'I6SO5Ckndk67RitJNJOFR4d7jDiVpWOgaMFUhgkM'} response_payload = requests.get(url, headers=headers).json() try: references = response_payload['references'] for reference in references: try: if 'methodology' in reference['intent'] and reference[ 'isInfluential']: count += 1 except KeyError: continue except KeyError: pass return count