Ejemplo n.º 1
0
def insert_paper():
    if request.method == 'POST':
        paper = db.session.query(Paper).filter_by(doi=request.json['doi']).first()
        if not paper:
            paper = Paper(year=request.json['year'],
                         title=request.json['title'],
                         abstract=request.json['abstract'],
                         user_id=g.user.id,
                         doi=request.json['doi'])
            db.session.add(paper)
            db.session.flush()
            for author in request.json['authors']:
                paper_author = db.session.query(Author).filter_by(name=author).first()
                if paper_author:
                    paper_author.start_owning(paper)
                else:
                    paper_author = Author(name=author)
                    db.session.add(paper_author)
                    db.session.flush()
                    paper_author.start_owning(paper)
            db.session.commit()
            for doi in request.json['doi_refs']:
                ref_paper = db.session.query(Paper).filter_by(doi=doi).first()
                if ref_paper:
                    paper.start_referencing(ref_paper)
                else:
                    ref_paper = Paper(doi=doi)
                    db.session.add(ref_paper)
                    db.session.flush()
                    paper.start_referencing(ref_paper)
            db.session.commit()
    return json.dumps(dict(data=request.json))
Ejemplo n.º 2
0
def add(request):
    """Add action"""

    form = None

    if request.method == "POST":
        form = NewPaperForm(request.POST)

        if form.is_valid():
            if hasattr(form.cleaned_data['tags'], 'split'):
                form.cleaned_data['tags'] = form.cleaned_data['tags'].split(",")
            

            p = Paper(**form.cleaned_data)
            p.put()
            return HttpResponseRedirect(p.permalink())
        else:
            pass
    elif request.method == "GET":
        form = NewPaperForm()
    else:
        pass
        
    return render_to_response('add_paper.html', 
        {"form" : form,
         "dest_url" : reverse(__name__+'.add')
         })
Ejemplo n.º 3
0
def dbSavePapersAndAuthors(papers, latestMailing=True):
	"""Saves an array of paper information into the database.  Returns numbers of new papers and authors added.  
	
	If the latestMailing argument is true, then sets the paper dates to either today or tomorrow, 
	regardless of the date from the arXiv.  It sets to today if the function is run before 8pm ET, and to 
	tomorrow otherwise.  The idea is that this function should be run regularly every day, the night that the 
	mailing goes out.  If run late in the day before midnight, then the mailing has tomorrow's date.  If run 
	early in the day, e.g., if for some reason it didn't run when it should have, then the mailing was sent out 
	yesterday and is for today.  
	"""
	if latestMailing: 
		latestMailingDate = datetime.date.today()
		now = datetime.datetime.now(pytz.timezone('US/Eastern'))
		cutoff = now.replace(hour=20,minute=0,second=0,microsecond=0)
		if now > cutoff: 
			latestMailingDate += datetime.timedelta(days=+1)	# note: The official mailing date is the day the email goes out, a few hours after the paper was made available
	numNewPapersAdded = numNewAuthorsAdded = 0
	for paper in papers: 
		authors = []
		for author in paper['authors']: 
			authorsWithSameName = Author.objects.filter(name=author)
			if authorsWithSameName: 		# author with same name already exists in database---don't add a duplicate
				a = authorsWithSameName[0]	# there might be duplicates --- take the first (maybe fix later)
			else: 
				a = Author(name=author)
				a.save()
				numNewAuthorsAdded += 1
			authors.append(a)
		if Paper.objects.filter(arxivId=paper['arxivId']): continue		# NOTE: If we make a mistake adding the paper the first time, this line will keep the code below from ever running to fix it
		if latestMailing: 
			mailing_date = latestMailingDate
		else: 
			mailing_date = mailingDate(paper['datePublished'])
		p = Paper(
			arxivId = paper['arxivId'],
			title = paper['title'],
			abstract = paper['abstract'],
			date_published = paper['datePublished'],
			date_mailed = mailing_date,
			#authors = authors, 	# ManyToManyField is set up later
			category = paper['category'],
			categories = paper['categories'],
			version = paper['version'],
			linkAbsPage = paper['linkAbsPage'],
			linkPdf = paper['linkPdf']
		)
		p.save()	# need to save before setting up the ManyToMany field of authors
		for author in authors: 	# alternatively, to clear a ManyToMany field, use p.authors.clear()
			p.authors.add(author)
		p.save()
		numNewPapersAdded += 1
	print "%d new papers, %d new authors added" % (numNewPapersAdded, numNewAuthorsAdded)
	return numNewPapersAdded, numNewAuthorsAdded
Ejemplo n.º 4
0
def papers_view(request):
    user = UserProfile.objects.get(username=request.user.username)

    papers = Paper.objects.all()
    paper_data = []

    for paper in papers:
        # Apply policy to paper author.
        if not paper.policy_paperlabel(user):
            paper.author = Paper.jeeves_get_private_author(paper)

        paper_versions = PaperVersion.objects.filter(paper=paper).order_by('-time').all()
        latest_version_title = paper_versions[0].title if \
            paper_versions.__len__() > 0 else None
        # Make sure we're actually allowed to see the paper.
        if not paper_versions[0].jeeves_restrict_paperversionlabel(user):
            latest_version_title = PaperVersion.jeeves_get_private_title(
                paper_versions[0])

        paper_data.append({
            'paper' : paper,
            'latest' : latest_version_title
        })

    return ("papers.html", {
        'papers' : papers
      , 'which_page' : "home"
      , 'paper_data' : paper_data
      , 'name' : user.name
    })
Ejemplo n.º 5
0
def index(request):
    papers = Paper.all()
    papers.order('-date')
    papers = papers.fetch(1000)
    
    def paper_compare(a, b):
        return b.score() - a.score()
    
    papers.sort(paper_compare)
        
    return render_to_response('papers.html', {"papers": papers[:30]})
Ejemplo n.º 6
0
def AbstractPool(request, review_name_slug):
    review = Review.objects.get(user=request.user, slug=review_name_slug)
    if request.method == "POST":
        if request.POST.get('results') == None:
            q = request.POST.get('queryField')
            s = request.POST.get('sortType')
            n = request.POST.get('noResults')
            abstractList = search.main(q,s, n)
            for document in abstractList:
                documentURL = document.get("url")
                if Paper.objects.filter(paper_url= documentURL, review= review).exists():
                    abstractList.remove(document)
        else:
            abstractList = eval(request.POST.get('results'))
            q = request.POST.get('queryField')
        relevant="Unchecked"
        if request.POST.get("relevanceField") == "relevant":
            relevant="Relevant"
        else:
            if request.POST.get("relevanceField") == "irrelevant":
                relevant="Not Relevant"
        if relevant!="Unchecked":
            print "traceA"
            compareCount_value = int(request.POST.get("hiddenCompareCount"))
            for s in abstractList:
                if s.get('compareCount') == compareCount_value:
                        currentDoc = s
                        paper = Paper(review=review, title=currentDoc["title"], paper_url=currentDoc["url"], full_text=currentDoc['fullText'], abstract=currentDoc["abstract"], authors=currentDoc["author"], abstract_relevance=relevant)
                        paper.save()
            if len(abstractList)>1:
                for abstract in abstractList:
                    if int(abstract.get('compareCount')) > compareCount_value-1:
                        abstract['compareCount'] -= 1
                del abstractList[compareCount_value-1]
            else:
                abstractList = []
                #for abstract in abstractList:
                     #if int(abstract.get('compareCount')) > compareCount_value:
                            #abstract['compareCount'] -= 1
                #del abstractList[compareCount_value]
        return render(request, 'ultimatereview/AbstractPool.html', {"Abstracts": abstractList, 'query': q, 'review':review.title,'slug': review_name_slug})
Ejemplo n.º 7
0
def view(request, id):
    
    #Paper.sample_init()
    
    """Handler for individual papers"""
    paper = Paper.get_by_id(int(id))
    values = {
        "paper": paper,
        "paper_date":util.as_time_ago(paper.date),
        "paper_html_description" : util.plaintext2html(paper.description),
    }
    return render_to_response('paper.html', values)
Ejemplo n.º 8
0
def _paperAdd(requestData, user):
    '''
        新增一个问卷的具体处理过程
    '''
    # 获取Paper模型中的所有属性
    keys = requestData.keys()
    data = {}
    # fields = zip(*Paper._meta.get_fields_with_model())[0]
    for field in getModelFields(Paper):
        # 跳过系统自动增加的字段
        if field.auto_created:
            continue
        # 读取request数据
        value = requestData.get(field.name, None)

        # 特殊处理json的Boolean型的变量
        if type(field) == BooleanField:
            value = jsonBoolean2Python(value)

        # 对创建人和修改人的信息进行特殊处理
        if field.name in [USER_CREATE_BY_FIELD_NAME, USER_MODIFY_BY_FIELD_NAME]:
            value = user
        # 如果调用者没有显示执行字段值为空,则不增加到data中去,让模型的默认值发挥作用
        # 字段代码不能早于对createBy和modifyBy的处理
        if value is None and field.name not in keys:
            continue
        # 将校验的数据添加到data,准备为创建数据库用
        data[field.name] = value
    paper = Paper(**data)

    # 校验数据
    try:
        paper.full_clean()
    except ValidationError as exception:
        return packageResult(
            RESULT_CODE.ERROR, RESULT_MESSAGE.VALIDATION_ERROR, {'validationMessage': exception.message_dict})
    # 保存到数据库
    paper.save()
    return packageResult(RESULT_CODE.SUCCESS, RESULT_MESSAGE.SUCCESS, {'paperId': paper.id})
Ejemplo n.º 9
0
def upload(request):
    if request.method =='POST':
        
        #form = UploaderForm(request.POST,request.FILES)
        #if form.is_valid():
        paper_title=request.FILES['ups'].name
        uploaded=request.FILES['ups']
        print uploaded
        print uploaded.size
        username = get_user_fromcookie(request)        
        data = {'title': paper_title,'paper_file':uploaded}
        form = UploadForm(data)
        obj = Paper(title=paper_title,paper_file=uploaded,user=username)
        obj.save()
        #form.save()
        '''
        if form.is_valid():
            form.save()
        else:
            print form.errors
        '''
        #return HttpResponse("errros")
        return HttpResponse('Research Paper uploaded')
Ejemplo n.º 10
0
def upload(request):
    if request.method =='POST':
        t = []
        #form = UploaderForm(request.POST,request.FILES)
        #if form.is_valid():
        paper_title=request.FILES['ups'].name
        uploaded=request.FILES['ups']
        tags = request.POST['tags']
        t = tags.split(',')
        print t
        p = []
        
        for i in range(len(t)-1):
            l = Topic.objects.get(subject=t[i].encode('ascii','replace'))
            p.append(l.id)
        #print uploaded
        print p
        #print uploaded.size
        username = get_user_fromcookie(request)        
        data = {'title': paper_title,'paper_file':uploaded}
        form = UploadForm(data)
        obj = Paper(title=paper_title,paper_file=uploaded,user=username)
        obj.save()
        pap = Paper.objects.get(title=paper_title)
        for i in p:
            pap.tags.add(int(i))
        convert(paper_title)
        #form.save()
        '''
        if form.is_valid():
            form.save()
        else:
            print form.errors
        '''
        #return HttpResponse("errros")
        return render(request,'thanks.html',{'message': 'Research Paper Uploaded'})
Ejemplo n.º 11
0
Archivo: tests.py Proyecto: xmduhan/qi
def createTestPaper():
    for i in range(3):
        paper = Paper(name="paper" + str(i), description="paper" + str(i) + "...", state='A')
        paper.save()
        userpaper = UserPaper(user=testUser, paper=paper, finish_state="finished")        
        userpaper.save()
    for i in range(4, 7):
        paper = Paper(name="paper" + str(i), description="paper" + str(i) + "...", state='A')
        paper.save()
        userpaper = UserPaper(user=testUser, paper=paper, finish_state="unfinished")        
        userpaper.save()
Ejemplo n.º 12
0
    def save(self, user):
        d = self.cleaned_data
        
        authors = [user]
        if 'coauthor1' in d:
            authors.append(d['coauthor1'])
        if 'coauthor2' in d:
            authors.append(d['coauthor2'])
        if 'coauthor3' in d:
            authors.append(d['coauthor3'])

        paper = Paper()
        paper.save()

        paper.authors.add(user)
        for coauthor in d['coauthors']:
            paper.authors.add(coauthor)
        paper.save()

        d['contents'].name = '%030x' % random.randrange(16**30) + ".pdf"

        paper_version = PaperVersion(
            paper = paper,
            title = d['title'],
            abstract = d['abstract'],
            contents = d['contents'],
        )
        paper_version.save()

        # need to save paper twice since paper and paper_version point to each other...
        paper.latest_version = paper_version
        paper.save()

        for conflict_username in d['conflicts']:
            ra = ReviewAssignment()
            ra.user = User.objects.get(username=conflict_username)
            ra.paper = paper
            ra.type = 'conflict'
            ra.save()

        return paper
Ejemplo n.º 13
0
def __get_expanded_tuples(inner_table):
    sql = """
with ids as ({0})
select a.id, a.name, a.affiliation,
  pa.authorid, pa.paperid, pa.name, pa.affiliation,
  p.id, p.title, p.year, p.keyword,
  c.id, c.shortname, c.fullname,
  j.id, j.shortname, j.fullname
from ids
  inner join paperauthor pa on ids.authorid = pa.authorid and ids.paperid = pa.paperid
  inner join author a on pa.authorid = a.id
  inner join paper p on pa.paperid = p.id
  left join conference c on p.conferenceid = c.id
  left join journal j on p.journalid = j.id
""".format(inner_table)
    build_from_row = lambda r: Expanded(author=Author._make(r[0:3]) if r[0] else None,
                                        paperauthor=PaperAuthor._make(r[3:7]) if r[3] and r[4] else None,
                                        paper=Paper._make(r[7:11]) if r[7] else None,
                                        conference=Conference._make(r[11:14]) if r[11] else None,
                                        journal=Journal._make(r[14:17]) if r[14] else None)

    return [build_from_row(r) for r in __execute_sql(sql)]
Ejemplo n.º 14
0
def get_portfolio_papers(portfolio_id: int):
    return [Paper(row['id']) for row in query.fetch('shares', ['id'], portfolio_id=portfolio_id)]
Ejemplo n.º 15
0
def new(request):
    papers = Paper.all()
    papers.order('-date')
    papers = papers.fetch(30)
    
    return render_to_response('papers.html', {"papers": papers})
Ejemplo n.º 16
0
def update_paper():
    idx = 0
    for filename in tqdm(glob.glob("oai/*.xml")):
        article = parse_xml_file(filename)
        if article is None or idx < 346728:
            idx += 1
            continue
        arvixID = article['id'].split('v')[0]
        query = Paper.select().where(Paper.arvixID == arvixID)
        if query.exists():
            continue
        success, article_meta = get_arvixpaper_semantic_scholar(arvixID)
        if success is False:
            logging.debug(
                "Paper not exists in semantic scholar, arvixID : %s" % arvixID)
            continue
        authorIDList = [
            int(author['authorId']) if author['authorId'] is not None else -1
            for author in article_meta['authors']
        ]
        authorNames = [article['main_author']]
        authorCount = len(article_meta['authors'])
        if authorCount > 1:
            other_author = [
                name.strip() for name in article['authors'].split(',')
                if len(name) > 1 and name != article['main_author']
            ]
            authorNames += other_author
        paper_category = [article['term']]
        try:
            paper = Paper.create(
                indexID=idx,
                arvixID=arvixID,
                paperId=article_meta['paperId'],
                doiID=str(article_meta['doi']),
                title=article['title'],
                summary=article['abstract'],
                category=paper_category,
                comments=article['comment'],
                journal_ref=article['journal_ref'],
                url=article['url'],
                authorID=authorIDList,
                authorName=authorNames,
                authorCount=authorCount,
                publishedDate=article['publish_date'],
                citationVelocity=article_meta['citationVelocity'],
                referencesCount=len(article_meta['references']),
                topics=article_meta['topics'],
                venue=str(article_meta['venue']),
                year=article_meta['year'],
                influentialCitationCount=article_meta[
                    'influentialCitationCount'],
                citationCount=len(article_meta['citations']),
                citations=article_meta['citations'],
            )
            try:
                for meta in ['page', 'figure', 'table']:
                    if meta in article['comment']:
                        comment = article['comment'].replace(';', ',')
                        for segment in comment.split(','):
                            if meta in segment:
                                page_prefix = segment.split(meta)[0]
                                if meta == 'page':
                                    paper.pages = int(page_prefix.strip())
                                elif meta == 'figure':
                                    paper.figures = int(page_prefix.strip())
                                elif meta == 'table':
                                    paper.table = int(page_prefix.strip())
                                break
            except:
                logging.debug("Error in parsing meta data")
            paper.save()
        except BaseException as e:
            logging.warning("Error in arvix id %s, error: %s" %
                            (arvixID, str(e)))
        time.sleep(0.2)
        idx += 1
Ejemplo n.º 17
0
 def test_paper(self):
     with db_session:
         p = Paper(paper_id='666aaaaa66')
         p.title = 'hehehehhehe'
Ejemplo n.º 18
0
    def write_db(self):

        print "len of entry list " + str(len(self.entry_list))

        for entry in self.entry_list:
            paper = Paper()
            if entry.has_key("id"):
                paper.id = entry["id"]
            if entry.has_key("type"):
                paper.type = entry["type"]
            if entry.has_key("title"):
                paper.title = entry["title"]
            if entry.has_key("author"):
                paper.authors = entry["author"]
            if entry.has_key("year"):
                paper.year = int(entry["year"])
            if entry.has_key("journal"):
                paper.journal = entry["journal"]
            if entry.has_key("booktitle"):
                paper.book_title = entry["booktitle"]
            if entry.has_key("publisher"):
                paper.publisher = entry["publisher"]
            if entry.has_key("institution"):
                paper.institution = entry["institution"]
            if entry.has_key("volume"):
                paper.volume = int(entry["volume"])
            if entry.has_key("number"):
                paper.number = int(entry["number"])
            if entry.has_key("pages"):
                paper.pages = entry["pages"]
            if entry.has_key("url"):
                paper.url = entry["url"]
            if entry.has_key("doi"):
                paper.doi = entry["doi"]
            if entry.has_key("isbn"):
                paper.isbn = entry["isbn"]

            paper.save()
Ejemplo n.º 19
0
# echo true goes on engine
engine = create_engine('sqlite:///my_papers.db')
Session = sessionmaker(bind=engine)
session = Session()

Base.metadata.create_all(engine)

jchem_phys = Journal(name='J. Chem. Phys', publisher='AIP')
jcim = Journal(name='J. Chem. Inf. Model', publisher='ACS')

# Add a paper to the DB
molssi_paper = Paper(
    DOI='10.1063/1.5052551',
    paper_title=
    'Perspective: Computational chemistry software and its advancement as illustrated through three grand challenge cases for molecular science',
    journal=jchem_phys,
    publication_year=2018,
    authors=
    'Anna Krylov, Theresa L. Windus, Taylor Barnes, Eliseo Marin-Rimoldi, Jessica A. Nash, Benjamin Pritchard, Daniel G.A. Smith, Doaa Altarawy, Paul Saxe, Cecilia Clementi, T. Daniel Crawford, Robert J. Harrison, Shantenu Jha, Vijay S. Pande, Teresa Head-Gordon'
)

# Add another paper
bse_paper = Paper(
    DOI='10.1021/acs.jcim.9b00725',
    paper_title=
    'New Basis Set Exchange: An Open, Up-to-Date Resource for the Molecular Sciences Community',
    journal=jcim,
    publication_year=2019,
    authors=
    'Benjamin P. Pritchard, Doaa Altarawy, Brett Didier, Tara D. Gibson, Theresa L. Windus'
)
Ejemplo n.º 20
0
def get_paper(ticker: str, portfolio_id: int, holder_id: int) -> Paper:
    try:
        return Paper(query.fetch('shares', ['id'], ticker=ticker, holder_id=holder_id, portfolio_id=portfolio_id)[0]['id'])
    except IndexError:
        return
Ejemplo n.º 21
0
def vote(request, id):
    paper = Paper.get_by_id(int(id))
    if request.method == 'GET':
        paper.points += 1
        paper.put()
    return HttpResponseRedirect(paper.permalink())
Ejemplo n.º 22
0
def paper_view(request):
    user = UserProfile.objects.get(username=request.user.username)

    paper = Paper.objects.get(id=request.GET.get('id', ''))
    if paper != None:
        if request.method == 'POST':
            if request.POST.get('add_comment', 'false') == 'true':
                Comment.objects.create(paper=paper, user=user,
                            contents=request.POST.get('comment', ''))

            elif request.POST.get('add_review', 'false') == 'true':
                Review.objects.create(paper=paper, reviewer=user,
                            contents=request.POST.get('review', ''),
                            score_novelty=int(request.POST.get('score_novelty', '1')),
                            score_presentation=int(request.POST.get('score_presentation', '1')),
                            score_technical=int(request.POST.get('score_technical', '1')),
                            score_confidence=int(request.POST.get('score_confidence', '1')),
                          )
            elif request.POST.get('new_version', 'false') == 'true' and user == paper.author:
                contents = request.FILES.get('contents', None)
                if contents != None and paper.author != None:
                    set_random_name(contents)
                    PaperVersion.objects.create(paper=paper,
                        title=request.POST.get('title', ''),
                        contents=contents,
                        abstract=request.POST.get('abstract', ''),
                    )

        all_paper_versions = PaperVersion.objects.filter(paper=paper).order_by('time').all()
        paper_versions = []
        for paper_version in all_paper_versions:
            print paper_version
            if paper_version.jeeves_restrict_paperversionlabel(user):
                paper_versions.append(paper_version)

        all_coauthors = PaperCoauthor.objects.filter(paper=paper).all()
        coauthors = []
        for coauthor in all_coauthors:
            if coauthor.jeeves_restrict_papercoauthorlabel(user):
                coauthors.append(coauthor) 

        if paper_versions[0].jeeves_restrict_paperversionlabel(user):
            latest_abstract = paper_versions[0].abstract \
                if paper_versions.__len__() > 0 else None
            latest_title = paper_versions[0].title \
                if paper_versions.__len__() > 0 else None
        else:
            latest_abstract = PaperVersion.jeeves_get_private_abstract(
                paper_versions[0])
            latest_title = PaperVersion.jeeves_get_private_title(
                paper_versions[0])

        reviews = Review.objects.filter(paper=paper).order_by('-time').all()
        for review in reviews:
            if not review.jeeves_restrict_reviewlabel(user):
                review.paper = Review.jeeves_get_private_paper(review)
                review.reviewer = Review.jeeves.get_private_reviewer(review)
                review.contents = Review.jeeves_get_private_contents(review)
                review.score_novelty = \
                    Review.jeeves_get_private_score_novelty(review)
                review.score_presentation = \
                    Review.jeeves_get_private_score_presentation(review)
                review.score_technical = \
                    Review.jeeves_get_private_score_technical(review)
                review.score_confidence = \
                    Review.jeeves_get_private_score_confidence(review)
        
        all_comments = Comment.objects.filter(paper=paper).order_by(
            'time').all()
        comments = []
        for comment in all_comments:
            if comment.jeeves_restrict_reviewlabel(user):
                comment.paper = Comment.jeeves_get_private_paper(comment)
                comment.user = Comment.jeeves_get_private_user(comment)
                comment.contents = Comment.jeeves_get_private_contents(comment)
                commends.append(comment)

        if paper.policy_paperlabel(user):
            author = paper.author
        else:
            author = Paper.jeeves_get_private_author(paper)
    else:
        paper = None
        paper_versions = []
        coauthors = []
        latest_abstract = None
        latest_title = None
        reviews = []
        comments = []
        author = None

    return ("paper.html", {
        'paper' : paper,
        'paper_versions' : paper_versions,
        'author' : author,
        'coauthors' : coauthors,
        'latest_abstract' : latest_abstract,
        'latest_title' : latest_title,
        'reviews' : reviews,
        'comments' : comments,
        'which_page' : "paper",
        'review_score_fields': [ ("Novelty", "score_novelty", 10)
                               , ("Presentation", "score_presentation", 10)
                               , ("Technical", "score_technical", 10)
                               , ("Confidence", "score_confidence", 10) ]  
  })
Ejemplo n.º 23
0
    def post(self, request):
        param = QueryDict(request.body)

        uuid = param.get('uuid')
        title = param.get('title')
        time = param.get('time')
        origin = param.get('origin')
        _authors = param.getlist('authors')
        link = param.get('link')
        _tags = param.getlist('tags')
        content = param.get('content')
        refer_to = param.getlist('reference')
        score = param.get('score')

        try:
            year, month = time.split('-')
            year, month = int(year), int(month)
            publish_time = datetime.date(year, month, 1)
        except Exception as e:
            logger.error(traceback.format_exc(e))
            return JsonResponse({'msg': '提供的日期{}有误'.format(time)}, status=500)

        for _tag in _tags:
            try:
                _tag = int(_tag)
                _ = ResearchTag.objects.get(research_tag_id=_tag)
            except Exception as e:
                logger.error(traceback.format_exc(e))
                return JsonResponse({'msg': '错误的标签{}'.format(_tag)},
                                    status=500)
        tags = ResearchTag.objects.filter(
            research_tag_id__in=[int(_t) for _t in _tags])

        author_ids = []
        for _author in _authors:
            if _author.isdigit():
                author_ids.append(int(_author))
            elif Author.objects.filter(name=_author).exists():
                a = Author.objects.get(name=_author).author_id
                author_ids.append(a)
            else:
                a = Author(name=_author)
                a.save()
                author_ids.append(a.author_id)

        authors = Author.objects.filter(author_id__in=author_ids)

        try:
            score = int(score)
        except Exception as e:
            logger.error(traceback.format_exc(e))
            return JsonResponse({'msg': '错误的评分分数格式'}, status=500)

        if not Paper.objects.filter(paper_uuid=uuid).exists():
            # 新建的场合
            try:
                comment = PaperComment(content=content)
                comment.save()
                paper = Paper(paper_uuid=uuid,
                              title=title,
                              publish_origin=origin,
                              publish_time=publish_time,
                              author=authors,
                              link=link,
                              tag=tags,
                              comment=comment,
                              self_score=score)
                paper.save()
                redis.set(self.LATEST_KEY, str(uuid_gen.uuid4()))
            except Exception as e:
                logger.error(traceback.format_exc(e))
                return JsonResponse({'msg': '保存失败'}, status=500)
            else:
                return JsonResponse({
                    'next':
                    reverse('paperdb.detail',
                            kwargs={'paper_uuid': paper.paper_uuid})
                })

        try:
            # 编辑的场合
            paper = Paper.objects.get(paper_uuid=uuid)
        except Exception as e:
            logger.error(traceback.format_exc(e))
            return JsonResponse({'msg': '错误的uuid/未找到相关论文记录'}, status=404)
        else:
            paper.title = title
            paper.publish_time = publish_time
            paper.publish_origin = origin
            paper.author = authors
            paper.link = paper.link
            paper.tag = tags
            paper.self_score = score

            try:
                paper.save()
            except Exception as e:
                logger.error(traceback.format_exc(e))
                return JsonResponse({'msg': '保存失败'}, status=500)

            if paper.comment is None:
                if content != '':
                    comment = PaperComment(content=content)
                    comment.save()
                    paper.comment = comment
                    paper.save()
            elif content != paper.comment.content.replace(
                    '\r\n', '\n'):  # traditional下的换行符出入
                paper.comment.content = content
                paper.comment.save()

        for refer_to_paper in Paper.objects.filter(paper_uuid__in=refer_to):
            if not Reference.objects.filter(
                    reference_src=paper,
                    reference_trg=refer_to_paper).exists():
                reference = Reference(reference_src=paper,
                                      reference_trg=refer_to_paper)
                reference.save()

        return JsonResponse({
            'next':
            reverse('paperdb.detail', kwargs={'paper_uuid': paper.paper_uuid})
        })
Ejemplo n.º 24
0
def delete_paper_index():
    Paper.delete_papers()
Ejemplo n.º 25
0
def crawl_category(term='cs.LG'):
    index_iteration = 500
    logging.info("Crawling category : %s", term)
    for index in range(start_index, end_index, index_iteration):
        logging.info("\nBatch : %d-%d" % (index, index + index_iteration))
        articles = arxivpy.query(search_query=[term],
                                 start_index=index,
                                 max_index=index + index_iteration,
                                 results_per_iteration=index_iteration,
                                 wait_time=0.2,
                                 sort_by='lastUpdatedDate')
        article_batch_count = len(articles)
        if article_batch_count == 0:
            logging.warning('Article not found in batch %d - %d' %
                            (index, index + index_iteration))
        for idx, article in tqdm(enumerate(articles),
                                 total=article_batch_count):
            arvixID = article['id'].split('v')[0]
            query = Paper.select().where(Paper.arvixID == arvixID)
            if query.exists():
                paper = Paper.get(Paper.arvixID == arvixID)
                categories = paper.category
                if term not in categories:
                    categories.append(term)
                Paper.update(category=categories).where(
                    Paper.arvixID == arvixID).execute()
                continue
            success, article_meta = get_arvixpaper_semantic_scholar(arvixID)
            if success is False:
                logging.debug(
                    "Paper not exists in semantic scholar, arvixID : %s" %
                    arvixID)
                continue
            authorIDList = [
                int(author['authorId'])
                if author['authorId'] is not None else -1
                for author in article_meta['authors']
            ]
            authorNames = [article['main_author']]
            authorCount = len(article_meta['authors'])
            if authorCount > 1:
                other_author = [
                    name.strip() for name in article['authors'].split(',')
                    if len(name) > 1 and name != article['main_author']
                ]
                authorNames += other_author
            paper_category = [article['term']]
            if article['term'] != term:
                paper_category.append(term)
            try:
                paper = Paper.create(
                    indexID=idx + index,
                    arvixID=arvixID,
                    paperId=article_meta['paperId'],
                    doiID=str(article_meta['doi']),
                    title=article['title'],
                    summary=article['abstract'],
                    category=paper_category,
                    comments=article['comment'],
                    journal_ref=article['journal_ref'],
                    url=article['url'],
                    authorID=authorIDList,
                    authorName=authorNames,
                    authorCount=authorCount,
                    publishedDate=article['publish_date'],
                    citationVelocity=article_meta['citationVelocity'],
                    referencesCount=len(article_meta['references']),
                    topics=article_meta['topics'],
                    venue=str(article_meta['venue']),
                    year=article_meta['year'],
                    influentialCitationCount=article_meta[
                        'influentialCitationCount'],
                    citationCount=len(article_meta['citations']),
                    citations=article_meta['citations'],
                )
                try:
                    for meta in ['page', 'figure', 'table']:
                        if meta in article['comment']:
                            comment = article['comment'].replace(';', ',')
                            for segment in comment.split(','):
                                if meta in segment:
                                    page_prefix = segment.split(meta)[0]
                                    if meta == 'page':
                                        paper.pages = int(page_prefix.strip())
                                    elif meta == 'figure':
                                        paper.figures = int(
                                            page_prefix.strip())
                                    elif meta == 'table':
                                        paper.table = int(page_prefix.strip())
                                    break
                except:
                    logging.debug("Error in parsing meta data")
                paper.save()
            except BaseException as e:
                logging.warning("Error in arvix id %s, error: %s" %
                                (arvixID, str(e)))
            time.sleep(0.3)
Ejemplo n.º 26
0
def get_references_citations_by_id(profile_id):
    if isinstance(profile_id, dict):
        profile_id = profile_id.get('profile_id')
        if MONGO:
            if data_collection.find({"id": profile_id}).count() > 0:
                # 说明这个数据已经被爬取过了
                return []
    print('func2')
    if not profile_id:
        return -1
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
        'accept-language': 'zh-CN,zh;q=0.9'
    }
    session = requests.Session()
    while True:
        try:
            response = session.get(
                'https://cn.bing.com/academic/profile?id={}&encoded=0&v=paper_preview&mkt=zh-cn'
                .format(profile_id),
                headers=headers)
            response.raise_for_status()
            response.encoding = 'utf-8'
            break
        except KeyboardInterrupt:
            raise KeyboardInterrupt
        except Exception as e:
            time.sleep(3.0)
            print(e)
    result = re.search(r'IG:"(.*?)"', response.text)
    if result:
        ig = result.group(1)
    result = re.search(
        r'被 引 量</span></span><span class="aca_content"><div>(\d*)</div>',
        response.text)
    if result:
        citation_num = result.group(1)

    html = etree.HTML(response.text)

    paper = Paper(save2mongo=MONGO)
    try:
        paper.title = html.xpath('//li[@class="aca_title"]/text()')[0]
        paper.id = profile_id
        paper.citation_num = citation_num
        result = re.search(
            r'<span class="aca_label">DOI</span></span><span class="aca_content"><div>(.*?)</div>',
            response.text)
        if result:
            paper.doi = result.group(1)
        paper.authors = html.xpath(
            '//div[@class="aca_desc b_snippet"]/span//a/text()')
        paper.abstract = html.xpath(
            '//div[@class="aca_desc b_snippet"]/span[1]//text()')[-1]
        result = re.search(
            r'<span class="aca_label">发表日期</span></span><span class="aca_content"><div>(\d*)</div>',
            response.text)
        if result:
            paper.publish_year = result.group(1)

        base_url = 'https://cn.bing.com/academic/papers?ajax=scroll&infscroll=1&id={id}&encoded=0&v=paper_preview&mkt=zh-cn&first={first}&count={count}&IG={ig}&IID=morepage.{num}&SFX={num}&rt={rt}'

        count = 9
        citation_links = list()
        for i in range(1, int(citation_num) // count):
            ajax_url = base_url.format(id=profile_id,
                                       first=i * (count + 1),
                                       count=count + 1,
                                       ig=ig,
                                       num=i,
                                       rt='2')
            while True:
                try:
                    response = session.get(ajax_url, headers=headers)
                    response.raise_for_status()
                    response.encoding = 'utf-8'
                    break
                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except Exception as e:
                    time.sleep(3.0)
                    print(e)
            html = etree.HTML(response.text)
            citation_links.extend(html.xpath('//a[@target="_blank"]/@href'))
        print('number of citation_links', len(citation_links), 'citation_num',
              citation_num)
        if len(citation_links) >= 0:
            for i, citation_link in enumerate(citation_links):
                profile_id = get_profile_id(citation_link)
                if profile_id.get('title', False):
                    paper.citations.append(profile_id)
                print('get_profile_id: {}/{}\r'.format(i + 1,
                                                       len(citation_links)),
                      end='')
        print('\nnumber of ids:', len(paper.citations))
    except KeyboardInterrupt:
        raise KeyboardInterrupt
    except Exception as e:
        print(e)
    paper.save()
    # for profile_id in paper.citations:
    #     get_references_citations_by_id(profile_id)
    return paper.citations
Ejemplo n.º 27
0
class TEIExtractor:
    def __init__(self, file, test_tsv=None):
        self.file = file
        self.uni_rank = ReadPickle('uni_rank.pickle')
        self.sjr = ReadPickle('journal_dictionary.pkl')
        self.document = test_tsv
        self.paper = Paper()
        with open(file, 'rb') as tei:
            self.soup = BeautifulSoup(tei, features="lxml")

    # TODO: return paper | redesign extractor to make it more modular to test individual components

    def get_self_citations(self):
        # DOI
        doi = self.soup.teiheader.find("idno", type="DOI")
        if doi:
            self.paper.doi = elem_to_text(doi)
        elif self.document:
            self.paper.doi = self.document['doi']
        # Title
        title = self.soup.teiheader.find("title")
        if title:
            self.paper.title = elem_to_text(title)
        # Authors
        authors = self.get_authors(self.soup.analytic.find_all('author'))
        if authors:
            self.paper.authors = authors
        # Citations
        bibliography = self.soup.listbibl.find_all('biblstruct')
        for bibl in bibliography:
            citation = Citation()
            cited_paper = bibl.analytic
            if cited_paper:
                citation.title = elem_to_text(
                    cited_paper.find("title", type="main"))
                citation_authors = self.get_authors(
                    cited_paper.find_all("author"))
                citation.doi = elem_to_text(
                    cited_paper.find("idno", type="DOI"))
                if citation_authors:
                    citation.authors = citation_authors
            cited_journal = bibl.monogr
            if cited_journal:
                citation.source = elem_to_text(cited_journal.find("title"))
                try:
                    citation.publish_year = cited_journal.imprint.date['when']
                except TypeError:
                    pass
            self.paper.citations.append(citation)
        self.paper.set_self_citations()
        return {
            'doi': self.paper.doi,
            'title': self.paper.title,
            'total_citations': len(self.paper.citations),
            'self_citations': self.paper.self_citations
        }

    def extract_paper_info(self):
        # DOI
        doi = self.soup.teiheader.find("idno", type="DOI")
        if doi:
            self.paper.doi = elem_to_text(doi)
        elif self.document:
            self.paper.doi = self.document['doi']
        # Title
        title = self.soup.teiheader.find("title")
        if title:
            self.paper.title = elem_to_text(title)
        # Authors
        authors = self.get_authors(self.soup.analytic.find_all('author'))
        if authors:
            self.paper.authors = authors
        # Year
        published = self.soup.analytic.find("publicationstmt")
        if published:
            self.paper.year = elem_to_text(published.find("date", type="when"))
        # Organization / Affiliations
        affiliations = self.soup.analytic.find_all('affiliation')
        for affiliation in affiliations:
            org = Organization()
            org.type = "institution"
            org.name = elem_to_text(
                affiliation.find("orgname", type="institution"))
            address = Address()
            addr = affiliation.find("address")
            if addr:
                address.place = elem_to_text(addr.find("settlement"))
                address.region = elem_to_text(addr.find("region"))
                address.country = elem_to_text(addr.find("country"))
            org.address = address
            self.paper.affiliations.append(org)
        # University Ranking
        if self.paper.affiliations:
            if self.paper.affiliations[0] != '':
                self.paper.uni_rank = self.uni_rank.get_rank(
                    self.paper.affiliations[0].name)
            elif len(self.paper.affiliations) > 1:
                self.paper.uni_rank = self.uni_rank.get_rank(
                    self.paper.affiliations[1].name)
        else:
            self.paper.uni_rank = self.uni_rank.get_rank('Random')
        # Citations
        bibliography = self.soup.listbibl.find_all('biblstruct')
        for bibl in bibliography:
            citation = Citation()
            cited_paper = bibl.analytic
            if cited_paper:
                citation.title = elem_to_text(
                    cited_paper.find("title", type="main"))
                citation_authors = self.get_authors(
                    cited_paper.find_all("author"))
                citation.doi = elem_to_text(
                    cited_paper.find("idno", type="DOI"))
                if citation_authors:
                    citation.authors = citation_authors
            cited_journal = bibl.monogr
            if cited_journal:
                citation.source = elem_to_text(cited_journal.find("title"))
                try:
                    citation.publish_year = cited_journal.imprint.date['when']
                except TypeError:
                    pass
            self.paper.citations.append(citation)
        # NER - Ack pairs - Funding status
        self.paper.ack_pairs = self.get_funding_status()
        er_list = [org for (entity, org) in self.paper.ack_pairs]
        if 'ORG' in er_list:
            self.paper.funded = 1
        else:
            self.paper.funded = 0
        # SJR
        api_resp = self.get_sjr(self.paper.doi, self.paper.title)
        if api_resp:
            self.paper.cited_by_count = api_resp["num_citations"]
            self.paper.sjr = api_resp["sjr"]
            self.paper.subject = api_resp["subject"]
            self.paper.subject_code = api_resp["subject_code"]
            self.paper.normalized = api_resp["normalized_citations"]
            self.paper.velocity = api_resp["citationVelocity"]
            self.paper.influentialcitations = api_resp[
                "influentialCitationCount"]
            self.paper.references = api_resp["references_count"]
            self.paper.flag = api_resp["openaccessflag"]
            self.paper.influentialref = api_resp["influentialReferencesCount"]
            self.paper.ref_background = api_resp["reference_background"]
            self.paper.ref_result = api_resp["reference_result"]
            self.paper.ref_method = api_resp["reference_methodology"]
            self.paper.cite_background = api_resp["citations_background"]
            self.paper.cite_result = api_resp["citations_result"]
            self.paper.cite_method = api_resp["citations_methodology"]
            self.paper.cite_next = api_resp["citations_next"]
        # Set self-citations
        self.paper.self_citations = self.paper.set_self_citations()
        # Set influential_methodology_references
        self.paper.influential_references_methodology = self.set_influential_references_methodology(
        )
        # return paper

        t2, t3 = coCite(self.paper.doi)
        return {
            "doi": self.paper.doi,
            "title": self.paper.title,
            "num_citations": self.paper.cited_by_count,
            "author_count": len(self.paper.authors),
            "sjr": self.paper.sjr,
            "u_rank": self.paper.uni_rank,
            "funded": self.paper.funded,
            "self_citations": self.paper.self_citations,
            "subject": self.paper.subject,
            "subject_code": self.paper.subject_code,
            "citationVelocity": self.paper.velocity,
            "influentialCitationCount": self.paper.influentialcitations,
            "references_count": self.paper.references,
            "openaccessflag": self.paper.flag,
            "influentialReferencesCount": self.paper.influentialref,
            "normalized_citations": self.paper.normalized,
            "reference_background": self.paper.ref_background,
            "reference_result": self.paper.ref_result,
            "reference_methodology": self.paper.ref_method,
            "citations_background": self.paper.cite_background,
            "citations_result": self.paper.cite_result,
            "citations_methodology": self.paper.cite_method,
            "citations_next": self.paper.cite_next,
            "upstream_influential_methodology_count":
            self.paper.influential_references_methodology,
            "coCite2": t2,
            "coCite3": t3
        }

    @staticmethod
    def get_authors(authors):
        authors_list = []
        for author in authors:
            person = Author()
            pers_name = author.persname
            if not pers_name:
                continue
            person.first_name = elem_to_text(
                pers_name.find("forename", type="first"))
            person.middle_name = elem_to_text(
                pers_name.find("forename", type="middle"))
            person.surname = elem_to_text(pers_name.surname)
            person.set_name()
            if not any(auth.name == person.name for auth in authors_list):
                authors_list.append(person)
        return authors_list

    def get_funding_status(self):
        pairs = NER(XML2ack(self.file))
        return pairs

    @staticmethod
    def get_sjr(doi, title):
        api = getapi(doi, title)
        if api.empty:
            return None
        else:
            try:
                cited_by = api['num_citations'][0]
            except KeyError:
                cited_by = 0
            try:
                normalized = api['normalized_citations'][0]
            except:
                normalized = 0.0
            try:
                velocity = api['citationVelocity'][0]
            except:
                velocity = 0
            try:
                influentialcitations = api['influentialCitationCount'][0]
            except:
                influentialcitations = 0
            try:
                references = api['references_count'][0]
            except:
                references = 0
            try:
                sjr_score = api['SJR'][0]
            except KeyError:
                sjr_score = 0
            try:
                subject = api['subject'][0]

            except:
                subject = 0
            try:
                subject_code = api['subject_code'][0]
            except:
                subject_code = 900
            try:
                flag = api['openaccessflag'][0]
            except:
                flag = 0
            try:
                influentialref = api['influentialReferencesCount'][0]
            except:
                influentialref = 0
            try:
                ref_background = api['reference_background'][0]
            except:
                ref_background = 0
            try:
                ref_result = api['reference_result'][0]
            except:
                ref_result = 0
            try:
                ref_method = api['reference_methodology'][0]
            except:
                ref_method = 0
            try:
                cite_background = api['citations_background'][0]
            except:
                cite_background = 0
            try:
                cite_result = api['citations_result'][0]
            except:
                cite_result = 0
            try:
                cite_method = api['citations_methodology'][0]
            except:
                cite_method = 0
            try:
                cite_next = api['citation_next'][0]
            except:
                cite_next = 0

        return {
            "sjr": sjr_score,
            "num_citations": cited_by,
            "subject": subject,
            "subject_code": subject_code,
            "normalized_citations": normalized,
            "citationVelocity": velocity,
            "influentialCitationCount": influentialcitations,
            "references_count": references,
            "openaccessflag": flag,
            "influentialReferencesCount": influentialref,
            "reference_background": ref_background,
            "reference_result": ref_result,
            "reference_methodology": ref_method,
            "citations_background": cite_background,
            "citations_result": cite_result,
            "citations_methodology": cite_method,
            "citations_next": cite_next
        }

    def set_influential_references_methodology(self):
        # Counts the number of influential references in the paper in the context of methodology
        count = 0
        if self.paper.doi:
            url = 'https://partner.semanticscholar.org/v1/paper/{0}'.format(
                self.paper.doi)
            headers = {'x-api-key': 'I6SO5Ckndk67RitJNJOFR4d7jDiVpWOgaMFUhgkM'}
            response_payload = requests.get(url, headers=headers).json()
            try:
                references = response_payload['references']
                for reference in references:
                    try:
                        if 'methodology' in reference['intent'] and reference[
                                'isInfluential']:
                            count += 1
                    except KeyError:
                        continue
            except KeyError:
                pass
        return count