def test_author(self): with db_session: a = Author(author_id='3333') a.name = 'given' p = Paper(paper_id='qqqq') p.title = 'titttttttle' a.papers.add(p)
def upsert_paper(info, author_id): try: with db_session: publisher_id = info['publisher_id'] if publisher_id: publisher = Publisher.get(publisher_id=publisher_id) if publisher: pass else: publisher = Publisher(publisher_id=publisher_id) publisher.name = info['publishername'] except: pass with db_session: p = Paper.get(paper_id=info['paper_id']) if p: logger.debug('paper has existed, paper_id=%s', info['paper_id']) return paper = Paper(paper_id=info['paper_id']) paper.title = info['title'] paper.abstract = info['abstract'] paper.cite_num = info['cite_num'] paper.cited_num = info['cited_num'] publisher_id = info['publisher_id'] if publisher_id: publisher = Publisher.get(publisher_id=publisher_id) if publisher: paper.publisher = publisher if author_id is None: return a = Author.get(author_id=author_id) if a: paper.authors.add(a) else: a_info = api.get_author(author_id) author = Author(author_id=a_info['author_id']) author.name = a_info['name'] author.image_url = a_info['image_url'] author.organization = a_info['organization'] author.home_page = a_info['home_page'] author.paper_count = a_info['paper_count'] author.citied_count = a_info['cited_count'] paper.authors.add(author)
def write_db(self): print "len of entry list " + str(len(self.entry_list)) for entry in self.entry_list: paper = Paper() if entry.has_key("id"): paper.id = entry["id"] if entry.has_key("type"): paper.type = entry["type"] if entry.has_key("title"): paper.title = entry["title"] if entry.has_key("author"): paper.authors = entry["author"] if entry.has_key("year"): paper.year = int(entry["year"]) if entry.has_key("journal"): paper.journal = entry["journal"] if entry.has_key("booktitle"): paper.book_title = entry["booktitle"] if entry.has_key("publisher"): paper.publisher = entry["publisher"] if entry.has_key("institution"): paper.institution = entry["institution"] if entry.has_key("volume"): paper.volume = int(entry["volume"]) if entry.has_key("number"): paper.number = int(entry["number"]) if entry.has_key("pages"): paper.pages = entry["pages"] if entry.has_key("url"): paper.url = entry["url"] if entry.has_key("doi"): paper.doi = entry["doi"] if entry.has_key("isbn"): paper.isbn = entry["isbn"] paper.save()
def add(search_query, author, title): fl = [ 'id', 'author', 'first_author', 'bibcode', 'id', 'year', 'title', 'abstract', 'doi', 'pubdate', "pub", "keyword", "doctype", "identifier", "links_data" ] if author: search_query += "author:" + author if title: search_query += "title:" + title papers = list(ads.SearchQuery(q=search_query, fl=fl)) if len(papers) == 0: selection = ads.search.Article exit() elif len(papers) == 1: selection = papers[0] # type:ads.search.Article else: # first_ten = itertools.islice(papers, 10) first_ten = papers[:10] single_paper: ads.search.Article for index, single_paper in enumerate(first_ten): print(index, single_paper.title[0], single_paper.first_author) selected_index = click.prompt('select paper', type=int) selection = papers[selected_index] # type:ads.search.Article assert len(selection.doi) == 1 doi = selection.doi[0] try: paper = Paper.get(Paper.doi == doi) print("this paper has already been added") exit(1) except peewee.DoesNotExist: pass print("fetching bibcode") q = ads.ExportQuery([selection.bibcode]) bibtex = q.execute() print("saving in db") paper = Paper() assert len(selection.title) == 1 paper.doi = doi paper.title = selection.title[0] paper.abstract = selection.abstract paper.bibcode = selection.bibcode paper.year = selection.year paper.pubdate = selection.pubdate paper.pdf_downloaded = False paper.first_author = Author.get_or_create(name=selection.first_author)[0] paper.publication = Publication.get_or_create(name=selection.pub)[0] paper.doctype = Doctype.get_or_create(name=selection.doctype)[0] paper.arxiv_identifier = [ ident for ident in selection.identifier if "arXiv:" in ident ][0].split("arXiv:")[-1] paper.bibtex = bibtex links = [json.loads(string) for string in selection.links_data] print(links) paper.save() authors = [Author.get_or_create(name=name)[0] for name in selection.author] for author in db.batch_commit(authors, 100): PaperAuthors.create(author=author, paper=paper) keywords = [ Keyword.get_or_create(keyword=keyword)[0] for keyword in selection.keyword ] for keyword in db.batch_commit(keywords, 100): PaperKeywords.create(keyword=keyword, paper=paper) print("fetching PDF") arxiv_url = "https://arxiv.org/pdf/{id}".format(id=paper.arxiv_identifier) r = requests.get(arxiv_url, stream=True) print(arxiv_url) with open('library/{filename}.pdf'.format(filename=paper.id), 'wb') as f: chunk_size = 1024 # bytes file_size = int(r.headers.get('content-length', 0)) progress_length = math.ceil(file_size // chunk_size) with click.progressbar(r.iter_content(chunk_size=20), length=progress_length) as progress_chunks: for chunk in progress_chunks: f.write(chunk) paper.pdf_downloaded = True paper.save()
def post(self, request): param = QueryDict(request.body) uuid = param.get('uuid') title = param.get('title') time = param.get('time') origin = param.get('origin') _authors = param.getlist('authors') link = param.get('link') _tags = param.getlist('tags') content = param.get('content') refer_to = param.getlist('reference') score = param.get('score') try: year, month = time.split('-') year, month = int(year), int(month) publish_time = datetime.date(year, month, 1) except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '提供的日期{}有误'.format(time)}, status=500) for _tag in _tags: try: _tag = int(_tag) _ = ResearchTag.objects.get(research_tag_id=_tag) except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '错误的标签{}'.format(_tag)}, status=500) tags = ResearchTag.objects.filter( research_tag_id__in=[int(_t) for _t in _tags]) author_ids = [] for _author in _authors: if _author.isdigit(): author_ids.append(int(_author)) elif Author.objects.filter(name=_author).exists(): a = Author.objects.get(name=_author).author_id author_ids.append(a) else: a = Author(name=_author) a.save() author_ids.append(a.author_id) authors = Author.objects.filter(author_id__in=author_ids) try: score = int(score) except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '错误的评分分数格式'}, status=500) if not Paper.objects.filter(paper_uuid=uuid).exists(): # 新建的场合 try: comment = PaperComment(content=content) comment.save() paper = Paper(paper_uuid=uuid, title=title, publish_origin=origin, publish_time=publish_time, author=authors, link=link, tag=tags, comment=comment, self_score=score) paper.save() redis.set(self.LATEST_KEY, str(uuid_gen.uuid4())) except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '保存失败'}, status=500) else: return JsonResponse({ 'next': reverse('paperdb.detail', kwargs={'paper_uuid': paper.paper_uuid}) }) try: # 编辑的场合 paper = Paper.objects.get(paper_uuid=uuid) except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '错误的uuid/未找到相关论文记录'}, status=404) else: paper.title = title paper.publish_time = publish_time paper.publish_origin = origin paper.author = authors paper.link = paper.link paper.tag = tags paper.self_score = score try: paper.save() except Exception as e: logger.error(traceback.format_exc(e)) return JsonResponse({'msg': '保存失败'}, status=500) if paper.comment is None: if content != '': comment = PaperComment(content=content) comment.save() paper.comment = comment paper.save() elif content != paper.comment.content.replace( '\r\n', '\n'): # traditional下的换行符出入 paper.comment.content = content paper.comment.save() for refer_to_paper in Paper.objects.filter(paper_uuid__in=refer_to): if not Reference.objects.filter( reference_src=paper, reference_trg=refer_to_paper).exists(): reference = Reference(reference_src=paper, reference_trg=refer_to_paper) reference.save() return JsonResponse({ 'next': reverse('paperdb.detail', kwargs={'paper_uuid': paper.paper_uuid}) })
def get_references_citations_by_id(profile_id): if isinstance(profile_id, dict): profile_id = profile_id.get('profile_id') if MONGO: if data_collection.find({"id": profile_id}).count() > 0: # 说明这个数据已经被爬取过了 return [] print('func2') if not profile_id: return -1 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9' } session = requests.Session() while True: try: response = session.get( 'https://cn.bing.com/academic/profile?id={}&encoded=0&v=paper_preview&mkt=zh-cn' .format(profile_id), headers=headers) response.raise_for_status() response.encoding = 'utf-8' break except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: time.sleep(3.0) print(e) result = re.search(r'IG:"(.*?)"', response.text) if result: ig = result.group(1) result = re.search( r'被 引 量</span></span><span class="aca_content"><div>(\d*)</div>', response.text) if result: citation_num = result.group(1) html = etree.HTML(response.text) paper = Paper(save2mongo=MONGO) try: paper.title = html.xpath('//li[@class="aca_title"]/text()')[0] paper.id = profile_id paper.citation_num = citation_num result = re.search( r'<span class="aca_label">DOI</span></span><span class="aca_content"><div>(.*?)</div>', response.text) if result: paper.doi = result.group(1) paper.authors = html.xpath( '//div[@class="aca_desc b_snippet"]/span//a/text()') paper.abstract = html.xpath( '//div[@class="aca_desc b_snippet"]/span[1]//text()')[-1] result = re.search( r'<span class="aca_label">发表日期</span></span><span class="aca_content"><div>(\d*)</div>', response.text) if result: paper.publish_year = result.group(1) base_url = 'https://cn.bing.com/academic/papers?ajax=scroll&infscroll=1&id={id}&encoded=0&v=paper_preview&mkt=zh-cn&first={first}&count={count}&IG={ig}&IID=morepage.{num}&SFX={num}&rt={rt}' count = 9 citation_links = list() for i in range(1, int(citation_num) // count): ajax_url = base_url.format(id=profile_id, first=i * (count + 1), count=count + 1, ig=ig, num=i, rt='2') while True: try: response = session.get(ajax_url, headers=headers) response.raise_for_status() response.encoding = 'utf-8' break except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: time.sleep(3.0) print(e) html = etree.HTML(response.text) citation_links.extend(html.xpath('//a[@target="_blank"]/@href')) print('number of citation_links', len(citation_links), 'citation_num', citation_num) if len(citation_links) >= 0: for i, citation_link in enumerate(citation_links): profile_id = get_profile_id(citation_link) if profile_id.get('title', False): paper.citations.append(profile_id) print('get_profile_id: {}/{}\r'.format(i + 1, len(citation_links)), end='') print('\nnumber of ids:', len(paper.citations)) except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: print(e) paper.save() # for profile_id in paper.citations: # get_references_citations_by_id(profile_id) return paper.citations
def test_paper(self): with db_session: p = Paper(paper_id='666aaaaa66') p.title = 'hehehehhehe'