def write_db(self): print "len of entry list " + str(len(self.entry_list)) for entry in self.entry_list: paper = Paper() if entry.has_key("id"): paper.id = entry["id"] if entry.has_key("type"): paper.type = entry["type"] if entry.has_key("title"): paper.title = entry["title"] if entry.has_key("author"): paper.authors = entry["author"] if entry.has_key("year"): paper.year = int(entry["year"]) if entry.has_key("journal"): paper.journal = entry["journal"] if entry.has_key("booktitle"): paper.book_title = entry["booktitle"] if entry.has_key("publisher"): paper.publisher = entry["publisher"] if entry.has_key("institution"): paper.institution = entry["institution"] if entry.has_key("volume"): paper.volume = int(entry["volume"]) if entry.has_key("number"): paper.number = int(entry["number"]) if entry.has_key("pages"): paper.pages = entry["pages"] if entry.has_key("url"): paper.url = entry["url"] if entry.has_key("doi"): paper.doi = entry["doi"] if entry.has_key("isbn"): paper.isbn = entry["isbn"] paper.save()
def add(search_query, author, title): fl = [ 'id', 'author', 'first_author', 'bibcode', 'id', 'year', 'title', 'abstract', 'doi', 'pubdate', "pub", "keyword", "doctype", "identifier", "links_data" ] if author: search_query += "author:" + author if title: search_query += "title:" + title papers = list(ads.SearchQuery(q=search_query, fl=fl)) if len(papers) == 0: selection = ads.search.Article exit() elif len(papers) == 1: selection = papers[0] # type:ads.search.Article else: # first_ten = itertools.islice(papers, 10) first_ten = papers[:10] single_paper: ads.search.Article for index, single_paper in enumerate(first_ten): print(index, single_paper.title[0], single_paper.first_author) selected_index = click.prompt('select paper', type=int) selection = papers[selected_index] # type:ads.search.Article assert len(selection.doi) == 1 doi = selection.doi[0] try: paper = Paper.get(Paper.doi == doi) print("this paper has already been added") exit(1) except peewee.DoesNotExist: pass print("fetching bibcode") q = ads.ExportQuery([selection.bibcode]) bibtex = q.execute() print("saving in db") paper = Paper() assert len(selection.title) == 1 paper.doi = doi paper.title = selection.title[0] paper.abstract = selection.abstract paper.bibcode = selection.bibcode paper.year = selection.year paper.pubdate = selection.pubdate paper.pdf_downloaded = False paper.first_author = Author.get_or_create(name=selection.first_author)[0] paper.publication = Publication.get_or_create(name=selection.pub)[0] paper.doctype = Doctype.get_or_create(name=selection.doctype)[0] paper.arxiv_identifier = [ ident for ident in selection.identifier if "arXiv:" in ident ][0].split("arXiv:")[-1] paper.bibtex = bibtex links = [json.loads(string) for string in selection.links_data] print(links) paper.save() authors = [Author.get_or_create(name=name)[0] for name in selection.author] for author in db.batch_commit(authors, 100): PaperAuthors.create(author=author, paper=paper) keywords = [ Keyword.get_or_create(keyword=keyword)[0] for keyword in selection.keyword ] for keyword in db.batch_commit(keywords, 100): PaperKeywords.create(keyword=keyword, paper=paper) print("fetching PDF") arxiv_url = "https://arxiv.org/pdf/{id}".format(id=paper.arxiv_identifier) r = requests.get(arxiv_url, stream=True) print(arxiv_url) with open('library/{filename}.pdf'.format(filename=paper.id), 'wb') as f: chunk_size = 1024 # bytes file_size = int(r.headers.get('content-length', 0)) progress_length = math.ceil(file_size // chunk_size) with click.progressbar(r.iter_content(chunk_size=20), length=progress_length) as progress_chunks: for chunk in progress_chunks: f.write(chunk) paper.pdf_downloaded = True paper.save()
def get_references_citations_by_id(profile_id): if isinstance(profile_id, dict): profile_id = profile_id.get('profile_id') if MONGO: if data_collection.find({"id": profile_id}).count() > 0: # 说明这个数据已经被爬取过了 return [] print('func2') if not profile_id: return -1 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36', 'accept-language': 'zh-CN,zh;q=0.9' } session = requests.Session() while True: try: response = session.get( 'https://cn.bing.com/academic/profile?id={}&encoded=0&v=paper_preview&mkt=zh-cn' .format(profile_id), headers=headers) response.raise_for_status() response.encoding = 'utf-8' break except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: time.sleep(3.0) print(e) result = re.search(r'IG:"(.*?)"', response.text) if result: ig = result.group(1) result = re.search( r'被 引 量</span></span><span class="aca_content"><div>(\d*)</div>', response.text) if result: citation_num = result.group(1) html = etree.HTML(response.text) paper = Paper(save2mongo=MONGO) try: paper.title = html.xpath('//li[@class="aca_title"]/text()')[0] paper.id = profile_id paper.citation_num = citation_num result = re.search( r'<span class="aca_label">DOI</span></span><span class="aca_content"><div>(.*?)</div>', response.text) if result: paper.doi = result.group(1) paper.authors = html.xpath( '//div[@class="aca_desc b_snippet"]/span//a/text()') paper.abstract = html.xpath( '//div[@class="aca_desc b_snippet"]/span[1]//text()')[-1] result = re.search( r'<span class="aca_label">发表日期</span></span><span class="aca_content"><div>(\d*)</div>', response.text) if result: paper.publish_year = result.group(1) base_url = 'https://cn.bing.com/academic/papers?ajax=scroll&infscroll=1&id={id}&encoded=0&v=paper_preview&mkt=zh-cn&first={first}&count={count}&IG={ig}&IID=morepage.{num}&SFX={num}&rt={rt}' count = 9 citation_links = list() for i in range(1, int(citation_num) // count): ajax_url = base_url.format(id=profile_id, first=i * (count + 1), count=count + 1, ig=ig, num=i, rt='2') while True: try: response = session.get(ajax_url, headers=headers) response.raise_for_status() response.encoding = 'utf-8' break except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: time.sleep(3.0) print(e) html = etree.HTML(response.text) citation_links.extend(html.xpath('//a[@target="_blank"]/@href')) print('number of citation_links', len(citation_links), 'citation_num', citation_num) if len(citation_links) >= 0: for i, citation_link in enumerate(citation_links): profile_id = get_profile_id(citation_link) if profile_id.get('title', False): paper.citations.append(profile_id) print('get_profile_id: {}/{}\r'.format(i + 1, len(citation_links)), end='') print('\nnumber of ids:', len(paper.citations)) except KeyboardInterrupt: raise KeyboardInterrupt except Exception as e: print(e) paper.save() # for profile_id in paper.citations: # get_references_citations_by_id(profile_id) return paper.citations