def run(): gs = GSData() db.connect() with open('data.txt', encoding='utf-8') as f: for line in f: wx_name = line.split(' ')[0] try: data = gs.query(wx_name) except GSException as ex: print(str(ex)) continue else: for item in data: url = 'https://{0}'.format(item['url'].split('://', 1)[1]) md5s = hashlib.md5(url.encode('utf-8')).hexdigest() if Paper.select().where(Paper.url_hash == md5s).count(): continue print(item) p = Paper.create( wx_name=item['wx_name'], name=item['name'], title=item['title'], author=item['author'], content=item['content'], url=url, url_hash=md5s, post_time=datetime.strptime(item['posttime'], '%Y-%m-%d %H:%M:%S'), add_time=datetime.strptime(item['add_time'], '%Y-%m-%d %H:%M:%S')) if type(item['readnum_newest']) == int: p.read_num = item['readnum_newest'] if type(item['likenum_newest']) == int: p.like_num = item['likenum_newest'] if item['picurl']: p.pic_url = item['picurl'] p.save() sleep(3) db.close()
def update_paper(): idx = 0 for filename in tqdm(glob.glob("oai/*.xml")): article = parse_xml_file(filename) if article is None or idx < 346728: idx += 1 continue arvixID = article['id'].split('v')[0] query = Paper.select().where(Paper.arvixID == arvixID) if query.exists(): continue success, article_meta = get_arvixpaper_semantic_scholar(arvixID) if success is False: logging.debug( "Paper not exists in semantic scholar, arvixID : %s" % arvixID) continue authorIDList = [ int(author['authorId']) if author['authorId'] is not None else -1 for author in article_meta['authors'] ] authorNames = [article['main_author']] authorCount = len(article_meta['authors']) if authorCount > 1: other_author = [ name.strip() for name in article['authors'].split(',') if len(name) > 1 and name != article['main_author'] ] authorNames += other_author paper_category = [article['term']] try: paper = Paper.create( indexID=idx, arvixID=arvixID, paperId=article_meta['paperId'], doiID=str(article_meta['doi']), title=article['title'], summary=article['abstract'], category=paper_category, comments=article['comment'], journal_ref=article['journal_ref'], url=article['url'], authorID=authorIDList, authorName=authorNames, authorCount=authorCount, publishedDate=article['publish_date'], citationVelocity=article_meta['citationVelocity'], referencesCount=len(article_meta['references']), topics=article_meta['topics'], venue=str(article_meta['venue']), year=article_meta['year'], influentialCitationCount=article_meta[ 'influentialCitationCount'], citationCount=len(article_meta['citations']), citations=article_meta['citations'], ) try: for meta in ['page', 'figure', 'table']: if meta in article['comment']: comment = article['comment'].replace(';', ',') for segment in comment.split(','): if meta in segment: page_prefix = segment.split(meta)[0] if meta == 'page': paper.pages = int(page_prefix.strip()) elif meta == 'figure': paper.figures = int(page_prefix.strip()) elif meta == 'table': paper.table = int(page_prefix.strip()) break except: logging.debug("Error in parsing meta data") paper.save() except BaseException as e: logging.warning("Error in arvix id %s, error: %s" % (arvixID, str(e))) time.sleep(0.2) idx += 1
def paper(): url = 'http://ieeexplore.ieee.org/rest/search' page = 31501 n_paper = 0 print(f'Started populate paper at: {datetime.datetime.now()}') print("Request search page for get cookies.") response = requests.get( 'http://ieeexplore.ieee.org/search/searchresult.jsp') cookies = response.cookies while True: try: page += 1 print(f'Request page {page}') payload = { 'pageNumber': str(page), } headers = { 'Accept': 'application/json, text/plain, */*', 'Content-Type': 'application/json;charset=UTF-8', 'Content-Length': '18', 'Accept-Language': 'en-US,en;q=0.8,pt;q=0.6', 'Referer': 'http://ieeexplore.ieee.org/search/searchresult.jsp', 'Origin': 'http://ieeexplore.ieee.org', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/60.0.3112.113 Safari/537.36', } response = requests.post(url, json=payload, headers=headers, cookies=cookies, timeout=15) papers = json.loads(response.text)['records'] if (page % 150) == 0: db.commit() print(f'committed at: {page}') print(f'insert {n_paper} rows') print("Request search page for get cookies.") response = requests.get( 'http://ieeexplore.ieee.org/search/searchresult.jsp') cookies = response.cookies if len(papers) <= 0: db.commit() print( f'{datetime.datetime.now()}- Finished, populated with {n_paper} papers' ) break for p in papers: if p.get('title'): Paper.create(title=p['title'], abstract=p.get('abstract', ''), finalScore=0., accepted=True) n_paper += 1 except Exception as e: print( f'{datetime.datetime.now()} - Error: {e} \nretry in 30 seconds' ) time.sleep(30) continue
def crawl_category(term='cs.LG'): index_iteration = 500 logging.info("Crawling category : %s", term) for index in range(start_index, end_index, index_iteration): logging.info("\nBatch : %d-%d" % (index, index + index_iteration)) articles = arxivpy.query(search_query=[term], start_index=index, max_index=index + index_iteration, results_per_iteration=index_iteration, wait_time=0.2, sort_by='lastUpdatedDate') article_batch_count = len(articles) if article_batch_count == 0: logging.warning('Article not found in batch %d - %d' % (index, index + index_iteration)) for idx, article in tqdm(enumerate(articles), total=article_batch_count): arvixID = article['id'].split('v')[0] query = Paper.select().where(Paper.arvixID == arvixID) if query.exists(): paper = Paper.get(Paper.arvixID == arvixID) categories = paper.category if term not in categories: categories.append(term) Paper.update(category=categories).where( Paper.arvixID == arvixID).execute() continue success, article_meta = get_arvixpaper_semantic_scholar(arvixID) if success is False: logging.debug( "Paper not exists in semantic scholar, arvixID : %s" % arvixID) continue authorIDList = [ int(author['authorId']) if author['authorId'] is not None else -1 for author in article_meta['authors'] ] authorNames = [article['main_author']] authorCount = len(article_meta['authors']) if authorCount > 1: other_author = [ name.strip() for name in article['authors'].split(',') if len(name) > 1 and name != article['main_author'] ] authorNames += other_author paper_category = [article['term']] if article['term'] != term: paper_category.append(term) try: paper = Paper.create( indexID=idx + index, arvixID=arvixID, paperId=article_meta['paperId'], doiID=str(article_meta['doi']), title=article['title'], summary=article['abstract'], category=paper_category, comments=article['comment'], journal_ref=article['journal_ref'], url=article['url'], authorID=authorIDList, authorName=authorNames, authorCount=authorCount, publishedDate=article['publish_date'], citationVelocity=article_meta['citationVelocity'], referencesCount=len(article_meta['references']), topics=article_meta['topics'], venue=str(article_meta['venue']), year=article_meta['year'], influentialCitationCount=article_meta[ 'influentialCitationCount'], citationCount=len(article_meta['citations']), citations=article_meta['citations'], ) try: for meta in ['page', 'figure', 'table']: if meta in article['comment']: comment = article['comment'].replace(';', ',') for segment in comment.split(','): if meta in segment: page_prefix = segment.split(meta)[0] if meta == 'page': paper.pages = int(page_prefix.strip()) elif meta == 'figure': paper.figures = int( page_prefix.strip()) elif meta == 'table': paper.table = int(page_prefix.strip()) break except: logging.debug("Error in parsing meta data") paper.save() except BaseException as e: logging.warning("Error in arvix id %s, error: %s" % (arvixID, str(e))) time.sleep(0.3)