def find_press_of_journal(): presses = session.query(Press) for press in presses: press_id = press.id press_url = press.url req = requests.get(press_url, headers=HEADER) txt = req.text soup = BeautifulSoup(txt, features="lxml") press_soup = soup.find("div", "clear-both") press_soup = press_soup.find_next_sibling() press_soup = press_soup.find_next_sibling() p_infos = press_soup.find_all("li") while p_infos: for p_info in p_infos: journal_url = p_info.a["href"] if const.DBLP_JOURNAL_PREVIX in journal_url: journal_url = journal_url[:-10] j_q = session.query(Journal)\ .filter(Journal.dblp_address == journal_url).first() if j_q: j_q.press = press_id session.commit() press_soup = press_soup.find_next_sibling() p_infos = press_soup.find_all("li") session.close()
def update_is_updated_of_volumes(): paper_volume_ids = session.query(distinct(Paper.volume_id)) volume_ids = session.query(Volume.id).filter( Volume.id.notin_(paper_volume_ids)).all() for volume_id in volume_ids: session.query(Volume).filter(Volume.id == volume_id[0]).update( {Volume.is_updated: False}) session.commit() session.close()
def insert_volumes_into_db(volumes, issn): new_volumes = list() for volume in volumes: new_volumes.append(Volume( issn=issn, volume=volume[const.VOLUME_NUMBER], year=volume[const.VOLUME_YEAR], url=volume[const.VOLUME_URL], is_updated=volume[const.VOLUME_UPDATED] )) session.add_all(new_volumes) session.commit()
def update_dblp_id_of_papers(): volume_ids = get_volumes_of_None() for volume_id in volume_ids: volume = session.query(Volume).filter(Volume.id == volume_id).first() if not volume: continue req = requests.get(volume.url, headers=HEADER) txt = req.text soup = BeautifulSoup(txt, features="lxml") sibling = soup.body.find("ul", class_="publ-list") if sibling is None: return info_of_papers # sibling = body_main.find_previous_sibling() while sibling: # item_name = "entry informal" # item_name = "entry book" article_entries = sibling.find_all( "li", {"class": ["entry data", "entry article"]}) if not len(article_entries): sibling = sibling.find_next_sibling() continue for article_entry in article_entries: paper_id = article_entry["id"] paper_doi = get_paper_doi(article_entry) #paper_title = get_paper_title(article_entry) #if paper_title is None: # paper_title = "" #if len(paper_title) > 255: # paper_title = paper_title[:255] #if not paper_title: # continue paper_info = session.query(Paper)\ .filter(Paper.doi == paper_doi).first() if not paper_info: continue paper_dblp_id = paper_info.dblp_id if not paper_dblp_id: paper_info.dblp_id = paper_id session.commit() sibling = sibling.find_next_sibling()
def get_presses(): press_url = "https://dblp.uni-trier.de/db/journals/publ/index.html" req = requests.get(press_url, headers=HEADER) txt = req.text soup = BeautifulSoup(txt, features="lxml") press_soup = soup.find("div", "clear-both") press_soup = press_soup.find_next_sibling() press_list = press_soup.find_all("li") presses = dict() for press in press_list: press_name = press.a.string press_url = press.a["href"] presses[press_name] = press_url session.add(Press(name=press_name, url=press_url)) session.commit() session.close()
def update_year_of_volumes(): volumes = session.query(Volume).filter(Volume.id >= 34745) for volume in volumes: volume_id = volume.id paper_info = session.query(Paper)\ .filter(Paper.volume_id == volume_id).first() if not paper_info: continue paper_dblp_id = paper_info.dblp_id paper_bibtex_url = \ const.DBLP_JOURNAL_BIBTEX_PREVIX + paper_dblp_id + \ const.DBLP_JOURNAL_BIBTEX_SUFFIX per_bibtex = get_paper_bibtex(paper_bibtex_url) volume_year = per_bibtex.get("year") volume_info = per_bibtex.get("volume") volume.year = volume_year volume.volume = volume_info session.commit()
def insert_journal_into_db(journal_title, journal_addr, issn): session.add( Journal(name=journal_title, dblp_address=journal_addr, issn=issn)) session.commit()
def set_updated_status_of_volumes(volume_id): volume = session.query(Volume).filter(Volume.id == volume_id).first() volume.is_updated = True session.commit()
def collect_journal_papers(): arx_journal_issn = "2331-8422" # while True: # conditions = (Volume.is_updated == False) volume_id = 46770 while volume_id <= 46770: conditions = (Volume.id == volume_id) new_volumes = analyze_volumes.query_volumes_by_filter( conditions) #, limit_num=5) new_volume_infos = list() for new_volume in new_volumes: new_volume_infos.append({ const.VOLUME_ID: new_volume.id, const.JOURNAL_ISSN: new_volume.issn, const.VOLUME_URL: new_volume.url }) # if not new_volume_infos: # print("Finish the update!") # break for new_volume_info in new_volume_infos: if new_volume_info[const.JOURNAL_ISSN] == arx_journal_issn: continue print("%s" % new_volume_info[const.VOLUME_URL]) print("VOLUME ID: %s" % new_volume_info[const.VOLUME_ID]) # if new_volume_info[const.VOLUME_ID] in [35956, 35957, 44428, # 44429, 44430]: # continue new_paper_infos = analyze_papers.analyze_papers_of_volume( new_volume_info[const.VOLUME_URL]) for new_paper_info in new_paper_infos: paper_query = \ session.query(Paper).filter( Paper.dblp_id == new_paper_info[const.PAPER_DBLP_ID])\ .first() if paper_query is not None: print("Database has the record [dblp_id: %s]" % paper_query.dblp_id) continue else: print("Database has no record [dblp_id: %s]" % new_paper_info[const.PAPER_DBLP_ID]) new_paper = Paper( title=new_paper_info[const.PAPER_TITLE], journal_issn=new_volume_info[const.JOURNAL_ISSN], volume_id=new_volume_info[const.VOLUME_ID], volume=new_paper_info[const.PAPER_VOLUME], number=new_paper_info[const.PAPER_NUMBER], start_page=new_paper_info[const.PAPER_START_PAGE], end_page=new_paper_info[const.PAPER_END_PAGE], year=new_paper_info[const.PAPER_DATE], url=new_paper_info[const.PAPER_URL], doi=new_paper_info[const.PAPER_DOI], dblp_id=new_paper_info[const.PAPER_DBLP_ID]) session.add(new_paper) session.flush() session.refresh(new_paper) new_paper_id = new_paper.id # add authors author_infos = new_paper_info[const.PAPER_AUTHOR] order = 1 for author_info in author_infos: author_query = \ session.query(Author).filter( Author.title == author_info["author_title"]).first() if author_query: author_id = author_query.id else: new_author = Author( title=author_info["author_title"], name=author_info["author_name"], dblp_url=author_info["author_dblp_url"]) session.add(new_author) session.flush() session.refresh(new_author) author_id = new_author.id session.add( PaperAuthor(paper_id=new_paper_id, author_id=author_id, order=order)) order += 1 session.commit() volume = \ session.query(Volume).filter( Volume.id == new_volume_info[const.VOLUME_ID]).first() volume.is_updated = True session.commit() print("Volume - [ID: %s] has been processed!" % new_volume_info[const.VOLUME_ID]) volume_id = volume_id + 1