def find_press_of_journal(): presses = session.query(Press) for press in presses: press_id = press.id press_url = press.url req = requests.get(press_url, headers=HEADER) txt = req.text soup = BeautifulSoup(txt, features="lxml") press_soup = soup.find("div", "clear-both") press_soup = press_soup.find_next_sibling() press_soup = press_soup.find_next_sibling() p_infos = press_soup.find_all("li") while p_infos: for p_info in p_infos: journal_url = p_info.a["href"] if const.DBLP_JOURNAL_PREVIX in journal_url: journal_url = journal_url[:-10] j_q = session.query(Journal)\ .filter(Journal.dblp_address == journal_url).first() if j_q: j_q.press = press_id session.commit() press_soup = press_soup.find_next_sibling() p_infos = press_soup.find_all("li") session.close()
def update_is_updated_of_volumes(): paper_volume_ids = session.query(distinct(Paper.volume_id)) volume_ids = session.query(Volume.id).filter( Volume.id.notin_(paper_volume_ids)).all() for volume_id in volume_ids: session.query(Volume).filter(Volume.id == volume_id[0]).update( {Volume.is_updated: False}) session.commit() session.close()
def update_dblp_id_of_papers(): volume_ids = get_volumes_of_None() for volume_id in volume_ids: volume = session.query(Volume).filter(Volume.id == volume_id).first() if not volume: continue req = requests.get(volume.url, headers=HEADER) txt = req.text soup = BeautifulSoup(txt, features="lxml") sibling = soup.body.find("ul", class_="publ-list") if sibling is None: return info_of_papers # sibling = body_main.find_previous_sibling() while sibling: # item_name = "entry informal" # item_name = "entry book" article_entries = sibling.find_all( "li", {"class": ["entry data", "entry article"]}) if not len(article_entries): sibling = sibling.find_next_sibling() continue for article_entry in article_entries: paper_id = article_entry["id"] paper_doi = get_paper_doi(article_entry) #paper_title = get_paper_title(article_entry) #if paper_title is None: # paper_title = "" #if len(paper_title) > 255: # paper_title = paper_title[:255] #if not paper_title: # continue paper_info = session.query(Paper)\ .filter(Paper.doi == paper_doi).first() if not paper_info: continue paper_dblp_id = paper_info.dblp_id if not paper_dblp_id: paper_info.dblp_id = paper_id session.commit() sibling = sibling.find_next_sibling()
def check_journal_press(journal_url): journal_query = session.query(Journal)\ .filter(Journal.dblp_address == journal_url).first() if journal_query: return True else: return False
def get_volumes_of_None(): volume_ids = session.query(distinct(Paper.volume_id))\ .filter(Paper.dblp_id == None) v_ids = list() for volume_id in volume_ids: v_ids.append(volume_id[0]) return v_ids
def check_dblp_address(self): dblp_addresses = session.query(Journal.dblp_address) num = 0 for dblp_address in dblp_addresses: if const.DBLP_JOURNAL_PREVIX in dblp_address[0]: print(dblp_address[0]) num = num + 1 print(num)
def update_year_of_volumes(): volumes = session.query(Volume).filter(Volume.id >= 34745) for volume in volumes: volume_id = volume.id paper_info = session.query(Paper)\ .filter(Paper.volume_id == volume_id).first() if not paper_info: continue paper_dblp_id = paper_info.dblp_id paper_bibtex_url = \ const.DBLP_JOURNAL_BIBTEX_PREVIX + paper_dblp_id + \ const.DBLP_JOURNAL_BIBTEX_SUFFIX per_bibtex = get_paper_bibtex(paper_bibtex_url) volume_year = per_bibtex.get("year") volume_info = per_bibtex.get("volume") volume.year = volume_year volume.volume = volume_info session.commit()
def find_volume_by_url(volume_url): return session.query(Volume).filter(Volume.url == volume_url).first()
def get_all_journals_from_db(): return session.query(Journal).all()
def set_updated_status_of_volumes(volume_id): volume = session.query(Volume).filter(Volume.id == volume_id).first() volume.is_updated = True session.commit()
def query_journal_is_in_volumes(journal_issn): result = session.query(Volume).filter(Volume.issn == journal_issn).first() if result: return True else: return False
def query_volumes(): volumes = session.query(Volume.issn).distinct().all() print(len(volumes))
def query_volumes_by_filter(conditions, limit_num=1000): query = session.query(Volume).filter(conditions).limit(limit_num) return query
def collect_journal_papers(): arx_journal_issn = "2331-8422" # while True: # conditions = (Volume.is_updated == False) volume_id = 46770 while volume_id <= 46770: conditions = (Volume.id == volume_id) new_volumes = analyze_volumes.query_volumes_by_filter( conditions) #, limit_num=5) new_volume_infos = list() for new_volume in new_volumes: new_volume_infos.append({ const.VOLUME_ID: new_volume.id, const.JOURNAL_ISSN: new_volume.issn, const.VOLUME_URL: new_volume.url }) # if not new_volume_infos: # print("Finish the update!") # break for new_volume_info in new_volume_infos: if new_volume_info[const.JOURNAL_ISSN] == arx_journal_issn: continue print("%s" % new_volume_info[const.VOLUME_URL]) print("VOLUME ID: %s" % new_volume_info[const.VOLUME_ID]) # if new_volume_info[const.VOLUME_ID] in [35956, 35957, 44428, # 44429, 44430]: # continue new_paper_infos = analyze_papers.analyze_papers_of_volume( new_volume_info[const.VOLUME_URL]) for new_paper_info in new_paper_infos: paper_query = \ session.query(Paper).filter( Paper.dblp_id == new_paper_info[const.PAPER_DBLP_ID])\ .first() if paper_query is not None: print("Database has the record [dblp_id: %s]" % paper_query.dblp_id) continue else: print("Database has no record [dblp_id: %s]" % new_paper_info[const.PAPER_DBLP_ID]) new_paper = Paper( title=new_paper_info[const.PAPER_TITLE], journal_issn=new_volume_info[const.JOURNAL_ISSN], volume_id=new_volume_info[const.VOLUME_ID], volume=new_paper_info[const.PAPER_VOLUME], number=new_paper_info[const.PAPER_NUMBER], start_page=new_paper_info[const.PAPER_START_PAGE], end_page=new_paper_info[const.PAPER_END_PAGE], year=new_paper_info[const.PAPER_DATE], url=new_paper_info[const.PAPER_URL], doi=new_paper_info[const.PAPER_DOI], dblp_id=new_paper_info[const.PAPER_DBLP_ID]) session.add(new_paper) session.flush() session.refresh(new_paper) new_paper_id = new_paper.id # add authors author_infos = new_paper_info[const.PAPER_AUTHOR] order = 1 for author_info in author_infos: author_query = \ session.query(Author).filter( Author.title == author_info["author_title"]).first() if author_query: author_id = author_query.id else: new_author = Author( title=author_info["author_title"], name=author_info["author_name"], dblp_url=author_info["author_dblp_url"]) session.add(new_author) session.flush() session.refresh(new_author) author_id = new_author.id session.add( PaperAuthor(paper_id=new_paper_id, author_id=author_id, order=order)) order += 1 session.commit() volume = \ session.query(Volume).filter( Volume.id == new_volume_info[const.VOLUME_ID]).first() volume.is_updated = True session.commit() print("Volume - [ID: %s] has been processed!" % new_volume_info[const.VOLUME_ID]) volume_id = volume_id + 1
def find_journal_by_issn(journal_issn): return session.query(Journal).filter(Journal.issn == journal_issn).first()
def find_journal_by_title(journal_title): return session.query(Journal).filter(Journal.name == journal_title).first()
from databases.db_engine import engine, session from information_collection import analyze_journal from information_collection import analyze_journal_urls from information_collection.journal_info import Journal from publish_lib import generate_random print(session.query(Journal).filter().count())