Exemple #1
0
def find_press_of_journal():
    presses = session.query(Press)
    for press in presses:
        press_id = press.id
        press_url = press.url

        req = requests.get(press_url, headers=HEADER)
        txt = req.text
        soup = BeautifulSoup(txt, features="lxml")
        press_soup = soup.find("div", "clear-both")
        press_soup = press_soup.find_next_sibling()
        press_soup = press_soup.find_next_sibling()
        p_infos = press_soup.find_all("li")

        while p_infos:
            for p_info in p_infos:
                journal_url = p_info.a["href"]
                if const.DBLP_JOURNAL_PREVIX in journal_url:
                    journal_url = journal_url[:-10]
                    j_q = session.query(Journal)\
                        .filter(Journal.dblp_address == journal_url).first()
                    if j_q:
                        j_q.press = press_id
                        session.commit()

            press_soup = press_soup.find_next_sibling()
            p_infos = press_soup.find_all("li")
    session.close()
def update_is_updated_of_volumes():
    paper_volume_ids = session.query(distinct(Paper.volume_id))
    volume_ids = session.query(Volume.id).filter(
        Volume.id.notin_(paper_volume_ids)).all()

    for volume_id in volume_ids:
        session.query(Volume).filter(Volume.id == volume_id[0]).update(
            {Volume.is_updated: False})

    session.commit()
    session.close()
Exemple #3
0
def insert_volumes_into_db(volumes, issn):
    new_volumes = list()
    for volume in volumes:
        new_volumes.append(Volume(
            issn=issn, 
            volume=volume[const.VOLUME_NUMBER], 
            year=volume[const.VOLUME_YEAR],
            url=volume[const.VOLUME_URL],
            is_updated=volume[const.VOLUME_UPDATED]
            ))
    session.add_all(new_volumes)
    session.commit()
Exemple #4
0
def update_dblp_id_of_papers():
    volume_ids = get_volumes_of_None()
    for volume_id in volume_ids:
        volume = session.query(Volume).filter(Volume.id == volume_id).first()
        if not volume:
            continue

        req = requests.get(volume.url, headers=HEADER)
        txt = req.text
        soup = BeautifulSoup(txt, features="lxml")
        sibling = soup.body.find("ul", class_="publ-list")

        if sibling is None:
            return info_of_papers

        # sibling = body_main.find_previous_sibling()
        while sibling:
            # item_name = "entry informal"
            # item_name = "entry book"
            article_entries = sibling.find_all(
                "li", {"class": ["entry data", "entry article"]})
            if not len(article_entries):
                sibling = sibling.find_next_sibling()
                continue

            for article_entry in article_entries:
                paper_id = article_entry["id"]

                paper_doi = get_paper_doi(article_entry)

                #paper_title = get_paper_title(article_entry)
                #if paper_title is None:
                #    paper_title = ""

                #if len(paper_title) > 255:
                #    paper_title = paper_title[:255]

                #if not paper_title:
                #    continue

                paper_info = session.query(Paper)\
                    .filter(Paper.doi == paper_doi).first()

                if not paper_info:
                    continue

                paper_dblp_id = paper_info.dblp_id

                if not paper_dblp_id:
                    paper_info.dblp_id = paper_id
                    session.commit()

            sibling = sibling.find_next_sibling()
Exemple #5
0
def get_presses():
    press_url = "https://dblp.uni-trier.de/db/journals/publ/index.html"
    req = requests.get(press_url, headers=HEADER)
    txt = req.text
    soup = BeautifulSoup(txt, features="lxml")
    press_soup = soup.find("div", "clear-both")
    press_soup = press_soup.find_next_sibling()
    press_list = press_soup.find_all("li")

    presses = dict()
    for press in press_list:
        press_name = press.a.string
        press_url = press.a["href"]
        presses[press_name] = press_url
        session.add(Press(name=press_name, url=press_url))

    session.commit()
    session.close()
Exemple #6
0
def update_year_of_volumes():
    volumes = session.query(Volume).filter(Volume.id >= 34745)
    for volume in volumes:
        volume_id = volume.id

        paper_info = session.query(Paper)\
            .filter(Paper.volume_id == volume_id).first()
        if not paper_info:
            continue

        paper_dblp_id = paper_info.dblp_id

        paper_bibtex_url = \
            const.DBLP_JOURNAL_BIBTEX_PREVIX + paper_dblp_id + \
            const.DBLP_JOURNAL_BIBTEX_SUFFIX

        per_bibtex = get_paper_bibtex(paper_bibtex_url)
        volume_year = per_bibtex.get("year")
        volume_info = per_bibtex.get("volume")

        volume.year = volume_year
        volume.volume = volume_info
        session.commit()
Exemple #7
0
def insert_journal_into_db(journal_title, journal_addr, issn):
    session.add(
            Journal(name=journal_title, dblp_address=journal_addr, issn=issn))
    session.commit()
Exemple #8
0
def set_updated_status_of_volumes(volume_id):
    volume = session.query(Volume).filter(Volume.id == volume_id).first()
    volume.is_updated = True
    session.commit()
def collect_journal_papers():

    arx_journal_issn = "2331-8422"

    #    while True:
    # conditions = (Volume.is_updated == False)

    volume_id = 46770

    while volume_id <= 46770:

        conditions = (Volume.id == volume_id)
        new_volumes = analyze_volumes.query_volumes_by_filter(
            conditions)  #, limit_num=5)

        new_volume_infos = list()
        for new_volume in new_volumes:
            new_volume_infos.append({
                const.VOLUME_ID: new_volume.id,
                const.JOURNAL_ISSN: new_volume.issn,
                const.VOLUME_URL: new_volume.url
            })

    #   if not new_volume_infos:
    #       print("Finish the update!")
    #       break

        for new_volume_info in new_volume_infos:

            if new_volume_info[const.JOURNAL_ISSN] == arx_journal_issn:
                continue

            print("%s" % new_volume_info[const.VOLUME_URL])
            print("VOLUME ID: %s" % new_volume_info[const.VOLUME_ID])

            # if new_volume_info[const.VOLUME_ID] in [35956, 35957, 44428,
            #                                        44429, 44430]:
            #    continue

            new_paper_infos = analyze_papers.analyze_papers_of_volume(
                new_volume_info[const.VOLUME_URL])

            for new_paper_info in new_paper_infos:
                paper_query = \
                    session.query(Paper).filter(
                        Paper.dblp_id == new_paper_info[const.PAPER_DBLP_ID])\
                        .first()

                if paper_query is not None:
                    print("Database has the record [dblp_id: %s]" %
                          paper_query.dblp_id)
                    continue
                else:
                    print("Database has no record [dblp_id: %s]" %
                          new_paper_info[const.PAPER_DBLP_ID])

                new_paper = Paper(
                    title=new_paper_info[const.PAPER_TITLE],
                    journal_issn=new_volume_info[const.JOURNAL_ISSN],
                    volume_id=new_volume_info[const.VOLUME_ID],
                    volume=new_paper_info[const.PAPER_VOLUME],
                    number=new_paper_info[const.PAPER_NUMBER],
                    start_page=new_paper_info[const.PAPER_START_PAGE],
                    end_page=new_paper_info[const.PAPER_END_PAGE],
                    year=new_paper_info[const.PAPER_DATE],
                    url=new_paper_info[const.PAPER_URL],
                    doi=new_paper_info[const.PAPER_DOI],
                    dblp_id=new_paper_info[const.PAPER_DBLP_ID])
                session.add(new_paper)
                session.flush()
                session.refresh(new_paper)
                new_paper_id = new_paper.id

                # add authors
                author_infos = new_paper_info[const.PAPER_AUTHOR]
                order = 1
                for author_info in author_infos:
                    author_query = \
                        session.query(Author).filter(
                            Author.title == author_info["author_title"]).first()
                    if author_query:
                        author_id = author_query.id
                    else:
                        new_author = Author(
                            title=author_info["author_title"],
                            name=author_info["author_name"],
                            dblp_url=author_info["author_dblp_url"])

                        session.add(new_author)
                        session.flush()
                        session.refresh(new_author)
                        author_id = new_author.id

                    session.add(
                        PaperAuthor(paper_id=new_paper_id,
                                    author_id=author_id,
                                    order=order))
                    order += 1
                session.commit()

            volume = \
                session.query(Volume).filter(
                    Volume.id == new_volume_info[const.VOLUME_ID]).first()
            volume.is_updated = True
            session.commit()
            print("Volume - [ID: %s] has been processed!" %
                  new_volume_info[const.VOLUME_ID])

        volume_id = volume_id + 1