Beispiel #1
0
def find_press_of_journal():
    presses = session.query(Press)
    for press in presses:
        press_id = press.id
        press_url = press.url

        req = requests.get(press_url, headers=HEADER)
        txt = req.text
        soup = BeautifulSoup(txt, features="lxml")
        press_soup = soup.find("div", "clear-both")
        press_soup = press_soup.find_next_sibling()
        press_soup = press_soup.find_next_sibling()
        p_infos = press_soup.find_all("li")

        while p_infos:
            for p_info in p_infos:
                journal_url = p_info.a["href"]
                if const.DBLP_JOURNAL_PREVIX in journal_url:
                    journal_url = journal_url[:-10]
                    j_q = session.query(Journal)\
                        .filter(Journal.dblp_address == journal_url).first()
                    if j_q:
                        j_q.press = press_id
                        session.commit()

            press_soup = press_soup.find_next_sibling()
            p_infos = press_soup.find_all("li")
    session.close()
def update_is_updated_of_volumes():
    paper_volume_ids = session.query(distinct(Paper.volume_id))
    volume_ids = session.query(Volume.id).filter(
        Volume.id.notin_(paper_volume_ids)).all()

    for volume_id in volume_ids:
        session.query(Volume).filter(Volume.id == volume_id[0]).update(
            {Volume.is_updated: False})

    session.commit()
    session.close()
Beispiel #3
0
def update_dblp_id_of_papers():
    volume_ids = get_volumes_of_None()
    for volume_id in volume_ids:
        volume = session.query(Volume).filter(Volume.id == volume_id).first()
        if not volume:
            continue

        req = requests.get(volume.url, headers=HEADER)
        txt = req.text
        soup = BeautifulSoup(txt, features="lxml")
        sibling = soup.body.find("ul", class_="publ-list")

        if sibling is None:
            return info_of_papers

        # sibling = body_main.find_previous_sibling()
        while sibling:
            # item_name = "entry informal"
            # item_name = "entry book"
            article_entries = sibling.find_all(
                "li", {"class": ["entry data", "entry article"]})
            if not len(article_entries):
                sibling = sibling.find_next_sibling()
                continue

            for article_entry in article_entries:
                paper_id = article_entry["id"]

                paper_doi = get_paper_doi(article_entry)

                #paper_title = get_paper_title(article_entry)
                #if paper_title is None:
                #    paper_title = ""

                #if len(paper_title) > 255:
                #    paper_title = paper_title[:255]

                #if not paper_title:
                #    continue

                paper_info = session.query(Paper)\
                    .filter(Paper.doi == paper_doi).first()

                if not paper_info:
                    continue

                paper_dblp_id = paper_info.dblp_id

                if not paper_dblp_id:
                    paper_info.dblp_id = paper_id
                    session.commit()

            sibling = sibling.find_next_sibling()
Beispiel #4
0
def check_journal_press(journal_url):
    journal_query = session.query(Journal)\
        .filter(Journal.dblp_address == journal_url).first()

    if journal_query:
        return True
    else:
        return False
Beispiel #5
0
def get_volumes_of_None():
    volume_ids = session.query(distinct(Paper.volume_id))\
        .filter(Paper.dblp_id == None)

    v_ids = list()
    for volume_id in volume_ids:
        v_ids.append(volume_id[0])
    return v_ids
Beispiel #6
0
 def check_dblp_address(self):
     dblp_addresses = session.query(Journal.dblp_address)
     num = 0
     for dblp_address in dblp_addresses:
         if const.DBLP_JOURNAL_PREVIX in dblp_address[0]:
             print(dblp_address[0])
             num = num + 1
     print(num)
Beispiel #7
0
def update_year_of_volumes():
    volumes = session.query(Volume).filter(Volume.id >= 34745)
    for volume in volumes:
        volume_id = volume.id

        paper_info = session.query(Paper)\
            .filter(Paper.volume_id == volume_id).first()
        if not paper_info:
            continue

        paper_dblp_id = paper_info.dblp_id

        paper_bibtex_url = \
            const.DBLP_JOURNAL_BIBTEX_PREVIX + paper_dblp_id + \
            const.DBLP_JOURNAL_BIBTEX_SUFFIX

        per_bibtex = get_paper_bibtex(paper_bibtex_url)
        volume_year = per_bibtex.get("year")
        volume_info = per_bibtex.get("volume")

        volume.year = volume_year
        volume.volume = volume_info
        session.commit()
Beispiel #8
0
def find_volume_by_url(volume_url):
    return session.query(Volume).filter(Volume.url == volume_url).first()
Beispiel #9
0
def get_all_journals_from_db():
    return session.query(Journal).all()
Beispiel #10
0
def set_updated_status_of_volumes(volume_id):
    volume = session.query(Volume).filter(Volume.id == volume_id).first()
    volume.is_updated = True
    session.commit()
Beispiel #11
0
def query_journal_is_in_volumes(journal_issn):
    result = session.query(Volume).filter(Volume.issn == journal_issn).first()
    if result:
        return True
    else:
        return False
Beispiel #12
0
def query_volumes():
    volumes = session.query(Volume.issn).distinct().all()

    print(len(volumes))
Beispiel #13
0
def query_volumes_by_filter(conditions, limit_num=1000):
    query = session.query(Volume).filter(conditions).limit(limit_num)
    return query
Beispiel #14
0
def collect_journal_papers():

    arx_journal_issn = "2331-8422"

    #    while True:
    # conditions = (Volume.is_updated == False)

    volume_id = 46770

    while volume_id <= 46770:

        conditions = (Volume.id == volume_id)
        new_volumes = analyze_volumes.query_volumes_by_filter(
            conditions)  #, limit_num=5)

        new_volume_infos = list()
        for new_volume in new_volumes:
            new_volume_infos.append({
                const.VOLUME_ID: new_volume.id,
                const.JOURNAL_ISSN: new_volume.issn,
                const.VOLUME_URL: new_volume.url
            })

    #   if not new_volume_infos:
    #       print("Finish the update!")
    #       break

        for new_volume_info in new_volume_infos:

            if new_volume_info[const.JOURNAL_ISSN] == arx_journal_issn:
                continue

            print("%s" % new_volume_info[const.VOLUME_URL])
            print("VOLUME ID: %s" % new_volume_info[const.VOLUME_ID])

            # if new_volume_info[const.VOLUME_ID] in [35956, 35957, 44428,
            #                                        44429, 44430]:
            #    continue

            new_paper_infos = analyze_papers.analyze_papers_of_volume(
                new_volume_info[const.VOLUME_URL])

            for new_paper_info in new_paper_infos:
                paper_query = \
                    session.query(Paper).filter(
                        Paper.dblp_id == new_paper_info[const.PAPER_DBLP_ID])\
                        .first()

                if paper_query is not None:
                    print("Database has the record [dblp_id: %s]" %
                          paper_query.dblp_id)
                    continue
                else:
                    print("Database has no record [dblp_id: %s]" %
                          new_paper_info[const.PAPER_DBLP_ID])

                new_paper = Paper(
                    title=new_paper_info[const.PAPER_TITLE],
                    journal_issn=new_volume_info[const.JOURNAL_ISSN],
                    volume_id=new_volume_info[const.VOLUME_ID],
                    volume=new_paper_info[const.PAPER_VOLUME],
                    number=new_paper_info[const.PAPER_NUMBER],
                    start_page=new_paper_info[const.PAPER_START_PAGE],
                    end_page=new_paper_info[const.PAPER_END_PAGE],
                    year=new_paper_info[const.PAPER_DATE],
                    url=new_paper_info[const.PAPER_URL],
                    doi=new_paper_info[const.PAPER_DOI],
                    dblp_id=new_paper_info[const.PAPER_DBLP_ID])
                session.add(new_paper)
                session.flush()
                session.refresh(new_paper)
                new_paper_id = new_paper.id

                # add authors
                author_infos = new_paper_info[const.PAPER_AUTHOR]
                order = 1
                for author_info in author_infos:
                    author_query = \
                        session.query(Author).filter(
                            Author.title == author_info["author_title"]).first()
                    if author_query:
                        author_id = author_query.id
                    else:
                        new_author = Author(
                            title=author_info["author_title"],
                            name=author_info["author_name"],
                            dblp_url=author_info["author_dblp_url"])

                        session.add(new_author)
                        session.flush()
                        session.refresh(new_author)
                        author_id = new_author.id

                    session.add(
                        PaperAuthor(paper_id=new_paper_id,
                                    author_id=author_id,
                                    order=order))
                    order += 1
                session.commit()

            volume = \
                session.query(Volume).filter(
                    Volume.id == new_volume_info[const.VOLUME_ID]).first()
            volume.is_updated = True
            session.commit()
            print("Volume - [ID: %s] has been processed!" %
                  new_volume_info[const.VOLUME_ID])

        volume_id = volume_id + 1
Beispiel #15
0
def find_journal_by_issn(journal_issn):
    return session.query(Journal).filter(Journal.issn == journal_issn).first()
Beispiel #16
0
def find_journal_by_title(journal_title):
    return session.query(Journal).filter(Journal.name == journal_title).first()
Beispiel #17
0
from databases.db_engine import engine, session

from information_collection import analyze_journal
from information_collection import analyze_journal_urls
from information_collection.journal_info import Journal

from publish_lib import generate_random

print(session.query(Journal).filter().count())