Example #1
0
def parse_semantic_scholar_corpus_file(path, database_path="aip.db"):
        database = DatabaseManager(location=database_path)

        hash, parsed = database.did_parse_file(path)
        if parsed:
            return True

        file_iterator_func = iterload_file_lines_gzip if path.endswith("gz") else iterload_file_lines
        # print(corpus_file)
        # The json files contain stacked json objects, which is bad practice. It should be wrapped in a JSON array.
        # Libraries will throw errors if you attempt to load the file, so now we lazy load each object line by line.
        publication_iterator = file_iterator_func(path)
        for publication in publication_iterator:
            if publication is None:  # Corrupt JSON line possibly. Skip it.
                continue

            if "venue" not in publication:  # While parsing we sometimes get KeyError: 'venue'...
                continue

            # Try to match the publication to a venue we are interested in.
            # Wrap in str() as it sometimes is an int (???)
            venue_string = str(publication['venue'])
            if len(venue_string) == 0:
                continue

            # Check if any of the venue strings are a substring of the mentioned value, add it to that set.
            publication_title = publication['title']
            publication_abstract = publication['paperAbstract']
            publication_year = publication['year'] if 'year' in publication else -1
            publication_journal_volume = publication['journalVolume'].replace(" ",
                                                                              "_")  # Empty for conferences.
            # publication_keywords = publication['entities']
            publication_id = publication['id']

            num_citations = 0
            if "inCitations" in publication:
                num_citations = len(publication["inCitations"])

            publication_doi = publication['doi']
            if publication_doi is None or len(publication_doi) == 0:
                publication_doi_url = publication['doiUrl']
                if "doi.org/" in publication_doi_url:
                    publication_doi = publication['doiUrl'][
                                      publication['doiUrl'].index("doi.org/") + len("doi.org/"):]

            database.update_or_insert_paper(id=publication_id, doi=publication_doi, title=publication_title,
                                            abstract=publication_abstract, raw_venue_string=venue_string,
                                            year=publication_year, volume=publication_journal_volume,
                                            num_citations=num_citations)
        # database.flush_missing_venues()
        database.add_parsed_file(hash)
        database.close()
        return True
Example #2
0
def parse_aminer_corpus_file(path, database_path="aip", logger_disabled=False):
    logger.disabled = logger_disabled
    database = DatabaseManager(location=database_path)

    hash, parsed = database.did_parse_file(path)
    if parsed:
        return True

    # print(path)
    # The json files contain stacked json objects, which is bad practice.
    # It should be wrapped in a JSON array.
    # Libraries will throw errors if you attempt to load the file, so now we lazy load each object.
    file_iterator_func = iterload_file_lines_gzip if path.endswith(
        "gz") else iterload_file_lines
    publication_iterator = file_iterator_func(path)
    for publication in tqdm(publication_iterator):
        if publication is None:  # Corrupt JSON line possibly. Skip it.
            continue

        # Try to match the publication to a venue we are interested in.
        # Warning: contrary to the documentation, the key is "venue" NOT "venue.raw"!
        if 'venue' not in publication:
            logger.warning("Skipping line missing venue: %s in %s.",
                           publication, path)
            continue

        if 'title' not in publication:
            logger.warning("Skipping line missing title: %s in %s.",
                           publication, path)
            continue

        venue_string = publication['venue']

        # Sometimes the venue string is yet another dict...
        if isinstance(venue_string, dict) and "raw" in venue_string:
            venue_string = venue_string["raw"]

        publication_title = str(publication['title']).rstrip(".")
        publication_abstract = publication[
            'abstract'] if 'abstract' in publication else ""

        publication_year = publication[
            'year'] if 'year' in publication else None
        publication_journal_volume = publication[
            'volume'] if 'volume' in publication else None
        # publication_keywords = publication['keywords']
        publication_id = publication['id']
        # citation_count = int(publication['n_citation']) if "n_citation" in publication else None

        publication_doi = publication['doi'] if 'doi' in publication else None
        # Sometimes in the urls, a doi link is used. If there is, we attempt to extract the doi from the link.
        if publication_doi is None or len(publication_doi) == 0:
            publication_doi_urls = publication[
                'url'] if 'url' in publication else []
            for publication_doi_url in publication_doi_urls:
                if "doi.org/" in publication_doi_url:
                    publication_doi = publication_doi_url[publication_doi_url.
                                                          index("doi.org/") +
                                                          len("doi.org/"):]
                    break

        database.update_or_insert_paper(id=publication_id,
                                        doi=publication_doi,
                                        title=publication_title,
                                        abstract=publication_abstract,
                                        raw_venue_string=venue_string,
                                        year=publication_year,
                                        volume=publication_journal_volume)
    # database.flush_missing_venues()
    database.add_parsed_file(hash)
    database.close()
    return True
Example #3
0
def parse(dblp_file, database_path="aip.db"):
    database = DatabaseManager(location=database_path)

    hash, parsed = database.did_parse_file(dblp_file)
    if parsed:
        return True

    counter = 0  # counter for new keys.

    # dtd = etree.DTD(file="/media/lfdversluis/datastore/dblp.dtd")
    for event, element in etree.iterparse(dblp_file,
                                          load_dtd=True,
                                          dtd_validation=True):
        if element.tag not in ['article', 'inproceedings', 'proceedings']:
            continue

        if 'key' in element.attrib:
            id = str(element.attrib['key'])
        else:
            id = "id" + str(counter)
            counter += 1
        title = element.find('title')  # type: Optional[str]
        if title is not None:
            title = str(title.text).rstrip(".")
        year = element.find('year')  # type: Optional[int]
        if year is not None:
            try:
                year = int(re.search(r'\d+', str(year.text)).group())
                if 20 < year < 100:  # Weird cases like 92-93
                    year += 1900
                elif year < 20:  # weird cases like '12
                    year += 2000
            except:
                year = None
        volume = element.find('volume')  # type: Optional[int]
        if volume is not None:
            try:
                volume = int(volume.text)
            except:
                volume = None
        # authors = element.find('author')  # type: Optional[str]
        venue = element.find('booktitle')  # type: Optional[str]
        if venue is None and len(element.findall('journal')) > 0:
            venue = element.find('journal')

        if venue is not None and venue.text is not None:
            venue = str(venue.text)
        else:
            venue = None

        doi = None
        for ee in element.findall('ee'):
            ee_str = str(ee.text)
            if ee is not None and "doi.org" in ee_str:
                doi = ee_str[ee_str.index("doi.org/") + len("doi.org/"):]
                break

        if title is not None and year is not None and venue is not None:
            # Clean the title which may have HTML elements
            database.update_or_insert_paper(id=id,
                                            doi=doi,
                                            title=title,
                                            abstract="",
                                            raw_venue_string=venue,
                                            year=year,
                                            volume=volume,
                                            num_citations=-1)

            # Get the authors for this paper and add them to the database
            authors = []  # tuples of ID, orcid
            for author_element in element.findall('author'):
                orcid = None
                if "orcid" in author_element.attrib:
                    orcid = str(author_element.attrib['orcid'])

                authors.append((author_element.text, orcid))

            database.add_authors_for_article(authors=authors, article_id=id)

        element.clear()

        # database.flush_missing_venues()
    database.add_parsed_file(hash)
    database.close()
    return True