def main(self, args):
        logging.info("Reading Open Corpus file from: {}".format(
            self.input_path))
        logging.info("Writing json file to: {}".format(self.output_path))

        dp = DatasetPaths()

        assert os.path.exists(self.input_path)
        assert not os.path.exists(self.output_path)
        assert not os.path.exists(dp.get_pkl_path('oc'))

        with open(self.output_path, 'w') as f:
            for obj in tqdm.tqdm(file_util.read_json_lines(self.input_path)):
                if 'year' not in obj:
                    continue
                translated_obj = {
                    FieldNames.PAPER_ID:
                    obj['id'],
                    FieldNames.TITLE_RAW:
                    obj['title'],
                    FieldNames.ABSTRACT_RAW:
                    obj['paperAbstract'],
                    FieldNames.AUTHORS: [a['name'] for a in obj['authors']],
                    FieldNames.IN_CITATION_COUNT:
                    len(obj['inCitations']),
                    FieldNames.KEY_PHRASES:
                    obj['keyPhrases'],
                    FieldNames.OUT_CITATIONS:
                    obj['outCitations'],
                    FieldNames.URLS:
                    obj['pdfUrls'],
                    FieldNames.S2_URL:
                    obj['s2Url'],
                    FieldNames.VENUE:
                    obj['venue'],
                    FieldNames.YEAR:
                    obj['year'],
                    FieldNames.TITLE:
                    ' '.join(global_tokenizer(obj['title'])),
                    FieldNames.ABSTRACT:
                    ' '.join(global_tokenizer(obj['paperAbstract']))
                }
                f.write(json.dumps(translated_obj))
                f.write("\n")
        f.close()
        oc_corpus = Corpus.build(dp.get_db_path('oc'), dp.get_json_path('oc'))
        pickle.dump(oc_corpus, open(dp.get_pkl_path('oc')))
    def main(self, args):
        dp = DatasetPaths()

        corpus_json = dp.get_json_path(self.dataset_name)
        index_location = dp.get_bm25_index_path(self.dataset_name)

        if os.path.exists(index_location):
            assert False
        else:
            os.mkdir(index_location)

        bm25_index = create_in(index_location, schema)
        writer = bm25_index.writer()

        for doc in tqdm.tqdm(file_util.read_json_lines(corpus_json)):
            writer.add_document(id=doc['id'],
                                title=doc['title'],
                                abstract=doc['abstract'])

        writer.commit()
Example #3
0
def stream_papers(data_path):
    for line_json in tqdm.tqdm(file_util.read_json_lines(data_path)):
        citations = set(line_json[FieldNames.OUT_CITATIONS])
        citations.discard(line_json[FieldNames.PAPER_ID])  # remove self-citations
        citations = list(citations)

        in_citation_count = 0

        key_phrases = list(set(line_json[FieldNames.KEY_PHRASES]))
        
        #auths = line_json[FieldNames.AUTHORS]

        yield ProtoDoc(
            id=line_json[FieldNames.PAPER_ID],
            title=line_json[FieldNames.TITLE],
            abstract=line_json[FieldNames.ABSTRACT],
            out_citations=citations,
            in_citation_count=in_citation_count,
            year=line_json.get(FieldNames.YEAR, 2017),
            key_phrases=key_phrases,
            venue=line_json.get(FieldNames.VENUE, ''),
        )
Example #4
0
input_path = '/data/split_opencorpus/1.json'
output_path = '/data/temp/1.json'
output_pkl_path = '/data/temp/1.pkl'

logging.info("Reading Open Corpus file from: {}".format(input_path))
logging.info("Writing json file to: {}".format(output_path))

dp = DatasetPaths()

assert os.path.exists(input_path)
assert not os.path.exists(output_path)
assert not os.path.exists(output_pkl_path)

s = 0
with open(output_path, 'w') as f:
    for obj in tqdm.tqdm(file_util.read_json_lines(input_path)):
        if 'year' not in obj:
            continue
        translated_obj = {
            FieldNames.PAPER_ID: obj['id'],
            FieldNames.TITLE_RAW: obj['title'],
            FieldNames.ABSTRACT_RAW: obj['paperAbstract'],
            FieldNames.AUTHORS: [a['name'] for a in obj['authors']],
            FieldNames.IN_CITATION_COUNT: 0,
            FieldNames.KEY_PHRASES: obj['keyPhrases'],
            FieldNames.OUT_CITATIONS: obj['outCitations'],
            FieldNames.URLS: obj['pdfUrls'],
            FieldNames.S2_URL: obj['s2Url'],
            FieldNames.VENUE: obj['venue'],
            FieldNames.YEAR: obj['year'],
            FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])),