def main(self, args): logging.info("Reading Open Corpus file from: {}".format( self.input_path)) logging.info("Writing json file to: {}".format(self.output_path)) dp = DatasetPaths() assert os.path.exists(self.input_path) assert not os.path.exists(self.output_path) assert not os.path.exists(dp.get_pkl_path('oc')) with open(self.output_path, 'w') as f: for obj in tqdm.tqdm(file_util.read_json_lines(self.input_path)): if 'year' not in obj: continue translated_obj = { FieldNames.PAPER_ID: obj['id'], FieldNames.TITLE_RAW: obj['title'], FieldNames.ABSTRACT_RAW: obj['paperAbstract'], FieldNames.AUTHORS: [a['name'] for a in obj['authors']], FieldNames.IN_CITATION_COUNT: len(obj['inCitations']), FieldNames.KEY_PHRASES: obj['keyPhrases'], FieldNames.OUT_CITATIONS: obj['outCitations'], FieldNames.URLS: obj['pdfUrls'], FieldNames.S2_URL: obj['s2Url'], FieldNames.VENUE: obj['venue'], FieldNames.YEAR: obj['year'], FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])), FieldNames.ABSTRACT: ' '.join(global_tokenizer(obj['paperAbstract'])) } f.write(json.dumps(translated_obj)) f.write("\n") f.close() oc_corpus = Corpus.build(dp.get_db_path('oc'), dp.get_json_path('oc')) pickle.dump(oc_corpus, open(dp.get_pkl_path('oc')))
def main(self, args): dp = DatasetPaths() corpus_json = dp.get_json_path(self.dataset_name) index_location = dp.get_bm25_index_path(self.dataset_name) if os.path.exists(index_location): assert False else: os.mkdir(index_location) bm25_index = create_in(index_location, schema) writer = bm25_index.writer() for doc in tqdm.tqdm(file_util.read_json_lines(corpus_json)): writer.add_document(id=doc['id'], title=doc['title'], abstract=doc['abstract']) writer.commit()
def stream_papers(data_path): for line_json in tqdm.tqdm(file_util.read_json_lines(data_path)): citations = set(line_json[FieldNames.OUT_CITATIONS]) citations.discard(line_json[FieldNames.PAPER_ID]) # remove self-citations citations = list(citations) in_citation_count = 0 key_phrases = list(set(line_json[FieldNames.KEY_PHRASES])) #auths = line_json[FieldNames.AUTHORS] yield ProtoDoc( id=line_json[FieldNames.PAPER_ID], title=line_json[FieldNames.TITLE], abstract=line_json[FieldNames.ABSTRACT], out_citations=citations, in_citation_count=in_citation_count, year=line_json.get(FieldNames.YEAR, 2017), key_phrases=key_phrases, venue=line_json.get(FieldNames.VENUE, ''), )
input_path = '/data/split_opencorpus/1.json' output_path = '/data/temp/1.json' output_pkl_path = '/data/temp/1.pkl' logging.info("Reading Open Corpus file from: {}".format(input_path)) logging.info("Writing json file to: {}".format(output_path)) dp = DatasetPaths() assert os.path.exists(input_path) assert not os.path.exists(output_path) assert not os.path.exists(output_pkl_path) s = 0 with open(output_path, 'w') as f: for obj in tqdm.tqdm(file_util.read_json_lines(input_path)): if 'year' not in obj: continue translated_obj = { FieldNames.PAPER_ID: obj['id'], FieldNames.TITLE_RAW: obj['title'], FieldNames.ABSTRACT_RAW: obj['paperAbstract'], FieldNames.AUTHORS: [a['name'] for a in obj['authors']], FieldNames.IN_CITATION_COUNT: 0, FieldNames.KEY_PHRASES: obj['keyPhrases'], FieldNames.OUT_CITATIONS: obj['outCitations'], FieldNames.URLS: obj['pdfUrls'], FieldNames.S2_URL: obj['s2Url'], FieldNames.VENUE: obj['venue'], FieldNames.YEAR: obj['year'], FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])),