Ejemplo n.º 1
0
def build_test_corpus(source_file, target_file):
    try:
        os.unlink(target_file)
    except:
        pass

    with open(source_file, 'w') as tf:
        for i in range(100):
            json.dump(
                {
                    FieldNames.TITLE:
                    ' '.join(random.sample(WORDS, 10)),
                    FieldNames.ABSTRACT:
                    ' '.join(random.sample(WORDS, 1000)),
                    FieldNames.AUTHORS: [],
                    FieldNames.OUT_CITATIONS:
                    [str(x) for x in random.sample(range(100), 2)],
                    FieldNames.IN_CITATION_COUNT:
                    len([str(x) for x in random.sample(range(100), 2)]),
                    FieldNames.KEY_PHRASES:
                    random.sample(WORDS, 3),
                    FieldNames.YEAR:
                    2011,
                    FieldNames.PAPER_ID:
                    str(i),
                    FieldNames.VENUE:
                    'v-{}'.format(random.randint(1, 5))
                }, tf)
            tf.write('\n')

    Corpus.build(target_file, source_file)
Ejemplo n.º 2
0
 def _verify(db_filename, corpus_json):
     try:
         Corpus.build(db_filename=db_filename, source_json=corpus_json)
     except Exception as e:
         logging.critical(
             "Failed to build corpus {} for file {}".format(
                 db_filename, corpus_json))
         print(e)
    def main(self, args):
        logging.info("Reading Open Corpus file from: {}".format(
            self.input_path))
        logging.info("Writing json file to: {}".format(self.output_path))

        dp = DatasetPaths()

        assert os.path.exists(self.input_path)
        assert not os.path.exists(self.output_path)
        assert not os.path.exists(dp.get_pkl_path('oc'))

        with open(self.output_path, 'w') as f:
            for obj in tqdm.tqdm(file_util.read_json_lines(self.input_path)):
                if 'year' not in obj:
                    continue
                translated_obj = {
                    FieldNames.PAPER_ID:
                    obj['id'],
                    FieldNames.TITLE_RAW:
                    obj['title'],
                    FieldNames.ABSTRACT_RAW:
                    obj['paperAbstract'],
                    FieldNames.AUTHORS: [a['name'] for a in obj['authors']],
                    FieldNames.IN_CITATION_COUNT:
                    len(obj['inCitations']),
                    FieldNames.KEY_PHRASES:
                    obj['keyPhrases'],
                    FieldNames.OUT_CITATIONS:
                    obj['outCitations'],
                    FieldNames.URLS:
                    obj['pdfUrls'],
                    FieldNames.S2_URL:
                    obj['s2Url'],
                    FieldNames.VENUE:
                    obj['venue'],
                    FieldNames.YEAR:
                    obj['year'],
                    FieldNames.TITLE:
                    ' '.join(global_tokenizer(obj['title'])),
                    FieldNames.ABSTRACT:
                    ' '.join(global_tokenizer(obj['paperAbstract']))
                }
                f.write(json.dumps(translated_obj))
                f.write("\n")
        f.close()
        oc_corpus = Corpus.build(dp.get_db_path('oc'), dp.get_json_path('oc'))
        pickle.dump(oc_corpus, open(dp.get_pkl_path('oc')))
Ejemplo n.º 4
0
with open(output_path, 'w') as f:
    for obj in tqdm.tqdm(file_util.read_json_lines(input_path)):
        if 'year' not in obj:
            continue
        translated_obj = {
            FieldNames.PAPER_ID: obj['id'],
            FieldNames.TITLE_RAW: obj['title'],
            FieldNames.ABSTRACT_RAW: obj['paperAbstract'],
            FieldNames.AUTHORS: [a['name'] for a in obj['authors']],
            FieldNames.IN_CITATION_COUNT: 0,
            FieldNames.KEY_PHRASES: obj['keyPhrases'],
            FieldNames.OUT_CITATIONS: obj['outCitations'],
            FieldNames.URLS: obj['pdfUrls'],
            FieldNames.S2_URL: obj['s2Url'],
            FieldNames.VENUE: obj['venue'],
            FieldNames.YEAR: obj['year'],
            FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])),
            FieldNames.ABSTRACT:
            ' '.join(global_tokenizer(obj['paperAbstract']))
        }
        s += 1
        if s == 10:
            break
        f.write(json.dumps(translated_obj))
        f.write("\n")
f.close()

oc_corpus = Corpus.build(dp.get_db_path('oc'), [output_path])
with open(output_pkl_path, 'wb') as f:
    pickle.dump(oc_corpus, f, -1)
    def main(self, args):

        if self.dataset_name == 'dblp':
            input_path = DatasetPaths.DBLP_GOLD_DIR
            output_path = DatasetPaths.DBLP_CORPUS_JSON
        elif self.dataset_name == 'pubmed':
            input_path = DatasetPaths.PUBMED_GOLD_DIR
            output_path = DatasetPaths.PUBMED_CORPUS_JSON
        else:
            assert False

        logging.info("Reading Gold data from {}".format(input_path))
        logging.info("Writing corpus to {}".format(output_path))
        assert os.path.exists(input_path)
        assert not os.path.exists(output_path)

        papers_file = os.path.join(input_path, "papers.txt")
        abstracts_file = os.path.join(input_path, "abstracts.txt")
        keyphrases_file = os.path.join(input_path, "paper_keyphrases.txt")
        citations_file = os.path.join(input_path, "paper_paper.txt")
        authors_file = os.path.join(input_path, "paper_author.txt")

        venues_file = os.path.join(input_path, "paper_venue.txt")

        paper_titles = {}
        paper_years = {}
        paper_abstracts = {}
        paper_keyphrases = {}
        paper_citations = {}
        paper_in_citations = {}
        paper_authors = {}
        paper_venues = {}

        bad_ids = set()
        for line in file_util.read_lines(abstracts_file):
            parts = line.split("\t")
            paper_id = int(parts[0])
            if len(parts) == 2:
                paper_abstracts[paper_id] = parts[1]
            else:
                paper_abstracts[paper_id] = ""

            if paper_abstracts[paper_id] == "":
                bad_ids.add(paper_id)

        for line in file_util.read_lines(papers_file):
            parts = line.split('\t')
            paper_id = int(parts[0])
            paper_years[paper_id] = int(parts[2])
            paper_titles[paper_id] = parts[3]

        for line in file_util.read_lines(keyphrases_file):
            parts = line.split("\t")
            paper_id = int(parts[0])
            if paper_id not in paper_keyphrases:
                paper_keyphrases[paper_id] = []

            for kp in parts[1:]:
                kp = kp.strip()
                if len(kp) > 0:
                    paper_keyphrases[paper_id].append(kp[:-4])

        for line in file_util.read_lines(citations_file):
            parts = line.split("\t")
            paper_id = int(parts[0])
            if paper_id not in paper_citations:
                paper_citations[paper_id] = []
            c = int(parts[1])
            if c in bad_ids:
                continue
            paper_citations[paper_id].append(str(c))

            if c not in paper_in_citations:
                paper_in_citations[c] = []
            if paper_id not in paper_in_citations:
                paper_in_citations[paper_id] = []

            paper_in_citations[c].append(paper_id)

        for line in file_util.read_lines(authors_file):
            parts = line.split("\t")
            paper_id = int(parts[0])
            if paper_id not in paper_authors:
                paper_authors[paper_id] = []

            paper_authors[paper_id].append(parts[1])

        for line in file_util.read_lines(venues_file):
            parts = line.split("\t")
            paper_id = int(parts[0])
            paper_venues[paper_id] = parts[1]

        test_paper_id = 13
        print("==== Test Paper Details ====")
        print(paper_titles[test_paper_id])
        print(paper_years[test_paper_id])
        print(paper_abstracts[test_paper_id])
        print(paper_keyphrases[test_paper_id])
        print(paper_citations[test_paper_id])
        print(paper_in_citations[test_paper_id])
        print(paper_authors[test_paper_id])
        print(paper_venues[test_paper_id])
        print("==== Test Paper Details ====")

        def _print_len(x, name=''):
            print("No. of {} = {}".format(name, len(x)))

        _print_len(paper_titles, 'Titles')
        _print_len(paper_years, 'Years')
        _print_len(paper_abstracts, 'Abstracts')
        _print_len(paper_keyphrases, 'KeyPhrases')
        _print_len(paper_citations, 'Paper Citations')
        _print_len(paper_in_citations, 'Paper In citations')
        _print_len(paper_authors, ' Authors')
        _print_len(paper_venues, ' Venues')

        logging.info("Skipped {} papers due to insufficient data".format(
            len(bad_ids)))

        corpus = {}
        for id, title in tqdm.tqdm(paper_titles.items()):
            if id in bad_ids:
                continue
            doc = document_from_dict({
                FieldNames.PAPER_ID:
                str(id),
                FieldNames.TITLE:
                ' '.join(global_tokenizer(title)),
                FieldNames.ABSTRACT:
                ' '.join(global_tokenizer(paper_abstracts[id])),
                FieldNames.OUT_CITATIONS:
                paper_citations.get(id, []),
                FieldNames.YEAR:
                paper_years[id],
                FieldNames.AUTHORS:
                paper_authors.get(id, []),
                FieldNames.KEY_PHRASES:
                paper_keyphrases[id],
                FieldNames.OUT_CITATION_COUNT:
                len(paper_citations.get(id, [])),
                FieldNames.IN_CITATION_COUNT:
                len(paper_in_citations.get(id, [])),
                FieldNames.VENUE:
                paper_venues.get(id, ''),
                FieldNames.TITLE_RAW:
                title,
                FieldNames.ABSTRACT_RAW:
                paper_abstracts[id]
            })
            corpus[id] = doc

        with open(output_path, 'w') as f:
            for _, doc in corpus.items():
                doc_json = dict_from_document(doc)
                f.write(json.dumps(doc_json))
                f.write("\n")

        dp = DatasetPaths()
        Corpus.build(dp.get_db_path(self.dataset_name),
                     dp.get_json_path(self.dataset_name))
Ejemplo n.º 6
0
def end_to_end_training(model_options: ModelOptions,
                        dataset_type,
                        models_dir,
                        models_ann_dir=None):
    # step 1: make the directory
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    # step 2: load the corpus DB
    print("Loading corpus db...")
    dp = DatasetPaths()
    db_file = dp.get_db_path(dataset_type)
    json_file = dp.get_json_path(dataset_type)
    if not os.path.isfile(db_file):
        print(
            "Have to build the database! This may take a while, but should only happen once."
        )
        Corpus.build(db_file, json_file)

    if dataset_type == 'oc':
        corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type))
    else:
        corpus = Corpus.load(db_file, model_options.train_frac)

    # step 3: load/make the featurizer (once per hyperopt run)
    print("Making feautrizer")
    featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_'

    featurizer_file = os.path.join(
        models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME)

    if os.path.isfile(featurizer_file):
        featurizer = file_util.read_pickle(featurizer_file)
    else:
        featurizer = Featurizer(
            max_features=model_options.max_features,
            max_title_len=model_options.max_title_len,
            max_abstract_len=model_options.max_abstract_len,
            use_pretrained=model_options.use_pretrained,
            min_author_papers=model_options.min_author_papers,
            min_venue_papers=model_options.min_venue_papers,
            min_keyphrase_papers=model_options.min_keyphrase_papers)
        featurizer.fit(corpus,
                       is_featurizer_for_test=model_options.train_for_test_set)
        file_util.write_pickle(featurizer_file, featurizer)

    # update model options after featurization
    model_options.n_authors = featurizer.n_authors
    model_options.n_venues = featurizer.n_venues
    model_options.n_keyphrases = featurizer.n_keyphrases
    model_options.n_features = featurizer.n_features
    if model_options.use_pretrained:
        model_options.dense_dim = model_options.dense_dim_pretrained

    # step 4: train the model
    citeomatic_model, embedding_model = train_text_model(
        corpus,
        featurizer,
        model_options,
        models_ann_dir=models_ann_dir,
        debug=True,
        tensorboard_dir=None)

    # step 5: save the model
    citeomatic_model.save_weights(os.path.join(models_dir,
                                               dp.CITEOMATIC_WEIGHTS_FILENAME),
                                  overwrite=True)

    if embedding_model is not None:
        embedding_model.save_weights(os.path.join(
            models_dir, dp.EMBEDDING_WEIGHTS_FILENAME),
                                     overwrite=True)

    file_util.write_json(
        os.path.join(models_dir, dp.OPTIONS_FILENAME),
        model_options.to_json(),
    )

    return corpus, featurizer, model_options, citeomatic_model, embedding_model