def wiki_dump_from_gs(): logger.info('Starting, process will connect with ElasticSearch and online wikipedia site...') mentions_files = [args.mentions] dump_file = args.output vocab = load_mentions_vocab_from_files(mentions_files) if args.host and args.port and args.index: wiki_elastic = WikipediaRelationExtraction(WikipediaSearchMethod.ELASTIC, host=args.host, port=args.port, index=args.index) else: logger.info( 'Running without Wikipedia elastic search, Note that this will ' 'take much longer to process only using online service') wiki_elastic = None wiki_online = WikipediaRelationExtraction(WikipediaSearchMethod.ONLINE) for phrase in vocab: phrase = phrase.replace("'", "").replace('"', "").replace('\\', "").strip() logger.info('Try to retrieve \'%s\' from elastic search', phrase) pages = None if wiki_elastic: pages = wiki_elastic.get_phrase_related_pages(phrase) if not pages or not pages.get_pages() or len(pages.get_pages()) == 0: logger.info('Not on elastic, retrieve \'%s\' from wiki online site', phrase) pages = wiki_online.get_phrase_related_pages(phrase) for search_page in pages.get_pages(): add_page(search_page, phrase) with open(dump_file, 'w') as myfile: json.dump(result_dump, myfile, default=json_dumper) logger.info('Saving dump to file-%s', dump_file)
def glove_dump(): filter_stop_words = False glove_file = args.glove out_file = args.output mention_files = [args.mentions] vocab = load_mentions_vocab_from_files(mention_files, filter_stop_words) word_to_ix, embeddings = load_glove_for_vocab(glove_file, vocab) logger.info('Words in vocabulary %d', len(vocab)) logger.info('Found %d words from vocabulary', len(word_to_ix.keys())) with open(out_file, 'wb') as f: pickle.dump([word_to_ix, embeddings], f) logger.info('Saving dump to file-%s', out_file)
def vo_dump(): vo_file = args.vo out_file = args.output mentions_event_gold_file = [args.mentions] vocab = load_mentions_vocab_from_files(mentions_event_gold_file, True) vo = VerboceanRelationExtraction.load_verbocean_file(vo_file) vo_for_vocab = {} for word in vocab: if word in vo: vo_for_vocab[word] = vo[word] logger.info('Found %d words from vocabulary', len(vo_for_vocab.keys())) logger.info('Preparing to save refDict output file') with open(out_file, 'w') as f: json.dump(vo_for_vocab, f) logger.info('Done saved to-%s', out_file)
def ref_dict_dump(): logger.info('Extracting referent dict dump, this may take a while...') ref_dict_file = args.ref_dict out_file = args.output mentions_entity_gold_file = [args.mentions] vocab = load_mentions_vocab_from_files(mentions_entity_gold_file, True) ref_dict = ReferentDictRelationExtraction.load_reference_dict(ref_dict_file) ref_dict_for_vocab = {} for word in vocab: if word in ref_dict: ref_dict_for_vocab[word] = ref_dict[word] logger.info('Found %d words from vocabulary', len(ref_dict_for_vocab.keys())) logger.info('Preparing to save refDict output file') with open(out_file, 'w') as f: json.dump(ref_dict_for_vocab, f) logger.info('Done saved to-%s', out_file)