def main(unused): parser = argparse.ArgumentParser() parser.add_argument("embedding_path") parser.add_argument("vocabulary_dir") parser.add_argument("--training_dir") parser.add_argument("--validation_dir") parser.add_argument("--decode_dir") parser.add_argument("--decode_out_dir") parser.add_argument("--mode", choices=["train", "validate", "decode"], default="train") parser.add_argument("--logdir") parser.add_argument("--batch_size", type=int, default=30) parser.add_argument("--validation_interval", type=int, default=20000) parser.add_argument("--beam_width", type=int, default=5) parser.add_argument("--max_output_length", type=int, default=32) parser.add_argument("--target_vocabulary_size", type=int, default=20000) parser.add_argument("--synthetic", action="store_true") parser.add_argument("--allow_gpu_growth", action="store_true") parser.add_argument("--collect_run_metadata", action="store_true") parser.add_argument("--log_weight_images", action="store_true") options = parser.parse_args() if options.mode == "decode": # Batching not supported in decoding options.batch_size = 1 embedding_words, word_dict, word_embedding_dim = load_word_embeddings( options.embedding_path) vocabulary = Vocabulary() summary_vocabulary_path = path.join(options.vocabulary_dir, "summary_vocabulary.txt") vocabulary.add_from_file( summary_vocabulary_path, options.target_vocabulary_size - len(vocabulary.words)) document_vocabulary_path = path.join(options.vocabulary_dir, "document_vocabulary.txt") # Add the most common words from vocabulary vocabulary.add_from_file(document_vocabulary_path, 150000) # Add additional common words from loaded embeddings # note that embedding_words contains 2D numpy arrays vocabulary.add_words(embedding_words[:100000]) run(options, word_dict, word_embedding_dim, vocabulary)
def build_vocabs(): train, dev, test = load_boknilev() samples = [ s for r in train + dev + test for s in boknilev_record_to_hcpd_samples(r) ] gold_pos_vocab = Vocabulary('GOLD_POS') gold_pos_vocab.add_words( set([hc.next_pos for s in samples for hc in s.x.head_cands])) gold_pos_vocab.add_word(None) words_vocab = Vocabulary('WORDS') words_vocab.add_words( set([hc.word for s in samples for hc in s.x.head_cands])) words_vocab.add_words(set([s.x.pp.word for s in samples])) words_vocab.add_words(set([s.x.child.word for s in samples])) words_vocab.add_word(None) words_to_lemmas = {} words_to_lemmas.update({s.x.child.word: s.x.child.lemma for s in samples}) words_to_lemmas.update( {hc.word: hc.lemma for s in samples for hc in s.x.head_cands}) return [gold_pos_vocab, words_vocab, words_to_lemmas]
def main(unused): parser = argparse.ArgumentParser() parser.add_argument('embedding_path') parser.add_argument('vocabulary_dir') parser.add_argument('--training_dir') parser.add_argument('--validation_dir') parser.add_argument('--decode_dir') parser.add_argument('--decode_out_dir') parser.add_argument('--mode', choices=['train', 'validate', 'decode'], default='train') parser.add_argument('--logdir') parser.add_argument('--batch_size', type=int, default=30) parser.add_argument('--validation_interval', type=int, default=20000) parser.add_argument('--beam_width', type=int, default=5) parser.add_argument('--max_output_length', type=int, default=32) parser.add_argument('--target_vocabulary_size', type=int, default=20000) parser.add_argument('--synthetic', action='store_true') parser.add_argument('--allow_gpu_growth', action='store_true') parser.add_argument('--collect_run_metadata', action='store_true') parser.add_argument('--log_weight_images', action='store_true') options = parser.parse_args() if options.mode == 'decode': # Batching not supported in decoding options.batch_size = 1 embedding_words, word_dict, word_embedding_dim = load_word_embeddings( options.embedding_path) vocabulary = Vocabulary() summary_vocabulary_path = path.join(options.vocabulary_dir, 'summary_vocabulary.txt') vocabulary.add_from_file( summary_vocabulary_path, options.target_vocabulary_size - len(vocabulary.words)) document_vocabulary_path = path.join(options.vocabulary_dir, 'document_vocabulary.txt') # Add the most common words from vocabulary vocabulary.add_from_file(document_vocabulary_path, 150000) # Add additional common words from loaded embeddings vocabulary.add_words(embedding_words[:100000]) run(options, word_dict, word_embedding_dim, vocabulary)
def _create_and_save_vocab(self, word_sequences, vocab_threshold, model_dir, vocab_filename): """Create a Vocabulary instance from a list of word sequences, and save it to disk. Args: word_sequences: List of word sequences (sentence(s)) to use as basis for the vocabulary. vocab_threshold: Minimum number of times any word must appear within word_sequences in order to be included in the vocabulary. """ vocabulary = Vocabulary() for i in range(len(word_sequences)): word_seq = word_sequences[i] vocabulary.add_words(word_seq.split()) vocabulary.compile(vocab_threshold) vocab_filepath = path.join(model_dir, vocab_filename) vocabulary.save(vocab_filepath) return vocabulary
def _create_and_save_vocab(self, word_sequences, vocab_threshold, model_dir, vocab_filename, embeddings_dir, normalize_imported_vocab, vocab_import_mode): """Create a Vocabulary instance from a list of word sequences, and save it to disk. Args: word_sequences: List of word sequences (sentence(s)) to use as basis for the vocabulary. vocab_threshold: Minimum number of times any word must appear within word_sequences in order to be included in the vocabulary. model_dir: directory to save the vocabulary file to vocab_filename: file name of the vocabulary file embeddings_dir: Optional directory to import external vocabulary & embeddings If provided, the external vocabulary will be imported and processed according to the vocab_import_mode. If None, only the generated vocabulary will be used. normalize_imported_vocab: See VocabularyImporter.import_vocabulary vocab_import_mode: If embeddings_dir is specified, this flag indicates if the dataset vocabulary should be generated and used in combination with the external vocabulary according to the rules of VocabularyImportMode. """ vocabulary = None if embeddings_dir is None or vocab_import_mode != VocabularyImportMode.External: vocabulary = Vocabulary() for i in range(len(word_sequences)): word_seq = word_sequences[i] vocabulary.add_words(word_seq.split()) vocabulary.compile(vocab_threshold) vocabulary_import_stats = None if embeddings_dir is not None: vocabulary_importer = vocabulary_importer_factory.get_vocabulary_importer( embeddings_dir) vocabulary, vocabulary_import_stats = vocabulary_importer.import_vocabulary( embeddings_dir, normalize_imported_vocab, vocab_import_mode, vocabulary) vocab_filepath = path.join(model_dir, vocab_filename) vocabulary.save(vocab_filepath) return vocabulary, vocabulary_import_stats
def _create_and_save_vocab(self, word_sequences, vocab_threshold, model_dir, vocab_filename, embeddings_dir, normalize_imported_vocab, vocab_import_mode): vocabulary = None if embeddings_dir is None or vocab_import_mode != VocabularyImportMode.External: vocabulary = Vocabulary() for i in range(len(word_sequences)): word_seq = word_sequences[i] vocabulary.add_words(word_seq.split()) vocabulary.compile(vocab_threshold) vocabulary_import_stats = None if embeddings_dir is not None: vocabulary_importer = vocabulary_importer_factory.get_vocabulary_importer(embeddings_dir) vocabulary, vocabulary_import_stats = vocabulary_importer.import_vocabulary(embeddings_dir, normalize_imported_vocab, vocab_import_mode, vocabulary) vocab_filepath = path.join(model_dir, vocab_filename) vocabulary.save(vocab_filepath) return vocabulary, vocabulary_import_stats
def build_vocabs(): tasks = [ '.'.join([id, syn]) for id in ['autoid', 'goldid'] for syn in ['autosyn', 'goldsyn'] ] stypes = ['train', 'dev', 'test'] loader = StreusleLoader() STREUSLE_BASE = os.environ.get( 'STREUSLE_BASE' ) or '/cs/usr/aviramstern/nlp/datasets/streusle_v4/release' all_files = [ STREUSLE_BASE + '/' + stype + '/streusle.ud_' + stype + '.' + task + '.json' for task in tasks for stype in stypes ] records = sum([loader.load(f, input_format='json') for f in all_files], []) samples = [streusle_record_to_lstm_model_sample(r) for r in records] pp_vocab = Vocabulary('PREPS') pp_vocab.add_words( set([ x.token for s in samples for x, y in zip(s.xs, s.ys) if any([y.supersense_role, y.supersense_func]) ])) ner_vocab = Vocabulary('NERS') ner_vocab.add_words( set([x.ner for s in samples for x, y in zip(s.xs, s.ys)])) ner_vocab.add_word(None) lemmas_vocab = Vocabulary('LEMMAS') lemmas_vocab.add_words( set([x.lemma for s in samples for x, y in zip(s.xs, s.ys)])) ud_dep_vocab = Vocabulary('UD_DEPS') ud_dep_vocab.add_words( set([x.ud_dep for s in samples for x, y in zip(s.xs, s.ys)])) ud_dep_vocab.add_word(None) ud_xpos_vocab = Vocabulary('UD_XPOS') ud_xpos_vocab.add_words( set([x.ud_xpos for s in samples for x, y in zip(s.xs, s.ys)])) ud_xpos_vocab.add_word(None) token_vocab = Vocabulary('TOKENS') token_vocab.add_words( set([x.token for s in samples for x, y in zip(s.xs, s.ys)])) govobj_config_vocab = Vocabulary('GOVOBJ_CONFIGS') govobj_config_vocab.add_words( set([x.govobj_config for s in samples for x, y in zip(s.xs, s.ys)])) pss_vocab = Vocabulary('PSS') pss_vocab.add_words(supersense_repo.PREPOSITION_SUPERSENSES_SET) pss_vocab.add_word(None) pss_vocab = Vocabulary('LEXCAT') pss_vocab.add_words( set([x.lexcat for s in samples for x, y in zip(s.xs, s.ys)])) return [ pp_vocab, ner_vocab, lemmas_vocab, ud_dep_vocab, ud_xpos_vocab, token_vocab, pss_vocab, govobj_config_vocab ]
def main(): """ Drives the program. """ vocabulary = Vocabulary() running = True while running: menu_list = [ "Check a word", "Add a word", "Add words", "Remove a word", "Remove words", "Print the word list", "Print the word list with definition", "Test your vocabulary", "Test your grammar" ] choice = user_input( menu_list, "Welcome to Learn English App!", "What would you like to do (Type 'quit' to exit the application): " ) if choice == "quit": running = False elif choice == "1": the_word = input("What word you want to check? : ") check_dictionary(vocabulary, the_word) elif choice == "2": new_word = input("Add a new word: ") vocabulary.add_a_word(new_word) if new_word.strip() == "": print("Error. Your word only contains whitespace.") elif choice == "3": new_words = input("Add new words (separate by commas): ") vocabulary.add_words(new_words) elif choice == "4": try: # EXCEPTION HANDLING delete_word = input("Remove a word: ") vocabulary.remove_a_word(delete_word) except KeyError: print("Error: That word is not in the vocabulary.") elif choice == "5": try: # EXCEPTION HANDLING words = input( "Enter the words you would like to remove (separate by commas): " ) vocabulary.remove_words(words) except KeyError: print( "Error: One or more of your words are not in the vocabulary." ) elif choice == "6": if len(vocabulary.sort_words()) == 0: print( "\nYour list is empty! Add some words to your list first!") else: print("\nHere is your word list") print_lists(vocabulary.sort_words()) elif choice == "7": if len(vocabulary.sort_words()) == 0: print( "\nYour list is empty! Add some words to your list first!") else: print("\nHere is your word list with definition") print_list_with_definition(vocabulary.sort_words()) elif choice == "8": try: # EXCEPTION HANDLING test_yourself(vocabulary) except IndexError: print("The question list is Empty") elif choice == "9": grammar_test()