class Corpus(object): def __init__(self): self.documents = [] self.vocab = Vocabulary() self.frozen = False def add(self, name, tokens): if not self.frozen: w = [self.vocab[x] for x in tokens] self.documents.append(Document(self, name, w)) def freeze(self): for doc in self.documents: doc.freeze() self.vocab.stop_growth() self.frozen = True def __iter__(self): return iter(self.documents) def __len__(self): return len(self.documents) @classmethod def load(cls, filename): return pickle.load(file(filename, 'r')) def save(self, filename): pickle.dump(self, file(filename, 'wb'))
def cleanUpText(self, text): cleanedWords = [] # perform lowercase words = text.lower().split(' ') # get vocabulary vocab = Vocabulary() for word in words: # check Portuguese stopwords # TODO: Implement other languages tokenizers if not (word in vocab.getPTStopWords()): cleanedWords.append(word) return cleanedWords
def __extract_vocabularies_from_data(self, classes): vocabularies = set() for c in classes: strings = self.__access_strings(c, '/train') vocabulary = Vocabulary(strings) curr_vocabulary = vocabulary.get_vocabulary() self.__write_vocabulary(c, curr_vocabulary) vocabularies |= curr_vocabulary #append set return sorted(vocabularies)
class Corpus(object): def __init__(self, documents=None, vocab=None, frozen=None): if documents: self.documents = documents else: self.documents = [] if vocab: self.vocab = vocab else: self.vocab = Vocabulary() if frozen: self.frozen = frozen else: self.frozen = False def add(self, name, tokens): if not self.frozen: w = [self.vocab[x] for x in tokens] self.documents.append(Document(self, name, w)) def freeze(self): for doc in self.documents: doc.freeze() self.vocab.stop_growth() self.frozen = True def __getitem__(self, i): return self.documents[i] def __getslice__(self, i, j): return Corpus(self.documents[i:j], self.vocab, self.frozen) def __iter__(self): return iter(self.documents) def __len__(self): return len(self.documents) @classmethod def load(cls, filename): return pickle.load(file(filename, "r")) def save(self, filename): pickle.dump(self, file(filename, "wb"))
class VocabularyTest(unittest.TestCase): def setUp(self): self.vocabulary = Vocabulary() self.vocabulary.load('testdata/vocabulary.dat', 'testdata/custom_words') pprint.pprint(self.vocabulary.trie) pprint.pprint(self.vocabulary.words) def test_vocabulary(self): self.assertIn(u'英雄三国', self.vocabulary.words.keys()) self.assertIn(u'魔鬼代言人', self.vocabulary.words.keys()) self.assertIn(u'黄河水利委员会', self.vocabulary.words.keys()) self.assertNotIn(u'十大伪歌手', self.vocabulary.words.keys()) self.assertNotIn(u'走路太牛', self.vocabulary.words.keys()) self.assertEqual('n', self.vocabulary.get_pos(u'英雄三国')) self.assertEqual('n', self.vocabulary.get_pos(u'魔鬼代言人')) self.assertEqual('nt', self.vocabulary.get_pos(u'黄河水利委员会')) self.assertEqual('UNK', self.vocabulary.get_pos(u'十大伪歌手')) self.assertEqual('UNK', self.vocabulary.get_pos(u'走路太牛')) def test_gen_DAG(self): pprint.pprint(self.vocabulary.gen_DAG( u'《英雄三国》是由网易历时四年自主研发运营的一款英雄对战竞技网游。'))
def setUp(self): self.vocabulary = Vocabulary() self.vocabulary.load('../data/vocabulary.dat') self.hmm_segmenter = HMMSegmenter() self.hmm_segmenter.load('../data/hmm_segment_model') self.max_prob_segmenter = MaxProbSegmenter( self.vocabulary, self.hmm_segmenter)
def test_pronunciation_valid_phrase(self): current_result = vb.pronunciation("hippopotamus") result = '[{"rawType": "ahd-legacy", "raw": "(hĭpˌə-pŏtˈə-məs)", "seq": 0}, {"rawType": "arpabet", "raw": "HH IH2 P AH0 P AA1 T AH0 M AH0 S", "seq": 0}]' expected_result = json.loads(result) if sys.version_info[:2] <= (2, 7): self.assertItemsEqual(current_result, expected_result) else: self.assertCountEqual(current_result, expected_result)
def test_antonym_valid_phrase_2(self): current_result = vb.antonym("respect") result = '{"text": ["disesteem", "disrespect"]}' expected_result = json.loads(result) if sys.version_info[:2] <= (2, 7): self.assertItemsEqual(current_result, expected_result) else: self.assertCountEqual(current_result, expected_result)
def test_translate_valid_phrase(self): current_result = vb.translate("hummus", "en", "es") result = '[{"text": "hummus", "seq": 0}]' middle_val = json.loads(result) expected_result = json.dumps(middle_val) if sys.version_info[:2] <= (2, 7): self.assertItemsEqual(current_result, expected_result) else: self.assertCountEqual(current_result, expected_result)
def test_partOfSpeech_valid_phrase_2(self): current_result = vb.part_of_speech("rapidly") result = '[{"text": "adverb", "example:": "With speed; in a rapid manner.", "seq": 0}]' middle_val = json.loads(result) expected_result = json.dumps(middle_val) if sys.version_info[:2] <= (2, 7): self.assertItemsEqual(current_result, expected_result) else: self.assertCountEqual(current_result, expected_result)
def test_partOfSpeech_valid_phrase_1(self): current_result = vb.part_of_speech("hello") result = '[{"text": "interjection", "example:": "Used to greet someone, answer the telephone, or express surprise.", "seq": 0}]' middle_val = json.loads(result) expected_result = json.dumps(middle_val) if sys.version_info[:2] <= (2, 7): self.assertItemsEqual(current_result, expected_result) else: self.assertCountEqual(current_result, expected_result)
def test_hyphenation_valid_phrase(self): current_result = vb.hyphenation("hippopotamus") result = '[{"seq": 0, "text": "hip", "type": "secondary stress"}, {"seq": 1, "text": "po"}, {"seq": 2, "text": "pot", "type": "stress"}, {"seq": 3, "text": "a"}, {"seq": 4, "text": "mus"}]' middle_val = json.loads(result) expected_result = json.dumps(middle_val) if sys.version_info[:2] <= (2, 7): self.assertItemsEqual(current_result, expected_result) else: self.assertCountEqual(current_result, expected_result)
def test_usageExamples_valid_phrase(self): current_result = vb.usage_example("hillock") result = '[{"seq": 0, "text": "I went to the to of the hillock to look around."}]' middle_val = json.loads(result) expected_result = json.dumps(middle_val) if sys.version_info[:2] <= (2, 7): self.assertItemsEqual(current_result, expected_result) else: self.assertCountEqual(current_result, expected_result)
def test_synonym_valid_phrase(self): current_result = vb.synonym("repudiate") result = '[{"seq": 0, "text": "deny"}]' middle_val = json.loads(result) expected_result = json.dumps(middle_val) if sys.version_info[:2] <= (2, 7): self.assertItemsEqual(current_result, expected_result) else: self.assertCountEqual(current_result, expected_result)
def synonyms(word): try: synonyms='' result=json.loads(vb.synonym(word)) for res in result: synonyms += res['text'] + ',' return synonyms[:-1] + '\n' except: return "N/A"
def meaning(word): try: parts='' result=json.loads(vb.part_of_speech(word)) for res in result: parts += res['text']+ ':' + res[u'example:'] + '\n\n' return parts except: return "N/A"
def translate(text): try: translation='' result=json.loads(vb.translate(text, "en","hi")) for res in result: translation += res['text'] + ',' return translation[:-1] + '\n' except: return "N/A"
def main(): """.""" from vocabulary import Vocabulary from attribute import Attribute from attribute_structure import AttributeStructure from attribute_system import AttributeSystem vocabulary = Vocabulary(['C'], [], ['V']) a = Attribute("a", []) b = Attribute("b", []) astr = AttributeStructure(a, b) objs = ['a', 'b', 'c'] attribute_system = AttributeSystem(astr, objs) C = ConstantAssignment(vocabulary, attribute_system, {'C': 'a'}) print C._vocabulary vocabulary.add_constant("C2") print C._vocabulary
def get_example(word): try: examples = json.loads(vb.usage_example(word)) example = '' limit = min(3, len(examples)) for i in range(limit): example += examples[i]['text']+'...' return example except Exception, e: print e,'\nFlag example' return ""
def setUp(self): self.document = Document(20) self.vocabulary = Vocabulary() self.vocabulary.load("../testdata/vocabulary.dat") self.model = Model(20) self.model.load('../testdata/lda_model') self.doc_tokens = ['macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null'] # inexistent
class MaxProbSegmenterTest(unittest.TestCase): def setUp(self): self.vocabulary = Vocabulary() self.vocabulary.load('../data/vocabulary.dat') self.hmm_segmenter = HMMSegmenter() self.hmm_segmenter.load('../data/hmm_segment_model') self.max_prob_segmenter = MaxProbSegmenter( self.vocabulary, self.hmm_segmenter) def call_segment(self, text): for word in self.max_prob_segmenter.segment(text): print word + '/\t', print '' def test_segment(self): fp = open('testdata/document.dat', 'rb') for text in fp.readlines(): self.call_segment(text.strip()) fp.close()
def usage_example(word): try: examples='' result=json.loads(vb.usage_example(word)) for res in result: examples += res['text'] + '\n\n' if(len<300): return examples else: return examples[:300] except: return "N/A"
def get_meaning(word): try: meaning = json.loads(vb.meaning(word)) means = '' limit = min(3, len(meaning)) for i in range(limit): means += meaning[i]['text'] + ';' return means except Exception, e: print e return ""
def __init__(self, args, src_file, trg_file): self.src_vocabulary = Vocabulary() self.src_vocabulary.make_dictionary(src_file) self.trg_vocabulary = Vocabulary() self.trg_vocabulary.make_dictionary(trg_file) self.src_size = len(self.src_vocabulary.wtoi) self.embed_size = args.embed_size self.hidden_size = args.hidden_size self.trg_size = len(self.trg_vocabulary.wtoi) super(EncoderDecoder, self).__init__( # encoder w_xe=F.EmbedID(self.src_size, self.embed_size), w_ep=F.Linear(self.embed_size, self.hidden_size*4), w_pp=F.Linear(self.hidden_size, self.hidden_size*4), # decoder w_ey=F.EmbedID(self.trg_size, self.embed_size), w_qe=F.Linear(self.embed_size, self.hidden_size*4), w_qq=F.Linear(self.hidden_size, self.hidden_size*4), w_yq=F.Linear(self.hidden_size, self.trg_size), )
def get_context(text): """ Try to get context for card :param card: """ try: m = json.loads(vb.usage_example(text)) if len(m) > 0: return m[0]['text'] return u'' except Exception as ex: error(u'', ex) return u''
def get_meaning(text, lang): """ Try to get meaning for card :param text: :param lang: :return: """ try: m = json.loads(vb.meaning(text, lang, lang)) if len(m) > 0: return m[0]['text'] return u'' except Exception as ex: error(u'', ex) return u''
def generate_dataset(items, slots, voca: Vocabulary): dataset = Dataset() for item in items: vectors = [] for word in item[0].split(): vectors.append(voca.get(word)) labels = [] for tag in item[1].split(): value = np.zeros([len(slots)], dtype=np.float32) value[slots.index(tag)] = 1 labels.append(value) dataset.add(item[0], item[1], vectors, labels) return dataset
def __init__(self, documents=None, vocab=None, frozen=None): if documents: self.documents = documents else: self.documents = [] if vocab: self.vocab = vocab else: self.vocab = Vocabulary() if frozen: self.frozen = frozen else: self.frozen = False
def open(self, corpus_dir): self.root_dir = corpus_dir if not path.isdir(corpus_dir): os.mkdir(corpus_dir) self.meta_dir = self.root_dir + "/meta" self.samples_dir = self.root_dir + "/samples" if not path.isdir(self.samples_dir): os.mkdir(self.samples_dir) self.vocabulary_dir = self.root_dir + "/vocabulary" self.vocabulary = Vocabulary(self.vocabulary_dir) self.categories_dir = self.root_dir + "/categories" self.categories = Categories(self.categories_dir) self.categories.load_categories() self.categories.print_categories()
def test_meaning_valid_phrase(self): current_result = vb.meaning("humming") result = '[{"seq": 0, "text": "Present participle of hum."}]' middle_val = json.loads(result) expected_result = json.dumps(middle_val) if sys.version_info[:2] <= (2, 7): ## python 2 self.assertItemsEqual(current_result, expected_result) else: # python 3 """ assertItemsEqual() was renamed to assertCountEqual() Why I am not using assertEqual() here? Reference: - http://stackoverflow.com/a/7473137/3834059 - https://docs.python.org/2/library/unittest.html#unittest.TestCase.assertItemsEqual - https://docs.python.org/3/library/unittest.html?highlight=assertcountequal#unittest.TestCase.assertCountEqual """ self.assertCountEqual(current_result, expected_result)
def main(): os.makedirs(os.path.join(args.logdir, 'models')) vocab = Vocabulary(os.path.join(args.wiki_preprocess, 'entity_vocab.txt')) print(f"# entity in dataset: {len(vocab)}") if not os.path.exists(args.cache): STOPWORD_PATH = os.path.join(args.dataroot, "previous/stopwords.txt") SYMBOL_PATH = os.path.join(args.dataroot, "previous/symbols.txt") with open(STOPWORD_PATH, 'r') as f: stop_words = set([line.strip() for line in f]) with open(SYMBOL_PATH, 'r') as f: symbols = set([line.strip() for line in f]) stop_words = stop_words.union(symbols) # Pre-trained word embedding wiki2vec = Wikipedia2Vec.load(args.wiki2vec) context_entity_word_co_occur_path = os.path.join( args.wiki_preprocess, 'context_entity_word_co_occur.txt') context_positive_words = filter_positive_words( context_entity_word_co_occur_path, stop_words, wiki2vec, vocab, ) page_entity_word_co_occur_path = os.path.join( args.wiki_preprocess, 'page_entity_word_co_occur.txt') page_positive_words = filter_positive_words( page_entity_word_co_occur_path, stop_words, wiki2vec, vocab, ) word_count_path = os.path.join(args.wiki_preprocess, 'word_count.json') negative_words, negative_freqs = \ filter_negative_words( word_count_path, stop_words, wiki2vec, freq_power=0.6 ) (page_positive_words, context_positive_words, negative_words, vecs) = get_reduced_embedding(page_positive_words, context_positive_words, negative_words, wiki2vec) del wiki2vec os.makedirs(os.path.dirname(args.cache), exist_ok=True) pickle.dump((page_positive_words, context_positive_words, negative_words, negative_freqs, vecs), open(args.cache, 'wb')) else: print(f"Load cache {args.cache}") (page_positive_words, context_positive_words, negative_words, negative_freqs, vecs) = pickle.load(open(args.cache, 'rb')) page_non_empty_index = [ i for i, positive_words in enumerate(page_positive_words) if len(positive_words) != 0 ] context_non_empty_index = [ i for i, positive_words in enumerate(context_positive_words) if len(positive_words) != 0 ] non_empty_index = set(page_non_empty_index + context_non_empty_index) print(f'# entity in vocab : {len(vocab) - 1:d}') print(f'# non empty page : {len(page_non_empty_index):d}') print(f'# non empty context: {len(context_non_empty_index):d}') print(f'# non empty : {len(non_empty_index):d}') word_embedding = nn.Embedding.from_pretrained(torch.tensor(vecs)) word_embedding = word_embedding.to(device) entity_embedding = nn.Embedding(len(vocab) - 1, vecs.shape[1]) nn.init.normal_(entity_embedding.weight, mean=0, std=1.) with torch.no_grad(): for idx in range(len(vocab) - 1): if idx not in non_empty_index: entity_embedding.weight[idx] = 0. entity_embedding = entity_embedding.to(device) optimizer = torch.optim.Adagrad(entity_embedding.parameters(), lr=args.lr) dataset = ContrastiveDataset(page_positive_words, negative_freqs, negative_words, args.positive_num, args.negative_num) dataset = Subset(dataset, page_non_empty_index) writer = SummaryWriter(os.path.join(args.logdir, 'phase1')) print('Phase 1') train(word_embedding, entity_embedding, optimizer, dataset, writer, start_epochs=1, end_epochs=args.phase1_epochs) dataset = ContrastiveDataset(context_positive_words, negative_freqs, negative_words, args.positive_num, args.negative_num) dataset = Subset(dataset, context_non_empty_index) writer = SummaryWriter(os.path.join(args.logdir, 'phase2')) print('Phase 2') train(word_embedding, entity_embedding, optimizer, dataset, writer, start_epochs=args.phase1_epochs + 1, end_epochs=args.phase2_epochs)
class Main: def main(self): clearCli() self.vocabulary = Vocabulary() vocabulary = self.vocabulary vocabulary.buildVocabulary() isValidCommand = True while True: quiz = Quiz(vocabulary) clearCli() print(CLI.main_menu) if not isValidCommand: print(CLI.invalid_command) command = input() isValidCommand = command in ['sa', 's', 'sl', 'q', 'j1', 'j2', 'j3', 'la', 't', 'o'] if isValidCommand: if command == 'sa': print('Starting quiz!\n\n') language = self.selectLanguage() quiz.startall(language) elif command == 's': numQuestions = self.selectNumQuestions() language = self.selectLanguage() print('Starting quiz!\n\n') quiz.start(language, numQuestions) elif command == 'sl': startLesson, endLesson = self.selectLessons() language = self.selectLanguage() print('Starting quiz!\n\n') quiz.start(language, startLesson=startLesson, endLesson=endLesson) elif command == 'j1': language = self.selectLanguage() print('Starting quiz for Japanese 1 vocabulary!\n\n') quiz.start(language, startLesson=1, endLesson=10) elif command == 'j2': language = self.selectLanguage() print('Starting quiz for Japanese 2 vocabulary!\n\n') quiz.start(language, startLesson=11, endLesson=20) elif command == 'j3': language = self.selectLanguage() print('Starting quiz for Japanese 3 vocabulary!\n\n') quiz.start(language, startLesson=21, endLesson=32) elif command == 'o': print('Starting open ended quiz') startLesson, endLesson = self.selectLessons() quiz.start_open_ended(startLesson=startLesson, endLesson=endLesson) elif command == 'q': print('Quiting program') break elif command == 'la': print('Listing all vocabulary') vocabulary.printWholeVocabulary() elif command == 't': print('Test') self.testFunction() def testFunction(self): kksi = kakasi() kksi.setMode("J", "H") conv = kksi.getConverter() all_hiragana = 'がくせい' partial_hiragana1 = '学せい' partial_hiragana2 = 'がく生' all_kanji = '学生' print(conv.do(all_hiragana)) print(conv.do(partial_hiragana1)) print(conv.do(partial_hiragana2)) print(conv.do(all_kanji)) print(conv.do(all_hiragana) == conv.do(partial_hiragana1) == conv.do(partial_hiragana2) == conv.do(all_kanji)) input() def selectNumQuestions(self): clearCli() print('How many questions?') while True: value = input() isANumber = value.isnumeric() isNumberWithinVocabSize = isANumber and 1 <= int(value) <= self.vocabulary.getVocabularySize() if isNumberWithinVocabSize: break clearCli() print('How many questions?') if not isANumber: print('Not a number') elif isANumber and not isNumberWithinVocabSize: print('Invalid value. Vocabulary size is', self.vocabulary.getVocabularySize()) numQuestions = int(value) return numQuestions def selectLanguage(self): clearCli() print('Language of questions? (jp/en)') while True: value = input() isValidInput = value == 'jp' or value == 'en' if isValidInput: break clearCli() print('Language of questions? (jp/en)') print(CLI.invalid_command) return value def selectLessons(self): clearCli() print('Do you want to do a (s)ingle or a (r)ange of lessons? (s/r)') while True: value = input() isValidInput = value == 's' or value == 'r' if isValidInput: break clearCli() print('Do you want to do a (s)ingle or a (r)ange of lessons? (s/r)') print(CLI.invalid_command) if value == 's': startLesson, endLesson = self.selectSingleLesson() elif value == 'r': startLesson, endLesson = self.selectRangeOfLessons() return startLesson, endLesson def selectSingleLesson(self): clearCli() print('Type lesson number?') while True: value = input() isValidInput = self.vocabulary.hasLesson(int(value)) if isValidInput: print('Valid lesson', value) break clearCli() print('Type lesson number?') print('Lesson does not exist') selectedLesson = value return selectedLesson , selectedLesson def selectRangeOfLessons(self): clearCli() print('Type start lesson number?') while True: value = input() isValidInput = self.vocabulary.hasLesson(int(value)) if isValidInput: print('Valid lesson', value) break clearCli() print('Type start lesson number?') print('Lesson does not exist') startLesson = value clearCli() print('Start lesson: ', startLesson) print('Type end lesson number?') while True: value = input() isLargerThanStart = int(value) > int(startLesson) doesLessonExist = self.vocabulary.hasLesson(int(value)) isValidInput = isLargerThanStart and doesLessonExist if isValidInput: print('Valid lesson', value) break clearCli() print('Start lesson: ', startLesson) print('Type end lesson number?') if not doesLessonExist: print('Lesson does not exist') elif not isLargerThanStart: print('End lesson should be greater than start lesson') endLesson = value assert int(endLesson) > int(startLesson) return startLesson, endLesson
def main(args): assert FLAGS.training_data_loader, "--training_data_loader is required" assert FLAGS.vocab_file, "--vocab_file is required" assert FLAGS.train_dir, "--train_dir is required" model_config = configuration.ModelConfig() training_config = configuration.TrainingConfig() print('Loading vocabulary file...') vocab = Vocabulary(FLAGS.vocab_file) vocab_size = vocab.get_vocabulary_size() # Assign parameters to model configuration. model_config.vocab_size = vocab_size training_config.train_dir = FLAGS.train_dir training_config.num_iterations = FLAGS.number_of_steps training_config.log_every_n_steps = FLAGS.log_every_n_steps training_config.validation_loss_every_n_steps = FLAGS.validation_loss_every_n_steps # Create training directory. if not tf.gfile.IsDirectory(training_config.train_dir): tf.logging.info("Creating training directory: %s", training_config.train_dir) tf.gfile.MakeDirs(training_config.train_dir) # Build the TensorFlow graph. g = tf.Graph() with g.as_default(): print('Building LSTM decoder model...') if not FLAGS.repeated_feed_images: model = LSTMDecoder(model_config, mode="train") else: model = LSTMDecoderRepeatedImageFeed(model_config, mode="train") model.build() # Setup learning rate decay. num_batches_per_epoch = (training_config.num_examples_per_epoch / model_config.batch_size) decay_steps = int(num_batches_per_epoch * training_config.num_epochs_per_decay) global_step = tf.Variable(0, name='global_step', trainable=False) learning_rate = tf.train.exponential_decay( training_config.initial_learning_rate, global_step, decay_steps=decay_steps, decay_rate=training_config.learning_rate_decay_factor, staircase=True) tf.summary.scalar('learning_rate', learning_rate) # Setup optimizer. optimizer = tf.train.AdamOptimizer(learning_rate) train = optimizer.minimize(model.total_loss, global_step=global_step) # Setup summary. all_summary = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train') val_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/val') # Create saver saver = tf.train.Saver( max_to_keep=training_config.max_checkpoints_to_keep) # Initialize variables. print('Initializing variables...') init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) print('Initializing data loader for training set...') start = time.time() data_loader_train = DataLoader() data_loader_train.load(FLAGS.training_data_loader) end = time.time() time_elapsed = end - start print('Finished initializing data loader (time elapsed: %f)' % time_elapsed) print('Initializing data loader for validation set...') start = time.time() data_loader_val = DataLoader() data_loader_val.load(FLAGS.validation_data_loader) end = time.time() time_elapsed = end - start print('Finished initializing data loader (time elapsed: %f)' % time_elapsed) print('Start training...') # Stochastic Gradient Descent for i in range(training_config.num_iterations): print('Sampling mini-batch...') image_features, input_sequence, input_mask, target_sequence =\ data_loader_train.segmental_sampling(batch_size=training_config.batch_size, num_segments=model_config.num_segments) _, total_loss, summary = sess.run( (train, model.total_loss, all_summary), feed_dict={ "input_features:0": image_features, "input_feed:0": input_sequence, "input_mask:0": input_mask, "target_sequences:0": target_sequence }) train_writer.add_summary(summary, i) # Logging if i % training_config.log_every_n_steps == 0: print('[%d/%d] loss: %f' % (i, training_config.num_iterations, total_loss)) # Save model. if i % training_config.save_every_n_steps == 0: print('Saving model at step %d...' % i) saver.save(sess, FLAGS.train_dir + '/model', global_step=i) # evaluate the loss with validation set at every epoch if i % training_config.validation_loss_every_n_steps == 0: image_features, input_sequence, input_mask, target_sequence = \ data_loader_val.segmental_sampling(batch_size=training_config.batch_size, num_segments=model_config.num_segments) total_loss, summary = sess.run( (model.total_loss, all_summary), feed_dict={ "input_features:0": image_features, "input_feed:0": input_sequence, "input_mask:0": input_mask, "target_sequences:0": target_sequence }) val_writer.add_summary(summary, i)
def load_view(view_name: str): """Return a given view from a UI file.""" return ui.load_view(os.path.join(UI_DIR, view_name)) if __name__ == '__main__': # This `builtins` trick fixes a problem where launching the script from # the home screen can cause multiple instances to run at once. # https://forum.omz-software.com/topic/4097/home-screen-alias-is-script-already-running/ try: (vocab, jinja2env, lookup_view, word_view, compact_word_view, about_view, container) = builtins.wordroom except (AttributeError, ValueError): container = None if isinstance(container, ui.View) and container.on_screen: pass # reuse the original globals else: # initialize new globals vocab = Vocabulary(data_file=VOCABULARY_FILE) jinja2env = Environment(loader=FileSystemLoader(HTML_DIR)) lookup_view = load_view('lookup') word_view = load_view('word') compact_word_view = load_view('word') about_view = load_view('about') container = AdaptiveView(lookup_view, word_view) container.name = 'WordRoom' container.present('fullscreen', hide_title_bar=True) builtins.wordroom = (vocab, jinja2env, lookup_view, word_view, compact_word_view, about_view, container) # if appex.is_running_extension(): # load_word_view(appex.get_text())
def chat(self, question, chat_settings): """Chat with the chatbot model by predicting an answer to a question. 'question' and 'answer' in this context are generic terms for the interactions in a dialog exchange and can be statements, remarks, queries, requests, or any other type of dialog speech. For example: Question: "How are you?" Answer: "Fine." Question: "That's great." Answer: "Yeah." Args: question: The input question for which the model should predict an answer. chat_settings: The ChatSettings instance containing the chat settings and inference hyperparameters Returns: q_with_hist: question with history if chat_settings.show_question_context = True otherwise None. answers: array of answer beams if chat_settings.show_all_beams = True otherwise the single selected answer. """ #Process the question by cleaning it and converting it to an integer encoded vector question = Vocabulary.clean_text(question) question = self.input_vocabulary.words2ints(question) #Prepend the currently tracked steps of the conversation history separated by EOS tokens. #This allows for deeper dialog context to influence the answer prediction. question_with_history = [] for i in range(len(self.conversation_history)): question_with_history += self.conversation_history[i] + [ self.input_vocabulary.eos_int() ] question_with_history += question #Get the answer prediction batch = np.zeros((1, len(question_with_history))) batch[0] = question_with_history max_output_sequence_length = chat_settings.inference_hparams.max_answer_words + 1 # + 1 since the EOS token is counted as a timestep predicted_answer_info = self.predict_batch( inputs=batch, input_sequence_length=np.array([len(question_with_history)]), max_output_sequence_length=max_output_sequence_length, beam_length_penalty_weight=chat_settings.inference_hparams. beam_length_penalty_weight, sampling_temperature=chat_settings.inference_hparams. sampling_temperature, log_summary=chat_settings.inference_hparams.log_summary) #Read the answer prediction answer_beams = [] if self.beam_width > 0: #For beam search decoding: if show_all_beams is enabeled then output all beams (sequences), otherwise take the first beam. # The beams (in the "predictions" matrix) are ordered with the highest ranked beams first. beam_count = 1 if not chat_settings.show_all_beams else len( predicted_answer_info["predictions_seq_lengths"][0]) for i in range(beam_count): predicted_answer_seq_length = predicted_answer_info[ "predictions_seq_lengths"][0][ i] - 1 #-1 to exclude the EOS token predicted_answer = predicted_answer_info["predictions"][ 0][:predicted_answer_seq_length, i].tolist() answer_beams.append(predicted_answer) else: #For greedy / sampling decoding: only one beam (sequence) is returned, based on the argmax for greedy decoding # or the sampling distribution for sampling decoding. Return this beam. beam_count = 1 predicted_answer_seq_length = predicted_answer_info[ "predictions_seq_lengths"][0] - 1 #-1 to exclude the EOS token predicted_answer = predicted_answer_info["predictions"][ 0][:predicted_answer_seq_length].tolist() answer_beams.append(predicted_answer) #Add new conversation steps to the end of the history and trim from the beginning if it is longer than conv_history_length self.conversation_history.append(question) self.conversation_history.append(answer_beams[0]) self.trim_conversation_history( chat_settings.inference_hparams.conv_history_length) #Convert the answer(s) to text and return answers = [] for i in range(beam_count): answer = self.output_vocabulary.ints2words(answer_beams[i]) answers.append(answer) q_with_hist = None if not chat_settings.show_question_context else self.output_vocabulary.ints2words( question_with_history) if chat_settings.show_all_beams: return q_with_hist, answers else: return q_with_hist, answers[0]
def train(trainFile, devFile, gramsNumber, smoothStrategy, BLaplace): # process data with open(trainFile, "r") as f: corpusTrain = f.readlines() with open(devFile, "r") as f: corpusDev = f.readlines() corpusTrainDev = corpusTrain + corpusDev if smoothStrategy == "laplace": vocab = Vocabulary(gramsNumber, corpusTrainDev) vocab.tune_with_Laplace_smoothing(BLaplace) elif smoothStrategy == "held_out": vocab = Vocabulary(gramsNumber, corpusTrain) vocab.tune_with_held_out_smoothing(corpusDev) elif smoothStrategy == "cross_valid": vocab = Vocabulary(gramsNumber, corpusTrain) vocab.tune_with_cross_val_smoothing(corpusDev) elif smoothStrategy == "good_turing": vocab = Vocabulary(gramsNumber, corpusTrain) vocab.tune_with_good_turing_smoothing() else: raise KeyError return vocab
return avi_data, targets, lengths if __name__ == '__main__': import time from torch.autograd import Variable from torch.nn.utils.rnn import pack_padded_sequence from checkpoint import * json_file = 'data/testing_label.json' numpy_file = 'data/testing_data/feat' helper = Vocabulary(json_file, min_word_count=5) dataset = TrainingDataset(label_json_file=json_file, training_data_path=numpy_file, helper=helper, load_into_ram=True) dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=8, collate_fn=collate_fn) ss = time.time() for epoch in range(1): s = time.time() print('epoch: {}'.format(epoch+1)) for batch_n, batch in enumerate(dataloader): #e = time.time()
def __init__(self, in_vocab, output_vocabularies, state_encoder_builder, valid_action_fn, args): SconeModel.__init__(self, state_encoder_builder, in_vocab, args.embeddings_size, args.num_enc_layers, args.encoder_size, args.decoder_size, RNNBuilder) self.args = args self._dropout = 0. # Output vocabs and embeddings. self.output_action_vocabulary = Vocabulary(output_vocabularies[0], [EOS, BEG]) self.output_location_vocabulary.g = Vocabulary(output_vocabularies[1], [NO_ARG, BEG]) self.output_argument_vocabulary = Vocabulary(output_vocabularies[2], [NO_ARG, BEG]) # All outputs vocabulary. all_vocabulary_list = [] self._valid_action_indices = [] index = 0 for action in self.output_action_vocabulary: for location in self.output_location_vocabulary.g: for argument in self.output_argument_vocabulary: if action != BEG and location != BEG and argument != BEG: if valid_action_fn(action, location, argument): self._valid_action_indices.append(index) all_vocabulary_list.append((action, location, argument)) index += 1 self._all_output_vocabulary = Vocabulary(all_vocabulary_list, []) self._output_action_embeddings = self._pc.add_lookup_parameters( (len(self.output_action_vocabulary), args.embeddings_size), name="output-action-embeddings") self._output_location_embeddings = self._pc.add_lookup_parameters( (len(self.output_location_vocabulary.g), args.embeddings_size), name="output-location-embeddings") self._output_argument_embeddings = self._pc.add_lookup_parameters( (len(self.output_argument_vocabulary), args.embeddings_size), name="output-argument-embeddings") # Action decoder RNN. self._dec_input_size = args.encoder_size * 2 \ + args.encoder_size * 2 \ + self._state_encoder.item_size() * 2 \ + args.embeddings_size * 3 self._decoder = RNNBuilder(args.num_dec_layers, self._dec_input_size, args.decoder_size, self._pc) situated_in_size = self._dec_input_size if self.args.always_initial_state: self._state_attention_winitial = self._pc.add_parameters( (self.args.encoder_size * 2 + self.args.decoder_size, self._state_encoder.item_size()), name="state-attention-winitial") self._state_attention_winitial2 = self._pc.add_parameters( (self.args.encoder_size * 2 + self.args.decoder_size, self._state_encoder.item_size()), name="state-attention-winitial2") situated_in_size += 2 * self._state_encoder.item_size() # MLP parameters to mix the situated embedding. self._situated_w = self._pc.add_parameters( (self._dec_input_size, situated_in_size), name="situated-w") self._situated_b = self._pc.add_parameters((self._dec_input_size), name="situated-b") # Project the RNN output to a vector that is the length of the output # vocabulary. self._final_w = self._pc.add_parameters( (args.decoder_size, args.decoder_size), name="final-w") self._output_w_action = self._pc.add_parameters( (len(self.output_action_vocabulary) - 1, args.decoder_size), name="output-w-action") self._output_w_location = self._pc.add_parameters( (len(self.output_location_vocabulary.g) - 1, args.decoder_size), name="output-w-location") self._output_w_argument = self._pc.add_parameters( (len(self.output_argument_vocabulary) - 1, args.decoder_size), name="output-w-argument")
class ChatBot: def __init__(self, layers=5, maxlen=10, embedding_size=128, batch_size=32, is_train=True, lr=0.0001): self.layers = layers self.maxlen = maxlen self.embedding_size = embedding_size self.batch_size = batch_size self.learning_rate = lr self.model_path = "model/chatbot/model.npz" #what is npz? It is the extension , it is the file in which we save the weight of our seq2seq model. ## Vocabulary self.vocab = Vocabulary(corpus=None, maxlen=maxlen) self.vocab_size = self.vocab.vocab_size ## Init Session sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) tf.reset_default_graph() self.sess = tf.Session(config=sess_config) ## Placeholders self.encoder_inputs = tf.placeholder(tf.int32, shape=[None, None]) self.decoder_inputs = tf.placeholder(tf.int32, shape=[None, None]) self.decoder_outputs = tf.placeholder(tf.int32, shape=[None, None]) self.mask = tf.placeholder(tf.int32, shape=[None, None]) ## Model self.net_out, _ = self.create_model( self.encoder_inputs, self.decoder_inputs, self.vocab_size, self.embedding_size, reuse=False) self.net_out.print_params(False) self.loss = tl.cost.cross_entropy_seq_with_mask( logits=self.net_out.outputs, target_seqs=self.decoder_outputs, input_mask=self.mask, return_details=False, name='cost') ## Optimizer self.train_op = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate).minimize(self.loss) def train(self, X, Y, num_epochs=1): ## Init Vars self.sess.run(tf.global_variables_initializer()) ## Load Model tl.files.load_and_assign_npz(sess=self.sess, name=self.model_path, network=self.net_out) n_step = len(X)//self.batch_size for epoch in range(num_epochs): X, Y = shuffle(X, Y, random_state=0) total_loss, n_iter = 0, 0 for x, y in tqdm(tl.iterate.minibatches( inputs=X, targets=Y, batch_size=self.batch_size, shuffle=False), total=n_step, desc='Epoch[{}/{}]'.format(epoch + 1, num_epochs), leave=False): x1, x2, y1, W = self.vocab.dataset(x, y) feed_data = {} feed_data[self.encoder_inputs] = x1 feed_data[self.decoder_inputs] = x2 feed_data[self.decoder_outputs] = y1 feed_data[self.mask] = W _, loss_iter = self.sess.run([self.train_op, self.loss], feed_dict=feed_data) total_loss += loss_iter n_iter += 1 ## printing average loss after every epoch print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter)) ## saving the model tl.files.save_npz(self.net_out.all_params, name=self.model_path, sess=self.sess) ## session cleanup self.sess.close() """ Creates the LSTM Model """ def create_model(self, encoder_inputs, decoder_inputs, vocab_size, emb_dim, is_train=True, reuse=False): with tf.variable_scope("model", reuse=reuse): # for chatbot, you can use the same embedding layer, # for translation, you may want to use 2 seperated embedding layers # embedding layers? with tf.variable_scope("embedding") as vs: net_encode = EmbeddingInputlayer( inputs = encoder_inputs, vocabulary_size = vocab_size, embedding_size = emb_dim, name = 'seq_embedding') vs.reuse_variables() net_decode = EmbeddingInputlayer( inputs = decoder_inputs, vocabulary_size = vocab_size, embedding_size = emb_dim, name = 'seq_embedding') net_rnn = Seq2Seq(net_encode, net_decode, cell_fn = tf.nn.rnn_cell.LSTMCell, n_hidden = emb_dim, initializer = tf.random_uniform_initializer(-0.1, 0.1), encode_sequence_length = retrieve_seq_length_op2(encoder_inputs), decode_sequence_length = retrieve_seq_length_op2(decoder_inputs), initial_state_encode = None, dropout = (0.5 if is_train else None), n_layer = self.layers, return_seq_2d = True, name = 'seq2seq') net_out = DenseLayer(net_rnn, n_units=vocab_size, act=tf.identity, name='output') return net_out, net_rnn def infer(self, query): unk_id = self.vocab.word_index["<unk>"] pad_id = self.vocab.word_index["<pad>"] start_id = self.vocab.word_index["<start>"] end_id = self.vocab.word_index["<end>"] ## Init Session sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) tf.reset_default_graph() sess = tf.Session(config=sess_config) ## Inference Data Placeholders encode_inputs = tf.placeholder(dtype=tf.int64, shape=[1, None], name="encode_inputs") decode_inputs = tf.placeholder(dtype=tf.int64, shape=[1, None], name="decode_inputs") net, net_rnn = self.create_model( encode_inputs, decode_inputs, self.vocab_size, self.embedding_size, is_train=False, reuse=False) y = tf.nn.softmax(net.outputs) ## Init Vars sess.run(tf.global_variables_initializer()) ## Load Model tl.files.load_and_assign_npz(sess=sess, name=self.model_path, network=net) """ Inference using pre-trained model """ def inference(seed): seed_id = self.vocab.text_to_sequence(seed) ## Encode and get state state = sess.run(net_rnn.final_state_encode, {encode_inputs: [seed_id]}) ## Decode, feed start_id and get first word [https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_ptb_lstm_state_is_tuple.py] o, state = sess.run([y, net_rnn.final_state_decode], { net_rnn.initial_state_decode: state, decode_inputs: [[start_id]]}) w_id = tl.nlp.sample_top(o[0], top_k=3) #w = self.vocab.index_word[w_id] ## Decode and feed state iteratively sentence = [w_id] for _ in range(self.maxlen): # max sentence length o, state = sess.run([y, net_rnn.final_state_decode],{ net_rnn.initial_state_decode: state, decode_inputs: [[w_id]]}) w_id = tl.nlp.sample_top(o[0], top_k=2) #w = self.vocab.index_word[w_id] if w_id == end_id: break sentence = sentence + [w_id] return sentence ## infer sentence = inference(query) response = self.vocab.seqs_to_text(sentence) response = " ".join(response.split(" ")) return response
def prepare_data(config): print('Loading data for ' + config.phase) if config.phase == 'train': filetemp = os.path.join(config.train_dir, config.temp_train_file) elif config.phase == 'eval': filetemp = os.path.join(config.eval_dir, config.temp_eval_file) elif config.phase == 'test': filetemp = os.path.join(config.test_dir, config.temp_test_file) data = np.load(filetemp).item() src = data['src'] dst = data['dst'] # print("Building the vocabulary...") vocabulary1 = Vocabulary(config.vocab1_size, save_file=config.vocab1_file) #vocabulary1.save(config.vocab1_file) print("Vocabulary built.") # if config.phase == 'train': filetemp = os.path.join(config.train_dir, config.train_file) elif config.phase == 'eval': filetemp = os.path.join(config.eval_dir, config.eval_file) elif config.phase == 'test': filetemp = os.path.join(config.test_dir, config.test_file) if True: #not os.path.exists(filetemp): word_idxs1, word_idxs2 = [], [] masks1, masks2 = [], [] len1, len2 = [], [] for sent in src: #tqdm(src): current_word_idxs_ = vocabulary1.process_sentence(sent) current_num_words = len(current_word_idxs_) # len1.append(len(current_word_idxs_)) #print('len(current_word_idxs_)', len(current_word_idxs_)) current_word_idxs = np.zeros(config.max_input_length, dtype=np.int32) current_masks = np.zeros(config.max_input_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs1.append(current_word_idxs) masks1.append(current_masks) print('src max length', max(len1)) # #import pdb;pdb.set_trace() for sent in dst: #tqdm(dst): current_word_idxs_ = vocabulary1.process_sentence(sent + ' stop') current_num_words = len(current_word_idxs_) # len2.append(len(current_word_idxs_)) #print('len(current_word_idxs_)', len(current_word_idxs_)) current_word_idxs = np.zeros(config.max_output_length, dtype=np.int32) current_masks = np.zeros(config.max_output_length) current_word_idxs[:current_num_words] = np.array( current_word_idxs_) current_masks[:current_num_words] = 1.0 word_idxs2.append(current_word_idxs) masks2.append(current_masks) print('dst max length', max(len2)) # word_idxs1 = np.array(word_idxs1) masks1 = np.array(masks1) word_idxs2 = np.array(word_idxs2) masks2 = np.array(masks2) len1 = np.array(len1) len2 = np.array(len2) data = { 'word_idxs1': word_idxs1, 'masks1': masks1, 'word_idxs2': word_idxs2, 'masks2': masks2, 'len1': len1, 'len2': len2 } np.save(filetemp, data) else: data = np.load(filetemp).item() word_idxs1 = data['word_idxs1'] masks1 = data['masks1'] len1 = data['len1'] word_idxs2 = data['word_idxs2'] masks2 = data['masks2'] len2 = data['len2'] # print("Building the dataset...") is_train = config.phase == 'train' dataset = DataSet(word_idxs1, masks1, len1, config.batch_size, word_idxs2, masks2, len2, is_train=is_train, shuffle=is_train) print("Dataset built.") print("prepare data for " + config.phase + " done!") return dataset, vocabulary1 #, vocabulary2
class ConstrainedContextSeq2SeqEmbeddings(SconeModel): """Model that predicts a sequence of actions (action and arguments). Attributes: Todo: * Consider refactoring. E.g., have a class for an encoder and a decoder. * Fewer parameters in the constructor. """ def __init__(self, in_vocab, output_vocabularies, state_encoder_builder, valid_action_fn, args): SconeModel.__init__(self, state_encoder_builder, in_vocab, args.embeddings_size, args.num_enc_layers, args.encoder_size, args.decoder_size, RNNBuilder) self.args = args self._dropout = 0. # Output vocabs and embeddings. self.output_action_vocabulary = Vocabulary(output_vocabularies[0], [EOS, BEG]) self.output_location_vocabulary.g = Vocabulary(output_vocabularies[1], [NO_ARG, BEG]) self.output_argument_vocabulary = Vocabulary(output_vocabularies[2], [NO_ARG, BEG]) # All outputs vocabulary. all_vocabulary_list = [] self._valid_action_indices = [] index = 0 for action in self.output_action_vocabulary: for location in self.output_location_vocabulary.g: for argument in self.output_argument_vocabulary: if action != BEG and location != BEG and argument != BEG: if valid_action_fn(action, location, argument): self._valid_action_indices.append(index) all_vocabulary_list.append((action, location, argument)) index += 1 self._all_output_vocabulary = Vocabulary(all_vocabulary_list, []) self._output_action_embeddings = self._pc.add_lookup_parameters( (len(self.output_action_vocabulary), args.embeddings_size), name="output-action-embeddings") self._output_location_embeddings = self._pc.add_lookup_parameters( (len(self.output_location_vocabulary.g), args.embeddings_size), name="output-location-embeddings") self._output_argument_embeddings = self._pc.add_lookup_parameters( (len(self.output_argument_vocabulary), args.embeddings_size), name="output-argument-embeddings") # Action decoder RNN. self._dec_input_size = args.encoder_size * 2 \ + args.encoder_size * 2 \ + self._state_encoder.item_size() * 2 \ + args.embeddings_size * 3 self._decoder = RNNBuilder(args.num_dec_layers, self._dec_input_size, args.decoder_size, self._pc) situated_in_size = self._dec_input_size if self.args.always_initial_state: self._state_attention_winitial = self._pc.add_parameters( (self.args.encoder_size * 2 + self.args.decoder_size, self._state_encoder.item_size()), name="state-attention-winitial") self._state_attention_winitial2 = self._pc.add_parameters( (self.args.encoder_size * 2 + self.args.decoder_size, self._state_encoder.item_size()), name="state-attention-winitial2") situated_in_size += 2 * self._state_encoder.item_size() # MLP parameters to mix the situated embedding. self._situated_w = self._pc.add_parameters( (self._dec_input_size, situated_in_size), name="situated-w") self._situated_b = self._pc.add_parameters((self._dec_input_size), name="situated-b") # Project the RNN output to a vector that is the length of the output # vocabulary. self._final_w = self._pc.add_parameters( (args.decoder_size, args.decoder_size), name="final-w") self._output_w_action = self._pc.add_parameters( (len(self.output_action_vocabulary) - 1, args.decoder_size), name="output-w-action") self._output_w_location = self._pc.add_parameters( (len(self.output_location_vocabulary.g) - 1, args.decoder_size), name="output-w-location") self._output_w_argument = self._pc.add_parameters( (len(self.output_argument_vocabulary) - 1, args.decoder_size), name="output-w-argument") def probability_of_token(self, token, probability_dist): return probability_dist[self._all_output_vocabulary.lookup_index(tuple(token))] def set_dropout(self, amount): """ Sets the dropout amount for the model, changes during various learning stages. Inputs: amount (float): Amount of dropout to apply. """ self._dropout = amount def compute_entropy(self, distribution): """ Gets the entropy of a probability distribution that may contain zeroes. Inputs: probability_distribution (dy.Expression): The probability distribution. Returns: dy.Expression representing the entropy. """ num_actions = len(self.output_action_vocabulary) - 1 num_locations = len(self.output_location_vocabulary.g) - 1 num_arguments = len(self.output_argument_vocabulary) - 1 valid_mask = numpy.zeros(num_actions * num_locations * num_arguments) for index in self._valid_action_indices: valid_mask[index] = 1. # This mask is one for all valid indices, and zero for all others. valid_mask = dy.inputTensor(valid_mask) # This basically replaces everything in the probability distribution # with the original value (if valid), or zero (if not valid). valid_probs = dy.cmult(valid_mask, distribution) # The inverse of valid mask, this gives a value of 1. if something is invalid. invalid_probs = 1.-valid_mask # The result of this operation is that everything that's valid gets its # original probability, and everything that's not gets a probability of 1. probs = valid_probs + invalid_probs # dy.log(probs) will give log(p(action)) if action is valid, and # log(1)=0 for invalid actions. # then entropies will be zero for everything that isn't valid, and the # actual p log(p) otherwise. entropies = dy.cmult(probs, dy.log(probs + 0.00000000001)) return -dy.sum_elems(entropies) def action_probabilities(self, distribution): num_actions = len(self.output_action_vocabulary) - 1 num_locations = len(self.output_location_vocabulary.g) - 1 num_arguments = len(self.output_argument_vocabulary) - 1 zeroes = numpy.zeros(num_locations * num_arguments) ones = numpy.ones(num_locations * num_arguments) actions_masks = [] probs = { } action_idx = 0 for action in self.output_action_vocabulary: if action != BEG: masks = numpy.concatenate( (numpy.repeat(zeroes, action_idx), ones, numpy.repeat(zeroes, num_actions - action_idx - 1))) actions_masks = dy.reshape(dy.inputTensor(masks), (num_actions * num_locations * num_arguments, 1)) action_prob = dy.sum_elems(dy.cmult(actions_masks, distribution)) probs[action] = action_prob action_idx += 1 return probs def group_tokens(self, string): """ Groups tokens from a flat list of strings into action sequence. Inputs: string (list of str): Flat action sequence. Returns: list of tuple, representing parameterized actions. """ seq = [] current_triple = [] for token in string: if token in self.output_action_vocabulary: if len(current_triple) == 3: # Push the current triple and add this one seq.append(current_triple) elif len(current_triple) < 3 and current_triple: # Means that there were no arguments current_triple.extend( [NO_ARG for _ in range(3 - len(current_triple))]) assert len(current_triple) == 3 seq.append(current_triple) current_triple = [token] elif token in self.output_location_vocabulary.g: assert len(current_triple) == 1, \ "Location " + str(token) + " must follow an action," \ + " but current triple was " + str(current_triple) current_triple.append(token) elif token in self.output_argument_vocabulary: assert len(current_triple) == 2, \ "Argument " + str(token) + " must follow an action and location," \ + " but current triple was " + str(current_triple) current_triple.append(token) if len(current_triple) < 3 and current_triple: current_triple.extend( [NO_ARG for _ in range(3 - len(current_triple))]) assert len(current_triple) == 3 or not current_triple if len(current_triple) == 3: seq.append(current_triple) return seq def _out_to_int(self, string, add_eos=False): if add_eos: string = list(string) + [EOS] else: string = list(string) return [(self.output_action_vocabulary.lookup_index(tok[0]), self.output_location_vocabulary.g.lookup_index(tok[1]), self.output_argument_vocabulary.lookup_index(tok[2])) \ for tok in self.group_tokens(string)] def _get_probs(self, rnn_output, restrict=None): final_w = dy.parameter(self._final_w) output_w_action = dy.parameter(self._output_w_action) output_w_location = dy.parameter(self._output_w_location) output_w_argument = dy.parameter(self._output_w_argument) intermediate_state = final_w * rnn_output if self.args.final_nonlinearity: intermediate_state = dy.tanh(intermediate_state) action_scores = output_w_action * intermediate_state location_scores = output_w_location * intermediate_state argument_scores = output_w_argument * intermediate_state flattened_scores = flatten_triple(action_scores, location_scores, argument_scores) if restrict or self.args.syntax_restricted: restrict_tokens = self._valid_action_indices if restrict: restrict_tokens = restrict return dy.exp(dy.log_softmax(flattened_scores, restrict=restrict_tokens)) else: probs = dy.softmax(flattened_scores) return probs def _predict(self, rnn_output, fsa_restricted=False, fsa=None): # Forces a forward pass to get value. probs = self._get_probs( rnn_output, restrict=fsa.valid_actions(self._all_output_vocabulary) if fsa_restricted else None).value() max_tuple = numpy.argmax(probs) predicted_token = self._all_output_vocabulary.lookup_token(max_tuple) return (predicted_token, probs[max_tuple]) def _init_decoder(self): return self._decoder.initial_state().add_input(dy.vecInput(self._dec_input_size)) def _embed_predicted_triple(self, triple): return dy.concatenate([self._output_action_embeddings[triple[0]], self._output_location_embeddings[triple[1]], self._output_argument_embeddings[triple[2]]]) def _decoder_input_embedding(self, rnn_state, previous_triple, encoded_string, enc_state, encoded_history, training=False, initial_state=None): attention_vecs = {} # Compute attention over encodded string. utterance_attn, utterance_dist = attend(encoded_string, rnn_state.h()[-1], dy.parameter(self._utterance_attention_w), self._dropout if training else 0.) attention_vecs['utterance'] = utterance_dist # Key for state and history attention. attn_key = dy.concatenate([utterance_attn, rnn_state.h()[-1]]) if training: attn_key = dy.dropout(attn_key, self._dropout) # Attend on history using current state and utterance attention. history_attn, history_dist = attend(encoded_history, attn_key, dy.parameter(self._history_attention_w), self._dropout if training else 0.) attention_vecs['history'] = history_dist # Attend on state. state_attn, state_dist = attend(enc_state, attn_key, dy.parameter(self._state_attention_w), self._dropout if training else 0.) state_attn2, state_dist2 = attend(enc_state, attn_key, dy.parameter(self._state_attention_w2), self._dropout if training else 0.) attention_vecs['state_1'] = state_dist attention_vecs['state_2'] = state_dist2 # Compute previous embedding prev_emb = self._embed_predicted_triple(previous_triple) # Concatenate with history and state, and mix with a feed-forward # layer. situated_embedding = dy.concatenate([utterance_attn, history_attn, state_attn, state_attn2, prev_emb]) # Attend on initial state (if provided) if self.args.feed_updated_state and self.args.always_initial_state: if not initial_state: raise ValueError("Encoding the initial state but it was not provided.") initial_attn, initial_dist = attend(initial_state, attn_key, dy.parameter(self._state_attention_winitial), self._dropout if training else 0.) initial_attn2, initial_dist2 = attend(initial_state, attn_key, dy.parameter(self._state_attention_winitial2), self._dropout if training else 0.) attention_vecs['initial_1'] = initial_dist attention_vecs['initial_2'] = initial_dist2 situated_embedding = dy.concatenate([situated_embedding, initial_attn, initial_attn2]) # Situated embedding mixing parameters. weights = dy.parameter(self._situated_w) biases = dy.parameter(self._situated_b) situated_embedding = dy.tanh(weights * situated_embedding + biases) return situated_embedding, attention_vecs def get_losses( self, utterance, output_seq, state, history, fsa=None, training=False): """Gets the losses of a gold sequence. Args: utterance (list of str): Represents the current utterance. output_seq (list of triple of str): Represents the gold output sequence. state (WorldState): Represents the state of the environment. history (list of list of str): Represents the previous utterances. fsa (ExecutableFSA, optional): An FSA builder object. training (bool, optional): Whether or not you are training right now. Returns: list of dy.Expression, where each corresponds to the loss at each gold output prediction. """ enc_utterance, enc_history, enc_state = self._encode_inputs( utterance, state, history) initial_encoded_state = enc_state output_seq = self.group_tokens(output_seq + [EOS]) # Run the decoder (forced decoding). rnn_state = self._init_decoder() losses = [] prev_token_ints = (self.output_action_vocabulary.lookup_index(BEG), self.output_location_vocabulary.g.lookup_index(BEG), self.output_argument_vocabulary.lookup_index(BEG)) for i, output_token in enumerate(output_seq): if self.args.feed_updated_state: if not fsa: raise ValueError("Attempting to feed the updated state " \ + "no FSA was provided") enc_state = self._state_encoder.encode(fsa.state()) # Compute the decoder input. situated_embedding, _ = self._decoder_input_embedding( rnn_state, prev_token_ints, enc_utterance, enc_state, enc_history, training, initial_state=initial_encoded_state if self.args.always_initial_state else None) if training: situated_embedding = dy.dropout( situated_embedding, self._dropout) # Weird choice -- not adding previous token generated token # embedding. TODO: fix rnn_state = rnn_state.add_input(situated_embedding) gold_index = self._all_output_vocabulary.lookup_index(tuple(output_token)) log_prob_token = dy.log(self._get_probs(rnn_state.output())[gold_index]) if self.args.feed_updated_state and output_token != (EOS, NO_ARG, NO_ARG) and output_token != [EOS, NO_ARG, NO_ARG]: fsa.feed_complete_action(*output_token) # Loss of labeled token. losses.append(-log_prob_token) prev_token_ints = (self.output_action_vocabulary.lookup_index(output_token[0]), self.output_location_vocabulary.g.lookup_index(output_token[1]), self.output_argument_vocabulary.lookup_index(output_token[2])) return losses def _update_rnn_state(self, encoded_states, fsa, rnn_state, previous_token, initial_state=None, training=False): """ Generates a single token given a state. """ # Generate only if at the beginning of the sequence or the # previously generated token was EOS. utterance = encoded_states[0] history = encoded_states[1] world_state = encoded_states[2] if self.args.feed_updated_state: if not fsa: raise ValueError("Attempting to feed the updated state " \ + "no FSA was provided") if not fsa.state(): raise ValueError("Attempting to feed the updated state " \ + "FSA state was None") world_state = self._state_encoder.encode(fsa.state()) situated_embedding, attentions = self._decoder_input_embedding( rnn_state, previous_token, utterance, world_state, history, initial_state=initial_state, training=training) if training: situated_embedding = dy.dropout(situated_embedding, self._dropout) return rnn_state.add_input(situated_embedding), attentions def _policy_shape_probs(self, prob_dist): # TODO: this is specific to Alchemy num_actions = len(self.output_action_vocabulary) - 1 num_locations = len(self.output_location_vocabulary.g) - 1 num_arguments = len(self.output_argument_vocabulary) - 1 new_probdist = dy.zeros(prob_dist.dim()[0]) zeroes = numpy.zeros(num_locations * num_arguments) ones = numpy.ones(num_locations * num_arguments) eos_prob = prob_dist[self._all_output_vocabulary.lookup_index((EOS, NO_ARG, NO_ARG))] action_idx = 0 for action in self.output_action_vocabulary: masks = numpy.concatenate( (numpy.repeat(zeroes, action_idx), ones, numpy.repeat(zeroes, num_actions - action_idx - 1))) actions_masks = dy.reshape(dy.inputTensor(masks), (num_actions * num_locations * num_arguments, 1)) if action == EOS: new_probdist += dy.cmult(actions_masks, prob_dist) / 2. elif action == "push": new_probdist += dy.cmult(actions_masks, prob_dist) + eos_prob / (2. * 56.) elif action == "pop": new_probdist += dy.cmult(actions_masks, prob_dist) if self.args.syntax_restricted: return dy.exp(dy.log_softmax(dy.cmult(new_probdist, prob_dist), restrict = self._valid_action_indices)) else: return dy.softmax(dy.cmult(new_probdist, prob_dist)) def sample_sequences(self, batch, length=LEN_LIMIT, training=False, fsa_builder=None): """Rolls out using a policy (the probability distribution. Args: batch (list of examples): The batch that is being used to roll out. length (int, optional): The maximum length of the roll out. training (bool, optional): Whether or not training. fsa_builder (ExecutableFSA): An FSA that can be used to constrain. Returns: Todo: * Docstring. * No use of 'filter'. * Make returned value more clear. * Fewer branches. * Shorter (i.e. refactor). """ sample_start = time.time() batch_states = [] batch_initial_states = [] batch_prob_sequences = [[] for example in batch] batch_sequences = [[] for example in batch] finished_seqs = [False for example in batch] batch_encoded_states = [] for example in batch: encoded_inputs = self._encode_inputs( example.utterance, example.initial_state, example.history) batch_encoded_states.append(encoded_inputs) batch_initial_states.append(encoded_inputs[2]) initial_state = None if self.args.feed_updated_state: if not fsa_builder: raise ValueError("Need an FSA builder when feeding the "\ + " updated state during sampling") initial_state = fsa_builder(example.initial_state) batch_states.append( \ (initial_state, self._init_decoder(), (self.output_action_vocabulary.lookup_index(BEG), self.output_location_vocabulary.g.lookup_index(BEG), self.output_argument_vocabulary.lookup_index(BEG)))) for _ in range(length): # Generate probabilities for this step. batch_probs = [] batch_rnn_states = [] assert len(batch) == len(batch_encoded_states) assert len(batch) == len(batch_states) for j, (example, encoded_states, state, initial_state) in \ enumerate(zip(batch, batch_encoded_states, batch_states, batch_initial_states)): if not finished_seqs[j]: rnn_state, _ = self._update_rnn_state(encoded_states, state[0], state[1], state[2], initial_state, training=training) probs = self._get_probs(rnn_state.output()) else: probs = None rnn_state = None batch_probs.append(probs) batch_rnn_states.append(rnn_state) # Do a forward pass on the entire batch. if [prob for prob in batch_probs if prob]: dy.esum([dy.concatenate(list(prob)) for prob in batch_probs if prob]).value() # Update the batch states and keep track of probability distribution # and generated sequences. new_states = [] assert len(batch) == len(batch_states) assert len(batch) == len(batch_probs) assert len(batch) == len(batch_rnn_states) for j, (example, old_state, prob_dist, rnn_state) in enumerate( zip(batch, batch_states, batch_probs, batch_rnn_states)): if not finished_seqs[j]: # Get the predicted token by sampling. sampling_policy = prob_dist if self.args.policy_shaping: sampling_policy = self._policy_shape_probs(prob_dist) predicted_token, token_prob = sample_any_tok( sampling_policy, self._all_output_vocabulary) # Update the FSA. fsa = None if self.args.feed_updated_state and predicted_token != (EOS, NO_ARG, NO_ARG): fsa = old_state[0] peek_state = fsa.peek_complete_action(*predicted_token) if peek_state and predicted_token != (EOS, NO_ARG, NO_ARG): fsa.feed_complete_action(*predicted_token) # Only update batch states if you don't predict EOS. Otherwise, # no point in continuing to generate for this example. if predicted_token == (EOS, NO_ARG, NO_ARG): finished_seqs[j] = True new_states.append((None, None, None)) else: predicted_token_idxs = \ (self.output_action_vocabulary.lookup_index(predicted_token[0]), self.output_location_vocabulary.g.lookup_index(predicted_token[1]), self.output_argument_vocabulary.lookup_index(predicted_token[2])) new_states.append( (fsa, rnn_state, predicted_token_idxs)) # Update probability expressions and samples. batch_sequences[j].append( (predicted_token, token_prob)) batch_prob_sequences[j].append(prob_dist) else: new_states.append((None, None, None)) batch_states = new_states else: break return batch_prob_sequences, batch_sequences def generate_probs(self, utterance, state, history, fsa=None, fsa_restricted=False): """Gets predictions (by argmax) and their probabilities. Args: utterance (list of str): The current utterance. state (WorldState): The world state. history (list of list of str): The previous utterances. fsa (ExecutableFSA, optional): The FSA builder object, if using constrained decoding. Returns: list of (str, float), representing the predicted sequence, where each string is the predicted token and the float is the probability of the token. """ dy.renew_cg() encoded_states = self._encode_inputs(utterance, state, history) initial_state = encoded_states[2] # Run the decoder. rnn_state = self._init_decoder() output_seq_probs = [] attentions = [] predicted_token_ints = [self.output_action_vocabulary.lookup_index(BEG), self.output_location_vocabulary.g.lookup_index(BEG), self.output_argument_vocabulary.lookup_index(BEG)] while len(output_seq_probs) <= LEN_LIMIT: # Compute the decoder input. rnn_state, attention = self._update_rnn_state( encoded_states, fsa, rnn_state, predicted_token_ints, initial_state if self.args.always_initial_state else None) attentions.append(attention) if self.args.fsa_restricted: raise ValueError("FSA generation is not implemented " \ + "jointly predicting all three things") else: predicted_token, prob = self._predict(rnn_state.output(), fsa_restricted, fsa) output_seq_probs.append((predicted_token, prob)) predicted_token_ints = \ [self.output_action_vocabulary.lookup_index(predicted_token[0]), self.output_location_vocabulary.g.lookup_index(predicted_token[1]), self.output_argument_vocabulary.lookup_index(predicted_token[2])] if predicted_token == (EOS, NO_ARG, NO_ARG): return output_seq_probs, attentions if self.args.feed_updated_state: peek_state = fsa.peek_complete_action(*predicted_token) if peek_state: fsa.feed_complete_action(*predicted_token) return output_seq_probs, attentions def generate(self, utterance, state, history, fsa, fsa_restricted=False): """Generates a sequence of predicted tokens for an input. Args: utterance (list of str): The current utterance. state (WorldState): The world state. history (list of list of str): The previous utterances. fsa (ExecutableFSA): The FSA, for constrained decoding. Returns: list of str, representing the predicted sequence. Todo: * Don't use map. """ preds_and_probs, attentions = self.generate_probs(utterance, state, history, fsa, fsa_restricted) # Get only the tokens and remove the EOS token at the end. preds = [p[0] for p in preds_and_probs] if list(preds[-1]) == [EOS, NO_ARG, NO_ARG]: preds = preds[:-1] return preds, attentions
def main(args): assert FLAGS.validation_data_loader, "--vocab_file is required" assert FLAGS.vocab_file, "--vocab_file is required" assert FLAGS.model_path, "--model_path is required" assert FLAGS.selection_method in ['sampling', 'argmax', 'beam_search'],\ "--selection_method can only be one of 'sampling', 'argmax' and 'beam_search'." model_config = configuration.ModelConfig() print('Loading vocabulary file...') vocab = Vocabulary(FLAGS.vocab_file) vocab_size = vocab.get_vocabulary_size() # Assign parameters to model configuration. model_config.vocab_size = vocab_size # Build the TensorFlow graph. g = tf.Graph() with g.as_default(): print('Building LSTM decoder model for inference...') if not FLAGS.repeated_feed_images: model = LSTMDecoder(model_config, mode="inference") else: model = LSTMDecoderRepeatedImageFeed(model_config, mode="inference") model.build() print('Initializing variables...') init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) print('Loading saved model...') saver = tf.train.Saver() saver.restore(sess, FLAGS.model_path) print('Initializing data loader for validation set...') start = time.time() data_loader_val = DataLoader() data_loader_val.load(FLAGS.validation_data_loader) end = time.time() time_elapsed = end - start print('Finished initializing data loader (time elapsed: %f)' % time_elapsed) print('Start inference...') initial_input_sequence = np.zeros(model_config.batch_size, dtype=np.int32) initial_input_sequence.fill(vocab.start_id) max_sentence_length = const_config.lstm_truncated_length + 1 json_results = [] for image_features, _, _, _, video_indices, video_segment_indices, valid_count in \ data_loader_val.segmental_sampling_iter(batch_size=model_config.batch_size, num_segments=model_config.num_segments): current_input = initial_input_sequence.copy() if not FLAGS.repeated_feed_images: current_state = sess.run( fetches="lstm/initial_state:0", feed_dict={"input_features:0": image_features}) else: current_state = sess.run(fetches="lstm/initial_state:0", feed_dict={}) generated_sentences =\ np.zeros((model_config.batch_size, max_sentence_length), dtype=np.int32) generated_sentences[:, 0] = current_input completed_masks = np.zeros(model_config.batch_size, dtype=np.bool) for i in range(const_config.lstm_truncated_length): if not FLAGS.repeated_feed_images: softmax_output, next_state = sess.run( fetches=["softmax:0", "lstm/state:0"], feed_dict={ "input_feed:0": current_input, "lstm/state_feed:0": current_state }) else: softmax_output, next_state = sess.run( fetches=["softmax:0", "lstm/state:0"], feed_dict={ "input_feed:0": current_input, "lstm/state_feed:0": current_state, "input_features:0": image_features }) if FLAGS.selection_method == 'sampling': # Sample the next word according to the probability. next_input = [] for probs in softmax_output: next_input.append(np.random.choice(vocab_size, p=probs)) next_input = np.array(next_input) elif FLAGS.selection_method == 'argmax': next_input = np.argmax(softmax_output, axis=1) else: # TODO: implement beam search next_input = None generated_sentences[:, i + 1] = next_input # Update input and state. current_input = next_input current_state = next_state # Early stop if we have generated the <END> token for all sentences. for j, word_id in enumerate(next_input): if word_id == vocab.end_id: completed_masks[j] = True if sum(completed_masks) == model_config.batch_size: break # Extract text sentences. sentences = [] for word_id_array in generated_sentences: word_id_array = remove_start_end_word_ids(word_id_array, vocab) text = vocab.id_array_to_sentence(word_id_array) sentences.append(text) sentences = sentences[:valid_count] for sentence in sentences: print sentence for i in range(valid_count): video_idx = video_indices[i] segment_idx = video_segment_indices[i] video = data_loader_val.videos[video_idx] video_segment = video.video_segments[segment_idx] caption_trimmed = remove_start_end_word_ids( video_segment.caption, vocab) gt_caption = vocab.id_array_to_sentence(caption_trimmed) video_segment_name = video.name + str(segment_idx) json_results.append({ 'name': video_segment_name, 'video_caption': sentences[i], 'gt_caption': gt_caption }) print('Finished Inference.') print('Dumping results...') fo = open(FLAGS.output_file, 'w') json.dump(json_results, fo, indent=4) fo.close() print('Done.')
type=int, default=VAL_FREQ_DEFAULT, help='Frequency of evaluation on validation set') parser.add_argument('--vocab_file', type=str, default=DEFAULT_VOCAB_FILE, help='Default vocabulary file') parser.add_argument('--one_hot', type=str, default=ONE_HOT_DEFAULT, help='apply one hot encoding') parser.add_argument('--check_freq', type=int, default=CHECKPOINT_FREQ_DEFAULT, help='test and save results ') parser.add_argument('--name', type=str, default=MODEL_NAME_DEFAULT, help='model name') parser.add_argument('--append', type=bool, default=APPEND_DEFAULT, help='append start,end token') FLAGS, unparsed = parser.parse_known_args() vocabulary = Vocabulary(FLAGS.vocab_file, None, None, flag='load') VOCAB_SIZE = len(vocabulary._vocab) start_v = vocabulary.word_to_id("#START#") end_v = vocabulary.word_to_id("#END#") print(vocabulary.word_to_id('dressing')) main(None)
import atislexicon from augmentation import Augmenter import domains from encoderdecoder import EncoderDecoderModel from attention import AttentionModel from example import Example import spec as specutil from vocabulary import Vocabulary MODELS = collections.OrderedDict([ ('encoderdecoder', EncoderDecoderModel), ('attention', AttentionModel), ]) VOCAB_TYPES = collections.OrderedDict([ ('raw', lambda s, e, **kwargs: Vocabulary.from_sentences(s, e, **kwargs)), ('glove', lambda s, e, **kwargs: Vocabulary.from_sentences( s, e, use_glove=True, **kwargs)) ]) # Global options OPTIONS = None # Global statistics STATS = {} def _parse_args(): global OPTIONS parser = argparse.ArgumentParser( description='A neural semantic parser.',
from vocabulary import * import pipeline_lstm import pipeline_cnn # pipeline_lstm.train() # pipeline_lstm.test() # pipeline_cnn.test() import data_video from vocabulary import Vocabulary vocab_path = data_video.msvd_bilingual_vocab_char_path vocab = Vocabulary.load(vocab_path) dataset = data_video.MSVDDatasetBilingual(vocab=vocab, segment_method='char', caption_mode='text', split='train') dataset.data.sort(key=lambda x: x.video_id) with open('all_captions.txt', 'w') as f: for d in dataset.data: f.write('{:>12} {}\n'.format(d.video_id, d.caption))
import general_utils import chat_command_handler from chat_settings import ChatSettings from chatbot_model import ChatbotModel from vocabulary import Vocabulary #Read the hyperparameters and configure paths _, model_dir, hparams, checkpoint = general_utils.initialize_session("chat") #Load the vocabulary print() print("Loading vocabulary...") if hparams.model_hparams.share_embedding: shared_vocab_filepath = path.join(model_dir, Vocabulary.SHARED_VOCAB_FILENAME) input_vocabulary = Vocabulary.load(shared_vocab_filepath) output_vocabulary = input_vocabulary else: input_vocab_filepath = path.join(model_dir, Vocabulary.INPUT_VOCAB_FILENAME) input_vocabulary = Vocabulary.load(input_vocab_filepath) output_vocab_filepath = path.join(model_dir, Vocabulary.OUTPUT_VOCAB_FILENAME) output_vocabulary = Vocabulary.load(output_vocab_filepath) #Create the model print("Initializing model...") print() with ChatbotModel(mode="infer", model_hparams=hparams.model_hparams, input_vocabulary=input_vocabulary,
def read_instances_from_file(files, max_len=400, keep_case=False): ''' Collect instances and construct vocab ''' vocab = Vocabulary() lb_vocab = Vocabulary(need_default=False) sets = [] for file in files: sents, labels = [], [] trimmed_sent = 0 with open(file) as f: lines = f.readlines() for l in lines: l = l.strip().split('\t') if len(l) < 2: continue label = l[0] sent = l[1] if not keep_case: sent = sent.lower() word_lst = sent.split() if len(word_lst) > max_len: word_lst = word_lst[:max_len] trimmed_sent += 1 if word_lst: sents.append(word_lst) labels.append(label) vocab.add_word_lst(word_lst) lb_vocab.add_word(label) assert len(sents) == len(labels) sets.append({'sents': sents, 'labels': labels}) logger.info('Get {} instances from file {}'.format(len(sents), file)) if trimmed_sent: logger.info( '{} sentences are trimmed. Max sentence length: {}.'.format( trimmed_sent, max_len)) logger.info('Building vocabulary...') vocab.add_word_lst(['<cls>'] * 6) vocab.build_vocab() lb_vocab.build_vocab() logger.info('Finished. Size of vocab: {}. # Class: {}.'.format( len(vocab), len(lb_vocab))) logger.info('<pad>: {}'.format(vocab.to_index('<pad>'))) logger.info('<unk>: {}'.format(vocab.to_index('<unk>'))) logger.info('<cls>: {}'.format(vocab.to_index('<cls>'))) return sets, vocab, lb_vocab
def build_vocabs(): tasks = [ '.'.join([id, syn]) for id in ['autoid', 'goldid'] for syn in ['autosyn', 'goldsyn'] ] stypes = ['train', 'dev', 'test'] loader = StreusleLoader() STREUSLE_BASE = os.environ.get( 'STREUSLE_BASE' ) or '/cs/usr/aviramstern/nlp/datasets/streusle_v4/release' all_files = [ STREUSLE_BASE + '/' + stype + '/streusle.ud_' + stype + '.' + task + '.json' for task in tasks for stype in stypes ] records = sum([loader.load(f, input_format='json') for f in all_files], []) samples = [streusle_record_to_lstm_model_sample(r) for r in records] pp_vocab = Vocabulary('PREPS') pp_vocab.add_words( set([ x.token for s in samples for x, y in zip(s.xs, s.ys) if any([y.supersense_role, y.supersense_func]) ])) ner_vocab = Vocabulary('NERS') ner_vocab.add_words( set([x.ner for s in samples for x, y in zip(s.xs, s.ys)])) ner_vocab.add_word(None) lemmas_vocab = Vocabulary('LEMMAS') lemmas_vocab.add_words( set([x.lemma for s in samples for x, y in zip(s.xs, s.ys)])) ud_dep_vocab = Vocabulary('UD_DEPS') ud_dep_vocab.add_words( set([x.ud_dep for s in samples for x, y in zip(s.xs, s.ys)])) ud_dep_vocab.add_word(None) ud_xpos_vocab = Vocabulary('UD_XPOS') ud_xpos_vocab.add_words( set([x.ud_xpos for s in samples for x, y in zip(s.xs, s.ys)])) ud_xpos_vocab.add_word(None) token_vocab = Vocabulary('TOKENS') token_vocab.add_words( set([x.token for s in samples for x, y in zip(s.xs, s.ys)])) govobj_config_vocab = Vocabulary('GOVOBJ_CONFIGS') govobj_config_vocab.add_words( set([x.govobj_config for s in samples for x, y in zip(s.xs, s.ys)])) pss_vocab = Vocabulary('PSS') pss_vocab.add_words(supersense_repo.PREPOSITION_SUPERSENSES_SET) pss_vocab.add_word(None) pss_vocab = Vocabulary('LEXCAT') pss_vocab.add_words( set([x.lexcat for s in samples for x, y in zip(s.xs, s.ys)])) return [ pp_vocab, ner_vocab, lemmas_vocab, ud_dep_vocab, ud_xpos_vocab, token_vocab, pss_vocab, govobj_config_vocab ]
def main(): vocabulary = Vocabulary() hangman = Hangman(vocabulary) hangman.startGame()
dev_e_path = '../data/validation/dev.e.gz' dev_f_path = '../data/validation/dev.f.gz' dev_wa = '../data/validation/dev.wa.nonullalign' test_e_path = '../data/test/test.e.gz' test_f_path = '../data/test/test.f.gz' test_wa = '../data/test/test.wa.nonullalign' # Using only 1000 words will result in many UNKs, but # it will make training a lot faster. # If you have a fast computer, a GPU, or a lot of time, # try with 10000 instead. max_tokens = 1000 corpus_e = smart_reader(train_e_path) vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens) pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb")) print("English vocabulary size: {}".format(len(vocabulary_e))) corpus_f = smart_reader(train_f_path) vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens) pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb")) print("French vocabulary size: {}".format(len(vocabulary_f))) # load test corpus test_corpus = list( bitext_reader(smart_reader(test_e_path), smart_reader(test_f_path))) # run tf.reset_default_graph()
from vocabulary import Vocabulary NERS = Vocabulary('NERS', [ 'DATE', 'ORGANIZATION', 'O', 'ORDINAL', 'TIME', 'NUMBER', 'MONEY', 'PERCENT', 'MISC', 'PERSON', 'LOCATION', 'DURATION', 'SET', None ])
return options, pattern if __name__ == '__main__': import os import glob import pprint from vocabulary import Vocabulary import parallelize options, pattern = parse_args() olddir = os.getcwd() os.chdir(options.datadir) fnames = glob.glob(pattern) nprocesses = len(fnames) if options.parallel else None results = parallelize.run(process_file, fnames, nprocesses, options) full_counter = Counter() for counter in results: full_counter.update(counter) vocabulary = Vocabulary(full_counter, n_most_common=options.nwords) vocabulary.save('index') pprint.pprint(full_counter.most_common(200)) print(len(full_counter)) print(vocabulary) os.chdir(olddir)
def import_vocabulary(self, vocabulary_dir, normalize=True, import_mode=VocabularyImportMode.External, dataset_vocab=None): if dataset_vocab is None and import_mode != VocabularyImportMode.External: raise ValueError( "dataset_vocab must be provided if import_mode is not 'External'." ) import_stats = VocabularyImportStats() #Read the external vocabulary tokens and embeddings tokens_with_embeddings = self._read_vocabulary_and_embeddings( vocabulary_dir) #If normalize flag is true, normalize casing of the external vocabulary and average embeddings for any resulting duplicate tokens if normalize: tokens_with_embeddings = self._normalize_tokens_with_embeddings( tokens_with_embeddings) import_stats.external_vocabulary_size = len(tokens_with_embeddings) #Apply dataset filters if applicable if dataset_vocab is not None: import_stats.dataset_vocabulary_size = dataset_vocab.size() if import_mode == VocabularyImportMode.ExternalIntersectDataset or import_mode == VocabularyImportMode.Dataset: #Get rid of all tokens that exist in the external vocabulary but don't exist in the dataset for token in list(tokens_with_embeddings.keys()): if not dataset_vocab.word_exists(token): del tokens_with_embeddings[token] import_stats.intersection_size = len(tokens_with_embeddings) if import_mode == VocabularyImportMode.ExternalUnionDataset or import_mode == VocabularyImportMode.Dataset: #Add any tokens that exist in the dataset but don't exist in the external vocabulary. #These added tokens will get word vectors sampled from the gaussian distributions of their components: # where the mean of each component is the mean of that component in the external embedding matrix # and the standard deviation of each component is the standard deviation of that component in the external embedding matrix embeddings_matrix = np.array(list( tokens_with_embeddings.values()), dtype=np.float32) emb_size = embeddings_matrix.shape[1] emb_mean = np.mean(embeddings_matrix, axis=0) emb_stdev = np.std(embeddings_matrix, axis=0) for i in range(dataset_vocab.size()): dataset_token = dataset_vocab.int2word(i, capitalize_i=False) if dataset_token not in tokens_with_embeddings: tokens_with_embeddings[ dataset_token] = np.random.normal( emb_mean, emb_stdev, emb_size) if len(tokens_with_embeddings) == 0: raise ValueError( "Imported vocabulary size is 0. Try a different VocabularyImportMode (currently {0})" .format(VocabularyImportMode(import_mode).name)) tokens, embeddings_matrix = zip(*tokens_with_embeddings.items()) embeddings_matrix = np.array(embeddings_matrix, dtype=np.float32) #Create the vocabulary instance vocabulary = Vocabulary(external_embeddings=embeddings_matrix) for i in range(len(tokens)): vocabulary.load_word(tokens[i], i) vocabulary.compile(loading=True) return vocabulary, import_stats
class Model(object): def __init__(self): self.vocab = Vocabulary() self.language_module = dataset.LSTMLanguageModule( message_flags.flattened_message_size(), self.vocab.get_vocab_size()).to(device) self.training_examples = [] self.encoder = pretrain.load_saved_encoder().to(device) self.encoder.eval() self.decoder = pretrain.load_saved_decoder().to(device) self.decoder.eval() params_to_train = list(self.language_module.parameters()) if FLAGS.model_train_decoder: params_to_train.extend(list(self.decoder.parameters())) self.optimizer = optim.Adam(params_to_train, weight_decay=1e-5) def predict(self, state, command): self.language_module.eval() self.decoder.eval() token_ids = self.vocab.token_ids(command) command_variable = torch.LongTensor(token_ids).unsqueeze(0).to(device) state_variable = dataset.state_to_variable(state).to(device) encoder_output = self.language_module.forward(command_variable) decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation( encoder_output) prediction = self.decoder.forward(state_variable, decoder_input) return dataset.output_from_variable(prediction, state) def optimizer_step(self): self.language_module.train() self.decoder.train() random.shuffle(self.training_examples) for batch in util.batch_iterator(self.training_examples, FLAGS.model_batch_size): states = [s for s, c, t, m in batch] commands = [c for s, c, t, m in batch] targets = [t for s, c, t, m in batch] target_messages = [m for s, c, t, m in batch] self.optimizer.zero_grad() target_message = torch.from_numpy( np.concatenate(target_messages, 0)).to(device) state_variable = dataset.state_to_variable_batch(states).to(device) target_variable = dataset.output_to_variable_batch( targets, states).to(device) max_command_len = max(len(c) for c in commands) token_ids = np.zeros((len(commands), max_command_len), dtype=np.int64) for i, command in enumerate(commands): ids = self.vocab.token_ids(command) token_ids[i, -len(ids):] = ids command_variable = torch.from_numpy(token_ids).to(device) encoder_output = self.language_module.forward(command_variable) decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation( encoder_output) prediction = self.decoder.forward(state_variable, decoder_input, target_variable) if FLAGS.continuous_message: error = encoder_output - target_message message_loss = (error * error).sum() else: log_message_probs = F.log_softmax( encoder_output.view(-1, FLAGS.discrete_message_size, FLAGS.discrete_message_symbols), 2) target_message_reshaped = target_message.view( -1, FLAGS.discrete_message_size, FLAGS.discrete_message_symbols) message_loss = -(log_message_probs * target_message_reshaped).sum() loss = dataset.loss( prediction, target_variable ) + FLAGS.model_message_loss_weight * message_loss loss = loss / len(batch) # avg instead of sum loss.backward() self.optimizer.step() def training_accuracy(self): n_correct = 0 for state, command, target, target_message in self.training_examples: prediction = self.predict(state, command) if prediction == target: n_correct += 1 return n_correct / len(self.training_examples) def update(self, state, command, target_output, num_updates=None): if num_updates is None: num_updates = FLAGS.model_max_updates state_variable = dataset.state_to_variable(state).to(device) target_variable = dataset.output_to_variable(target_output, state).to(device) encoder_output = self.encoder.forward(state_variable, target_variable) target_message = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation( encoder_output) target_message = target_message.cpu().detach().numpy() self.training_examples.append( (state, command, target_output, target_message)) for _ in range(num_updates): self.optimizer_step()
if title != None: ax.set_title(title) ax.imshow(image) return ax vocab_threshold = 5 vocab_file = './vocab.pkl' start_word = "<start>" end_word = "<end>" unk_word = "<unk>" annotations_file = os.path.join( '/home/george/', 'cocoapi/annotations/image_info_test2014.json') vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder_file = 'encoder-2.pkl' decoder_file = 'decoder-2.pkl' embed_size = 256 hidden_size = 512 vocab_size = len(vocab) encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval()
from vocabulary import Vocabulary UD_DEPS = Vocabulary('UD_DEPS', [ 'ROOT', 'mark', 'obj', 'amod', 'dep', 'cop', 'appos', 'advmod', 'conj', 'cc', 'nsubjpass', 'compound', 'aux:pass', 'iobj', 'nsubj', 'root', 'nmod:tmod', 'ccomp', 'aux', 'cc:preconj', 'nsubj:pass', 'nmod', 'neg', 'acl', 'fixed', 'dobj', 'xcomp', 'auxpass', 'reparandum', 'det', 'discourse', 'vocative', 'flat', 'csubj:pass', 'obl', 'obl:tmod', 'punct', 'compound:prt', 'csubjpass', 'nummod', 'mwe', 'csubj', 'list', 'nmod:poss', 'advcl', 'obl:npmod', 'dislocated', 'orphan', 'expl', 'acl:relcl', 'nmod:npmod', 'goeswith', 'det:predet', 'case', 'parataxis', None ])
def chat(self, question, chat_settings): if chat_settings.enable_auto_punctuation: question = Vocabulary.auto_punctuate(question) question = Vocabulary.clean_text( question, normalize_words=chat_settings.inference_hparams.normalize_words) question = self.input_vocabulary.words2ints(question) question_with_history = [] for i in range(len(self.conversation_history)): question_with_history += self.conversation_history[i] + [ self.input_vocabulary.eos_int() ] question_with_history += question #Get the answer prediction batch = np.zeros((1, len(question_with_history))) batch[0] = question_with_history max_output_sequence_length = chat_settings.inference_hparams.max_answer_words + 1 # + 1 since the EOS token is counted as a timestep predicted_answer_info = self.predict_batch( inputs=batch, input_sequence_length=np.array([len(question_with_history)]), max_output_sequence_length=max_output_sequence_length, beam_length_penalty_weight=chat_settings.inference_hparams. beam_length_penalty_weight, sampling_temperature=chat_settings.inference_hparams. sampling_temperature, log_summary=chat_settings.inference_hparams.log_summary) #Read the answer prediction answer_beams = [] if self.beam_width > 0: #For beam search decoding: if show_all_beams is enabeled then output all beams (sequences), otherwise take the first beam. # The beams (in the "predictions" matrix) are ordered with the highest ranked beams first. beam_count = 1 if not chat_settings.show_all_beams else len( predicted_answer_info["predictions_seq_lengths"][0]) for i in range(beam_count): predicted_answer_seq_length = predicted_answer_info[ "predictions_seq_lengths"][0][ i] - 1 #-1 to exclude the EOS token predicted_answer = predicted_answer_info["predictions"][ 0][:predicted_answer_seq_length, i].tolist() answer_beams.append(predicted_answer) else: #For greedy / sampling decoding: only one beam (sequence) is returned, based on the argmax for greedy decoding # or the sampling distribution for sampling decoding. Return this beam. beam_count = 1 predicted_answer_seq_length = predicted_answer_info[ "predictions_seq_lengths"][0] - 1 #-1 to exclude the EOS token predicted_answer = predicted_answer_info["predictions"][ 0][:predicted_answer_seq_length].tolist() answer_beams.append(predicted_answer) #Add new conversation steps to the end of the history and trim from the beginning if it is longer than conv_history_length #Answers need to be converted from output_vocabulary ints to input_vocabulary ints (since they will be fed back in to the encoder) self.conversation_history.append(question) answer_for_history = self.output_vocabulary.ints2words( answer_beams[0], is_punct_discrete_word=True, capitalize_i=False) answer_for_history = self.input_vocabulary.words2ints( answer_for_history) self.conversation_history.append(answer_for_history) self.trim_conversation_history( chat_settings.inference_hparams.conv_history_length) #Convert the answer(s) to text and return answers = [] for i in range(beam_count): answer = self.output_vocabulary.ints2words(answer_beams[i]) answers.append(answer) q_with_hist = None if not chat_settings.show_question_context else self.input_vocabulary.ints2words( question_with_history) if chat_settings.show_all_beams: return q_with_hist, answers else: return q_with_hist, answers[0]
class Model(object): def __init__(self): self.vocab = Vocabulary() self.language_module = dataset.LSTMLanguageModule( message_flags.flattened_message_size(), self.vocab.get_vocab_size()).to(device) self.decoder = dataset.Decoder( message_flags.flattened_message_size()).to(device) all_params = list(self.language_module.parameters()) + list( self.decoder.parameters()) self.optimizer = optim.Adam(all_params, weight_decay=1e-5) self.training_examples = [] def predict(self, state, command): self.language_module.eval() self.decoder.eval() token_ids = self.vocab.token_ids(command) command_variable = torch.LongTensor(token_ids).unsqueeze(0).to(device) state_variable = dataset.state_to_variable(state).to(device) encoder_output = self.language_module.forward(command_variable) decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation( encoder_output) prediction = self.decoder.forward(state_variable, decoder_input) return dataset.output_from_variable(prediction, state) def optimizer_step(self): self.language_module.train() self.decoder.train() random.shuffle(self.training_examples) for state, command, target in self.training_examples: self.optimizer.zero_grad() state_variable = dataset.state_to_variable(state).to(device) target_variable = dataset.output_to_variable(target, state).to(device) token_ids = self.vocab.token_ids(command) command_variable = torch.LongTensor(token_ids).unsqueeze(0).to( device) encoder_output = self.language_module.forward(command_variable) decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation( encoder_output) prediction = self.decoder.forward(state_variable, decoder_input, target_variable) loss = dataset.loss(prediction, target_variable) loss.backward() self.optimizer.step() def training_accuracy(self): n_correct = 0 for state, command, target in self.training_examples: prediction = self.predict(state, command) if prediction == target: n_correct += 1 return n_correct / len(self.training_examples) def update(self, state, command, target_output, num_updates=None): if num_updates is None: num_updates = FLAGS.baseline_max_updates self.training_examples.append((state, command, target_output)) for _ in range(num_updates): self.optimizer_step()
def rewrite(self): """ Rewrite the flight according to the vocabulary voc (voc is a Vocabulary)""" rw=[] for part in self.vocabulary.getPartitions(): for partelt in part.getModalities(): val=self.getValue(part.getAttName()) mu = partelt.getMu(val) rw.append(mu) return rw def satisfaisant(self, conditions): for condition in conditions: part = self.vocabulary.getPartition(condition[0]) partelt = part.getModality(condition[1]) val = self.getValue(part.getAttName()) mu = partelt.getMu(val) if (mu < condition[2]): return False return True if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python flight.py <vocfile.csv>") else: if os.path.isfile(sys.argv[1]): voc = Vocabulary(sys.argv[1]) line= "2008,1,3,4,1103,1955,2211,2225,WN,335,N712SW,128,150,116,-14,8,IAD,TPA,810,4,8,0,,0,NA,NA,NA,NA,NA" f = Flight(line,voc) print(f.rewrite())
if __name__ == '__main__': arguments = parse_args() logger.info('Loading config') with open(arguments.config) as config_file: config = yaml.load(config_file) logger.info('Initializing input stream') input_stream = LineSentence( arguments.corpus, max_sentence_length=config['sliding_window']['change_every_words']) min_word_freq = config['vocabulary']['min_freq'] logger.info('Building vocabulary with min_freq={}'.format(min_word_freq)) vocab = Vocabulary.from_documents(input_stream, min_word_freq) vocabulary_size = len(vocab) logger.info('Vocabulary size: {}'.format(vocabulary_size)) logger.info('Building negative sampling distribution') negative_sampler = HierarchicalSampler( vocab=vocab, alpha=config['negative_sampling']['alpha'], chunks_num=config['negative_sampling']['vocab_chunks_num']) logger.info('Building model computation graph') optimizer = tf.train.AdagradOptimizer( learning_rate=config['training_params']['initial_learning_rate']) negative_samples_num = config['sliding_window']['max_size'] * \