def displayFirstNExamples(self, n): if self.src_window < 0: return src_vocab, src_vocab_size = io_vocab.load_vocab(self.src_vocab_file) tgt_vocab, tgt_vocab_size = io_vocab.load_vocab(self.tgt_vocab_file) src_inverse_vocab = io_vocab.inverse_vocab(src_vocab) tgt_inverse_vocab = io_vocab.inverse_vocab(tgt_vocab) assert(n <= self.chunk_size) for i in xrange(n): example_x = self.data_x[i] example_y = self.data_y[i] sent_idx = example_x[-1] src_sent_vector = self.data_sm[sent_idx] src_sent_length = src_sent_vector[0] src_sent_vector = src_sent_vector[1:src_sent_length+1] src_window_vector = example_x[:self.src_window*2 + 1] tgt_gram_vector = example_x[self.src_window*2 + 1:-1] src_sent_words = io_vocab.getWordsFromIndeces(src_sent_vector, src_inverse_vocab, self.tgt_vocab_size) src_window_words = io_vocab.getWordsFromIndeces(src_window_vector, src_inverse_vocab, self.tgt_vocab_size) tgt_gram_words = io_vocab.getWordsFromIndeces(tgt_gram_vector, tgt_inverse_vocab, 0) output = "" count = 0 for w in src_window_words: count += 1 if count == self.src_window + 1: output += "[" + w + "] " else: output += w + " " output += "|| " output += " ".join(tgt_gram_words) + " " output += "===> " + tgt_inverse_vocab[example_y] output += " |||| " output += " ".join(src_sent_words) + " " print output
def get_tfidf(input_file, vocab_file): (vocab_map, vocab_size) = io_vocab.load_vocab(vocab_file) print vocab_map exit() tf_map, df_map = initialize_tfidf(vocab_map) infile = open(input_file, 'r') for line in infile: update_tfidf(line, tf_map, df_map) tfidf_map = compute_tfidf(tf_map, df_map)
# all the global context (sentence) will be extended to this length to # ensure a uniform length max_src_sent_length = args.sentence_vector_length # often around 100 #################################### # LOAD VACAB # <words> is a list of words as in string # <vocab_map> is a dict mapping from word string to integer number of 1,2,...|Vocab| # <vocab_size> is the size of vocab == len(words) == len(vocab_map). src_vocab_file = args.vocab_file + '.' + \ str(args.vocab_size) + '.vocab.' + src_lang tgt_vocab_file = args.vocab_file + '.' + \ str(args.vocab_size) + '.vocab.' + tgt_lang (src_vocab_map, src_vocab_size) = io_vocab.load_vocab( src_vocab_file) (tgt_vocab_map, tgt_vocab_size) = io_vocab.load_vocab( tgt_vocab_file) ####################################### # LOAD VALID NGRAMS, LOAD TEST NGRAMS # <valid_set_x> is a list of list, each of the list in valid_set_x is a n-gram of word, each word is represented by an integer # for e.g. [128, 11, 13, 33, 17, 22, 0, 0, 11, 3] # <valid_set_y> is a list of integers each represent a next-word following the list of word in valid_set_x src_valid_file = args.valid_file + '.' + \ str(args.vocab_size) + '.id.' + src_lang tgt_valid_file = args.valid_file + '.' + \ str(args.vocab_size) + '.id.' + tgt_lang # valid_set_sm is the sentence matrix (valid_set_x, valid_set_y, valid_set_sm) = io_read_ngram.get_all_joint_ngrams_with_src_global_matrix(src_valid_file, tgt_valid_file, args.valid_file + '.align',