def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) counter = Counter() with utf8_file_open(args.infile, 'r') as infile: for line in infile: line = line.strip() if args.lowercase: line = line.lower() # line = line.decode('utf-8').strip() # log.info(line) # if line == '' or line.startswith('<doc id='): # continue counter.update(line.strip().split()) with utf8_file_open(args.outfile, 'w') as outfile: for (key, count) in sort_dict_by_label(counter, True): outfile.write(u'%s\t%i\n' % (key, count)) log.info('finished')
def write_vocabulary_file(output_file, vocab): """Write the given vocabulary to the given file. The vocabulary items are stored in order of the vocab values, i.e., in the same order as they have been read by read_vocabulary_id_file. Parameters ---------- output_file : str filename of the output vocab : dict(str, int) vocabulary that has been read by read_vocabulary_id_file """ with utf8_file_open(output_file, 'w') as vocab_file: vocab_file.write('\n'.join(k[0] for k in sort_dict_by_label(vocab))) vocab_file.write('\n')
def write_vocabulary_file(output_file, vocab): """Write the given vocabulary to the given file. The vocabulary items are stored in order of the vocab values, i.e., in the same order as they have been read by read_vocabulary_id_file. Parameters ---------- output_file : str filename of the output vocab : dict(str, int) vocabulary that has been read by read_vocabulary_id_file """ with utf8_file_open(output_file, 'w') as vocab_file: vocab_file.write(u'\n'.join(k[0] for k in sort_dict_by_label(vocab))) vocab_file.write(u'\n')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') model = load_object_from_file(args.model_file) # read vocabulary from file vocab = sort_dict_by_label(read_vocabulary_id_file(args.vocabulary)) # get matrices from model r_matrix = model.R.get_value() q_matrix = model.Q.get_value() # get input embeddings if args.model_type == 'vlbl': in_we = r_matrix elif args.model_type == 'vlbl_dist': # this will not work with the old versions of models - because of sparsity d_matrix = model.D.get_value().todense() in_we = np.dot(d_matrix, r_matrix) # need to convert from numpy.matrix to numpy.ndarray in_we = in_we.view(type=np.ndarray) with utf8_file_open(args.result_file + ".in", 'w') as outfile: for (word, ind) in vocab: outfile.write( unicode(word) + u' ' + u' '.join(map(str, in_we[ind])) + u'\n') with utf8_file_open(args.result_file + ".out", 'w') as outfile: for (word, ind) in vocab: outfile.write( unicode(word) + u' ' + u' '.join(map(str, q_matrix[ind])) + u'\n') log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') model = load_object_from_file(args.model_file) # read vocabulary from file vocab = sort_dict_by_label(read_vocabulary_id_file(args.vocabulary)) # get matrices from model r_matrix = model.R.get_value() q_matrix = model.Q.get_value() # get input embeddings if args.model_type == 'vlbl': in_we = r_matrix elif args.model_type == 'vlbl_dist': # this will not work with the old versions of models - because of sparsity d_matrix = model.D.get_value().todense() in_we = np.dot(d_matrix, r_matrix) # need to convert from numpy.matrix to numpy.ndarray in_we = in_we.view(type=np.ndarray) with utf8_file_open(args.result_file + ".in", 'w') as outfile: for (word, ind) in vocab: outfile.write(unicode(word) + u' ' + u' '.join(map(str, in_we[ind])) + u'\n') with utf8_file_open(args.result_file + ".out", 'w') as outfile: for (word, ind) in vocab: outfile.write(unicode(word) + u' ' + u' '.join(map(str, q_matrix[ind])) + u'\n') log.info('finished')
def run(self): vocab = dict(self.vocab) # Get a mapping from index to word vocab_entries = sort_dict_by_label(vocab) vocab_entries = zip(*vocab_entries)[0] log_probabs = 0. num_ppl_examples = 0 num_examples = 0 with utf8_file_open(self.result_file, 'w') as outfile: for batch, _ in self.next_batch(self.predict_file): # Handle each prediction # for (cur_count, (example, predictions)) in enumerate(self.predict_single()): log_iterations(log, num_examples, 10000) num_examples += len(batch) if self.perplexity: batch = zip(*batch) # Pass only the context, not the target word predictions = self.predictor_method(batch[0]) else: self.predictor_method(batch) if self.store_softmax or self.store_rank or self.store_argmax \ or self.information or self.perplexity: sm, probabs, cur_log_probabs, cur_num_ppl_examples = \ self._calc_probabilities_from_similarity(batch[1], predictions[1]) num_ppl_examples += cur_num_ppl_examples if self.store_rank or self.information: # rankdata sorts ascending, i.e., distances, but we have # similarities, hence, 1-sm ranks = rankdata(1 - sm, method='min').astype(int) if self.store_rank: outfile.write(ndarray_to_string(ranks)) if self.information: unique_ranks = set(ranks) hard_idx = vocab[u'hard'] sorted_unique_ranks = ' '.join( map(str, sorted(unique_ranks))) sorted_unique_ranks = '' top_ten_entries = ' '.join([ vocab_entries[i] for i in np.argsort(1 - sm)[:10] ]) print '#%d\t%s\t%s' % (ranks[hard_idx], sorted_unique_ranks, top_ten_entries) if self.store_argmax: maximum = np.argmax(sm) # outfile.write(vocab_entries[maximum] + u' (%d)\t' % maximum) outfile.write(vocab_entries[maximum]) if self.store_softmax: if self.normalize_with_root: sm = np.sqrt(sm) sm = sm / np.linalg.norm(sm, 2, axis=-1) outfile.write(ndarray_to_string(sm)) if self.perplexity: if self.save_word: indices_in_predict_vocab = [ self.vocab_mapping[batch[1][i]] for i in range(len(batch[1])) ] indices_in_original_vocab = [ self.vocab_mapping_list[i] for i in indices_in_predict_vocab ] words = [ self.vocab.keys()[self.vocab.values().index(i)] for i in indices_in_original_vocab ] outfile.write(u'\n'.join( "%s %s" % (x, y) for x, y in zip(map(unicode, probabs), words))) else: outfile.write(u'\n'.join(map(unicode, probabs))) log_probabs += cur_log_probabs if cur_log_probabs is not np.nan else 0. if self.predictions: outfile.write(ndarray_to_string(predictions[0][0])) outfile.write(u'\n') # print all results # for predictions in predictions: # outfile.write(ndarray_to_string(predictions[0][0]) + u'\t') # # if args.store_softmax: # outfile.write(ndarray_to_string(predictions[1][0]) + u'\t') # # outfile.write(vocab_entries[predictions[2][0]] + u' (%d)' % predictions[2][0]) # outfile.write(u'\n') # # outfile.write(unicode(predictions) + u'\n') if self.perplexity: ppl = np.exp(-1. / (num_ppl_examples) * log_probabs) log.info('Perplexity on %d examples is %f', num_ppl_examples, ppl)
def run(self): vocab = dict(self.vocab) # Get a mapping from index to word vocab_entries = sort_dict_by_label(vocab) vocab_entries = zip(*vocab_entries)[0] log_probabs = 0. num_ppl_examples = 0 num_examples = 0 with utf8_file_open(self.result_file, 'w') as outfile: for batch, _ in self.next_batch(self.predict_file): # Handle each prediction # for (cur_count, (example, predictions)) in enumerate(self.predict_single()): log_iterations(log, num_examples, 10000) num_examples += len(batch) if self.perplexity: batch = zip(*batch) # Pass only the context, not the target word predictions = self.predictor_method(batch[0]) else: self.predictor_method(batch) if self.store_softmax or self.store_rank or self.store_argmax \ or self.information or self.perplexity: sm, probabs, cur_log_probabs, cur_num_ppl_examples = \ self._calc_probabilities_from_similarity(batch[1], predictions[1]) num_ppl_examples += cur_num_ppl_examples if self.store_rank or self.information: # rankdata sorts ascending, i.e., distances, but we have # similarities, hence, 1-sm ranks = rankdata(1 - sm, method='min').astype(int) if self.store_rank: outfile.write(ndarray_to_string(ranks)) if self.information: unique_ranks = set(ranks) hard_idx = vocab[u'hard'] sorted_unique_ranks = ' '.join(map(str, sorted(unique_ranks))) sorted_unique_ranks = '' top_ten_entries = ' '.join([vocab_entries[i] for i in np.argsort(1 - sm)[:10]]) print '#%d\t%s\t%s' % (ranks[hard_idx], sorted_unique_ranks, top_ten_entries) if self.store_argmax: maximum = np.argmax(sm) # outfile.write(vocab_entries[maximum] + u' (%d)\t' % maximum) outfile.write(vocab_entries[maximum]) if self.store_softmax: if self.normalize_with_root: sm = np.sqrt(sm) sm = sm / np.linalg.norm(sm, 2, axis=-1) outfile.write(ndarray_to_string(sm)) if self.perplexity: if self.save_word: indices_in_predict_vocab = [self.vocab_mapping[batch[1][i]] for i in range(len(batch[1]))] indices_in_original_vocab = [self.vocab_mapping_list[i] for i in indices_in_predict_vocab] words = [self.vocab.keys()[self.vocab.values().index(i)] for i in indices_in_original_vocab] outfile.write( u'\n'.join("%s %s" % (x, y) for x, y in zip(map(unicode, probabs), words)) ) else: outfile.write(u'\n'.join(map(unicode, probabs))) log_probabs += cur_log_probabs if cur_log_probabs is not np.nan else 0. if self.predictions: outfile.write(ndarray_to_string(predictions[0][0])) outfile.write(u'\n') # print all results # for predictions in predictions: # outfile.write(ndarray_to_string(predictions[0][0]) + u'\t') # # if args.store_softmax: # outfile.write(ndarray_to_string(predictions[1][0]) + u'\t') # # outfile.write(vocab_entries[predictions[2][0]] + u' (%d)' % predictions[2][0]) # outfile.write(u'\n') # # outfile.write(unicode(predictions) + u'\n') if self.perplexity: ppl = np.exp(-1. / (num_ppl_examples) * log_probabs) log.info('Perplexity on %d examples is %f', num_ppl_examples, ppl)