Example #1
0
def augment(model_path, input_nbest_path, vocab_path, output_nbest_path):
    classifier = MLP(model_path=model_path)
    evaluator = eval.Evaluator(None, classifier)

    vocab = VocabManager(vocab_path)

    ngram_size = classifier.ngram_size

    def get_ngrams(tokens):
        for i in range(ngram_size - 1):
            tokens.insert(0, '<s>')
        if vocab.has_end_padding:
            tokens.append('</s>')
        indices = vocab.get_ids_given_word_list(tokens)
        return U.get_all_windows(indices, ngram_size)

    input_nbest = NBestList(input_nbest_path, mode='r')
    output_nbest = NBestList(output_nbest_path, mode='w')

    L.info('Augmenting: ' + input_nbest_path)

    start_time = time.time()

    counter = 0
    cache = dict()
    for group in input_nbest:
        ngram_list = []
        for item in group:
            tokens = item.hyp.split()
            ngrams = get_ngrams(tokens)
            for ngram in ngrams:
                if not cache.has_key(str(ngram)):
                    ngram_list.append(ngram)
                    cache[str(ngram)] = 1000
        if len(ngram_list) > 0:
            ngram_array = np.asarray(ngram_list, dtype='int32')
            ngram_log_prob_list = evaluator.get_ngram_log_prob(
                ngram_array[:, 0:-1], ngram_array[:, -1])
            for i in range(len(ngram_list)):
                cache[str(ngram_list[i])] = ngram_log_prob_list[i]
        for item in group:
            tokens = item.hyp.split()
            ngrams = get_ngrams(tokens)
            sum_ngram_log_prob = 0
            for ngram in ngrams:
                sum_ngram_log_prob += cache[str(ngram)]
            item.append_feature(sum_ngram_log_prob)
            output_nbest.write(item)
        #print counter
        counter += 1
    output_nbest.close()

    L.info("Ran for %.2fs" % (time.time() - start_time))
Example #2
0
def augment(model_path, input_nbest_path, vocab_path, output_nbest_path):
	classifier = MLP(model_path=model_path)
	evaluator = eval.Evaluator(None, classifier)

	vocab = VocabManager(vocab_path)

	ngram_size = classifier.ngram_size

	def get_ngrams(tokens):
		for i in range(ngram_size - 1):
			tokens.insert(0, '<s>')
		if vocab.has_end_padding:
			tokens.append('</s>')
		indices = vocab.get_ids_given_word_list(tokens)
		return U.get_all_windows(indices, ngram_size)

	input_nbest = NBestList(input_nbest_path, mode='r')
	output_nbest = NBestList(output_nbest_path, mode='w')

	L.info('Augmenting: ' + input_nbest_path)
	
	start_time = time.time()

	counter = 0
	cache = dict()
	for group in input_nbest:
		ngram_list = []
		for item in group:
			tokens = item.hyp.split()
			ngrams = get_ngrams(tokens)
			for ngram in ngrams:
				if not cache.has_key(str(ngram)):
					ngram_list.append(ngram)
					cache[str(ngram)] = 1000
		if len(ngram_list) > 0:
			ngram_array = np.asarray(ngram_list, dtype='int32')
			ngram_log_prob_list = evaluator.get_ngram_log_prob(ngram_array[:,0:-1], ngram_array[:,-1])
			for i in range(len(ngram_list)):
				cache[str(ngram_list[i])] = ngram_log_prob_list[i]
		for item in group:
			tokens = item.hyp.split()
			ngrams = get_ngrams(tokens)
			sum_ngram_log_prob = 0
			for ngram in ngrams:
				sum_ngram_log_prob += cache[str(ngram)]
			item.append_feature(sum_ngram_log_prob)
			output_nbest.write(item)
		#print counter
		counter += 1
	output_nbest.close()

	L.info("Ran for %.2fs" % (time.time() - start_time))
Example #3
0
        try:
            group_list.append(input_nbest.next())
        except StopIteration:
            flag = False
    if len(group_list) > 0:
        outputs = pool.map(process_group, group_list)
        for i in range(len(group_list)):
            scores = outputs[i]
            group = group_list[i]
            sorted_indices = sorted(scores, key=scores.get, reverse=True)
            if args.out_scores_path:
                for idx in scores:
                    output_scores.write(
                        str(group.group_index) + ' ' + str(idx) + ' ' +
                        str(scores[idx]) + "\n")
            if args.out_nbest_path:
                for idx in sorted_indices:
                    output_nbest.write(group[idx])
            output_1best.write(group[sorted_indices[0]].hyp + "\n")
        counter += 1
        group_counter += len(group_list)
        if counter % 5 == 0:
            L.info("%i groups processed" % (group_counter))
L.info("Finished processing %i groups" % (group_counter))

if args.out_scores_path:
    output_scores.close()
if args.out_nbest_path:
    output_nbest.close()
output_1best.close()
Example #4
0
	group_list = []
	for i in range(args.threads):
		try:
			group_list.append(input_nbest.next())
		except StopIteration:
			flag = False
	if len(group_list) > 0:
		outputs = pool.map(process_group, group_list)
		for i in range(len(group_list)):
			scores = outputs[i]
			group = group_list[i]
			sorted_indices = sorted(scores, key=scores.get, reverse=True)
			if args.out_scores_path:
				for idx in scores:
					output_scores.write(str(group.group_index) + ' ' + str(idx) + ' ' + str(scores[idx]) + "\n")
			if args.out_nbest_path:
				for idx in sorted_indices:
					output_nbest.write(group[idx])
			output_1best.write(group[sorted_indices[0]].hyp + "\n")
		counter += 1
		group_counter += len(group_list)
		if counter % 5 == 0:
			L.info("%i groups processed" % (group_counter))
L.info("Finished processing %i groups" % (group_counter))

if args.out_scores_path:
	output_scores.close()
if args.out_nbest_path:
	output_nbest.close()
output_1best.close()