def get_complex_words(tokenised_string): dataframe = pd.DataFrame() dataframe['word'] = tokenised_string dataframe['binary'] = 'N' dataframe.to_csv('./' + 'complex_word' + '.txt', sep='\t', index=False, header=False, quotechar=' ') sentences_test = experiment.read_input_files('./complex_word.txt') batches_of_sentence_ids = experiment.create_batches_of_sentence_ids( sentences_test, config["batch_equal_size"], config['max_batch_size']) for sentence_ids_in_batch in batches_of_sentence_ids: batch = [sentences_test[i] for i in sentence_ids_in_batch] cost, predicted_labels, predicted_probs = model.process_batch( batch, is_training=False, learningrate=0.0) try: assert (len(sentence_ids_in_batch) == len(predicted_labels)) except: print('cw error') prob_labels = predicted_probs[0] probability_list = [] for prob_pair in prob_labels: probability_list.append(prob_pair[1]) return probability_list
def get_prob_labels(self): try: sentences_test = experiment.read_input_files(self.temp_file) batches_of_sentence_ids = experiment.create_batches_of_sentence_ids( sentences_test, self.config["batch_equal_size"], self.config['max_batch_size']) except: return 'error' for sentence_ids_in_batch in batches_of_sentence_ids: batch = [sentences_test[i] for i in sentence_ids_in_batch] cost, predicted_labels, predicted_probs = self.model.process_batch( batch, is_training=False, learningrate=0.0) try: assert (len(sentence_ids_in_batch) == len(predicted_labels)) except: return 'error' prob_labels = predicted_probs[0] probability_list = [] for prob_pair in prob_labels: probability_list.append(prob_pair[1]) return probability_list
def get_dataframe(): sentences_test = experiment.read_input_files(temp_file) batches_of_sentence_ids = experiment.create_batches_of_sentence_ids( sentences_test, config["batch_equal_size"], config['max_batch_size']) for sentence_ids_in_batch in batches_of_sentence_ids: batch = [sentences_test[i] for i in sentence_ids_in_batch] cost, predicted_labels, predicted_probs = model.process_batch( batch, is_training=False, learningrate=0.0) try: assert (len(sentence_ids_in_batch) == len(predicted_labels)) except: print('batch size error') prob_labels = predicted_probs[0] probability_list = [] for prob_pair in prob_labels: probability_list.append(prob_pair[1]) annotated_sentences = pd.DataFrame() sentences = [sentences_test[i] for i in sentence_ids_in_batch] annotated_sentences['index'] = sentence_ids_in_batch annotated_sentences['sentences'] = sentences annotated_sentences['labels'] = predicted_labels annotated_sentences['probs'] = predicted_probs return annotated_sentences
import sys from model import MLTModel from evaluator import MLTEvaluator from experiment import read_input_files if __name__ == "__main__": model = MLTModel.load(sys.argv[1]) data = read_input_files(sys.argv[2], -1) batch_size = 32 # Evaluator evaluator = MLTEvaluator(model.config) for i in range(0, len(data), batch_size): batch = data[i:i+batch_size] cost, sentence_scores, token_scores_list = model.process_batch_inference(batch, False, 0.0) for j in range(len(batch)): for k in range(len(batch[j])): print(" ".join([str(x) for x in batch[j][k]]) + "\t" + str(token_scores_list[0][j][k]) + "\t" + str(sentence_scores[j])) print("") # Evaluator evaluator.append_data(cost, batch, sentence_scores, token_scores_list) # Evaluator results = evaluator.get_results("test") for key in results: sys.stderr.write(key + ": " + str(results[key]) + "\n")
def print_predictions(print_probs, model_path, input_file): time_loading = time.time() model = labeler.SequenceLabeler.load(model_path) time_noloading = time.time() config = model.config predictions_cache = {} num_additional_features = config['num_additional_features'] num_additional_feature_vectors = config.get('num_additional_feature_vectors', 1) id2label = collections.OrderedDict() for label in model.label2id: id2label[model.label2id[label]] = label sentences_test = experiment.read_input_files(input_file) batches_of_sentence_ids = experiment.create_batches_of_sentence_ids(sentences_test, config["batch_equal_size"], config['max_batch_size']) feature_path = experiment.read_input_features(input_file, 'models/features/') for sentence_ids_in_batch in batches_of_sentence_ids: batch = [ numpy.concatenate((sentences_test[i], experiment.load_sentence_id( feature_path, i, num_additional_features, num_additional_feature_vectors)), axis=1) for i in sentence_ids_in_batch ] #batch = [sentences_test[i] for i in sentence_ids_in_batch] cost, predicted_labels, predicted_probs = model.process_batch(batch, is_training=False, learningrate=0.0) assert(len(sentence_ids_in_batch) == len(predicted_labels)) for i in range(len(sentence_ids_in_batch)): key = str(sentence_ids_in_batch[i]) predictions = [] if print_probs == False: for j in range(len(predicted_labels[i])): predictions.append(id2label[predicted_labels[i][j]]) elif print_probs == True: for j in range(len(predicted_probs[i])): p_ = "" for k in range(len(predicted_probs[i][j])): p_ += str(id2label[k]) + ":" + str(predicted_probs[i][j][k]) + "\t" predictions.append(p_.strip()) predictions_cache[key] = predictions sentence_id = 0 word_id = 0 with open(input_file, "r") as f: for line in f: if len(line.strip()) == 0: print("") if word_id == 0: continue assert(len(predictions_cache[str(sentence_id)]) == word_id), str(len(predictions_cache[str(sentence_id)])) + " " + str(word_id) sentence_id += 1 word_id = 0 continue assert(str(sentence_id) in predictions_cache) assert(len(predictions_cache[str(sentence_id)]) > word_id) t, g, *_ = line.strip().split('\t') print('{}\t{}\tNaN\t{}'.format(t, g, predictions_cache[str(sentence_id)][word_id].strip())) word_id += 1 sys.stderr.write("Processed: " + input_file + "\n") sys.stderr.write("Elapsed time with loading: " + str(time.time() - time_loading) + "\n") sys.stderr.write("Elapsed time without loading: " + str(time.time() - time_noloading) + "\n")