def __init__(self, input_file_path: str, gold_file_path: str, max_char_len: int = 256, is_crf: bool = False, TASK: str = 'BIS'): configure_workspace() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.max_len = max_char_len self.parse_dataset(input_file_path, gold_file_path) self.get_unigrams() self.create_vocabulary(is_crf) self.encode_labels(TASK) self.encoded_data = None self.vocab_size = len(self.char2idx.keys()) self.out_vocab_size = len(self.label2idx.keys()) print(f'Input Vocabulary Size: {self.vocab_size}') print(f'Output Vocabulary Size: {self.out_vocab_size}')
print("==========Validation Dataset==========") dev_file_path = join(DATA_PATH, 'dev.tsv') validation_set = TSVDatasetParser(dev_file_path, max_len=80, is_crf=crf_model) validation_set.encode_dataset(training_set.word2idx, training_set.labels2idx, training_set.char2idx) print("==========Testing Dataset==========") test_file_path = join(DATA_PATH, 'test.tsv') testing_set = TSVDatasetParser(test_file_path, max_len=80, is_crf=crf_model) testing_set.encode_dataset(training_set.word2idx, training_set.labels2idx, training_set.char2idx) return training_set, validation_set, testing_set if __name__ == '__main__': RESOURCES_PATH = join(getcwd(), 'resources') configure_workspace(seed=1873337) crf_model = False train_dataset, dev_dataset, test_dataset = prepare_data(crf_model) batch_size = 64 pretrained_embeddings = None embeddings_path = join(RESOURCES_PATH, 'wiki.en.vec') pretrained_embeddings = load_pretrained_embeddings(embeddings_path, train_dataset.word2idx, 300, is_crf=crf_model) name_ = 'LSTM' hp = HyperParameters(name_, train_dataset.word2idx, train_dataset.labels2idx, pretrained_embeddings,
dataset.update(train_x=bert_inputs, train_y=train_labels, tokenizer=tokenizer) return dataset if __name__ == '__main__': # Initialize session sess = tf.Session() # Reduce logging output. tf.logging.set_verbosity(tf.logging.INFO) params = parse_args() config_params = configure_workspace() elmo = config_params["use_elmo"] dataset = process_bert_data() vocabulary_size = dataset.get("vocabulary_size") output_size = dataset.get("output_size") pos_vocab_size = dataset.get("pos_vocab_size") lex_vocab_size = dataset.get("lex_vocab_size") model = multitask_attention_model(output_size, pos_vocab_size, lex_vocab_size, config_params) # max_seq_len = 512 # model = attention_model(output_size, max_seq_len, config_params) # Instantiate variables initialize_vars(sess)
# Fetching the ground truth of the data ground_truth = [] ground_truth_path = input_path.replace("data.xml", "gold.key.txt") with open(ground_truth_path, encoding='utf-8', mode='r') as ground_truth_file: lines = ground_truth_file.read().splitlines() for line in lines: sense_key = line.split()[1] ground_truth.append(sense_key) # Compute F1_Score _, _, f1score, _ = precision_recall_fscore_support(ground_truth, _predictions, average='micro') print(f'{model._name} F1_score: {f1score}') def predictions_scorer(): cmd = f"javac {os.path.join(os.getcwd(), 'resources', 'scorer.java')}" process = Popen(cmd.split(), stdout=PIPE) outs, errs = process.communicate() print(outs, errs) # TODO: COMPLETE IMPLEMENTATION OF FUNCTION SCORER if __name__ == '__main__': configure_workspace() input_path = os.path.join(os.getcwd(), 'data', 'evaluation', 'multilingual_eval', 'semeval2013.de.data.xml') output_path = os.path.join(os.getcwd(), 'resources', 'multilingual_output.txt') resources_path = os.path.join(os.getcwd(), 'resources') lang = 'en' predict_multilingual(input_path, output_path, resources_path, lang)