def main(): parser = ArgumentParser() parser.add_argument('--lang', action='store') parser.add_argument('--config', action='store', default='configs/HAN.jsonnet') parser.add_argument('--save', action='store', default='experiments/models/HAN') parser.add_argument('--dataset', default="conllu.tar.gz") args = parser.parse_args() import_submodules("loader") import_submodules("models") test_path = "data/{0}/{1}/test".format(args.lang, args.dataset) model = load_archive(os.path.join(args.save, "model.tar.gz")).model if "HAN" in args.config or "hier" in args.config: reader = NorecReaderHierarchical() else: reader = NorecReader_Flat() p = Predictor(model, reader) header = "doc_id\tgold\tpred\tnum_sents\tnum_tokens\n" predictions = [] gold_labels = [] output_text = header for i in reader.read(test_path): metadata = i.fields['meta'].metadata try: pred = p.predict_instance(i)['prediction'] # if there's an error always choose 1 except: pred = 1 predictions.append(pred) gold_label = i["rating"].label gold_labels.append(gold_label) output_text += "{}\t{}\t{}\t{}\t{}\n".format(metadata["doc_id"], gold_label, pred, metadata["sentences"], metadata["tokens"]) acc = accuracy_score(gold_labels, predictions) f1 = f1_score(gold_labels, predictions, average="macro") print("Acc score: {0:.3f}\nF1 score: {1:.3f}\n".format(acc, f1)) final_output = "Acc score: {0:.3f}\nF1 score: {1:.3f}\n\n".format(acc, f1) final_output += output_text with open(os.path.join(args.save, "results.txt"), "w") as outfile: outfile.write(final_output)
def _read(self, file_path: str, annotator: Predictor = None) -> Iterator[Instance]: """ take a file name of an amazon dataset .tsv file review is on 13 column and the label is in and process the file and stream Instances """ with open(file_path) as f: for l in f: l = l.strip() l = l.split("\t") try: if len( l ) != 2: # check only two columns per line to avoid reading malformed lines continue # remove empty reviews in amazon dataset less than two letters if len(l[0].strip()) < 2: continue if l[1] not in "12345": continue except: continue # get the sentence text sentence = self.tokenize(l[0].strip().lower()) # Binarize the output usual score is from 1->5 # Make 1,2,3 negative -- 4,5 positive # (this is to accommodate the unbalance between positive and neg) if annotator is None: # get the label review score label = l[1] if self.binary_output: if label in "45": label = "1" else: label = "0" else: label = annotator.predict_instance( self.tokens_to_instance(sentence)) label = str(label["class"]) try: assert len(sentence) > 2 assert label in ["1", "0"] except AssertionError as e: continue yield self.tokens_to_instance(sentence, label)
''' the language model used Glove but we just build an embedder to load the trained parameters ''' token_embedding = Embedding( num_embeddings=vocabulary.get_vocab_size(namespace='tokens'), embedding_dim=combination.word_embedding_size, padding_index=0) token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder( {'tokens': token_embedding}) ''' define encoder to wrap up an lstm feature extractor ''' contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=combination.word_embedding_size, hidden_size=combination.ed_ncoder_size, bidirectional=False, batch_first=True)) model = LanguageModel(vocab=vocabulary, text_field_embedder=token_embedder, contextualizer=contextualizer, dropout=combination.dropout, regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=combination.l2))]), ) \ .cuda(device) model.load_state_dict(torch.load(open(language_model_path, 'rb')), strict=True) dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS) language_model_predictor = Predictor(model=model, dataset_reader=dataset_reader) val_data_path = os.path.join('.', 'data_seg_val_toytoy') instances = dataset_reader.read(val_data_path) predictions = [ language_model_predictor.predict_instance(instance) for instance in instances ]