Esempio n. 1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--lang', action='store')
    parser.add_argument('--config', action='store', default='configs/HAN.jsonnet')
    parser.add_argument('--save', action='store', default='experiments/models/HAN')
    parser.add_argument('--dataset', default="conllu.tar.gz")
    args = parser.parse_args()

    import_submodules("loader")
    import_submodules("models")


    test_path = "data/{0}/{1}/test".format(args.lang, args.dataset)

    model = load_archive(os.path.join(args.save, "model.tar.gz")).model
    if "HAN" in args.config or "hier" in args.config:
        reader = NorecReaderHierarchical()
    else:
        reader = NorecReader_Flat()

    p = Predictor(model, reader)

    header = "doc_id\tgold\tpred\tnum_sents\tnum_tokens\n"

    predictions = []
    gold_labels = []
    output_text = header

    for i in reader.read(test_path):
        metadata = i.fields['meta'].metadata

        try:
            pred = p.predict_instance(i)['prediction']
        # if there's an error always choose 1
        except:
            pred = 1
        predictions.append(pred)

        gold_label = i["rating"].label
        gold_labels.append(gold_label)

        output_text += "{}\t{}\t{}\t{}\t{}\n".format(metadata["doc_id"],
                                                     gold_label,
                                                     pred,
                                                     metadata["sentences"],
                                                     metadata["tokens"])


    acc = accuracy_score(gold_labels, predictions)
    f1 = f1_score(gold_labels, predictions, average="macro")

    print("Acc score: {0:.3f}\nF1 score: {1:.3f}\n".format(acc, f1))


    final_output = "Acc score: {0:.3f}\nF1 score: {1:.3f}\n\n".format(acc, f1)
    final_output += output_text

    with open(os.path.join(args.save, "results.txt"), "w") as outfile:
        outfile.write(final_output)
Esempio n. 2
0
    def _read(self,
              file_path: str,
              annotator: Predictor = None) -> Iterator[Instance]:
        """
        take a file name of an amazon dataset .tsv file
        review is on 13 column and the label is in
        and process the file and stream Instances
        """

        with open(file_path) as f:

            for l in f:
                l = l.strip()
                l = l.split("\t")
                try:
                    if len(
                            l
                    ) != 2:  # check only two columns per line to avoid reading malformed lines
                        continue

                    # remove empty reviews in amazon dataset less than two letters
                    if len(l[0].strip()) < 2:
                        continue

                    if l[1] not in "12345":
                        continue
                except:
                    continue

                # get the sentence text
                sentence = self.tokenize(l[0].strip().lower())

                # Binarize the output usual score is from 1->5
                # Make 1,2,3 negative  -- 4,5 positive
                # (this is to accommodate the unbalance between positive and neg)
                if annotator is None:
                    # get the label review score
                    label = l[1]

                    if self.binary_output:
                        if label in "45":
                            label = "1"
                        else:
                            label = "0"
                else:

                    label = annotator.predict_instance(
                        self.tokens_to_instance(sentence))
                    label = str(label["class"])

                try:
                    assert len(sentence) > 2
                    assert label in ["1", "0"]
                except AssertionError as e:
                    continue

                yield self.tokens_to_instance(sentence, label)
Esempio n. 3
0
''' the language model used Glove but we just build an embedder to load the trained parameters '''
token_embedding = Embedding(
    num_embeddings=vocabulary.get_vocab_size(namespace='tokens'),
    embedding_dim=combination.word_embedding_size,
    padding_index=0)
token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
    {'tokens': token_embedding})
''' define encoder to wrap up an lstm feature extractor '''
contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(input_size=combination.word_embedding_size,
                  hidden_size=combination.ed_ncoder_size,
                  bidirectional=False,
                  batch_first=True))
model = LanguageModel(vocab=vocabulary,
                      text_field_embedder=token_embedder,
                      contextualizer=contextualizer,
                      dropout=combination.dropout,
                      regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=combination.l2))]),
                      ) \
    .cuda(device)
model.load_state_dict(torch.load(open(language_model_path, 'rb')), strict=True)
dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS)
language_model_predictor = Predictor(model=model,
                                     dataset_reader=dataset_reader)
val_data_path = os.path.join('.', 'data_seg_val_toytoy')
instances = dataset_reader.read(val_data_path)
predictions = [
    language_model_predictor.predict_instance(instance)
    for instance in instances
]