Exemple #1
0
    def read(self, file_path: str):

        instances = []

        ds = FEVERDataSet(file_path,
                          reader=self.reader,
                          formatter=self.formatter)
        ds.read()

        for instance in tqdm.tqdm(ds.data):
            if instance is None:
                continue

            if not self._sentence_level:
                pages = set(ev[0] for ev in instance["evidence"])
                premise = " ".join([self.db.get_doc_text(p) for p in pages])
            else:
                lines = set([
                    self.get_doc_line(d[0], d[1]) for d in instance['evidence']
                ])
                premise = " ".join(lines)

            if len(premise.strip()) == 0:
                premise = ""

            hypothesis = instance["claim"]
            label = instance["label_text"]
            instances.append(self.text_to_instance(premise, hypothesis, label))
        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Exemple #2
0
    def _read(self, file_path: str) -> Iterator[Instance]:

        ds = FEVERDataSet(file_path,
                          reader=self.reader,
                          formatter=self.formatter)
        ds.read()

        for instance in tqdm.tqdm(ds.data):
            if instance is None:
                continue

            if not self._sentence_level:
                pages = set(ev[0] for ev in instance["evidence"])
                premise = " ".join([self.db.get_doc_text(p) for p in pages])
            else:
                lines = set([
                    self.get_doc_line(d[0], d[1]) for d in instance['evidence']
                ])
                premise = " ".join(lines)
                if self._ner_facts:
                    premise = premise + " ".join(instance['fact'])

            if len(premise.strip()) == 0:
                premise = ""

            hypothesis = instance["claim"]
            label = instance["label_text"]
            ner_missing = instance["ner_missing"]
            yield self.text_to_instance(premise, hypothesis, label,
                                        ner_missing)
Exemple #3
0
    def annotation_on_the_fly(self, file_path, run_name, objUOFADataReader):
        print("do_annotation_on_the_fly == true")

        # DELETE THE annotated file IF IT EXISTS every time before the loop
        # self.delete_if_exists(head_file)
        # self.delete_if_exists(body_file)
        if (run_name == "train"):
            print("run_name == train")
            head_file = objUOFADataReader.ann_head_tr
            body_file = objUOFADataReader.ann_body_tr
        else:
            if (run_name == "dev"):
                print("run_name == dev")
                head_file = objUOFADataReader.ann_head_dev
                body_file = objUOFADataReader.ann_body_dev

        ds = FEVERDataSet(file_path,
                          reader=self.reader,
                          formatter=self.formatter)
        ds.read()
        instances = []

        for instance in tqdm.tqdm(ds.data):
            counter = counter + 1

            if instance is None:
                continue

            if not self._sentence_level:
                pages = set(ev[0] for ev in instance["evidence"])
                premise = " ".join([self.db.get_doc_text(p) for p in pages])
            else:
                lines = set([
                    self.get_doc_line(d[0], d[1]) for d in instance['evidence']
                ])
                premise = " ".join(lines)

            if len(premise.strip()) == 0:
                premise = ""

            hypothesis = instance["claim"]
            label = instance["label_text"]

            premise_ann, hypothesis_ann = self.uofa_annotate(
                hypothesis, premise, counter, objUOFADataReader, head_file,
                body_file)
            instances.append(
                self.text_to_instance(premise_ann, hypothesis_ann, label))
        return instances
Exemple #4
0
    def read(self, file_path: str):

        instances = []

        ds = FEVERDataSet(file_path,reader=self.reader, formatter=self.formatter)
        ds.read()

        for instance in tqdm.tqdm(ds.data):
            if instance is None:
                continue

            for page in set([ev[0] for ev in instance['evidence']]):
                claim = instance['claim'].strip()
                paragraph = self.db.get_doc_text(page)
                tokenized_paragraph = self._wiki_tokenizer.tokenize(paragraph)

                evidences = set([ev[1] for ev in instance['evidence'] if ev[0] == page])


                lines = self.db.get_doc_lines(page)
                if any(ev<0 for ev in evidences):
                    span_ends = [0]
                    span_starts = [0]
                    evidence_texts = [""]

                else:
                    evidence_texts = [lines.split("\n")[line].split("\t")[1] for line in evidences]

                    span_starts = [paragraph.index(evidence_text) for evidence_text in evidence_texts]
                    span_ends =  [start + len(evidence_texts) for start, evidence_text in zip(span_starts, evidence_texts)]
                inst = self.text_to_instance(claim,
                                                 paragraph,
                                                 zip(span_starts, span_ends),
                                                 evidence_texts,
                                                 tokenized_paragraph)
                instances.append(inst)
        if not instances:
            raise ConfigurationError("No instances were read from the given filepath {}. "
                                     "Is the path correct?".format(file_path))
        return Dataset(instances)
    if args.sentence:
        logger.info("Model is Sentence level")
        ffns.append(SentenceLevelTermFrequencyFeatureFunction(db, naming=mname))
    else:
        logger.info("Model is Document level")
        ffns.append(TermFrequencyFeatureFunction(db,naming=mname))

    f = Features(mname,ffns)
    jlr = JSONLineReader()

    formatter = FEVERGoldFormatter(None, FEVERLabelSchema(),filtering=args.filtering)

    train_ds = DataSet(file=args.train, reader=jlr, formatter=formatter)
    dev_ds = DataSet(file=args.dev, reader=jlr, formatter=formatter)

    train_ds.read()
    dev_ds.read()

    test_ds = None
    if args.test is not None:
        test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter)
        test_ds.read()

    train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds)
    f.save_vocab(mname)

    input_shape = train_feats[0].shape[1]

    model = SimpleMLP(input_shape,100,3)

    if gpu():
Exemple #6
0
    waseem_de_composite = CompositeDataset(name="waseem_composite_dev")
    for dataset in datasets_de:
        dataset.read()
        waseem_de_composite.add(dataset)

    waseem_te_composite = CompositeDataset(name="waseem_composite_test")
    for dataset in datasets_te:
        dataset.read()
        waseem_te_composite.add(dataset)

    davidson_tr = DataSet(os.path.join("data", "davidson.tr.csv"),
                          reader=csvreader,
                          formatter=df,
                          name="davidson_train")
    davidson_tr.read()

    davidson_dv = DataSet(os.path.join("data", "davidson.dv.csv"),
                          reader=csvreader,
                          formatter=df,
                          name="davidson_dev")
    davidson_dv.read()

    davidson_te = DataSet(os.path.join("data", "davidson.te.csv"),
                          reader=csvreader,
                          formatter=df,
                          name="davidson_test")
    davidson_te.read()

    features = Features(get_feature_functions(mname))
    primary_train_fs, aux_train_fs, dev_fs, test_fs_primary, test_fs_aux = features.load(
Exemple #7
0
    scores = []
    for line in lines:
        scores.append(cl.similarity(nlp(line)))
    return scores


db = FeverDocDB("data/fever/fever.db")
idx = set(db.get_doc_ids())

jlr = JSONLineReader()
formatter = FEVERSentenceTextFormatter(idx, db, RelatedLabelSchema())
dev_ds = DataSet(file="data/fever-data/dev.jsonl",
                 reader=jlr,
                 formatter=formatter)

dev_ds.read()


def doc_lines(db, doc):
    lines = db.get_doc_lines(doc)
    return [
        line.split("\t")[1] if len(line.split("\t")) > 1 else ""
        for line in lines.split("\n")
    ]


#thresh = 0.8

y_true = []
y_scores = []
Exemple #8
0
        waseem_tr_composite.add(dataset)

    waseem_de_composite = CompositeDataset(name="waseem_composite_dev")
    for dataset in datasets_de:
        dataset.read()
        waseem_de_composite.add(dataset)

    waseem_te_composite = CompositeDataset(name="waseem_composite_test")
    for dataset in datasets_te:
        dataset.read()
        waseem_te_composite.add(dataset)



    davidson_te = DataSet(os.path.join("data","davidson.te.csv"),reader=csvreader,formatter=df,name="davidson_test")
    davidson_te.read()

    features = Features(get_feature_functions(mname))
    train_fs, dev_fs, test_fs_primary, test_fs_aux = features.load(waseem_tr_composite, waseem_de_composite, waseem_te_composite, davidson_te)

    print("Number of features in primary: {0}".format(train_fs[0].shape[1]))

    model = MLP(train_fs[0].shape[1],get_model_shape(),3)

    if gpu():
        model.cuda()

    if model_exists(mname) and os.getenv("TRAIN").lower() not in ["y","1","t","yes"]:
        model.load_state_dict(torch.load("models/{0}.model".format(mname)))
    else:
        train(model, train_fs, 50, 1e-3, 45, dev=dev_fs,
Exemple #9
0
    logger.info("Model name is {0}".format(mname))

    ffns = []

    if args.sentence:
        logger.info("Model is Sentence level")
        ffns.append(SentenceLevelTermFrequencyFeatureFunction(db,
                                                              naming=mname))
    else:
        logger.info("Model is Document level")
        ffns.append(TermFrequencyFeatureFunction(db, naming=mname))

    f = Features(mname, ffns)
    f.load_vocab(mname)

    jlr = JSONLineReader()
    formatter = FEVERGoldFormatter(None, FEVERLabelSchema())

    test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter)
    test_ds.read()
    feats = f.lookup(test_ds)

    input_shape = feats[0].shape[1]
    model = SimpleMLP(input_shape, 100, 3)

    if gpu():
        model.cuda()

    model.load_state_dict(torch.load("models/{0}.model".format(mname)))
    print_evaluation(model, feats, FEVERLabelSchema(), args.log)
    df = DavidsonFormatter(DavidsonToZLabelSchema(), preprocessing=pp)

    davidson_tr_dataset = DataSet(os.path.join("data", "davidson.tr.csv"),
                                  formatter=df,
                                  reader=csvreader,
                                  name="davidson_train")
    davidson_dv_dataset = DataSet(os.path.join("data", "davidson.dv.csv"),
                                  formatter=df,
                                  reader=csvreader,
                                  name="davidson_dev")
    davidson_te_dataset = DataSet(os.path.join("data", "davidson.te.csv"),
                                  formatter=df,
                                  reader=csvreader,
                                  name="davidson_test")

    davidson_tr_dataset.read()
    davidson_dv_dataset.read()
    davidson_te_dataset.read()

    features = Features(get_feature_functions(mname))
    train_fs, dev_fs, test_fs = features.load(davidson_tr_dataset,
                                              davidson_dv_dataset,
                                              davidson_te_dataset)

    print("Number of features: {0}".format(train_fs[0].shape[1]))
    model = MLP(train_fs[0].shape[1], get_model_shape(), 3)

    if gpu():
        model.cuda()

    if model_exists(mname) and os.getenv("TRAIN").lower() not in [