Beispiel #1
0
    def read(self, file_path: str):

        instances = []

        ds = FEVERDataSet(file_path,
                          reader=self.reader,
                          formatter=self.formatter)
        ds.read()

        for instance in tqdm.tqdm(ds.data):
            if instance is None:
                continue

            if not self._sentence_level:
                pages = set(ev[0] for ev in instance["evidence"])
                premise = " ".join([self.db.get_doc_text(p) for p in pages])
            else:
                lines = set([
                    self.get_doc_line(d[0], d[1]) for d in instance['evidence']
                ])
                premise = " ".join(lines)

            if len(premise.strip()) == 0:
                premise = ""

            hypothesis = instance["claim"]
            label = instance["label_text"]
            instances.append(self.text_to_instance(premise, hypothesis, label))
        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)
Beispiel #2
0
    def _read(self, file_path: str) -> Iterator[Instance]:

        ds = FEVERDataSet(file_path,
                          reader=self.reader,
                          formatter=self.formatter)
        ds.read()

        for instance in tqdm.tqdm(ds.data):
            if instance is None:
                continue

            if not self._sentence_level:
                pages = set(ev[0] for ev in instance["evidence"])
                premise = " ".join([self.db.get_doc_text(p) for p in pages])
            else:
                lines = set([
                    self.get_doc_line(d[0], d[1]) for d in instance['evidence']
                ])
                premise = " ".join(lines)
                if self._ner_facts:
                    premise = premise + " ".join(instance['fact'])

            if len(premise.strip()) == 0:
                premise = ""

            hypothesis = instance["claim"]
            label = instance["label_text"]
            ner_missing = instance["ner_missing"]
            yield self.text_to_instance(premise, hypothesis, label,
                                        ner_missing)
Beispiel #3
0
    def annotation_on_the_fly(self, file_path, run_name, objUOFADataReader):
        print("do_annotation_on_the_fly == true")

        # DELETE THE annotated file IF IT EXISTS every time before the loop
        # self.delete_if_exists(head_file)
        # self.delete_if_exists(body_file)
        if (run_name == "train"):
            print("run_name == train")
            head_file = objUOFADataReader.ann_head_tr
            body_file = objUOFADataReader.ann_body_tr
        else:
            if (run_name == "dev"):
                print("run_name == dev")
                head_file = objUOFADataReader.ann_head_dev
                body_file = objUOFADataReader.ann_body_dev

        ds = FEVERDataSet(file_path,
                          reader=self.reader,
                          formatter=self.formatter)
        ds.read()
        instances = []

        for instance in tqdm.tqdm(ds.data):
            counter = counter + 1

            if instance is None:
                continue

            if not self._sentence_level:
                pages = set(ev[0] for ev in instance["evidence"])
                premise = " ".join([self.db.get_doc_text(p) for p in pages])
            else:
                lines = set([
                    self.get_doc_line(d[0], d[1]) for d in instance['evidence']
                ])
                premise = " ".join(lines)

            if len(premise.strip()) == 0:
                premise = ""

            hypothesis = instance["claim"]
            label = instance["label_text"]

            premise_ann, hypothesis_ann = self.uofa_annotate(
                hypothesis, premise, counter, objUOFADataReader, head_file,
                body_file)
            instances.append(
                self.text_to_instance(premise_ann, hypothesis_ann, label))
        return instances
Beispiel #4
0
    def read(self, file_path: str):

        instances = []

        ds = FEVERDataSet(file_path,reader=self.reader, formatter=self.formatter)
        ds.read()

        for instance in tqdm.tqdm(ds.data):
            if instance is None:
                continue

            for page in set([ev[0] for ev in instance['evidence']]):
                claim = instance['claim'].strip()
                paragraph = self.db.get_doc_text(page)
                tokenized_paragraph = self._wiki_tokenizer.tokenize(paragraph)

                evidences = set([ev[1] for ev in instance['evidence'] if ev[0] == page])


                lines = self.db.get_doc_lines(page)
                if any(ev<0 for ev in evidences):
                    span_ends = [0]
                    span_starts = [0]
                    evidence_texts = [""]

                else:
                    evidence_texts = [lines.split("\n")[line].split("\t")[1] for line in evidences]

                    span_starts = [paragraph.index(evidence_text) for evidence_text in evidence_texts]
                    span_ends =  [start + len(evidence_texts) for start, evidence_text in zip(span_starts, evidence_texts)]
                inst = self.text_to_instance(claim,
                                                 paragraph,
                                                 zip(span_starts, span_ends),
                                                 evidence_texts,
                                                 tokenized_paragraph)
                instances.append(inst)
        if not instances:
            raise ConfigurationError("No instances were read from the given filepath {}. "
                                     "Is the path correct?".format(file_path))
        return Dataset(instances)
Beispiel #5
0
    ffns = []

    if args.sentence:
        logger.info("Model is Sentence level")
        ffns.append(SentenceLevelTermFrequencyFeatureFunction(db, naming=mname))
    else:
        logger.info("Model is Document level")
        ffns.append(TermFrequencyFeatureFunction(db,naming=mname))

    f = Features(mname,ffns)
    jlr = JSONLineReader()

    formatter = FEVERGoldFormatter(None, FEVERLabelSchema(),filtering=args.filtering)

    train_ds = DataSet(file=args.train, reader=jlr, formatter=formatter)
    dev_ds = DataSet(file=args.dev, reader=jlr, formatter=formatter)

    train_ds.read()
    dev_ds.read()

    test_ds = None
    if args.test is not None:
        test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter)
        test_ds.read()

    train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds)
    f.save_vocab(mname)

    input_shape = train_feats[0].shape[1]
Beispiel #6
0
        '-o',
        '--overrides',
        type=str,
        default="",
        help='a HOCON structure used to override the experiment configuration')

    args = parser.parse_args()

    logger.info("Load DB")
    db = FeverDocDB(args.db)

    jlr = JSONLineReader()
    formatter = FEVERGoldFormatter(set(), FEVERLabelSchema())

    logger.info("Read datasets")
    train_ds = DataSet(file="data/fever/train.ns.pages.p{0}.jsonl".format(1),
                       reader=jlr,
                       formatter=formatter)
    dev_ds = DataSet(file="data/fever/dev.ns.pages.p{0}.jsonl".format(1),
                     reader=jlr,
                     formatter=formatter)

    train_ds.read()
    dev_ds.read()

    logger.info("Generate vocab for TF-IDF")
    tf = XTermFrequencyFeatureFunction(db)
    tf.inform(train_ds.data, dev_ds.data)

    logger.info("Eval")
    eval_model(db, args)
Beispiel #7
0
                                             0: 0,
                                             1: 1,
                                             2: 2,
                                             3: 0
                                         })
    df = DavidsonFormatter(DavidsonToZLabelSchema(),
                           preprocessing=pp,
                           mapping={
                               0: 0,
                               1: 1,
                               2: 2
                           })

    datasets_tr = [
        DataSet(file=sexism_file_tr,
                reader=jlr,
                formatter=formatter,
                name=None),
        DataSet(file=racism_file_tr,
                reader=jlr,
                formatter=formatter,
                name=None),
        DataSet(file=neither_file_tr,
                reader=jlr,
                formatter=formatter,
                name=None),
        DataSet(file=waseem_hovy_tr,
                reader=jlr,
                formatter=formatter2,
                name=None)
    ]
Beispiel #8
0
            "claim": line["Headline"],
            "evidence": line["Body ID"],
            "label": annotation
        }


if __name__ == "__main__":
    bodies = Bodies("data/fnc-1/train_bodies.csv",
                    "data/fnc-1/competition_test_bodies.csv")

    f = Features([FNCTermFrequencyFeatureFunction(bodies)])
    csvr = CSVReader()
    formatter = FNCFormatter(FNCLabelSchema())

    train_ds = DataSet(file="data/fnc-1/train_stances.csv",
                       reader=csvr,
                       formatter=formatter)
    test_ds = DataSet(file="data/fnc-1/competition_test_stances.csv",
                      reader=csvr,
                      formatter=formatter)

    train_ds.read()
    test_ds.read()

    train_feats, _, test_feats = f.load(train_ds, None, test_ds)

    input_shape = train_feats[0].shape[1]
    model = SimpleMLP(input_shape, 100, 4)

    if gpu():
        model.cuda()
Beispiel #9
0
def wmd_sim(claim, lines):
    cl = nlp(claim)
    scores = []
    for line in lines:
        scores.append(cl.similarity(nlp(line)))
    return scores


db = FeverDocDB("data/fever/fever.db")
idx = set(db.get_doc_ids())

jlr = JSONLineReader()
formatter = FEVERSentenceTextFormatter(idx, db, RelatedLabelSchema())
dev_ds = DataSet(file="data/fever-data/dev.jsonl",
                 reader=jlr,
                 formatter=formatter)

dev_ds.read()


def doc_lines(db, doc):
    lines = db.get_doc_lines(doc)
    return [
        line.split("\t")[1] if len(line.split("\t")) > 1 else ""
        for line in lines.split("\n")
    ]


#thresh = 0.8
Beispiel #10
0
    logger.info("Model name is {0}".format(mname))

    ffns = []

    if args.sentence:
        logger.info("Model is Sentence level")
        ffns.append(SentenceLevelTermFrequencyFeatureFunction(db,
                                                              naming=mname))
    else:
        logger.info("Model is Document level")
        ffns.append(TermFrequencyFeatureFunction(db, naming=mname))

    f = Features(mname, ffns)
    f.load_vocab(mname)

    jlr = JSONLineReader()
    formatter = FEVERGoldFormatter(None, FEVERLabelSchema())

    test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter)
    test_ds.read()
    feats = f.lookup(test_ds)

    input_shape = feats[0].shape[1]
    model = SimpleMLP(input_shape, 100, 3)

    if gpu():
        model.cuda()

    model.load_state_dict(torch.load("models/{0}.model".format(mname)))
    print_evaluation(model, feats, FEVERLabelSchema(), args.log)
Beispiel #11
0
    waseem_hovy_de = os.path.join("data","amateur_expert.dv.json")


    sexism_file_te = os.path.join("data","waseem_s.te.json")
    racism_file_te = os.path.join("data","waseem_r.te.json")
    neither_file_te = os.path.join("data","waseem_n.te.json")
    waseem_hovy_te = os.path.join("data","amateur_expert.te.json")

    csvreader = CSVReader(encoding="ISO-8859-1")
    jlr = JSONLineReader()
    formatter = TextAnnotationFormatter(WaseemLabelSchema(),preprocessing=pp)
    formatter2 = TextAnnotationFormatter(WaseemHovyLabelSchema(),preprocessing=pp,mapping={0:0,1:1,2:2,3:0})
    df = DavidsonFormatter(DavidsonToZLabelSchema(),preprocessing=pp,mapping={0:0,1:1,2:2})

    datasets_tr = [
        DataSet(file=sexism_file_tr, name=None, reader=jlr, formatter=formatter),
        DataSet(file=racism_file_tr, name=None, reader=jlr, formatter=formatter),
        DataSet(file=neither_file_tr, name=None, reader=jlr, formatter=formatter),
        DataSet(file=waseem_hovy_tr, name=None, reader=jlr, formatter=formatter2)
    ]

    datasets_de = [
        DataSet(file=sexism_file_de,  name=None, reader=jlr, formatter=formatter),
        DataSet(file=racism_file_de,  name=None, reader=jlr, formatter=formatter),
        DataSet(file=neither_file_de,  name=None, reader=jlr, formatter=formatter),
        DataSet(file=waseem_hovy_de,   name=None,reader=jlr, formatter=formatter2)
    ]

    datasets_te = [
        DataSet(file=sexism_file_te,  name=None, reader=jlr, formatter=formatter),
        DataSet(file=racism_file_te,  name=None, reader=jlr, formatter=formatter),
Beispiel #12
0
    maxdoc = sys.argv[1]
    ns_docsize = sys.argv[2]

    db = FeverDocDB("data/fever/fever.db")
    idx = set(db.get_doc_ids())

    mname = "2way-p{0}-{1}".format(maxdoc, ns_docsize)

    f = Features([SentenceTermFrequencyFeatureFunction(db, naming=mname)])
    jlr = JSONLineReader()

    formatter = FEVERSentenceFormatter(idx, db, RelatedLabelSchema())

    train_ds = DataSet(
        file="data/fever/train.ns.pages.p{0}.jsonl".format(ns_docsize),
        reader=jlr,
        formatter=formatter)
    dev_ds = DataSet(file="data/fever/dev.pages.p{0}.jsonl".format(maxdoc),
                     reader=jlr,
                     formatter=formatter)
    test_ds = DataSet(file="data/fever/test.pages.p{0}.jsonl".format(maxdoc),
                      reader=jlr,
                      formatter=formatter)

    train_ds.read()
    dev_ds.read()
    test_ds.read()

    train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds)
    input_shape = train_feats[0].shape[1]
    if not os.path.exists("models"):
        os.mkdir("models")
    return os.path.exists(os.path.join("models", "{0}.model".format(mname)))


if __name__ == "__main__":

    SimpleRandom.set_seeds()
    mname = "expt7" + ("emb" if is_embedding_model() else
                       "") + ("large" if is_large_model() else "")

    csvreader = CSVReader(encoding="ISO-8859-1")
    df = DavidsonFormatter(DavidsonToZLabelSchema(), preprocessing=pp)

    davidson_tr_dataset = DataSet(os.path.join("data", "davidson.tr.csv"),
                                  formatter=df,
                                  reader=csvreader,
                                  name="davidson_train")
    davidson_dv_dataset = DataSet(os.path.join("data", "davidson.dv.csv"),
                                  formatter=df,
                                  reader=csvreader,
                                  name="davidson_dev")
    davidson_te_dataset = DataSet(os.path.join("data", "davidson.te.csv"),
                                  formatter=df,
                                  reader=csvreader,
                                  name="davidson_test")

    davidson_tr_dataset.read()
    davidson_dv_dataset.read()
    davidson_te_dataset.read()

    features = Features(get_feature_functions(mname))
    db = FeverDocDB("data/fever/drqa.db")
    idx = set(db.get_doc_ids())

    fnc_bodies = Bodies("data/fnc-1/train_bodies.csv",
                        "data/fnc-1/competition_test_bodies.csv")
    fever_bodies = db

    f = Features(
        [FeverOrFNCTermFrequencyFeatureFunction(fever_bodies, fnc_bodies)])
    csvr = CSVReader()
    jlr = JSONLineReader()
    fnc_formatter = FNCFormatter2(FNCSimpleLabelSchema())
    fever_formatter = FEVERPredictionsFormatter(idx, FEVERLabelSchema())

    train_ds = DataSet(file="data/fnc-1/train_stances.csv",
                       reader=csvr,
                       formatter=fnc_formatter)
    dev_ds = DataSet(file="data/fnc-1/competition_test_stances.csv",
                     reader=csvr,
                     formatter=fnc_formatter)
    test_ds = DataSet(file="data/fever/fever.dev.pages.p5.jsonl",
                      reader=jlr,
                      formatter=fever_formatter)

    train_ds.read()
    test_ds.read()
    dev_ds.read()

    train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds)

    input_shape = train_feats[0].shape[1]