Ejemplo n.º 1
0
def conllup_to_starsem(fname, sherlock_train="data/sherlock/cdt.conllup", semcue=False):
    sentences = cd.read_col_data(fname)
    train = cd.read_col_data(sherlock_train)
    _vocabs = vcb.make_vocabs(train, 0)
    vocabs = vcb.Vocabs(*_vocabs)
    w2i = vocabs.scoperels.w2i
    for sentence in sentences:
        #sid, story = sentence.id.split(maxsplit=1)
        story = "_"
        sid = sentence.id.split()[0]
        matrix = sentence.make_matrix("scope", label=True, w2i=w2i)
        if semcue:
            cmatrix = sentence.make_matrix("sem", label=True, w2i=w2i)
            cues = [i for i in range(len(cmatrix)) if cmatrix[0,i] == w2i["cue"] and not w2i["mwc"] in cmatrix[:,i]]
        else:
            #cues = [i for i in range(len(matrix)) if matrix[0,i] == w2i["cue"] and not w2i["mwc"] in matrix[:,i]]
            cues = [i for i in range(len(matrix)) if matrix[0,i] == w2i["cue"]]
        #for h in range(len(matrix)):
        #    if sum(matrix[h,:]) > 0:
        #        cues.append(h)

        if len(cues) > 0:
            # cue scope event
            for word in sentence:
                negs = ["_", "_", "_"] * len(cues)
                for i,c in enumerate(cues):
                    p = len(cues) - 1 - i
                    p = i
                    myev = word.form
                    if c == word.id:
                        is_incue, mycue, myev = check_cue(word.form)
                        if not is_incue:
                            mycue = word.form
                        negs[3*p] = mycue
                    if matrix[c,word.id] == w2i["event"]:
                        negs[3*p+1] = myev
                        negs[3*p+2] = myev
                    elif matrix[c,word.id] == w2i["scope"]:
                        negs[3*p+1] = myev#word.form
                    elif matrix[c,word.id] == w2i["mwc"]:
                        negs[3*p+0] = myev#word.form
                    elif semcue:
                        if cmatrix[c,word.id] == w2i["mwc"]:
                            negs[3*p+0] = myev#word.form
                print("\t".join([story, sid, str(word.id-1), word.form, word.lemma, word.xpos, "_", *negs]))
        else:
            for word in sentence:
                print("\t".join([story, sid, str(word.id-1), word.form, word.lemma, word.xpos, "_", "***"]))

        print()
Ejemplo n.º 2
0
def conllup_to_epe(fname):
    import col_data as cd
    s = 0
    for sentence in cd.read_col_data(fname):
        sid, story = sentence.id.split(maxsplit=1)
        epe = {"id": sid, "nodes": []}
        cues = {}
        nodes = epe["nodes"]
        c = 0
        for token in sentence:
            node = {"id": token.id,
                    "form": token.form,
                    "start": s,
                    "end": s + len(token.form),
                    "properties": {"xpos": token.xpos,
                                   "upos": token.upos,
                                   "lemma": token.lemma},
                    "edges": [],
                    "negation": []}
            s += len(token.form) + 1
            nodes.append(node)

            if "cue" in [l for h,l in token.scope] and not "mwc" in [l for h,l in token.scope]:
                if token.id not in cues:
                    cues[token.id] = c
                    c += 1
            elif "cue" in [l for h,l in token.scope] and "mwc" in [l for h,l in token.scope]:
                try:
                    cues[token.id] = cues[[h for h,l in token.scope if l == "mwc"][0]]
                except KeyError:
                    cues[[h for h,l in token.scope if l == "mwc"][0]] = c
                    c += 1
                    cues[token.id] = cues[[h for h,l in token.scope if l == "mwc"][0]]

        for token in sentence:
            if token.head == 0:
                nodes[token.id-1]["top"] = True
            elif token.head > 0:
                nodes[token.head-1]["edges"].append({"label": token.deprel, "target": token.id})
            for h,l in token.deps:
                nodes[h-1]["edges"].append({"label": l, "target": token.id})
            for h,l in token.scope:
                if h == token.id:
                    is_incue, mycue, myev = check_cue(token.form)
                    if not is_incue:
                        mycue = token.form
                        myev = token.form
                    if l == "scope":
                        nodes[token.id-1]["negation"].append({"id": cues[token.id], "cue": mycue, "scope": myev})
                    elif l == "event":
                        nodes[token.id-1]["negation"].append({"id": cues[token.id], "cue": mycue, "scope": myev, "event": myev})
                elif l == "cue":
                    #print((cues[token.id], [x["id"] for x in nodes[token.id-1]["negation"]]))
                    if not (cues[token.id] in [x["id"] for x in nodes[token.id-1]["negation"]]):
                        nodes[token.id-1]["negation"].append({"id": cues[token.id], "cue": token.form})
                elif l == "scope":
                    nodes[token.id-1]["negation"].append({"id": cues[h], "scope": token.form})
                elif l == "event":
                    nodes[token.id-1]["negation"].append({"id": cues[h], "scope": token.form, "event": token.form})
        print(json.dumps(epe))
Ejemplo n.º 3
0
def predict(model, settings, to_predict, elmo, vocabs):
    pred_path = settings.dir + to_predict.split("/")[-1] + ".pred"
    entries, predicted, other_predicted = model.predict(to_predict, elmo)
    f1, _ = sc.score(*zip(*((entry[1][settings.pt].numpy(), predicted[entry[0]].numpy()) for entry in entries)))
    print("F1 is {:.2%}".format(f1))

    if len(other_predicted) > 0:
        other_f1, _ = sc.score(*zip(*((entry[1][settings.ot].numpy(), other_predicted[entry[0]].numpy()) for entry in entries)))
        print("Other F1 is {:.2%}".format(other_f1))
    with open(pred_path, "w") as fh:
        for sentence in cd.read_col_data(to_predict):
            pred = predicted[sentence.id].numpy()
            if settings.target_style == "scope-":
                cue_matrix = sentence.make_matrix("cues", True, vocabs[settings.td["cue"]].w2i)
                pred = np.maximum(pred, cue_matrix)
            #pred = other_predicted[sentence.id].numpy()
            sentence.update_parse(pred, settings.target_style, vocabs[settings.pt].i2w)
            if len(other_predicted) > 0:
                pred = other_predicted[sentence.id].numpy()
                # NOTE sem == sem hopefully
                if settings.target_style == settings.other_target_style:
                    sentence.update_parse(pred, "syn", vocabs[settings.pt].i2w)
                else:
                    sentence.update_parse(pred, settings.other_target_style, vocabs[settings.pt].i2w)
            print(sentence, file=fh)
    return True
Ejemplo n.º 4
0
    def _load_data(self, data_path, pos_style, target_style,
                   other_target_style, elmo):
        print("Loading data from {}".format(data_path))
        data = cd.read_col_data(data_path)
        #with h5py.File(elmo, 'r') as f:
        #    for sen in f:
        #        #print(sen)
        #        for word, vec in zip(sen.split("\t"), f[sen]):
        #            print(word, vec)

        if self.use_elmo:
            felmo = h5py.File(elmo, "r")

        self.index_entries = []
        for sentence in data:
            #print(sentence.id)
            #print(len(sentence), len(felmo[sentence.id]))
            if self.use_elmo:
                self.index_entries.append(
                    IndexEntry(sentence, self.vocabs, self.external,
                               self.settings, felmo[sentence.id],
                               self.vec_dim))
            else:
                self.index_entries.append(
                    IndexEntry(sentence, self.vocabs, self.external,
                               self.settings, None))

        if self.use_elmo:
            felmo.close()
        print("Done")
Ejemplo n.º 5
0
def coldata_to_starsem(fn_in: str, fn_out: str) -> None:
    with open(fn_out, "w") as fh_out:
        for sentence in read_col_data(fn_in):
            # print(sentence, file=fh_out)
            negs = negations_from_matrix(negation_matrix(sentence), sentence)
            out = []
            for token in sentence:
                out.append(
                    f"_\t{sentence.id}\t{token.id-1}\t{token.form}\t_\t_\t_\t")
                if not negs:
                    out.append("***\n")
                else:
                    out_neg = []
                    for neg in negs.values():
                        if token.id in neg["Cue"]:
                            if token.form not in no_affix:
                                form, span = affixer(token.form)
                            else:
                                form = token.form
                            out_neg.append(form)
                        else:
                            out_neg.append("_")
                        if token.id in neg["Scope"] and \
                           token.id not in neg["Cue"]:  # cue in other negation's scope
                            form = token.form
                            out_neg.append(form)
                        elif token.id in neg["Cue"]:
                            form, (start, end) = affixer(token.form)
                            if token.form in no_affix:
                                form = "_"
                            elif start == 0 and end != len(
                                    token.form):  # prefix
                                # print(sentence.id, end="\t")
                                # print("prefix", token.form, form, span, neg)
                                # print([(t.id, t.form) for t in sentence])
                                form = token.form[end:]
                            elif start != 0 and end == len(
                                    token.form):  # suffix
                                # print(sentence.id, end="\t")
                                # print("suffix", token.form, form, span, neg)
                                # print([(t.id, t.form) for t in sentence])
                                form = token.form[:start]
                            else:
                                form = "_"
                            out_neg.append(form)
                        else:
                            out_neg.append("_")
                        out_neg.append("_")  # event remnant
                    out.append("\t".join(out_neg))
                    out.append("\n")

            print("".join(out), file=fh_out)
Ejemplo n.º 6
0
        for i in range(n):
            for j in range(n):
                C[int(gl[i, j]), int(pl[i, j])] += 1
    print(C)

    for i in range(len(C)):
        print(i2w[i])
        fscore(i, C)
        #for j in range(len(C)):
        #    print("\t", i2w[j], C[i,j])


if __name__ == "__main__":
    import col_data as cd
    import vocab as vcb
    import sys
    try:
        with open("vocabs.pk", "rb") as fh:
            vocabs = pickle.load(fh)
    except FileNotFoundError:
        train = cd.read_col_data(sys.argv[1])
        _vocabs = vcb.make_vocabs(train, 0)
        vocabs = vcb.Vocabs(*_vocabs)

    gold = cd.read_col_data(sys.argv[2])
    pred = cd.read_col_data(sys.argv[3])
    gms = [g.make_matrix("scope", True, vocabs.scoperels.w2i) for g in gold]
    pms = [p.make_matrix("scope", True, vocabs.scoperels.w2i) for p in pred]
    confuse(gms, pms, vocabs.scoperels.i2w)
Ejemplo n.º 7
0
def run_parser(args):
    # For now, assume there always is train, val, and glove data
    if args.seed == -1:
        args.seed = np.random.randint(1234567890)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    device = torch.device("cuda:0" if torch.cuda.is_available()
                          and not args.force_cpu else "cpu")
    print(device)
    args.device = device
    if torch.cuda.is_available():
        print(torch.cuda.get_device_capability(device))

    args.td = {None: 0, "syn": 1, "sem": 2, "cue": 3, "scope": 4, "scope-": 5}
    args.ot = args.td[args.other_target_style]
    args.pt = args.td[args.target_style]

    args.helpers = None
    if args.help_style:
        args.helpers = [args.td[x] for x in args.help_style.split(",")]

    if not args.dir.endswith("/"):
        args.dir += "/"

    if args.load:
        with open(args.dir + "vocabs.pk", "rb") as fh:
            vocabs = pickle.load(fh)
        #args.vocabs = vocabs
        model = ModelInteractor.factory(args, vocabs)
        model.load(args.load)
    else:
        sentences = cd.read_col_data(args.train)
        if args.vocab is not None:
            with open(args.vocab, "rb") as fh:
                vocabs = pickle.load(fh)
        else:
            _vocabs = make_vocabs(sentences, 0)
            vocabs = Vocabs(*_vocabs)
        with open(args.dir + "vocabs.pk", "wb") as fh:
            pickle.dump(vocabs, fh)
        #args.vocabs = vocabs
        model = ModelInteractor.factory(args, vocabs)

    if args.recycle is not None:
        with open(args.recycle + "vocabs.pk", "rb") as fh:
            other_vocabs = pickle.load(fh)
        with open(args.recycle + "settings.json") as fh:
            other_settings = json.load(fh)
        other_settings = Namespace(**other_settings)
        other_settings.device = args.device
        other = ModelInteractor.factory(other_settings, other_vocabs)
        other.load(args.recycle + "best_model.save")
        model.upd_from_other(other, *args.recycle_layers.split(","))


    if args.freeze is not None:
        model.freeze_params(*args.freeze.split(","))

    if (args.load and args.cont) or args.load is None:
        model.train()

        # load the best_model.save instead of using the current one
        model = ModelInteractor.factory(args, vocabs)
        model.load(args.dir + "best_model.save")

    

    if (args.load and args.cont) or args.load is None:
        predict(model, args, args.val, args.elmo_dev, vocabs)
    predict(model, args, args.predict_file, args.elmo_test, vocabs)
Ejemplo n.º 8
0
        return len(self.rels)

    def __getitem__(self, index):
        return self.rels[index]

    def __setitem__(self, index, value):
        self.rels[index] = value


if __name__ == "__main__":
    import sys
    import col_data as cd
    import pickle
    sentences = []
    for fn in sys.argv[2:]:
        sentences.extend(cd.read_col_data(fn))
    #sentences = cd.read_col_data(sys.argv[1])
    forms, norms, lemmas, uposs, xposs, synrels, semrels, chars, scoperels = make_vocabs(
        sentences)
    print([
        len(v.w2i) for v in [
            forms, norms, lemmas, uposs, xposs, synrels, semrels, chars,
            scoperels
        ]
    ])
    vocabs = Vocabs(forms, norms, lemmas, uposs, xposs, synrels, semrels,
                    chars, scoperels)
    #print(synrels.w2i, semrels.w2i, scoperels.w2i)
    with open(sys.argv[1], "wb") as fh:
        pickle.dump(vocabs, fh)
Ejemplo n.º 9
0
    for scope in scopes.values():
        scope = sorted(scope)
        #print(scope)
        for i, j in enumerate(scope):
            if i < len(scope) - 1:
                next_n = scope[i + 1]
                dist = next_n - j
                #print(dist)
                if dist > 1:
                    return True
    return False


if __name__ == "__main__":

    gold = dict([(l.id, l) for l in cd.read_col_data(
        "../data/neg_graphs/point_to_root/test.conllu")])
    pred = dict([(l.id, l) for l in cd.read_col_data(
        "../experiments/point_to_root/2/test.conllu.pred")])

    gold_neg = get_only_negated(gold)
    pred_neg = get_only_negated(pred)

    # which cues in gold but not predicted?
    print("Which cues in gold but not predicted?")
    gmissed, gsids = which_gold_cues_are_missed(gold_neg, pred)
    for cue, count in gmissed.most_common():
        print("-- {}:{}".format(cue, count))

    # --subquestion: does the model EVER predict these cues?
    never_predicted = [cue for cue in gmissed]
    for sent_id, sent in pred_neg.items():
Ejemplo n.º 10
0
    # Find which experiments have been run
    #experiment_names = set(name_map.keys())
    #experiments_run = set(os.listdir(args.preddir))
    #to_check = experiment_names.intersection(experiments_run)

    for setup in args.experiments:
        metric = []
        metric.append("")
        metric.append(name_map[setup])

        #goldfile = os.path.join(args.golddir, setup, "test.conllu")
        #predfile = os.path.join(args.preddir, setup, "test.conllu.pred")
        goldfile = os.path.join(args.golddir, setup, "dev.conllu")
        predfile = os.path.join(args.preddir, setup, "dev.conllu.pred")

        gold = list(cd.read_col_data(goldfile))
        pred = list(cd.read_col_data(predfile))
        for label in ["holder", "targ", "exp"]:
            prec, rec, f1 = span_f1(gold, pred, mapping, test_label=label)
            metric.append(f1 * 100)
            #print("{0}: {1:.1f}".format(label, f1 * 100))

        lgold = read_labeled(goldfile)
        lpred = read_labeled(predfile)

        ugold = read_unlabeled(goldfile)
        upred = read_unlabeled(predfile)

        #print("Targeted F1")
        f1 = targeted_f1(lgold, lpred)
        metric.append(f1 * 100)