def load(fname):
    global vw, sequenceLabeler
    try:
        vw = pyvw.vw("--quiet -i " + fname + " -f " + fname)
    except:
        vw = pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + fname)
        vw.finish()
        vw = pyvw.vw("--quiet -i " + fname + " -f " + fname)
    sequenceLabeler = vw.init_search_task(SequenceLabeler3)
Exemple #2
0
    def get_vw(self):
        """
        Factory to create a vw instance on demand

        Returns
        -------
        pyvw.vw instance
        """
        if self.vw_ is None:
            self.vw_ = vw(**self.params)
        return self.vw_
Exemple #3
0
    def get_vw(self):
        """
        Factory to create a vw instance on demand

        Returns
        -------
        pyvw.vw instance
        """
        if self.vw_ is None:
            self.vw_ = vw(**self.params)
        return self.vw_
    def get_vw(self):
        """Factory to create a vw instance on demand

        Returns
        -------
        pyvw.vw instance
        """
        if self.vw_ is None:
            self.vw_ = vw(**self.params)

            # set label type
            self.label_type_ = self.vw_.get_label_type()
        return self.vw_
Exemple #5
0
    def get_vw(self):
        """Factory to create a vw instance on demand

        Returns
        -------
        pyvw.vw instance
        """
        if self.vw_ is None:
            self.vw_ = vw(**self.params)

            # set label type
            self.label_type_ = self.vw_.get_label_type()
        return self.vw_
def main():
    vw = []
    sl = []
    while True:
        inp = raw_input("> ")

        inp = inp.strip()
        words = inp.split()

        cmd = words[0]
        if cmd == "/save":
            for temp in vw:
                temp.finish()
            sys.exit(1)
        if cmd == "/train":
            data = " ".join(words[1:]).strip()
            for i in range(10):
                for temp in sl:
                    temp.learn(preprocess([data]))
        elif cmd == "/query":
            data = " ".join(words[1:]).strip()
            output = set()
            for s in sl:
                output.add(postprocess(query(s, data)))
            for out in output:
                print "\t", out
        elif cmd == "/start":
            data = " ".join(words[1:]).strip()
            if os.path.isfile(data + ".1") and os.path.isfile(data + ".2") and os.path.isfile(
                            data + ".3") and os.path.isfile(data + ".4"):
                vw = [
                    pyvw.vw("--quiet -i " + data + ".1 -f "+data + ".1"),
                    pyvw.vw("--quiet -i " + data + ".2 -f "+data + ".2"),
                    pyvw.vw("--quiet -i " + data + ".3 -f "+data + ".3"),
                    pyvw.vw("--quiet -i " + data + ".4 -f "+data + ".4")
                ]
            else:
                vw = [
                    pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".1"),
                    pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".2"),
                    pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".3"),
                    pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".4")
                ]
            sl = [
                vw[0].init_search_task(SequenceLabeler),
                vw[1].init_search_task(SequenceLabeler2),
                vw[2].init_search_task(SequenceLabeler3),
                vw[3].init_search_task(SequenceLabeler4)
            ]
Exemple #7
0
def mini_vw(inputFile, numPasses, otherArgs):
    vw = pyvw.vw(otherArgs)
    for p in range(numPasses):
        print 'pass', (p+1)
        h = open(inputFile, 'r')
        for l in h.readlines():
            if learnFromStrings:
                vw.learn(l)
            else:
                ex = vw.example(l)
                vw.learn(ex)
                ex.finish()
        h.close()
    vw.finish()
Exemple #8
0
def mini_vw(inputFile, numPasses, otherArgs):
    vw = pyvw.vw(otherArgs)
    for p in range(numPasses):
        print 'pass', (p + 1)
        h = open(inputFile, 'r')
        for l in h.readlines():
            if learnFromStrings:
                vw.learn(l)
            else:
                ex = vw.example(l)
                vw.learn(ex)
                ex.finish()

        h.close()
    vw.finish()
Exemple #9
0
def do_work(train_instances, dev_instances, test_instances, sample_size, samples_per_event,
        gold_probs, iters, l2, log_time, semsims, dfdeltas,
        use_best_feats, use_i_only, use_abs_df, doc_condition, output_dir):
    

    vw = pyvw.vw(
        ("-l .001 --l2 {} --search 2 --search_task hook --ring_size 1024 " + \
         "--search_no_caching --noconstant --quiet").format(l2)) 
    task = vw.init_search_task(Summarizer)
    task.use_best_feats = use_best_feats
    task.use_i_only = use_i_only
    task.use_abs_df = use_abs_df
    task._doc_condition = doc_condition
    print "use best?", task.use_best_feats 
    print "use i only?", task.use_i_only
    print "use abs df?", task.use_abs_df
    print "use doc condition?", task._doc_condition



    all_scores = []
    all_weights = []


    for n_iter in xrange(1, iters + 1):
        task.total_loss = 0    

        random.shuffle(train_instances)
        print "iter", n_iter
        task.learn(train_instances)
        for i, inst in enumerate(dev_instances):
            egain, comp, f1, loss, _ = predict(task, inst, n_iter)
            print egain, comp, f1, loss
            all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss})
        df = pd.DataFrame(all_scores)
        df_u = df.groupby("iter").mean().reset_index(drop=True)
        print df_u    

        select_df, next_df = task.get_feature_weights()

        select_df["class"] = "SELECT"
        select_df["iter"] = n_iter

        next_df["class"] = "NEXT"
        next_df["iter"] = n_iter
        all_weights.append(select_df)
        all_weights.append(next_df)



    best_f1_iter = df_u["F1"].argmax() + 1
    best_egain_iter = df_u["E[gain]"].argmax() + 1
    best_comp_iter = df_u["Comp."].argmax() + 1
    best_loss_iter = df_u["Loss"].argmin() + 1

    weights_df = pd.concat(all_weights)


    all_summaries = []
#    all_scores = []

    F1_weights = weights_df[weights_df["iter"] == best_f1_iter]
    loss_weights = weights_df[weights_df["iter"] == best_loss_iter]
    egain_weights = weights_df[weights_df["iter"] == best_egain_iter]
    comp_weights = weights_df[weights_df["iter"] == best_comp_iter]

    def get_summaries(weights, run):
        print "Best", run
        task.set_weights(weights)
        for test_instance in test_instances:
            event = test_instance[0]
            df = test_instance[1]
            print event
            task._keep_scores = True
            task._scores = []
            predictions = task.predict(test_instance)
            assert len(predictions) == len(task._scores)

            for action, (_, row), ascore in zip(predictions, df.iterrows(), task._scores):
                if action == SELECT:
                  #  assert ascore["SELECT"] <= ascore["NEXT"]
                    print "{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
                        event.query_num, "CUNLP", run, 
                        "-".join(row["update id"].split("-")[0:2]), 
                        row["update id"].split("-")[2], 
                        row["timestamp"], ascore)
                    all_summaries.append(
                        {"event": event.query_num, 
                         "team": "CUNLP",
                         "run": run,
                         "stream id": "-".join(row["update id"].split("-")[0:2]),
                         "sentence id": row["update id"].split("-")[2], 
                         "timestamp": row["timestamp"],
                         "confidence": row["probs"],
                         "partial": ascore,
                         "text": row["sent text"],
                         "pretty text": row["pretty text"]
                        })
                #else:
                  #  assert ascore["SELECT"] >= ascore["NEXT"]
#    all_scores = []
#    task.set_weights(F1_weights)
#    for i, inst in enumerate(dev_instances):
#        egain, comp, f1, loss, _ = predict(task, inst, best_f1_iter)
#        print egain, comp, f1, loss
#        all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss})
#    df = pd.DataFrame(all_scores)
#    df_u = df.groupby("iter").mean().reset_index(drop=True)
#    print df_u    
#
#    all_scores = []
#    task.set_weights(egain_weights)
#    for i, inst in enumerate(dev_instances):
#        egain, comp, f1, loss, _ = predict(task, inst, best_egain_iter)
#        print egain, comp, f1, loss
#        all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss})
#    df = pd.DataFrame(all_scores)
#    df_u = df.groupby("iter").mean().reset_index(drop=True)
#    print df_u    



    get_summaries(F1_weights, "L2S.F1")
    get_summaries(loss_weights, "L2S.Loss")
    get_summaries(egain_weights, "L2S.E[gain]")
    get_summaries(comp_weights, "L2S.Comp.")

            
    df = pd.DataFrame(all_summaries, 
        columns=["event", "team", "run", "stream id", "sentence id", 
                 "timestamp", "confidence", "partial", "pretty text", "text"])
    submission_path = os.path.join(output_dir, "submission.tsv")
    summary_path = os.path.join(output_dir, "summaries.tsv")
    f1_weights_path = os.path.join(output_dir, "weights.f1.tsv")
    loss_weights_path = os.path.join(output_dir, "weights.loss.tsv")
    egain_weights_path = os.path.join(output_dir, "weights.egain.tsv")
    comp_weights_path = os.path.join(output_dir, "weights.comp.tsv")

    scores_path = os.path.join(output_dir, "scores.tsv")

    no_text = ["event", "team", "run", "stream id", "sentence id", 
               "timestamp", "confidence", "partial"]
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    df["confidence"] = df["confidence"].apply(lambda x: max(x, 0))
    with open(submission_path, "w") as f:
        df[no_text].to_csv(f, index=False, header=False, sep="\t")
    with open(summary_path, "w") as f:
        df.to_csv(f, index=False, sep="\t")

    with open(f1_weights_path, "w") as f:
        F1_weights.to_csv(f, index=False, sep="\t")
    with open(loss_weights_path, "w") as f:
        loss_weights.to_csv(f, index=False, sep="\t")
    with open(egain_weights_path, "w") as f:
        egain_weights.to_csv(f, index=False, sep="\t")
    with open(comp_weights_path, "w") as f:
        comp_weights.to_csv(f, index=False, sep="\t")
    with open(scores_path, "w") as f:
        df_u.to_csv(f, sep="\t", index=False)
Exemple #10
0
def do_work(training_events, test_event, sample_size, samples_per_event,
        gold_probs, iters, l2, log_time, semsims, dfdeltas,
        use_best_feats, use_i_only, use_abs_df):
    
    training_streams = []
    summary = []

    for event in training_events:
        df = get_input_stream(event, gold_probs)
        training_streams.append((event, df))

    test_df = get_input_stream(test_event, gold_probs)
    test_X_l = semsims[test_event.type].transform(
        test_df["stems"].apply(lambda x: ' '.join(x)).tolist())
    test_stream = (test_event, test_df, test_X_l, dfdeltas(test_event))

    vw = pyvw.vw(
        ("--l2 {} --search 2 --search_task hook --ring_size 1024 " + \
         "--search_no_caching --noconstant --quiet").format(l2)) 
    task = vw.init_search_task(Summarizer)
    task.use_best_feats = use_best_feats
    task.use_i_only = use_i_only
    task.use_abs_df = use_abs_df
    print "use best?", task.use_best_feats 
    print "use i only?", task.use_i_only
    print "use abs df?", task.use_abs_df
    task.log_time = log_time
    all_scores = []
    all_weights = []

    instances = []
    for sample in xrange(samples_per_event):
        for event, stream in training_streams:
            while 1:
                sample_stream = ds(stream, sample_size=sample_size)
                if (sample_stream["nuggets"].apply(len) > 0).any():
                    break
            X_l = semsims[event.type].transform(
                sample_stream["stems"].apply(lambda x: ' '.join(x)).tolist())  
            instances.append((event, sample_stream, X_l, dfdeltas(event)))



    for n_iter in xrange(1, iters + 1):
        task.total_loss = 0    

        
        #instances = [(event, ds(stream, sample_size=sample_size))
        #             for event, stream in training_streams
        #             for sample in xrange(samples_per_event)]
        random.shuffle(instances)
        for i, inst in enumerate(instances):
            print "{}.{}.{}/{}".format(
                test_event.fs_name(), n_iter, i, len(instances))
            task.learn([inst])
        print "{}.{}.p".format(
            test_event.fs_name(), n_iter)
        
        train_egain = 0
        train_comp = 0
        train_f1 = 0
        train_loss = 0
        for i, inst in enumerate(instances):
            egain, comp, f1, loss, train_sum = predict(task, inst, n_iter)
            train_egain += egain
            train_comp += comp
            train_f1 += f1
            train_loss += loss
        train_egain = train_egain / float(len(instances))
        train_comp = train_comp / float(len(instances))
        train_f1 = train_f1 / float(len(instances))
        train_loss = train_loss / float(len(instances))
        print "{} {} train loss {}".format(test_event.query_id, n_iter, train_loss)


        pred = task.predict(test_stream)

        select_df, next_df = task.get_feature_weights()

        select_df["class"] = "SELECT"
        select_df["iter"] = n_iter

        next_df["class"] = "NEXT"
        next_df["iter"] = n_iter
        all_weights.append(select_df)
        all_weights.append(next_df)

        pred = ["SELECT" if p == SELECT else "SKIP" for p in pred]
        all_nuggets = set()
        for nuggets in test_stream[1]["nuggets"].tolist():
            all_nuggets.update(nuggets)

        loss = 0        
        y_int_y_hat = 0
        size_y = 0
        size_y_hat = 0

        nuggets = set()
        for action, (_, sent) in izip(pred, test_stream[1].iterrows()):
            gain = len(sent["nuggets"] - nuggets)
            if action == "SELECT": 
                if gain == 0:
                    loss += 1
                summary.append({
                    "event": test_event.query_id,
                    "iter": n_iter,
                    "update id": sent["update id"],
                    "timestamp": sent["timestamp"],
                    "gain": gain, 
                    "nuggets": ",".join(sent["nuggets"]), 
                    "update text": sent["pretty text"]
                })
                nuggets.update(sent["nuggets"])
            else: 
                if gain > 0:
                    loss += 1
            if gain > 0:
                oracle = "SELECT"
            else:
                oracle = "SKIP"

            if action == "SELECT" and oracle == "SELECT":
                y_int_y_hat += 1
                size_y += 1
                size_y_hat += 1
            elif action == "SELECT" and oracle == "SKIP":
                size_y_hat += 1
            elif action == "SKIP" and oracle == "SELECT":
                size_y += 1


        if size_y_hat == 0:
            print test_event
            print (test_stream[1]["nuggets"].apply(len) > 0).any()
        loss = 1 - float(y_int_y_hat) / (size_y + size_y_hat)
        

        if len(nuggets) > 0:
            egain = len(nuggets) / sum([1.0 if a == "SELECT" else 0.0 for a in pred])
        else:
            egain = 0        
        comp = len(nuggets) / float(len(all_nuggets)) 
        
        all_scores.append({"iter": n_iter, "Comp.": comp,
                           "E[gain]": egain, "Loss": loss, 
                           "Avg. Train Loss": train_loss,
                           "Avg. Train E[gain]": train_egain,
                           "Avg. Train Comp.": train_comp,
                           "Avg. Train F1": train_f1,
        })        

        print "{}.{}.p E[gain]={:0.6f} Comp.={:0.6f} Train Loss={:0.6f}".format(
            test_event.fs_name(), n_iter, egain, comp, train_loss)

    scores_df = pd.DataFrame(all_scores, columns=["iter", "E[gain]", "Comp.", "Loss", "Avg. Train Loss", "Avg. Train E[gain]", "Avg. Train Comp.", "Avg. Train F1"])
    weights_df = pd.concat(all_weights)
    weights_df["event"] = test_event.query_id
    scores_df["event"] = test_event.query_id
    summary_df = pd.DataFrame(
        summary, 
        columns=["iter", "event", "update id", "timestamp", "gain", 
                 "update text", "nuggets"])
    return scores_df, weights_df, summary_df

def train(sequenceLabeler, data):
    sequenceLabeler.learn(preprocess([data]))


def testit(sequenceLabeler):
    passed = 0
    for sample in samples:
        pred = postprocess(test(sequenceLabeler, sample[0]))
        if pred == sample[1]:
            passed += 1

    print "\n======== ACCURACY:[", (passed*1.0/len(samples))*100, "% ] ======"
    print "====================================\n"

if __name__ == "__main__":
    fname = "tagger2.bin"
    vw = pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + fname)
    sequenceLabeler = vw.init_search_task(SequenceLabeler3)

    train(sequenceLabeler, "watch_N big_B bang_I")
    testit(sequenceLabeler)

    train(sequenceLabeler, "harry_B potter_I")
    testit(sequenceLabeler)

    train(sequenceLabeler, "show_N me_N action_B movies_B")
    testit(sequenceLabeler)
    vw.finish()
Exemple #12
0
def main(learner, training_ids, test_ids, sample_size, n_iters,
         report_dir_base):

    extractor = "goose"
    topk = 20
    delay = None
    threshold = .8
    res = InputStreamResource()

    events = [
        e for e in cuttsum.events.get_events()
        if e.query_num in training_ids or e.query_num in test_ids
    ]
    training_insts = []
    test_insts = []
    for event in events:
        print "Loading event", event.fs_name()
        corpus = cuttsum.corpora.get_raw_corpus(event)

        # A list of dataframes. Each dataframe is a document with =< 20 sentences.
        # This is the events document stream.
        dataframes = res.get_dataframes(event, corpus, extractor, threshold,
                                        delay, topk)

        if event.query_num in training_ids:
            training_insts.append((event, dataframes))

        if event.query_num in test_ids:
            test_insts.append((event, dataframes))

    # Init l2s task.
    vw = pyvw.vw(
        "--search 0 --csoaa_ldf m --search_task hook --ring_size 1024  --quiet  --search_no_caching"
    )

    #task = vw.init_search_task(UpdateSummarizer)
    if learner == "PerfectOracle":
        task = vw.init_search_task(PerfectOracle)
    elif learner == "LessPerfectOracle":
        task = vw.init_search_task(LessPerfectOracle)
    elif learner == "SelectLexNextOracle":
        task = vw.init_search_task(SelectLexNextOracle)
    elif learner == "SelectLexNextLex":
        task = vw.init_search_task(SelectLexNextLex)
    elif learner == "SelectLexNextLexCache":
        task = vw.init_search_task(SelectLexNextLexCache)
    elif learner == "SelectLexGenericNextOracle":
        task = vw.init_search_task(SelectLexGenericNextOracle)
    elif learner == "SelectBasicNextBias":
        task = vw.init_search_task(SelectBasicNextBias)
    elif learner == "SelectBasicNextBiasDocAvg":
        task = vw.init_search_task(SelectBasicNextBiasDocAvg)

    for n_iter in range(n_iters):
        print "iter", n_iter + 1
        ds = downsample(training_insts, size=sample_size)
        task.learn(ds)
        all_train_df = [df for inst in training_insts for df in inst[1]]
        feature_weights = task.get_feature_weights(all_train_df)

        write_model(feature_weights, report_dir_base, n_iter)

        for event, dataframes in training_insts:
            # Predict a sequence for this training examples and see if it is sensible.
            print "PREDICTING", event.fs_name()
            sequence, scores = task.predict_with_scores((event, dataframes))
            print sequence
            make_report(event, dataframes, sequence, scores, "train", n_iter,
                        report_dir_base)

        for event, dataframes in test_insts:
            # Predict a sequence for this training examples and see if it is sensible.
            print "PREDICTING", event.fs_name()
            sequence, scores = task.predict_with_scores((event, dataframes))
            print sequence
            make_report(event, dataframes, sequence, scores, "test", n_iter,
                        report_dir_base)
Exemple #13
0
                (VERB, 'ate'),
                (DET , 'a'),
                (ADJ , 'big'),
                (NOUN, 'sandwich')],
               [(DET , 'the'),
                (NOUN, 'sandwich'),
                (VERB, 'was'),
                (ADJ , 'tasty')],
               [(NOUN, 'it'),
                (VERB, 'ate'),
                (NOUN, 'it'),
                (ADJ , 'all')] ]


# initialize VW as usual, but use 'hook' as the search_task
vw = pyvw.vw("--search 4 --quiet --search_task hook --ring_size 1024")

# tell VW to construct your search task object
sequenceLabeler = vw.init_search_task(SequenceLabeler)

# train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above
print >>sys.stderr, 'training!'
i = 0
while i < 10:
    sequenceLabeler.learn(my_dataset)
    i += 1

# now see the predictions on a test sentence
print >>sys.stderr, 'predicting!'
print sequenceLabeler.predict( [(0,w) for w in "the sandwich ate a monster".split()] )
print 'should have printed: [1, 2, 3, 1, 2]'
        return output


# wow! your data can be ANY type you want... does NOT have to be VW examples
DET = 1
NOUN = 2
VERB = 3
ADJ = 4
my_dataset = [
    [(DET, "the"), (NOUN, "monster"), (VERB, "ate"), (DET, "a"), (ADJ, "big"), (NOUN, "sandwich")],
    [(DET, "the"), (NOUN, "sandwich"), (VERB, "was"), (ADJ, "tasty")],
    [(NOUN, "it"), (VERB, "ate"), (NOUN, "it"), (ADJ, "all")],
]


# initialize VW as usual, but use 'python_hook' as the search_task
vw = pyvw.vw("--search 4 --quiet --search_task python_hook --search_no_snapshot --ring_size 1024")

# tell VW to construct your search task object
sequenceLabeler = vw.init_search_task(SequenceLabeler)

# train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above
print >> sys.stderr, "training!"
for curPass in range(10):
    sequenceLabeler.learn(my_dataset.__iter__)

# now see the predictions on a test sentence
print >> sys.stderr, "predicting!"
print sequenceLabeler.predict([(0, w) for w in "the sandwich ate a monster".split()])
print "should have printed: [1, 2, 3, 1, 2]"
Exemple #15
0
                                    my_tag=n + 1,
                                    oracle=oracle,
                                    condition=[(n, 'p'), (n - 1, 'q')])

            output[
                n] = pred - 1 if pred < n else pred  # have to +1 because n==m excluded

        return output


# TODO: if they make sure search=0 <==> ldf <==> csoaa_ldf

# demo the non-ldf version:

print 'training non-LDF'
vw = pyvw.vw("--search 2 --search_task hook --ring_size 1024 --quiet")
task = vw.init_search_task(CovingtonDepParser)
for p in range(2):  # do two passes over the training data
    task.learn(my_dataset)
print 'testing non-LDF'
print task.predict([(w, -1) for w in "the monster ate a sandwich".split()])
print 'should have printed [ 1 2 -1 4 2 ]'

# demo the ldf version:
print 'training LDF'
vw = pyvw.vw(
    "--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet")
task = vw.init_search_task(CovingtonDepParserLDF)
for p in range(100):  # do two passes over the training data
    task.learn(my_dataset)
print 'testing LDF'
Exemple #16
0
def do_work(
    train_instances,
    dev_instances,
    test_instances,
    sample_size,
    samples_per_event,
    gold_probs,
    iters,
    l2,
    log_time,
    semsims,
    dfdeltas,
    use_best_feats,
    use_i_only,
    use_abs_df,
    doc_condition,
    output_dir,
):

    vw = pyvw.vw(
        (
            "-l .001 --l2 {} --search 2 --search_task hook --ring_size 1024 "
            + "--search_no_caching --noconstant --quiet"
        ).format(l2)
    )
    task = vw.init_search_task(Summarizer)
    task.use_best_feats = use_best_feats
    task.use_i_only = use_i_only
    task.use_abs_df = use_abs_df
    task._doc_condition = doc_condition
    print "use best?", task.use_best_feats
    print "use i only?", task.use_i_only
    print "use abs df?", task.use_abs_df
    print "use doc condition?", task._doc_condition

    all_scores = []
    all_weights = []

    for n_iter in xrange(1, iters + 1):
        task.total_loss = 0

        random.shuffle(train_instances)
        print "iter", n_iter
        task.learn(train_instances)
        for i, inst in enumerate(dev_instances):
            egain, comp, f1, loss, _ = predict(task, inst, n_iter)
            print egain, comp, f1, loss
            all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss})
        df = pd.DataFrame(all_scores)
        df_u = df.groupby("iter").mean().reset_index(drop=True)
        print df_u

        select_df, next_df = task.get_feature_weights()

        select_df["class"] = "SELECT"
        select_df["iter"] = n_iter

        next_df["class"] = "NEXT"
        next_df["iter"] = n_iter
        all_weights.append(select_df)
        all_weights.append(next_df)

    best_f1_iter = df_u["F1"].argmax() + 1
    best_egain_iter = df_u["E[gain]"].argmax() + 1
    best_comp_iter = df_u["Comp."].argmax() + 1
    best_loss_iter = df_u["Loss"].argmin() + 1

    weights_df = pd.concat(all_weights)

    all_summaries = []
    #    all_scores = []

    F1_weights = weights_df[weights_df["iter"] == best_f1_iter]
    loss_weights = weights_df[weights_df["iter"] == best_loss_iter]
    egain_weights = weights_df[weights_df["iter"] == best_egain_iter]
    comp_weights = weights_df[weights_df["iter"] == best_comp_iter]

    def get_summaries(weights, run):
        print "Best", run
        task.set_weights(weights)
        for test_instance in test_instances:
            event = test_instance[0]
            df = test_instance[1]
            print event
            task._keep_scores = True
            task._scores = []
            predictions = task.predict(test_instance)
            assert len(predictions) == len(task._scores)

            for action, (_, row), ascore in zip(predictions, df.iterrows(), task._scores):
                if action == SELECT:
                    #  assert ascore["SELECT"] <= ascore["NEXT"]
                    print "{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
                        event.query_num,
                        "CUNLP",
                        run,
                        "-".join(row["update id"].split("-")[0:2]),
                        row["update id"].split("-")[2],
                        row["timestamp"],
                        ascore,
                    )
                    all_summaries.append(
                        {
                            "event": event.query_num,
                            "team": "CUNLP",
                            "run": run,
                            "stream id": "-".join(row["update id"].split("-")[0:2]),
                            "sentence id": row["update id"].split("-")[2],
                            "timestamp": row["timestamp"],
                            "confidence": row["probs"],
                            "partial": ascore,
                            "text": row["sent text"],
                            "pretty text": row["pretty text"],
                        }
                    )
                # else:
                #  assert ascore["SELECT"] >= ascore["NEXT"]

    #    all_scores = []
    #    task.set_weights(F1_weights)
    #    for i, inst in enumerate(dev_instances):
    #        egain, comp, f1, loss, _ = predict(task, inst, best_f1_iter)
    #        print egain, comp, f1, loss
    #        all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss})
    #    df = pd.DataFrame(all_scores)
    #    df_u = df.groupby("iter").mean().reset_index(drop=True)
    #    print df_u
    #
    #    all_scores = []
    #    task.set_weights(egain_weights)
    #    for i, inst in enumerate(dev_instances):
    #        egain, comp, f1, loss, _ = predict(task, inst, best_egain_iter)
    #        print egain, comp, f1, loss
    #        all_scores.append({"iter": n_iter, "E[gain]": egain, "Comp.": comp, "F1": f1, "Loss": loss})
    #    df = pd.DataFrame(all_scores)
    #    df_u = df.groupby("iter").mean().reset_index(drop=True)
    #    print df_u

    get_summaries(F1_weights, "L2S.F1")
    get_summaries(loss_weights, "L2S.Loss")
    get_summaries(egain_weights, "L2S.E[gain]")
    get_summaries(comp_weights, "L2S.Comp.")

    df = pd.DataFrame(
        all_summaries,
        columns=[
            "event",
            "team",
            "run",
            "stream id",
            "sentence id",
            "timestamp",
            "confidence",
            "partial",
            "pretty text",
            "text",
        ],
    )
    submission_path = os.path.join(output_dir, "submission.tsv")
    summary_path = os.path.join(output_dir, "summaries.tsv")
    f1_weights_path = os.path.join(output_dir, "weights.f1.tsv")
    loss_weights_path = os.path.join(output_dir, "weights.loss.tsv")
    egain_weights_path = os.path.join(output_dir, "weights.egain.tsv")
    comp_weights_path = os.path.join(output_dir, "weights.comp.tsv")

    scores_path = os.path.join(output_dir, "scores.tsv")

    no_text = ["event", "team", "run", "stream id", "sentence id", "timestamp", "confidence", "partial"]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    df["confidence"] = df["confidence"].apply(lambda x: max(x, 0))
    with open(submission_path, "w") as f:
        df[no_text].to_csv(f, index=False, header=False, sep="\t")
    with open(summary_path, "w") as f:
        df.to_csv(f, index=False, sep="\t")

    with open(f1_weights_path, "w") as f:
        F1_weights.to_csv(f, index=False, sep="\t")
    with open(loss_weights_path, "w") as f:
        loss_weights.to_csv(f, index=False, sep="\t")
    with open(egain_weights_path, "w") as f:
        egain_weights.to_csv(f, index=False, sep="\t")
    with open(comp_weights_path, "w") as f:
        comp_weights.to_csv(f, index=False, sep="\t")
    with open(scores_path, "w") as f:
        df_u.to_csv(f, sep="\t", index=False)
import pyvw

vw = pyvw.vw('--audit')
full = vw.example( { 'a': ['b'], 'x': ['y'] } )
full.learn()

part = vw.example( {'a': ['b'] } )
part.learn()

part.push_features('x', ['y'])
part.learn()

part.erase_namespace(ord('x'))
part.push_features('x', ['z'])
part.learn()
Exemple #18
0
    def _run(self, y_x):
        y,(x0,x1) = y_x

        ex = self.example({'x': [('x0',x0), ('x1',x1)]})
        h  = self.sch.predict(examples=ex, my_tag=1, oracle=None) * 2 - 3
        
        ex = self.example({'x': [('x0',x0), ('x1',x1), ('x0h',x0*h)]})
        p  = self.sch.predict(examples=ex, my_tag=2, oracle=y, condition=(1,'h'))
        
        self.sch.loss( 0. if p == y else 1. )
        return p

my_dataset = [ (1, (-1, -1)),
               (1, (+1, +1)),
               (2, (-1, +1)),
               (2, (+1, -1)) ]

    
vw = pyvw.vw("--search 2 --search_task hook --ring_size 1024 --search_alpha 1e-2")
lv = vw.init_search_task(LatentVariableClassifier)

print 'training'
for i in range(100):
    lv.learn(my_dataset)

print 'testing'
for (y,x) in my_dataset:
    print 'pred =', lv.predict( (0,x) )
    
Exemple #19
0
import pyvw

vw = pyvw.vw('--audit')
full = vw.example({'a': ['b'], 'x': ['y']})
full.learn()

part = vw.example({'a': ['b']})
part.learn()

part.push_features('x', ['y'])
part.learn()

part.erase_namespace(ord('x'))
part.push_features('x', ['z'])
part.learn()
Exemple #20
0
        label, ex = parseExample(tokens)
        ldf_example.append((label, ex))
    else:
        if ldf_example:
            shared = ldf_example[0]
            ldf = ldf_example[1:]
            for l in ldf:
                for k in shared[1]:
                    l[1][k] = shared[1][k]
            sentence.append(ldf)
            ldf_example = []


# initialize VW as usual, but use 'hook' as the search_task
# vw = pyvw.vw("--search 0 --hash all -b 31 --csoaa_ldf mc --quiet --search_task hook -q t: -q m: --ngram t2 --ngram m2 --ngram g2 --ngram c2")
vw = pyvw.vw("--search 0 --hash all -b 31 --csoaa_ldf mc --quiet --search_task hook -q ::")

# tell VW to construct your search task object
sequenceLabeler = vw.init_search_task(SequenceLabeler)


train = data[:77072]
test = data[77072:]


def prepare(test):
    for s in range(len(test)):
        sentence = test[s]
        oracle = []
        for w in range(len(sentence)):
            word = sentence[w]
Exemple #21
0
def main(input_path, features, loss_metric, fold,
        lemma_length_cutoff, use_interactions, max_iters,
        output_dir):

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    converters = {"lemmas": eval, "tokens": eval}

    with open(input_path, "rb") as f:
        all_inputs = pd.read_csv(f, sep="\t", converters=converters)
        print("Read {} input sentences from {}".format(
            len(all_inputs), input_path))

    if lemma_length_cutoff > 0:
        all_inputs = lemma_filter(all_inputs, lemma_length_cutoff)

    vw_str = "--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 " \
             "--search_no_caching --quiet --noconstant"
    vw = pyvw.vw(vw_str)
    instances = make_instances(vw, all_inputs, features, use_interactions)

    chunk_size = 20
    chunks = [instances[i:i+chunk_size] for i in range(0, 80, 20)]
    instances_train = []
    for i, chunk in enumerate(chunks):
        if i != fold:
            instances_train.extend(chunk)
        else:
            instances_dev = chunks[i]
    print "Fold {}".format(fold)

    task = vw.init_search_task(L2SSum)
    task.set_loss_func(loss_metric)
    print task.get_loss_func()

    from datetime import datetime, timedelta
    now = datetime.now()
    total_train_time = timedelta(0)
    for num_iter in range(1, max_iters + 1):    
        print("iter {}/{}".format(num_iter, max_iters))
        task.learn(instances_train)
        dur = datetime.now() - now
        total_train_time += dur
        print("took {}".format(dur))
        now = datetime.now()
        write_weights(output_dir, num_iter, vw, instances[0][-1])
        
        output_paths = []
        for instance in instances_train:
            docset, year, opath = write_output(
                task.predict(instance), num_iter, instance, output_dir)
            output_paths.append((docset, year, opath))
        write_eval(output_paths, num_iter, output_dir)

        output_paths_dev = []
        for instance in instances_dev:
            docset, year, opath = write_output(
                task.predict(instance), num_iter, instance, output_dir,
                dev=True)
            output_paths_dev.append((docset, year, opath))
        write_eval_dev(output_paths_dev, num_iter, output_dir)



    print total_train_time, 
    print timedelta(seconds=total_train_time.total_seconds() / 10.)
Exemple #22
0
def main(learner, training_ids, test_ids, sample_size, 
         n_iters, report_dir_base):

    extractor = "goose" 
    topk = 20
    delay = None
    threshold = .8
    res = InputStreamResource()

    events = [e for e in cuttsum.events.get_events()
              if e.query_num in training_ids or e.query_num in test_ids]
    training_insts = []
    test_insts = []
    for event in events:
        print "Loading event", event.fs_name()
        corpus = cuttsum.corpora.get_raw_corpus(event)

        # A list of dataframes. Each dataframe is a document with =< 20 sentences.
        # This is the events document stream.
        dataframes = res.get_dataframes(event, corpus, extractor, threshold,
                delay, topk)

        if event.query_num in training_ids:
            training_insts.append((event, dataframes))    
           
        if event.query_num in test_ids:
            test_insts.append((event, dataframes))    

    # Init l2s task.
    vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024  --quiet  --search_no_caching")

    #task = vw.init_search_task(UpdateSummarizer)
    if learner == "PerfectOracle":
        task = vw.init_search_task(PerfectOracle)
    elif learner == "LessPerfectOracle":
        task = vw.init_search_task(LessPerfectOracle)
    elif learner == "SelectLexNextOracle":
        task = vw.init_search_task(SelectLexNextOracle)
    elif learner == "SelectLexNextLex":
        task = vw.init_search_task(SelectLexNextLex)
    elif learner == "SelectLexNextLexCache":
        task = vw.init_search_task(SelectLexNextLexCache)
    elif learner == "SelectLexGenericNextOracle":
        task = vw.init_search_task(SelectLexGenericNextOracle)
    elif learner == "SelectBasicNextBias":
        task = vw.init_search_task(SelectBasicNextBias)
    elif learner == "SelectBasicNextBiasDocAvg":
        task = vw.init_search_task(SelectBasicNextBiasDocAvg)
    
    for n_iter in range(n_iters):
        print "iter", n_iter + 1
        ds = downsample(training_insts, size=sample_size)
        task.learn(ds)
        all_train_df = [df for inst in training_insts for df in inst[1]]
        feature_weights = task.get_feature_weights(all_train_df)

        write_model(feature_weights, report_dir_base, n_iter)

        for event, dataframes in training_insts:
            # Predict a sequence for this training examples and see if it is sensible.
            print "PREDICTING", event.fs_name()
            sequence, scores = task.predict_with_scores((event, dataframes))
            print sequence
            make_report(event, dataframes, sequence, scores, "train", n_iter,
                report_dir_base)


        for event, dataframes in test_insts:
            # Predict a sequence for this training examples and see if it is sensible.
            print "PREDICTING", event.fs_name()
            sequence, scores = task.predict_with_scores((event, dataframes))
            print sequence
            make_report(event, dataframes, sequence, scores, "test", n_iter,
                report_dir_base)

# wow! your data can be ANY type you want... does NOT have to be VW examples
DET = 1
NOUN = 2
VERB = 3
ADJ = 4
my_dataset = [[(DET, 'the'), (NOUN, 'monster'), (VERB, 'ate'), (DET, 'a'),
               (ADJ, 'big'), (NOUN, 'sandwich')],
              [(DET, 'the'), (NOUN, 'sandwich'), (VERB, 'was'),
               (ADJ, 'tasty')],
              [(NOUN, 'it'), (VERB, 'ate'), (NOUN, 'it'), (ADJ, 'all')]]

# initialize VW as usual, but use 'hook' as the search_task
vw = pyvw.vw(
    "--search 4 --quiet --search_task hook --search_no_snapshot --ring_size 1024"
)

# tell VW to construct your search task object
sequenceLabeler = vw.init_search_task(SequenceLabeler)

# train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above
print >> sys.stderr, 'training!'
for curPass in range(10):
    sequenceLabeler.learn(my_dataset.__iter__)

# now see the predictions on a test sentence
print >> sys.stderr, 'predicting!'
print sequenceLabeler.predict([(0, w)
                               for w in "the sandwich ate a monster".split()])
print 'should have printed: [1, 2, 3, 1, 2]'
        ex = self.example({'w': [word + '_' + str(p)]}, labelType=self.vw.lCostSensitive)
        ex.set_label_string(str(p) + ':0')
        return ex
        
    def _run(self, sentence):   # it's called _run to remind you that you shouldn't call it directly!
        output = []
        for n in range(len(sentence)):
            pos,word = sentence[n]
            # use "with...as..." to guarantee that the example is finished properly
            ex = [ self.makeExample(word,p) for p in [DET,NOUN,VERB,ADJ] ]
            pred = self.sch.predict(examples=ex, my_tag=n+1, oracle=pos-1, condition=(n,'p'))
            output.append(pred + 1)
        return output

# initialize VW as usual, but use 'hook' as the search_task
vw = pyvw.vw("--search 0 --csoaa_ldf m --quiet --search_task hook --ring_size 1024")

# tell VW to construct your search task object
sequenceLabeler = vw.init_search_task(SequenceLabeler)

# train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above
print >>sys.stderr, 'training!'
i = 0
while i < 100000000:
    sequenceLabeler.learn(my_dataset)
    i += 1

# now see the predictions on a test sentence
print >>sys.stderr, 'predicting!'
print sequenceLabeler.predict( [(1,w) for w in "the sandwich ate a monster".split()] )
print 'should have printed: [1, 2, 3, 1, 2]'
Exemple #25
0
def compute_reference(prev, truth):
	if truth == BIO('O') or truth == BIO('B'):
        ret = []
        if truth == BIO('O'):
            for key in valid_labels:
                ret.append(BIO('O',key))
        if truth == BIO('B'):
            for key in valid_labels:
                ret.append(BIO('B',key))
		return ret # TODO
	
	ret = []
	if prev != BIO('I') and prev != BIO('B'):
		#ret.append(BIO('B'))
        for key in valid_labels:
            ret.append(BIO('O',key))
		return ret
	else:
        for key in valid_labels:
            ret.append(BIO('I',key))
        return ret
		

        
class MWE(pyvw.SearchTask):
    def __init__(self, vw, sch, num_actions):
        # you must must must initialize the parent class
        # this will automatically store self.sch <- sch, self.vw <- vw
        pyvw.SearchTask.__init__(self, vw, sch, num_actions)
        
        # for now we will use AUTO_HAMMING_LOSS; in Part II, you should remove this and implement a more task-focused loss
        # like one-minus-F-measure.
        sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES )

    def _run(self, sentence):
        output = []
        prev   = BIO('O')   # store the previous prediction
        for n in range(len(sentence)):
            # label is a BIO, word is a string and pos is a string
            label,word,lemma,pos = sentence[n]

            with self.make_example(word, lemma, pos) as ex:  # construct the VW example
                # first, compute the numeric labels for all valid reference actions
                refs  = [ bio.numeric_label for bio in compute_reference(prev, label) ]
                # next, because some actions are invalid based on the
                # previous decision, we need to compute a list of
                # valid actions available at this point
                valid = [ bio.numeric_label for bio in prev.valid_next() ]
                # make a prediction
                pred  = self.sch.predict(examples   = ex,
                                         my_tag     = n+1,
                                         oracle     = refs,
                                         condition  = [(n, 'p'), (n-1, 'q')],
                                         allowed    = valid)
                # map that prediction back to a BIO label
                this  = numeric_label_to_BIO(pred)
                # append it to output
                output.append(this)
                # update the 'previous' prediction to the current
                prev  = this

        # return the list of predictions as BIO labels
        return output

    def make_example(self, word, lemma, pos):
        ex = self.example({'w': [word], 'l': [lemma], 'p': [pos]},labelType=self.vw.lCostSensitive)
        ex.set_label_string(str(pos)+':0')
        return ex
        


def make_data(BIO,filename):
    data = []
    sentence = []
    f = open(filename,'r')
    for l in f:
        l = l.strip()
        # at end of sentence
        if l == "":
            data.append(sentence)
            sentence = []
        else:
            [offset,word,lemma,pos,mwe,parent,strength,ssense,sid] = l.split('\t')
            sentence.append((BIO(mwe),word,lemma,pos))
    return data



if __name__ == "__main__":
    # input/output files
    trainfilename='dimsum16.p3.train.contiguous'
    testfilename='dimsum16.p3.test.contiguous'
    outfilename='dimsum16.p3.test.contiguous.out'

    # read in some examples to be used as training/dev set
    train_data = make_data(BIO,trainfilename)

    # initialize VW and sequence labeler as learning to search
    vw = pyvw.vw(search=9, quiet=True, search_task='hook', ring_size=1024, \
                 search_rollin='learn', search_rollout='none')

    # tell VW to construct your search task object
    sequenceLabeler = vw.init_search_task(MWE)

    # train!
    # we make 5 passes over the training data, training on the first 80%
    # examples (we retain the last 20% as development data)
    print 'training!'
    N = int(0.8 * len(train_data))
    for i in xrange(5):
        print 'iteration ', i, ' ...'
        sequenceLabeler.learn(train_data[0:N])
        
    # now see the predictions on 20% held-out sentences 
    print 'predicting!' 
    hamming_loss, total_words = 0,0
    for n in range(N, len(train_data)):
        truth = [label for label,word,lemma,pos in train_data[n]]
        pred  = sequenceLabeler.predict( [(BIO('O'),word,lemma,pos) for label,word,lemma,pos in train_data[n]] )
        for i,t in enumerate(truth):
            if t != pred[i]:
                hamming_loss += 1
            total_words += 1
    #    print 'predicted:', '\t'.join(map(str, pred))
    #    print '    truth:', '\t'.join(map(str, truth))
    #    print ''
    print 'total hamming loss on dev set:', hamming_loss, '/', total_words
Exemple #26
0
    def _run(self, y_x):
        y,(x0,x1) = y_x

        ex = self.example({'x': [('x0',x0), ('x1',x1)]})
        h  = self.sch.predict(examples=ex, my_tag=1, oracle=None) * 2 - 3
        
        ex = self.example({'x': [('x0',x0), ('x1',x1), ('x0h',x0*h)]})
        p  = self.sch.predict(examples=ex, my_tag=2, oracle=y, condition=(1,'h'))
        
        self.sch.loss( 0. if p == y else 1. )
        return p

my_dataset = [ (1, (-1, -1)),
               (1, (+1, +1)),
               (2, (-1, +1)),
               (2, (+1, -1)) ]

    
vw = pyvw.vw("--search 2 --search_task hook --ring_size 1024 --search_alpha 1e-2")
lv = vw.init_search_task(LatentVariableClassifier)

print('training')
for i in range(100):
    lv.learn(my_dataset)

print('testing')
for (y,x) in my_dataset:
    print('pred =', lv.predict( (0,x) ))
    
Exemple #27
0
            sentence.append((BIO(mwe),word,lemma,pos))
    return data



if __name__ == "__main__":
    # input/output files
    trainfilename='dimsum16.p3.train.contiguous'
    testfilename='dimsum16.p3.test.contiguous'
    outfilename='dimsum16.p3.test.contiguous.out'

    # read in some examples to be used as training/dev set
    train_data = make_data(BIO,trainfilename)

    # initialize VW and sequence labeler as learning to search
    vw = pyvw.vw(search=9, quiet=True, search_task='hook', ring_size=1024, \
                 search_rollin='learn', search_rollout='none')

    # tell VW to construct your search task object
    sequenceLabeler = vw.init_search_task(MWE)

    # train!
    # we make 5 passes over the training data, training on the first 80%
    # examples (we retain the last 20% as development data)
    print 'training!'
    N = int(0.8 * len(train_data))
    for i in xrange(5):
        print 'iteration ', i, ' ...'
        sequenceLabeler.learn(train_data[0:N])
        
    # now see the predictions on 20% held-out sentences 
    print 'predicting!' 
Exemple #28
0
import pyvw


def my_predict(vw, ex):
    pp = 0.
    for f,v in ex.iter_features():
        pp += vw.get_weight(f) * v
    return pp

def ensure_close(a,b,eps=1e-6):
    if abs(a-b) > eps:
        raise Exception("test failed: expected " + str(a) + " and " + str(b) + " to be " + str(eps) + "-close, but they differ by " + str(abs(a-b)))

###############################################################################
vw = pyvw.vw("--quiet")


###############################################################################
vw.learn("1 |x a b")


###############################################################################
print '# do some stuff with a read example:'
ex = vw.example("1 |x a b |y c")
ex.learn() ; ex.learn() ; ex.learn() ; ex.learn()
updated_pred = ex.get_updated_prediction()
print 'current partial prediction =', updated_pred

# compute our own prediction
print '        my view of example =', str(list(ex.iter_features()))
my_pred = my_predict(vw, ex)
Exemple #29
0
                # print 'pred', pred
                # print 'performing action', action
                # print
                parser.perform_action(action)
                output.append(pred)
                n += 1
        loss = parser.loss()
        self.sch.loss(loss)
        parser.stop()
        print 'parsed doc', doc_id, 'with loss', loss
        return output

    def get_label(self, action):
        return 1 if action['action'] == 'shift' else 2

    def get_action(self, label):
        return self.SHIFT if label == 1 else self.REDUCE

if __name__ == '__main__':
    disco = DiscoSession()
    dataset = disco.get_doc_ids()
    vw = pyvw.vw("--search 2 --quiet --search_task hook --ring_size 1024 --search_no_caching -f disco.vw")
    parser = vw.init_search_task(DiscourseParser)

    print 'training ...'
    for i in xrange(5):
        parser.learn(dataset)

    vw.finish()
    print 'done!'
Exemple #30
0
def do_work(
    training_events,
    test_event,
    sample_size,
    samples_per_event,
    gold_probs,
    iters,
    l2,
    log_time,
    semsims,
    dfdeltas,
    use_best_feats,
    use_i_only,
    use_abs_df,
):

    training_streams = []
    summary = []

    for event in training_events:
        df = get_input_stream(event, gold_probs)
        training_streams.append((event, df))

    test_df = get_input_stream(test_event, gold_probs)
    test_X_l = semsims[test_event.type].transform(test_df["stems"].apply(lambda x: " ".join(x)).tolist())
    test_stream = (test_event, test_df, test_X_l, dfdeltas(test_event))

    vw = pyvw.vw(
        ("--l2 {} --search 2 --search_task hook --ring_size 1024 " + "--search_no_caching --noconstant --quiet").format(
            l2
        )
    )
    task = vw.init_search_task(Summarizer)
    task.use_best_feats = use_best_feats
    task.use_i_only = use_i_only
    task.use_abs_df = use_abs_df
    print "use best?", task.use_best_feats
    print "use i only?", task.use_i_only
    print "use abs df?", task.use_abs_df
    task.log_time = log_time
    all_scores = []
    all_weights = []

    instances = []
    for sample in xrange(samples_per_event):
        for event, stream in training_streams:
            while 1:
                sample_stream = ds(stream, sample_size=sample_size)
                if (sample_stream["nuggets"].apply(len) > 0).any():
                    break
            X_l = semsims[event.type].transform(sample_stream["stems"].apply(lambda x: " ".join(x)).tolist())
            instances.append((event, sample_stream, X_l, dfdeltas(event)))

    for n_iter in xrange(1, iters + 1):
        task.total_loss = 0

        # instances = [(event, ds(stream, sample_size=sample_size))
        #             for event, stream in training_streams
        #             for sample in xrange(samples_per_event)]
        random.shuffle(instances)
        for i, inst in enumerate(instances):
            print "{}.{}.{}/{}".format(test_event.fs_name(), n_iter, i, len(instances))
            task.learn([inst])
        print "{}.{}.p".format(test_event.fs_name(), n_iter)

        train_egain = 0
        train_comp = 0
        train_f1 = 0
        train_loss = 0
        for i, inst in enumerate(instances):
            egain, comp, f1, loss, train_sum = predict(task, inst, n_iter)
            train_egain += egain
            train_comp += comp
            train_f1 += f1
            train_loss += loss
        train_egain = train_egain / float(len(instances))
        train_comp = train_comp / float(len(instances))
        train_f1 = train_f1 / float(len(instances))
        train_loss = train_loss / float(len(instances))
        print "{} {} train loss {}".format(test_event.query_id, n_iter, train_loss)

        pred = task.predict(test_stream)

        select_df, next_df = task.get_feature_weights()

        select_df["class"] = "SELECT"
        select_df["iter"] = n_iter

        next_df["class"] = "NEXT"
        next_df["iter"] = n_iter
        all_weights.append(select_df)
        all_weights.append(next_df)

        pred = ["SELECT" if p == SELECT else "SKIP" for p in pred]
        all_nuggets = set()
        for nuggets in test_stream[1]["nuggets"].tolist():
            all_nuggets.update(nuggets)

        loss = 0
        y_int_y_hat = 0
        size_y = 0
        size_y_hat = 0

        nuggets = set()
        for action, (_, sent) in izip(pred, test_stream[1].iterrows()):
            gain = len(sent["nuggets"] - nuggets)
            if action == "SELECT":
                if gain == 0:
                    loss += 1
                summary.append(
                    {
                        "event": test_event.query_id,
                        "iter": n_iter,
                        "update id": sent["update id"],
                        "timestamp": sent["timestamp"],
                        "gain": gain,
                        "nuggets": ",".join(sent["nuggets"]),
                        "update text": sent["pretty text"],
                    }
                )
                nuggets.update(sent["nuggets"])
            else:
                if gain > 0:
                    loss += 1
            if gain > 0:
                oracle = "SELECT"
            else:
                oracle = "SKIP"

            if action == "SELECT" and oracle == "SELECT":
                y_int_y_hat += 1
                size_y += 1
                size_y_hat += 1
            elif action == "SELECT" and oracle == "SKIP":
                size_y_hat += 1
            elif action == "SKIP" and oracle == "SELECT":
                size_y += 1

        if size_y_hat == 0:
            print test_event
            print (test_stream[1]["nuggets"].apply(len) > 0).any()
        loss = 1 - float(y_int_y_hat) / (size_y + size_y_hat)

        if len(nuggets) > 0:
            egain = len(nuggets) / sum([1.0 if a == "SELECT" else 0.0 for a in pred])
        else:
            egain = 0
        comp = len(nuggets) / float(len(all_nuggets))

        all_scores.append(
            {
                "iter": n_iter,
                "Comp.": comp,
                "E[gain]": egain,
                "Loss": loss,
                "Avg. Train Loss": train_loss,
                "Avg. Train E[gain]": train_egain,
                "Avg. Train Comp.": train_comp,
                "Avg. Train F1": train_f1,
            }
        )

        print "{}.{}.p E[gain]={:0.6f} Comp.={:0.6f} Train Loss={:0.6f}".format(
            test_event.fs_name(), n_iter, egain, comp, train_loss
        )

    scores_df = pd.DataFrame(
        all_scores,
        columns=[
            "iter",
            "E[gain]",
            "Comp.",
            "Loss",
            "Avg. Train Loss",
            "Avg. Train E[gain]",
            "Avg. Train Comp.",
            "Avg. Train F1",
        ],
    )
    weights_df = pd.concat(all_weights)
    weights_df["event"] = test_event.query_id
    scores_df["event"] = test_event.query_id
    summary_df = pd.DataFrame(
        summary, columns=["iter", "event", "update id", "timestamp", "gain", "update text", "nuggets"]
    )
    return scores_df, weights_df, summary_df
Exemple #31
0
def my_predict(vw, ex):
    pp = 0.
    for f, v in ex.iter_features():
        pp += vw.get_weight(f) * v
    return pp


def ensure_close(a, b, eps=1e-6):
    if abs(a - b) > eps:
        raise Exception("test failed: expected " + str(a) + " and " + str(b) +
                        " to be " + str(eps) + "-close, but they differ by " +
                        str(abs(a - b)))


###############################################################################3
vw = pyvw.vw("--quiet")

###############################################################################3
vw.learn("1 |x a b")

###############################################################################3
print '# do some stuff with a read example:'
ex = vw.example("1 |x a b |y c")
ex.learn()
ex.learn()
ex.learn()
ex.learn()
updated_pred = ex.get_updated_prediction()
print 'current partial prediction =', updated_pred

# compute our own prediction
def main(argv):
	if len(argv) < -1:
		print "usage python concept_relation_joint_learning.py concept_training_dataset_p concept_test_dataset_p span_concept_dict_p vnpb_words_concepts_dict_p relation_train_dataset_p relation_test_dataset_p kbest_dep_parse_p original_amr_aligned nodes_relation_dict_p"
		return

	global edgeLabelsList
	global span_concept_dict_p
	global vnpb_words_concepts_dict_p
	global dep_parse
	global gold_relation_dict
	global concept_labels
	global concept_map
	global relation_map
	global training_id_dict
	global flag
	global seen_dict
	global prev_sent_id
	global nodes_relation_dict_in
	global nodes_relation_dict_out
	global nodes_relation_dict_pair
	global dep_parse_nx

	#change this is you want to use debugger
	debug = False

	if debug:
		concept_training_dataset_p = "../data/amr-release-1.0-training-proxy/concept_dataset.p"
		concept_test_dataset_p = "../data/amr-release-1.0-test-proxy/concept_dataset.p"
		span_concept_dict_p = "../data/amr-release-1.0-training-proxy/span_concept_dict.p"
		vnpb_words_concepts_dict_p = "../data/amr-release-1.0-training-proxy/vnpb_words_concepts_dict.p"
		relation_training_dataset_p = "../data/amr-release-1.0-training-proxy/relation_dataset.p"
		kbest_dep_parse_p_train = "../data/amr-release-1.0-training-proxy/dep_parse.p"
		kbest_dep_parse_p_test = "../data/amr-release-1.0-test-proxy/dep_parse.p"
		original_amr_aligned = "../data/amr-release-1.0-test-proxy/amr-release-1.0-test-proxy.aligned"
		nodes_relation_dict_p = "../data/amr-release-1.0-training-proxy/nodes_relation_dict.p"
		amr_out_file_name = "proxy-out-temp_quant_d"

	else:
		concept_training_dataset_p = argv[0]
		concept_test_dataset_p = argv[1]
		span_concept_dict_p = argv[2]
		vnpb_words_concepts_dict_p = argv[3]
		relation_training_dataset_p = argv[4]
		kbest_dep_parse_p_train = argv[5]
		kbest_dep_parse_p_test = argv[6]
		original_amr_aligned = argv[7]
		nodes_relation_dict_p = argv[8]
		amr_out_file_name = argv[9]

	print "Starting Up!"
	nodes_relation_dict_out, nodes_relation_dict_in, nodes_relation_dict_pair = pickle.load(open(nodes_relation_dict_p))

	#Format of concept_training_dataset
	#concept_training_dataset = {id: [span, pos, concept]}
	#concept_training_dataset = pickle.load(open("data/amr-release-1.0-training-proxy/concept_training_dataset_p", "rb"))

	#Read original amr
	amr_dict, ids_in_order, id_lines_in_order, sent_lines_in_order = read_amr(open(original_amr_aligned))
	#print len(amr_dict)

	#Prepare training data
	concept_training_dataset = pickle.load(open(concept_training_dataset_p, "rb"))
	gold_relation_dict = pickle.load(open(relation_training_dataset_p, "rb"))
	dep_parse = pickle.load(open(kbest_dep_parse_p_train, "rb"))

	dep_parse_nx = {}
	for each_id in dep_parse:
		each_dp = dep_parse[each_id]
		dep_parse_graph_u = nx.Graph()
		dep_parse_graph_d = nx.DiGraph()
		for each_src in each_dp:
			for each_tgt in each_dp[each_src]:
				dep_parse_graph_u.add_edge(each_src, each_tgt[0], {'label': each_tgt[1]})
				dep_parse_graph_d.add_edge(each_src, each_tgt[0], {'label': each_tgt[1]})

		dep_parse_nx[each_id] = (dep_parse_graph_u, dep_parse_graph_d)


	training_id_dict = {}
	count = 0
	subcount = 0
	lens = []
	training_sentences = []
	for id, concept_training_data in concept_training_dataset.iteritems():
		current_spans = []
		training_id_dict[id] = count
		i = 0
		for span_index, [span, pos, concept, name, ner] in enumerate(concept_training_data):
			num_words = len(span.split())
			current_spans.append(Span(span, pos, range(i, i+num_words), ner, concept))
			i += num_words
		training_sentence = Sentence(id, current_spans)
		lens.append(len(training_sentence.spans))
		count += 1
		if len(training_sentence.spans) <= 10:
			subcount += 1

		training_sentences.append(training_sentence)
	#print subcount, count
	#print sorted(lens, reverse=True)[:100]

	amr_out_file = open(amr_out_file_name, 'w')

	#Prepare vw parameters
	N = len(training_sentences)
	#N = 1
	#N = 10
	vw = pyvw.vw("--search 0 --csoaa_ldf m --quiet --search_task hook --ring_size 2048 --search_no_caching -q a: ")
	task = vw.init_search_task(ConceptRelationLearning)
	prev_sent_id = training_sentences[0].id

	#Start training
	print "Learning.."
	start_time = time.time()
	for p in range(1):
		seen_dict = {}
		task.learn(training_sentences[:])

	print "Time taken: " + str(time.time() - start_time)

	flag = False
	#Prepare test data
	concept_test_dataset = pickle.load(open(concept_test_dataset_p, "rb"))
	dep_parse = pickle.load(open(kbest_dep_parse_p_test, "rb"))

	dep_parse_nx = {}
	for each_id in dep_parse:
		each_dp = dep_parse[each_id]
		dep_parse_graph_u = nx.Graph()
		dep_parse_graph_d = nx.DiGraph()
		for each_src in each_dp:
			for each_tgt in each_dp[each_src]:
				dep_parse_graph_u.add_edge(each_src, each_tgt[0], {'label': each_tgt[1]})
				dep_parse_graph_d.add_edge(each_src, each_tgt[0], {'label': each_tgt[1]})

		dep_parse_nx[each_id] = (dep_parse_graph_u, dep_parse_graph_d)

	gold_relation_dict = {}
	test_sentences = []
	for id, concept_test_data in concept_test_dataset.iteritems():
		current_spans = []
		i = 0
		for span_index, [span, pos, concept, name, ner] in enumerate(concept_test_data):
			num_words = len(span.split())
			current_spans.append(Span(span, pos, range(i, i+num_words), ner, concept))
			#print current_spans[-1].word_positions
			i += num_words
		test_sentence = Sentence(id, current_spans)
		test_sentences.append(test_sentence)

	#test_sentences = test_sentences[:10]
	#Start testing
	start_time = time.time()
	print "Testing.."

	#print len(test_sentences)
	predictions = []
	t2 = []

	test_sentences_dict = {}
	for test_sentence in test_sentences:
		id = test_sentence.id
		test_sentences_dict[id] = test_sentence
	i = 0
	for id in ids_in_order:
		#print id
		id_line = id_lines_in_order[i]
		sent_line = sent_lines_in_order[i]
		test_sentence = test_sentences_dict[id]
		predicted, node_exp, root_index = task.predict(eraseAnnotations(test_sentence))
		predictions.append(predicted)
		t2.append(test_sentence)
		write_amr_to_file(test_sentence.id, predicted, node_exp, concept_map, relation_map, amr_out_file,
								nodes_relation_dict_out, nodes_relation_dict_in, nodes_relation_dict_pair, root_index, id_line, sent_line)
		predictions[i] = predicted
		i+=1

	print "Time taken: " + str(time.time() - start_time)
	amr_out_file.close()
Exemple #33
0
                        'p':
                    [wordN + '_' + wordM, dir + '_' + wordN + '_' + wordM],
                        'd': [
                            str(m - n <= d) + '<=' + str(d)
                            for d in [-8, -4, -2, -1, 1, 2, 4, 8]
                        ] + [
                            str(m - n >= d) + '>=' + str(d)
                            for d in [-8, -4, -2, -1, 1, 2, 4, 8]
                        ]
                }) as ex:
                    pred = self.sch.predict(examples=ex,
                                            my_tag=(m + 1) * N + n + 1,
                                            oracle=isParent,
                                            condition=[
                                                (max(0, (m) * N + n + 1), 'p'),
                                                (max(0, (m + 1) * N + n), 'q')
                                            ])
                    if pred == 2:
                        output[n] = m
                        break
        return output


vw = pyvw.vw("--search 2 --quiet --search_task hook --ring_size 1024")
task = vw.init_search_task(CovingtonDepParser)
for p in range(10):  # do ten passes over the training data
    task.learn(my_dataset.__iter__)
print 'testing'
print task.predict([(w, -1) for w in "the monster ate a sandwich".split()])
print 'should have printed [ 1 2 -1 4 2 ]'