Exemple #1
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    epoch = 0

    while (epoch < numepochs):
        #print(epoch)
        mistakes = 0
        correct = 0
        #print(len(train_data))
        #sen=0
        for sentence_data in train_data:
            words = []
            postags = []
            truetags = []
            label_list = sentence_data[0]
            feat_list = sentence_data[1]
            for label in label_list:
                (word, postag, chunktag) = label.split(" ")
                words.append(word)
                postags.append(postag)
                truetags.append(chunktag)
            tagset = perc.read_tagset(opts.tagsetfile)
            default_tag = tagset[0]
            argmaxtags = perc.perc_test(feat_vec, label_list, feat_list,
                                        tagset, default_tag)
            feat_index = 0
            i = 0

            for word in words:
                (feat_index, feats_for_this_word) = perc.feats_for_word(
                    feat_index, feat_list)
                # print(len(feats_for_this_word))
                argmax = argmaxtags[i]
                tru = truetags[i]
                if (argmax == tru):
                    i += 1
                    continue
                for f in feats_for_this_word:
                    wrongkey = f, argmax
                    rightkey = f, tru
                    feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                    feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                i += 1
            i = 0

            for word in words:
                argmax = argmaxtags[i]
                tru = truetags[i]
                if (argmax == tru):
                    i += 1
                    correct += 1
                    continue
                else:
                    mistakes += 1
                argmaxprev = "B:"
                truprev = "B:"
                if (i == 0):
                    argmaxprev += "B_-1"
                    truprev += "B_-1"
                else:
                    argmaxprev += argmaxtags[i - 1]
                    truprev += truetags[i - 1]
                wrongkey = argmaxprev, argmax
                rightkey = truprev, tru
                feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                i += 1
            #if(sen%1000==0):
            #print(str(sen)+"/"+str(len(train_data)))
            #sen+=1
        #print(mistakes)
        #print(correct)
        epoch += 1
    # please limit the number of iterations of training to n iterations
    return feat_vec
Exemple #2
0
        help=
        "precomputed features for the input data, i.e. the values of \phi(x,_) without y"
    )
    optparser.add_option(
        "-e",
        "--numepochs",
        dest="numepochs",
        default=int(10),
        help=
        "number of epochs of training; in each epoch we iterate over over all the training examples"
    )
    optparser.add_option("-m",
                         "--modelfile",
                         dest="modelfile",
                         default=os.path.join("data", "default.model"),
                         help="weights for all features stored on disk")
    (opts, _) = optparser.parse_args()

    # each element in the feat_vec dictionary is:
    # key=feature_id value=weight
    feat_vec = {}
    tagset = []
    train_data = []

    tagset = perc.read_tagset(opts.tagsetfile)
    print >> sys.stderr, "reading data ..."
    train_data = perc.read_labeled_data(opts.trainfile, opts.featfile)
    print >> sys.stderr, "done."
    feat_vec = perc_train(train_data, tagset, int(opts.numepochs))
    perc.perc_write_to_file(feat_vec, opts.modelfile)
Exemple #3
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    cumulative_feat_vec = defaultdict(float)
    index_dec = dict()
    epoch = 0
    count = 0
    numsen = len(train_data)
    while (epoch < numepochs):
        print(epoch)
        mistakes = 0
        correct = 0
        #print(numsen)
        sen = 0
        for sentence_data in train_data:
            if (epoch != numepochs or sen != numsen):

                words = []
                postags = []
                truetags = []
                label_list = sentence_data[0]
                feat_list = sentence_data[1]
                for label in label_list:
                    (word, postag, chunktag) = label.split(" ")
                    words.append(word)
                    postags.append(postag)
                    truetags.append(chunktag)
                tagset = perc.read_tagset(opts.tagsetfile)
                default_tag = tagset[0]
                argmaxtags = perc.perc_test(feat_vec, label_list, feat_list,
                                            tagset, default_tag)
                feat_index = 0
                i = 0

                for word in words:
                    (feat_index, feats_for_this_word) = perc.feats_for_word(
                        feat_index, feat_list)
                    # print(len(feats_for_this_word))
                    argmax = argmaxtags[i]
                    tru = truetags[i]
                    if (argmax == tru):
                        i += 1
                        continue
                    for f in feats_for_this_word:
                        wrongkey = f, argmax
                        rightkey = f, tru
                        if (wrongkey in index_dec):
                            (index_epoch, index_sen) = index_dec[wrongkey]
                            idletime = (epoch * numsen + sen -
                                        index_epoch * numsen - index_sen)
                            cumulative_feat_vec[
                                wrongkey] = cumulative_feat_vec.get(
                                    wrongkey,
                                    0) + feat_vec.get(wrongkey, 0) * idletime
                        if (rightkey in index_dec):
                            (index_epoch, index_sen) = index_dec[rightkey]
                            idletime = (epoch * numsen + sen -
                                        index_epoch * numsen - index_sen)
                            cumulative_feat_vec[
                                rightkey] = cumulative_feat_vec.get(
                                    rightkey,
                                    0) + feat_vec.get(rightkey, 0) * idletime

                        feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                        feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                        cumulative_feat_vec[
                            wrongkey] = cumulative_feat_vec.get(
                                wrongkey, 0) + feat_vec[wrongkey]
                        cumulative_feat_vec[
                            rightkey] = cumulative_feat_vec.get(
                                rightkey, 0) + feat_vec[rightkey]
                        index_dec[wrongkey] = (epoch, sen)
                        index_dec[rightkey] = (epoch, sen)
                    i += 1
                i = 0

                for word in words:
                    argmax = argmaxtags[i]
                    tru = truetags[i]
                    if (argmax == tru):
                        i += 1
                        correct += 1
                        continue
                    else:
                        mistakes += 1
                    argmaxprev = "B:"
                    truprev = "B:"
                    if (i == 0):
                        argmaxprev += "B_-1"
                        truprev += "B_-1"
                    else:
                        argmaxprev += argmaxtags[i - 1]
                        truprev += truetags[i - 1]
                    wrongkey = argmaxprev, argmax
                    rightkey = truprev, tru

                    if (wrongkey in index_dec):
                        (index_epoch, index_sen) = index_dec[wrongkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                            wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime

                    if (rightkey in index_dec):
                        (index_epoch, index_sen) = index_dec[rightkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                            rightkey, 0) + feat_vec.get(rightkey, 0) * idletime

                    feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                    feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                    cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                        wrongkey, 0) + feat_vec[wrongkey]
                    cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                        rightkey, 0) + feat_vec[rightkey]
                    index_dec[wrongkey] = (epoch, sen)
                    index_dec[rightkey] = (epoch, sen)
                    i += 1

                #keys=feat_vec.keys()
                #for key in keys:
                #cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key]
                count += 1
            else:

                words = []
                postags = []
                truetags = []
                label_list = sentence_data[0]
                feat_list = sentence_data[1]
                for label in label_list:
                    (word, postag, chunktag) = label.split(" ")
                    words.append(word)
                    postags.append(postag)
                    truetags.append(chunktag)
                tagset = perc.read_tagset(opts.tagsetfile)
                default_tag = tagset[0]
                argmaxtags = perc.perc_test(feat_vec, label_list, feat_list,
                                            tagset, default_tag)
                feat_index = 0
                i = 0

                for word in words:
                    (feat_index, feats_for_this_word) = perc.feats_for_word(
                        feat_index, feat_list)
                    # print(len(feats_for_this_word))
                    argmax = argmaxtags[i]
                    tru = truetags[i]
                    for f in feats_for_this_word:
                        wrongkey = f, argmax
                        rightkey = f, tru

                        (index_epoch, index_sen) = index_dec[wrongkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                            wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime

                        (index_epoch, index_sen) = index_dec[rightkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                            rightkey, 0) + feat_vec.get(rightkey, 0) * idletime

                    if (argmax == tru):
                        i += 1
                        continue
                    for f in feats_for_this_word:
                        wrongkey = f, argmax
                        rightkey = f, tru

                        feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                        feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                        cumulative_feat_vec[
                            wrongkey] = cumulative_feat_vec.get(
                                wrongkey, 0) + feat_vec[wrongkey]
                        cumulative_feat_vec[
                            rightkey] = cumulative_feat_vec.get(
                                rightkey, 0) + feat_vec[rightkey]
                        index_dec[wrongkey] = (epoch, sen)
                        index_dec[rightkey] = (epoch, sen)
                    i += 1
                i = 0

                for word in words:
                    argmax = argmaxtags[i]
                    tru = truetags[i]

                    argmaxprev = "B:"
                    truprev = "B:"
                    if (i == 0):
                        argmaxprev += "B_-1"
                        truprev += "B_-1"
                    else:
                        argmaxprev += argmaxtags[i - 1]
                        truprev += truetags[i - 1]
                    wrongkey = argmaxprev, argmax
                    rightkey = truprev, tru

                    (index_epoch, index_sen) = index_dec[wrongkey]
                    idletime = (epoch * numsen + sen - index_epoch * numsen -
                                index_sen)
                    cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                        wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime

                    (index_epoch, index_sen) = index_dec[rightkey]
                    idletime = (epoch * numsen + sen - index_epoch * numsen -
                                index_sen)
                    cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                        rightkey, 0) + feat_vec.get(rightkey, 0) * idletime

                    if (argmax == tru):
                        i += 1
                        correct += 1
                        continue
                    else:
                        mistakes += 1

                    feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                    feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                    cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                        wrongkey, 0) + feat_vec[wrongkey]
                    cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                        rightkey, 0) + feat_vec[rightkey]
                    index_dec[wrongkey] = (epoch, sen)
                    index_dec[rightkey] = (epoch, sen)
                    i += 1

                # keys=feat_vec.keys()
                # for key in keys:
                # cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key]
                count += 1

            if (sen % 1000 == 0):
                print(str(sen) + "/" + str(len(train_data)))
            sen += 1

        #print(mistakes)
        #print(correct)
        epoch += 1

    keys = cumulative_feat_vec.keys()
    for key in keys:
        cumulative_feat_vec[key] = float(
            cumulative_feat_vec[key]) / float(count)

    # please limit the number of iterations of training to n iterations

    return cumulative_feat_vec
Exemple #4
0
from collections import defaultdict

def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    # please limit the number of iterations of training to n iterations
    return feat_vec

if __name__ == '__main__':
    optparser = optparse.OptionParser()
    optparser.add_option("-t", "--tagsetfile", dest="tagsetfile", default=os.path.join("data", "tagset.txt"), help="tagset that contains all the labels produced in the output, i.e. the y in \phi(x,y)")
    optparser.add_option("-i", "--trainfile", dest="trainfile", default=os.path.join("data", "train.txt.gz"), help="input data, i.e. the x in \phi(x,y)")
    optparser.add_option("-f", "--featfile", dest="featfile", default=os.path.join("data", "train.feats.gz"), help="precomputed features for the input data, i.e. the values of \phi(x,_) without y")
    optparser.add_option("-e", "--numepochs", dest="numepochs", default=int(10), help="number of epochs of training; in each epoch we iterate over over all the training examples")
    optparser.add_option("-m", "--modelfile", dest="modelfile", default=os.path.join("data", "default.model"), help="weights for all features stored on disk")
    (opts, _) = optparser.parse_args()

    # each element in the feat_vec dictionary is:
    # key=feature_id value=weight
    feat_vec = {}
    tagset = []
    train_data = []

    tagset = perc.read_tagset(opts.tagsetfile)
    print >>sys.stderr, "reading data ..."
    train_data = perc.read_labeled_data(opts.trainfile, opts.featfile)
    print >>sys.stderr, "done."
    feat_vec = perc_train(train_data, tagset, int(opts.numepochs))
    perc.perc_write_to_file(feat_vec, opts.modelfile)

Exemple #5
0
                           help='number of layers')
    argparser.add_argument('--pos-dim',
                           type=int,
                           default=64,
                           help='POS tag embedding dimension')
    argparser.add_argument('-r',
                           '--resume',
                           help='resume training from saved model')
    argparser.add_argument('--prototype',
                           default=False,
                           action='store_true',
                           help='prototyping mode')
    args = argparser.parse_args()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    tagset = perc.read_tagset(args.tagsetfile)
    print("reading data ...", file=sys.stderr)

    test_data = perc.read_labeled_data(args.inputfile,
                                       args.featfile,
                                       verbose=False)
    print("done.", file=sys.stderr)
    if args.prototype:
        test_data = test_data[0:8]

    print('loading model...', file=sys.stderr)
    model_data = load_model(args.modelfile)

    word_idx = model_data['word_index']
    speech_tag_idx = model_data['speech_tag_index']
    tag2idx = model_data['tag_index']