Exemple #1
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    i = 0
    j = 0
    while i < len(train_data):
        while j < len(train_data[i][1]):
            feat = train_data[i][1][j]
            for tag in tagset:
                feat_vec[feat, tag] = 0
            j += 1
        i += 1

    gooby = perc.perc_test(feat_vec, train_data[0][0], train_data[0][1],
                           tagset, tagset[0])
    #print(train_data[0][0])
    #print(getDefault(train_data[0][0]))
    for i in range(1, numepochs):
        # for i in range(1,numepochs):
        for j in range(0, len(train_data) - 1):
            for k in range(0, len(train_data[j]) - 1):
                #Get Default Tag
                defaultTag = getDefault(train_data[j][0])
                z = perc.perc_test(feat_vec, train_data[j][0],
                                   train_data[j][1], tagset, defaultTag)
                # print(z)

                #compare z to t
                t = []
                for line in train_data[j][0]:
                    t.append(line.split()[2])
                #update weights if z and t are not the same
                for a in range(0, len(z) - 1):
                    if z[a] != t[a]:
                        for b in range(20 * a, 20 * a + 19):
                            try:
                                feat_vec[(train_data[j][1][b], t[a])] += 1
                                feat_vec[(train_data[j][1][b], z[a])] -= 1
                            except:
                                print("Error")
                                print("j = " + str(j) + " b = " + str(b) +
                                      " a = " + str(a))
                                print("len(train_data) = " +
                                      str(len(train_data)))
                                print("len(train_data[j][1]) = " +
                                      str(len(train_data[j][1])))
                                print("len(t) = " + str(len(t)))
                                print("len(z) = " + str(len(z)))
                                print("train_data[j][1][738] = " +
                                      train_data[j][1][738])
                                print("train_data[j][1][739] = " +
                                      train_data[j][1][739])
                                exit()

                #if z != t:
                #feat_vec += feat_vec + perc.perc_test(feat_vec, train_data[j][0], train_data[j][1], tagset, defaultTag)

    return feat_vec
Exemple #2
0
def perc_train(train_data, tagset, numepochs):
    print len(train_data)
    feat_vec = defaultdict(int)
    defaultTag = tagset[0]
    for i in range(numepochs):
        print i
        k = 0
        feat_index = 0
        for (labeled_list, feat_list) in train_data:
            if k % 100 == 0: print "     ", k
            k += 1
            z = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                               defaultTag)

            # get the augmented labels and feats for the word
            labels = copy.deepcopy(labeled_list)
            (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
            labels.insert(0, '_B-1 _B-1 _B-1')
            z.insert(0, '_B-1')

            # update weights when t != labels[j]
            N = len(labels)
            for j in range(1, N - 1):
                if x(labels, j, 2) != z[j]:
                    updateWeights(feat_vec, labels, z, j, feats)
    return feat_vec
Exemple #3
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    sigma = defaultdict(int)
    mistakes = 0
    # For each epoch we go through all the sentences in the training set.
    for epoch in range(numepochs):
        mistakes = 0
        # Each sentence is passed through the viterbi to get the argmax output of the labels.
        for sentence in train_data:
            # The prediction from viterbi is stored in pred.
            pred = perc.perc_test(feat_vec, sentence[0], sentence[1], tagset,
                                  tagset[0])
            true = [word.split()[2] for word in sentence[0]]
            count = 0
            # Each label returned from the result of viterbi is checked with the true label.
            for i in range(len(pred)):
                count += 20
                if (pred[i] != true[i]):
                    # Record the mistakes in all epochs
                    mistakes += 1
                    # For features of each of the word in the sentence, we make the updates in weight vector.
                    for j in range(count - 20, count):
                        # We give a -1 update to the features of the wrong label.
                        if (sentence[1][j], pred[i]) in feat_vec.keys():
                            feat_vec[(sentence[1][j], pred[i])] -= 1
                        else:
                            feat_vec[(sentence[1][j], pred[i])] = -1
                        # We give a +1 update to the features of the true label.
                        if (sentence[1][j], true[i]) in feat_vec.keys():
                            feat_vec[(sentence[1][j], true[i])] += 1
                        else:
                            feat_vec[(sentence[1][j], true[i])] = 1
                    if i > 0:
                        # Similarly, we give -1 update to the wrong bigram features and
                        # +1 update to bigram features in true label.
                        if ("B:" + pred[i - 1], pred[i]) in feat_vec.keys():
                            feat_vec[("B:" + pred[i - 1], pred[i])] -= 1
                        else:
                            feat_vec[("B:" + pred[i - 1], pred[i])] = -1
                        if ("B:" + true[i - 1], true[i]) in feat_vec.keys():
                            feat_vec[("B:" + true[i - 1], true[i])] += 1
                        else:
                            feat_vec[("B:" + true[i - 1], true[i])] = 1
            # After going through each sentence, we aggregate the weights for all the features as mentioned in
            # http://www.cs.sfu.ca/~anoop/papers/pdf/syntax-parsing-survey-2011.pdf
            for feat, weight in feat_vec.items():
                if feat in sigma.keys():
                    sigma[feat] += weight
                else:
                    sigma[feat] = weight

        print('Mistakes in epoch :', epoch, ' are: ', mistakes)

    # We average the weight parameter using the formula γ = σ/(mT) mentioned in the above mentioned paper.
    for feat, weight in sigma.items():
        sigma[feat] = weight / (len(train_data) * numepochs)

    # insert your code here
    # please limit the number of iterations of training to n iterations
    return sigma
def perc_train(train_data, tagset, numepochs):
    """
    :current_global_vector: a dict of features for the predicted labels
    :gold_global_vector: a dict of features for the standard
    """
    feat_vec = defaultdict(int)
    avg_vec = defaultdict(int)
    default_tag = tagset[0]
    for t in range(numepochs):
        error_num = 0
        for (labeled_list, feat_list) in train_data:
            std_labels = get_labels(labeled_list)
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                                    default_tag)
            if std_labels != output:
                error_num += 1
            gold_global_vector = get_global_vector(std_labels, feat_list)
            current_global_vector = get_global_vector(output, feat_list)
            add_vector(feat_vec, gold_global_vector, 1)
            add_vector(feat_vec, current_global_vector, -1)

        print >> sys.stderr, "Epoch", t + 1, "done. # of incorrect sentences: ", error_num
        # Supposedly we should average over all epoch * len(train_data) feature vectors,
        # but that would lead to too many long-vector additions and is painfully slow.
        add_vector(avg_vec, feat_vec, 1)
        perc.perc_write_to_file(
            {key: float(avg_vec[key]) / (t + 1)
             for key in avg_vec}, opts.modelfile + str(t))

    return {key: float(avg_vec[key]) / numepochs for key in avg_vec}
Exemple #5
0
def perc_train(train_data, tagset, numepochs):
    # perceptron train
    feat_vec = defaultdict(int)
    default_tag = tagset[0]
    for i in range(0, numepochs):
        for (label_list, feat_list) in train_data:
            cur = perc.perc_test(feat_vec, label_list, feat_list, tagset,
                                 default_tag)
            gold = [entry.split()[2] for entry in label_list]
            if cur != gold:
                cur.insert(0, 'B_-1')
                gold.insert(0, 'B_-1')
                cur.append('B_+1')
                gold.append('B_+1')
                cur_len = len(cur)
                gold_len = len(gold)
                if cur_len != gold_len:
                    raise ValueError(
                        "output length is not the same with the input sentence"
                    )
                feat_index = 0
                # perceptron update
                for i in range(1, cur_len):
                    (feat_index,
                     features) = perc.feats_for_word(feat_index, feat_list)
                    for f in features:
                        feat_vec[(f, cur[i])] = feat_vec[(f, cur[i])] - 1
                        feat_vec[(f, gold[i])] = feat_vec[(f, gold[i])] + 1
        print >> sys.stderr, "iteration %d done." % i

    return feat_vec
Exemple #6
0
def argmax(feat_vec, data, tagset, default_tag):
    labeled_list, feat_list = data

    local_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                                  default_tag)

    local_output.insert(0, 'B_-1')
    local_output.append('B_+1')
    return retrieve_feature(local_output, feat_list)
Exemple #7
0
def perc_train(train_data, tagset, numepochs, pos_dict):
    feat_vec = defaultdict(int)
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    numepochs = int(20)
    default_tag = tagset[0]
    for t in range(numepochs):
        tmp = 0
        # Count sentence
        print 'Iteration#',t,' is processing now.'
        for (labeled_list, feat_list) in train_data:
            labels = copy.deepcopy(labeled_list)
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag, pos_dict)
            # compare current output and true result
            # correct_flag = True
            feat_index = 0
            # check word by word if the predicted tag is equal to the true tag
            for i, v in enumerate(output):
                (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                # retrieve the feature for a word
                if len(feats) == 0:
                    print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                    raise ValueError("features do not align with input sentence")
                
                fields = labels[i].split()
                label = fields[2]
                if i > 0: 
                    label_pre = labels[i-1].split()[2]
                    if output[i-1] is not label_pre or output[i] != label:
                        for feat in feats:
                            if feat[0] == 'B': # for bigram feature
                                feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"
                                feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1
                                feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1
                                feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1
                                feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1
                            else: # for U00 to U22 feature
                                feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                                feat_vec[feat, label] = feat_vec[feat, label] + 1
                else:  # for i==0 case, all the first word in each sentence
                    label_pre = 'B_-1'  # previous label will be denoted by B_-1
                    for feat in feats:
                        if feat[0] == 'B':  # bigram feature case
                            feat = feat + ":" + label_pre
                        feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                        feat_vec[feat, label] = feat_vec[feat, label] + 1

        perc.perc_write_to_file(feat_vec, 'model_' + str(t))

    # please limit the number of iterations of training to n iterations
    return feat_vec
def perc_train(train_data, tagset, n):
    feat_vec = defaultdict(int)
    feat_avg_vec = defaultdict(int)
    # insert your code here
    # please limit the number of iterations of training to n iterations
    default_tag = tagset[0]  # tag any word with 'B-NP' in the beginning
    num_sentence = len(train_data)
    num_words = 0
    count = 0
    for iteration in range(n):
        sent_index = 0
        for sentence in train_data:  #sentence = (labeled_list, feat_list) for each sentence
            sent_index += 1
            print '{0}\r'.format("\rIteration: %d/%d. Sentence: %d/%d\t" %
                                 (iteration + 1, n, sent_index, num_sentence)),

            (labeled_list, feat_list) = sentence
            num_words += len(labeled_list)

            #compute tags based on current weights
            estimated_tags = perc.perc_test(feat_vec, labeled_list, feat_list,
                                            tagset, default_tag)
            #the target 'right' tag list
            standard_tags = [item.split()[2] for item in labeled_list]

            if estimated_tags != standard_tags:
                st_prev = es_prev = 'B_-1'
                index = 0
                #reference: http://gul.gu.se/public/pp/public_courses/course38351/published/1360057354030/resourceId/19456476/content/9adb1f1e-52e4-48b4-8001-ada93be18089/9adb1f1e-52e4-48b4-8001-ada93be18089.html
                step = (n * num_sentence - count) * 1.0 / (n * num_sentence)
                for (st_tag, es_tag) in zip(standard_tags, estimated_tags):
                    (index, feats) = perc.feats_for_word(index, feat_list)

                    for feat in feats:
                        #deal with feat B: according to the given output example.
                        if feat == 'B':
                            if st_prev != es_prev or st_tag != es_tag:
                                feat_vec[('B:' + es_prev, es_tag)] -= 1
                                feat_vec[('B:' + st_prev, st_tag)] += 1
                                feat_avg_vec[('B:' + es_prev, es_tag)] -= step
                                feat_avg_vec[('B:' + st_prev, st_tag)] += step
                                es_prev = es_tag
                                st_prev = st_tag

                        else:
                            if st_tag != es_tag:
                                feat_vec[(feat, es_tag)] -= 1
                                feat_vec[(feat, st_tag)] += 1
                                feat_avg_vec[(feat, es_tag)] -= step
                                feat_avg_vec[(feat, st_tag)] += step
            count += 1
        perc.perc_write_to_file(
            feat_avg_vec, 'models/n' + str(iteration) + 'avg_params.model')

    return feat_avg_vec
Exemple #9
0
def perc_train(train_data, tagset, n):
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")
    default_tag = tagset[0]
    feat_vec = defaultdict(int)

    epochs = n
    for round in range(0, epochs):
        num_mistakes = 0
        for (labeled_list, feat_list) in train_data:
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                                    default_tag)
            true_output = get_truth(labeled_list)
            logging.info("arg max output: %s" % (" ".join(output)))
            logging.info("truth: %s" % (" ".join(true_output)))
            #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output
            if output != true_output:
                num_mistakes += 1
                output.insert(0, 'B_-1')
                output.append('B_+1')
                true_output.insert(0, 'B_-1')
                true_output.append('B_+1')
                feat_index = 0
                for i in range(1, len(output) - 1):
                    #print >>sys.stderr, output[i], true_output[i]
                    (feat_index,
                     feats) = perc.feats_for_word(feat_index, feat_list)
                    if len(feats) == 0:
                        print >> sys.stderr, " ".join(labeled_list), " ".join(
                            feat_list), "\n"
                        raise ValueError(
                            "features do not align with input sentence")
                    #print >>sys.stderr, feats
                    feat_vec_update = defaultdict(int)
                    for feat in feats:
                        #!!!Debug: output_feat is not truth feat....
                        output_feat = truth_feat = feat

                        feat_vec_update[output_feat, output[i]] += -1
                        feat_vec_update[truth_feat, true_output[i]] += 1

                    for (upd_feat, upd_tag) in feat_vec_update:
                        if feat_vec_update[upd_feat, upd_tag] != 0:
                            feat_vec[upd_feat,
                                     upd_tag] += feat_vec_update[upd_feat,
                                                                 upd_tag]
                            logging.info(
                                "updating feat_vec with feature_id: (%s, %s) value: %d"
                                % (upd_feat, upd_tag,
                                   feat_vec_update[upd_feat, upd_tag]))
        print >> sys.stderr, "number of mistakes:", num_mistakes
        logging.info("current number of mistakes: %d" % (num_mistakes))
    return feat_vec
def perc_train(train_data, tagset, numepochs):
    # perceptron train
    T = float(len(train_data))
    step = numepochs * T
    feat_vec_cache = defaultdict(int)
    # feat_vec stores the weights for the features of a sentence, initially all weights are 0
    feat_vec = defaultdict(int)
    # default_tag = 'B-NP'
    default_tag = tagset[0]
    # for each epoch/iteration
    for i in range(0, numepochs):
        # for each item (e.g tuple=([labeled words for each sentence],[features for those words of sentence])) in train_data
        for (label_list, feat_list) in train_data:
            # cur = list of best tag for each word in sentence found using viterbi algo
            cur = perc.perc_test(feat_vec, label_list, feat_list, tagset,
                                 default_tag)
            # gold = list of reference/true tag for each word in sentence
            gold = [entry.split()[2] for entry in label_list]
            if cur != gold:
                cur.insert(0, 'B_-1')
                gold.insert(0, 'B_-1')
                cur.append('B_+1')
                gold.append('B_+1')
                cur_len = len(cur)
                gold_len = len(gold)
                if cur_len != gold_len:
                    raise ValueError(
                        "output length is not the same with the input sentence"
                    )
                feat_index = 0
                # perceptron update
                # for each tag/word of a sentence
                for i in range(1, cur_len):
                    # for each word in a sentence, (feat_index, features) is a tuple, where feat_index=endindex of the list of features for that word, and features=list of features for that word
                    (feat_index,
                     features) = perc.feats_for_word(feat_index, feat_list)
                    # update the weights of the features for that word, by rewarding the features seen in reference, while penalizing the ones not seen in reference but returned by viterbi
                    for f in features:
                        feat_vec[(f, cur[i])] = feat_vec[(f, cur[i])] - 1
                        feat_vec[(f, gold[i])] = feat_vec[(f, gold[i])] + 1
                        # averaged perceptron
                        # usual way of averaging over all intermediate weight vectors is:
                        # w = (w0 + w1 + w2 + ...... + wt) / (numepochs * T)
                        # But we can also average in an efficient way:
                        # w = w1*(step/numepochs*T) + w2*(step-1/numepochs*T) + w3*(step-2/numepochs*T) + ...... + wt*(1/numepochs*T)
                        feat_vec_cache[(f, cur[i])] = feat_vec_cache[
                            (f, cur[i])] - 1 * (float(step / numepochs * T))
                        feat_vec_cache[(f, gold[i])] = feat_vec_cache[
                            (f, gold[i])] + 1 * (float(step / numepochs * T))
            step -= 1
        print >> sys.stderr, "iteration %d done." % i
    return feat_vec_cache
def perc_avg_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    avg_feat_vec = defaultdict(float)
    default_tag = tagset[0]

    for epoch in range(numepochs):
        count_mistake = 0
        print(f"Running on epoch {epoch+1}......")
        tic = time.time()
        for _, (labeled_list, feat_list) in enumerate(train_data):
            pred_output = perc.perc_test(feat_vec, labeled_list, feat_list,
                                         tagset, default_tag)
            true_output = [x.split()[2] for x in labeled_list]

            if pred_output != true_output:
                count_mistake += 1
                feat_index = 0

                for w_index in range(len(pred_output)):
                    pred_tag = pred_output[w_index]
                    true_tag = true_output[w_index]
                    (feat_index,
                     feats) = perc.feats_for_word(feat_index, feat_list)
                    for feat in feats:
                        if feat == 'B' and w_index > 0:
                            if true_output[w_index - 1] != pred_output[
                                    w_index - 1] or pred_tag != true_tag:
                                feat_vec['B:' + true_output[w_index - 1],
                                         true_tag] += 1
                                feat_vec['B:' + pred_output[w_index - 1],
                                         pred_tag] -= 1
                        elif pred_tag != true_tag:
                            feat_vec[feat, true_tag] += 1
                            feat_vec[feat, pred_tag] -= 1

            for key in feat_vec.keys():
                # γ = σ/(mT)
                avg_feat_vec[key] += feat_vec[key]

        toc = time.time()
        print(
            f'Epoch {epoch+1} finished. Time cost on this epoch: {toc-tic}. Number of mistakes: {count_mistake}.'
        )

    for key in avg_feat_vec.keys():
        avg_feat_vec[key] /= (numepochs * len(train_data))
    return avg_feat_vec
Exemple #12
0
def perc_train(train_data, tagset, iterations=1):
    feat_vec = FeatureVector()
    default_tag = tagset[0]

    for iteration in range(iterations):
        # Number of Sentences
        sentence_total = len(train_data)
        sentence_count = 0

        for (labeled_list, feat_list) in train_data:
            # Print out information
            sentence_count += 1
            print "iteration", iteration, "sentence", sentence_count, "of", sentence_total

            # Retrieve Gold Output
            gold_output = []
            gold_output.append('B_-1')
            for i in labeled_list:
                (w, t, label) = i.split()
                gold_output.append(label)
            gold_output.append('B_+1')

            # Retrieve Local Output
            local_output = perc.perc_test(feat_vec,
                                          labeled_list,
                                          feat_list,
                                          tagset,
                                          default_tag)
            local_output.insert(0, 'B_-1')
            local_output.append('B_+1')

            print gold_output
            print local_output

            # When Outputs are different, update feature vector
            if local_output != gold_output:
                # Extract features from both outputs
                local_vec = retrieve_feature(local_output, feat_list)
                gold_vec  = retrieve_feature(gold_output, feat_list)

                feat_vec += gold_vec - local_vec

        # Dump every iteration
        feat_vec.dump("models/jetic_Iter_" + str(iteration+1) + ".model")

    return feat_vec.export()
Exemple #13
0
def perc_train(train_data, tagset, n):
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")
    default_tag = tagset[0]
    feat_vec = defaultdict(int)

    epochs = n
    for round in range(0,epochs):
        num_mistakes = 0
        for (labeled_list, feat_list) in train_data:
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            true_output = get_truth(labeled_list)
            logging.info("arg max output: %s" % (" ".join(output)))
            logging.info("truth: %s" % (" ".join(true_output)))
            #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output
            if output != true_output:
                num_mistakes += 1
                output.insert(0,'B_-1')
                output.append('B_+1')
                true_output.insert(0,'B_-1')
                true_output.append('B_+1')
                feat_index = 0
                for i in range(1,len(output)-1):
                    #print >>sys.stderr, output[i], true_output[i]
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labeled_list), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    #print >>sys.stderr, feats
                    feat_vec_update = defaultdict(int)
                    for feat in feats:
                        #!!!Debug: output_feat is not truth feat....
                        output_feat = truth_feat = feat

                        feat_vec_update[output_feat, output[i]] += -1
                        feat_vec_update[truth_feat, true_output[i]] += 1

                    for (upd_feat, upd_tag) in feat_vec_update:
                        if feat_vec_update[upd_feat, upd_tag] != 0:
                            feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag]
                            logging.info("updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag]))
        print >>sys.stderr, "number of mistakes:", num_mistakes
        logging.info("current number of mistakes: %d" % (num_mistakes))
    return feat_vec
Exemple #14
0
def perc_train(train_data, tagset, n):
	feat_vec = defaultdict(int)
	# insert your code here
	# please limit the number of iterations of training to n iterations
	
	n_sentences = len(train_data)
	for i in range (0,n):
		for j in range(0,n_sentences):
			print '{0}\r'.format("\rIteration: %d/%d. Sentence: %d/%d\t" %(i+1, n, j+1, n_sentences)),
			
			labeled_list = train_data[j][0]
			feat_list = train_data[j][1]	
	
			# Extract the labels from training data
			toutput = [tags.split(' ')[2] for tags in labeled_list ]	
			
			# Output from Viterbi Algorithm	
			zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP')
			
			# Compare outputs
			if toutput != zoutput:
				index = 0
				for k in range(0, len(zoutput)):							
					predicted_tag = zoutput[k]
					true_tag = toutput[k]
					(index, feats) = perc.feats_for_word(index, feat_list)
					for feat in feats:
						s1 = s2 = ''
						if feat == 'B':
							if k >= 1:
								zprevtag = zoutput[k-1]
								tprevtag = toutput[k-1]
							else:
								zprevtag = tprevtag = 'B_-1'
							s1 = (feat+':'+zprevtag, predicted_tag)
							s2 = (feat+':'+tprevtag, true_tag)
						else:
							s1 = (feat, predicted_tag)
							s2 = (feat, true_tag)
							
						if s1 != s2:
							feat_vec[s1] -= 1
							feat_vec[s2] += 1
	return feat_vec
Exemple #15
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    default_tag = tagset[0]

    for t in range(numepochs):
        error_num = 0
        for (labeled_list, feat_list) in train_data:
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                                    default_tag)
            expected = [i.split()[2] for i in labeled_list]
            if output != expected:
                vec_output = global_feature_vector(feat_list, output)
                vec_expected = global_feature_vector(feat_list, expected)
                update_weight_vector(feat_vec, vec_output, -1)
                update_weight_vector(feat_vec, vec_expected, 1)
                error_num += 1
        print "Number of mistakes: ", error_num

    return feat_vec
Exemple #16
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    tags = {}

    for i in range(0, numepochs):
        for j in range(0, len(train_data)):
            label_list = train_data[j][0]
            feat_list = train_data[j][1]

            z = perc.perc_test(feat_vec, label_list, feat_list, tagset,
                               tagset[0])

            for k in range(0, len(z)):
                temp = train_tags(label_list)
                if (z[k] != temp[k]):
                    check_and_change(feat_vec, word_list(label_list),
                                     pos_list(label_list),
                                     train_tags(label_list), z, k)

    return feat_vec
Exemple #17
0
def perc_train(train_data, tagset, numepochs):
    """
    :current_global_vector: a dict of features for the predicted labels
    :gold_global_vector: a dict of features for the standard
    """
    feat_vec = defaultdict(int)
    #for t in range(numepochs):
    default_tag = tagset[0]
    for t in range(numepochs):
        for (labeled_list, feat_list) in train_data:
            std_labels = get_labels(labeled_list)
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                                    default_tag)
            gold_global_vector = get_global_vector(std_labels, feat_list)
            current_global_vector = get_global_vector(output, feat_list)
            add_vector(feat_vec, gold_global_vector, 1)
            add_vector(feat_vec, current_global_vector, -1)

        perc.perc_write_to_file(feat_vec, opts.modelfile + str(t))

    return feat_vec
Exemple #18
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    default_tag = tagset[0]

    # insert your code here
    # please limit the number of iterations of training to n iterations
    for i in range(numepochs):
        numOfError = 0
        output = []
        for (labeled_list, feat_list) in train_data:
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                                    default_tag)
            elements = [element.split(" ")[2] for element in labeled_list]
            for j in range(len(elements)):
                trueLabel = elements[j]
                argMaxLabel = output[j]
                if (trueLabel != argMaxLabel):
                    numOfError = numOfError + 1
                    for feat in feat_list[j * 20:j * 20 + 20]:
                        if (feat == "B") & j > 0:
                            trueLabel_prev = elements[j - 1]
                            argMaxLabel_prev = output[j - 1]
                            feat_vec["B:" + trueLabel_prev,
                                     trueLabel] = feat_vec["B:" +
                                                           trueLabel_prev,
                                                           trueLabel] + 1
                            feat_vec["B:" + argMaxLabel_prev,
                                     argMaxLabel] = feat_vec["B:" +
                                                             argMaxLabel_prev,
                                                             argMaxLabel] - 1
                        else:
                            feat_vec[feat,
                                     trueLabel] = feat_vec[feat, trueLabel] + 1
                            feat_vec[feat,
                                     argMaxLabel] = feat_vec[feat,
                                                             argMaxLabel] - 1

        print "Number of error in Epoch", i + 1, " ", numOfError
    return feat_vec
Exemple #19
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    avg_vec = defaultdict(int)
    changed_vec = {
    }  # records when vectors were changed, value is (feature/x, epoch/count, totalfeatures)
    print 'numepochs = ', numepochs
    for count in range(0, numepochs):
        print "Epoch: " + str(count)
        for x in range(0, len(train_data)):
            true_features = create_featureSchema(train_data[x][1],
                                                 train_data[x][0])
            output_label = perc.perc_test(feat_vec, train_data[x][0],
                                          train_data[x][1], tagset, tagset[0])
            for y in range(0, len(output_label)):
                true_label = train_data[x][0][y].split(' ')[2]
                if output_label[y] != true_label:
                    changed_vec_value = [count, x, len(train_data)]
                    apply_change = (count == (numepochs - 1)
                                    and x == (len(train_data) - 1))
                    previous_true_label = train_data[x][0][y - 1].split(' ')[2]
                    update_featVector(output_label[y], output_label[y - 1],
                                      true_features[y], true_label,
                                      previous_true_label, feat_vec, avg_vec,
                                      changed_vec, changed_vec_value,
                                      apply_change)

    for k in changed_vec.iterkeys():
        this_feat = len(train_data)
        this_epoch = numepochs
        last_feat = changed_vec[k][0]
        last_epoch = changed_vec[k][1]
        num_feat = changed_vec[k][2]
        multiplier = (this_feat * num_feat + this_epoch -
                      last_feat * num_feat - last_epoch)
        avg_vec[k] += (feat_vec[k] * multiplier)

    for k in avg_vec.iterkeys():
        avg_vec[k] = 1.0 * avg_vec[k] / (numepochs * len(train_data))
    return avg_vec
def perc_train(train_data, tagset, n):
	feat_vec = defaultdict(int)
	sigma_feat_vec = defaultdict(float)
	# insert your code here
	# please limit the number of iterations of training to n iterations
	
	print >> sys.stderr, "training data ..."
	n_sentences = len(train_data)
	for i in range (0,n):		
		for j in range(0,n_sentences):
			print >> sys.stderr, '\r{0}'.format("Iteration: %d/%d. Sentence: %d/%d\t" %(i+1, n, j+1, n_sentences)),
			
			labeled_list = train_data[j][0]
			feat_list = train_data[j][1]
	
			# Extract the labels from training data
			toutput = [tags.split(' ')[2] for tags in labeled_list ]	
			
			# Output from Viterbi Algorithm	
			zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP')
			
			# Compare outputs
			if toutput != zoutput:
				index = 0
				for p in range(0, len(zoutput)):							
					predicted_tag = zoutput[p]
					true_tag = toutput[p]
					(index, feats) = perc.feats_for_word(index, feat_list)				
					
					for feat in feats:
						s1 = s2 = ''
						if feat == 'B':
							if p >= 1:
								zprevtag = zoutput[p-1]
								tprevtag = toutput[p-1]
							else:
								zprevtag = tprevtag = 'B_-1'
							s1 = (feat+':'+zprevtag, predicted_tag)
							s2 = (feat+':'+tprevtag, true_tag)
						else:
							s1 = (feat, predicted_tag)
							s2 = (feat, true_tag)
							
						if s1 != s2:
							feat_vec[s1] -= 1
							feat_vec[s2] += 1
			
			# Compute average vector			
			for f in feat_vec:
				sigma_feat_vec[f] += feat_vec[f] 
							
	print >> sys.stderr, "\ndone"
	zerokeys = []
	for f in sigma_feat_vec:
		if sigma_feat_vec[f] == 0:
			zerokeys.append(f)
		else:
			sigma_feat_vec[f] = sigma_feat_vec[f]/(n*n_sentences)
	for k in zerokeys:
		del sigma_feat_vec[k]
	return sigma_feat_vec
Exemple #21
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(float)
    avg_feat_vec = defaultdict(float)
    tau_feat_vec = dict()

    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    default_tag = tagset[0]
    m = len(train_data) # length of training data
    for t in range(numepochs):
        print 'Iteration#',t,' is processing now.'
        for j, (labeled_list, feat_list) in enumerate(train_data):

            labels = copy.deepcopy(labeled_list)
            # print 'sentence[',j,']'
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            # compare current output and true result

            if j != m - 1 or t != numepochs - 1:
                feat_index = 0
                # check word by word if the predicted tag is equal to the true tag
                for i, v in enumerate(output):
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    # retrieve the feature for a word
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    
                    label = labels[i].split()[2]
                    if i > 0: 
                        label_pre = labels[i-1].split()[2]
                        for feat in feats:

                            if feat[0] == 'B': # for bigram feature
                                feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"

                                if output[i] != label or feat_out != feat_lab:

                                    # laze update the tau vector value
                                    lazy_update_vect(feat_out, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)
                                    lazy_update_vect(feat_lab, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)


                                    # update original feature vector, if feat_out == feat_lab perform 2nd type updating
                                    update_bigram_vect(feat_vec, avg_feat_vec, feat_out, feat_lab, output[i], label)

                                    # if feat_out == feat_lab then update twice for the same tau
                                    tau_feat_vec[feat_out, output[i]] = (j, t)
                                    tau_feat_vec[feat_lab, label] = (j, t)



                            elif output[i] != label:
                                lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)
                                lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)
                                
                                # for U00 to U22 feature                                
                                update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label)

                                # update vector
                                tau_feat_vec[feat, output[i]] = (j, t)
                                tau_feat_vec[feat, label] = (j, t)


                    else:  # for i==0 case, all the first word in each sentence
                        label_pre = '_B-2'  # previous label will be denoted by _B-2
                        for feat in feats:
                            if feat[0] == 'B' and output[i] != label:
                                # bigram feature case
                                feat = feat + ":" + label_pre

                                lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)  
                                lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)  

                                update_bigram_vect(feat_vec, avg_feat_vec, feat, feat, output[i], label)

                                tau_feat_vec[feat, label] = (j, t)
                                tau_feat_vec[feat, output[i]] = (j, t)


                            elif output[i] != label:
                                lazy_update_vect(feat, output[i], tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)
                                lazy_update_vect(feat, label, tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)
                                
                                # for U00 to U22 feature
                                update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label)

                                tau_feat_vec[feat, output[i]] = (j, t)
                                tau_feat_vec[feat, label] = (j, t)


            else:
                final_lazy_update_vect(tau_feat_vec, feat_vec, avg_feat_vec, t, j, m)

                # special case for the last sentence 
                feat_index = 0
                # check word by word if the predicted tag is equal to the true tag
                for i, v in enumerate(output):
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    # retrieve the feature for a word
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    
                    label = labels[i].split()[2]
                    if i > 0: 
                        label_pre = labels[i-1].split()[2]
                        for feat in feats:

                            if feat[0] == 'B': # for bigram feature
                                feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"
                                if output[i] != label:
                                    # update original feature vector
                                    update_bigram_vect(feat_vec, avg_feat_vec, feat_out, feat_lab, output[i], label)

                            elif output[i] != label:                                
                                update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label)

                    else:  
                        # for i==0 case, all the first word in each sentence
                        label_pre = '_B-1'  # previous label will be denoted by _B-2
                        for feat in feats:
                            if feat[0] == 'B' and output[i] != label:  
                                # bigram feature case
                                feat = feat + ":" + label_pre
                                update_bigram_vect(feat_vec, avg_feat_vec, feat, feat, output[i], label)

                            elif output[i] != label:
                                # for U00 to U22 feature
                                update_unigram_vect(feat_vec, avg_feat_vec, feat, output[i],label)

        # end of iteration

    # averaging perceptron
    for key in avg_feat_vec.keys():
        avg_feat_vec[key] = avg_feat_vec[key]/float(numepochs*m)
    # please limit the number of iterations of training to n iterations
    perc.perc_write_to_file(feat_vec, 'model_feat_vec')
    return avg_feat_vec
Exemple #22
0
def avg_perc_train(train_data, tagset, n):
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")
    default_tag = tagset[0]

    feat_vec = defaultdict(int)
    avg_vec = defaultdict(int)
    last_iter = {}

    epochs = n
    num_updates = 0
    for round in range(0,epochs):
        num_mistakes = 0
        for (labeled_list, feat_list) in train_data:
            num_updates += 1
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            true_output = get_truth(labeled_list)
            logging.info("arg max output: %s" % (" ".join(output)))
            logging.info("truth: %s" % (" ".join(true_output)))
            #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output
            if output != true_output:
                num_mistakes += 1
                output.insert(0,'B_-1')
                output.append('B_+1')
                true_output.insert(0,'B_-1')
                true_output.append('B_+1')
                feat_index = 0
                for i in range(1,len(output)-1):
                    #print >>sys.stderr, output[i], true_output[i]
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labeled_list), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    #print >>sys.stderr, feats
                    feat_vec_update = defaultdict(int)
                    for feat in feats:
                        if feat == 'B':
                            output_feat = 'B:' + output[i-1]
                            truth_feat = 'B:' + true_output[i-1]
                        else:
                            output_feat = truth_feat = feat

                        feat_vec_update[output_feat, output[i]] += -1
                        feat_vec_update[truth_feat, true_output[i]] += 1
                        #reason: if output[i]==true_output[i] update = 0
                    for (upd_feat, upd_tag) in feat_vec_update:
                        if feat_vec_update[upd_feat, upd_tag] != 0:
                            feat_vec[upd_feat, upd_tag] += feat_vec_update[upd_feat, upd_tag]
                            logging.info("updating feat_vec with feature_id: (%s, %s) value: %d" % (upd_feat, upd_tag, feat_vec_update[upd_feat, upd_tag]))
                            if (upd_feat, upd_tag) in last_iter:
                                avg_vec[upd_feat, upd_tag] += (num_updates - last_iter[upd_feat, upd_tag]) * feat_vec[upd_feat, upd_tag]
                            else:
                                avg_vec[upd_feat, upd_tag] = feat_vec[upd_feat, upd_tag]
                            last_iter[upd_feat, upd_tag] = num_updates
        print >>sys.stderr, "number of mistakes:", num_mistakes
    for (feat, tag) in feat_vec:
        if (feat, tag) in last_iter:
            avg_vec[feat, tag] += (num_updates - last_iter[feat, tag]) * feat_vec[feat, tag]
        else:
            avg_vec[feat, tag] = feat_vec[feat, tag]
        feat_vec[feat, tag] = avg_vec[feat, tag] / num_updates
    return feat_vec
Exemple #23
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(float)
    avg_feat_vec = defaultdict(float)
    tau_feat_vec = dict()
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    numepochs = int(2)
    default_tag = tagset[0]
    m = len(train_data) # length of training data
    for t in range(numepochs):
        print 'Iteration#',t,' is processing now.'
        for j, (labeled_list, feat_list) in enumerate(train_data):
            labels = copy.deepcopy(labeled_list)
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            # compare current output and true result
            # correct_flag = True
            if j != m or t != numepochs - 1:
                feat_index = 0
                # check word by word if the predicted tag is equal to the true tag
                for i, v in enumerate(output):
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    # retrieve the feature for a word
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    
                    label = labels[i].split()[2]
                    if i > 0: 
                        label_pre = labels[i-1].split()[2]
                        for feat in feats:

                            if feat[0] == 'B': # for bigram feature
                                feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"

                                if   output[i-1] != label_pre and output[i] != label:

                                    if feat in tau_feat_vec:
                                        (js, ts) = tau_feat_vec[feat]
                                        for (feature, tag) in avg_feat_vec.keys():
                                            if feature == feat:
                                                avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                                    # update original feature vector
                                    feat_vec[feat_out, output[i]]   -= 1.0
                                    feat_vec[feat_lab, output[i]]   -= 1.0
                                    feat_vec[feat_out, label]       += 1.0
                                    feat_vec[feat_lab, label]       += 1.0

                                    # update avg feature vector
                                    avg_feat_vec[feat_out, output[i]]   -= 1.0
                                    avg_feat_vec[feat_lab, output[i]]   -= 1.0
                                    avg_feat_vec[feat_out, label]       += 1.0
                                    avg_feat_vec[feat_lab, label]       += 1.0

                                    tau_feat_vec[feat_out] = (j, t)
                                    tau_feat_vec[feat_lab] = (j, t)

                                elif output[i-1] == label_pre and output[i] != label:

                                    if feat in tau_feat_vec:
                                        (js, ts) = tau_feat_vec[feat]
                                        for (feature, tag) in avg_feat_vec.keys():
                                            if feature == feat:
                                                avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                                    feat_vec[feat_lab, output[i]]   -= 2.0
                                    feat_vec[feat_lab, label]       += 2.0

                                    avg_feat_vec[feat_lab, output[i]]   -= 2.0
                                    avg_feat_vec[feat_lab, label]       += 2.0
                                    
                                    tau_feat_vec[feat_lab] = (j, t)
                                    tau_feat_vec[feat_lab]     = (j, t)

                                elif output[i-1] != label_pre and output[i] == label:
                                    pass

                                elif output[i-1] == label_pre and output[i] == label:
                                    pass

                            else: # for U00 to U22 feature

                                if output[i] != label and feat in tau_feat_vec:
                                    (js, ts) = tau_feat_vec[feat]
                                    for (feature, tag) in avg_feat_vec.keys():
                                        if feature == feat:
                                            avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)

                                feat_vec[feat, output[i]] -= 1.0
                                feat_vec[feat, label] += 1.0

                                avg_feat_vec[feat, output[i]] -= 1.0
                                avg_feat_vec[feat, label] += 1.0

                                # update vector
                                tau_feat_vec[feat, output[i]] = (j, t)
                                tau_feat_vec[feat, label]     = (j, t)


                    else:  # for i==0 case, all the first word in each sentence
                        label_pre = 'B_-1'  # previous label will be denoted by B_-1
                        for feat in feats:


                            if feat[0] == 'B':  # bigram feature case
                                feat = feat + ":" + label_pre

                            if output[i] != label and feat in tau_feat_vec:
                                (js, ts) = tau_feat_vec[feat]
                                for (feature, tag) in avg_feat_vec.keys():
                                    if feature == feat:
                                        avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                            feat_vec[feat, output[i]] -= 1.0
                            feat_vec[feat, label] += 1.0

                            avg_feat_vec[feat, output[i]] -= 1.0
                            avg_feat_vec[feat, label] += 1.0

                            tau_feat_vec[feat, output[i]] = (j, t)
                            tau_feat_vec[feat, label] = (j, t)


            else:
                # last sentence of each iteration
                feat_index = 0
                # check word by word if the predicted tag is equal to the true tag
                for i, v in enumerate(output):
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    # retrieve the feature for a word
                    if len(feats) == 0:
                        print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                        raise ValueError("features do not align with input sentence")
                    
                    fields = labels[i].split()
                    label = fields[2]
                    if i > 0: 
                        label_pre = labels[i-1].split()[2]
                        for feat in feats:
                            if feat[0] == 'B': # for bigram feature
                                feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"

                                if   output[i-1] != label_pre and output[i] != label:

                                    if feat in tau_feat_vec:
                                        (js, ts) = tau_feat_vec[feat]
                                        for (feature, tag) in avg_feat_vec.keys():
                                            if feature == feat:
                                                avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)

                                    # update original feature vector
                                    feat_vec[feat_out, output[i]]   -= 1.0
                                    feat_vec[feat_lab, output[i]]   -= 1.0
                                    feat_vec[feat_out, label]       += 1.0
                                    feat_vec[feat_lab, label]       += 1.0

                                    # update avg feature vector
                                    avg_feat_vec[feat_out, output[i]]   -= 1.0
                                    avg_feat_vec[feat_lab, output[i]]   -= 1.0
                                    avg_feat_vec[feat_out, label]       += 1.0
                                    avg_feat_vec[feat_lab, label]       += 1.0

                                elif output[i-1] == label_pre and output[i] != label:

                                    if feat in tau_feat_vec:
                                        (js, ts) = tau_feat_vec[feat]
                                        for (feature, tag) in avg_feat_vec.keys():
                                            if feature == feat:
                                                avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                                    feat_vec[feat_lab, output[i]]   -= 2.0
                                    feat_vec[feat_lab, label]       += 2.0

                                    avg_feat_vec[feat_lab, output[i]]   -= 2.0
                                    avg_feat_vec[feat_lab, label]       += 2.0
                                    

                                elif output[i-1] != label_pre and output[i] == label:
                                    pass

                                elif output[i-1] == label_pre and output[i] == label:
                                    pass

                            else: # for U00 to U22 feature
                            
                                if output[i] != label and feat in tau_feat_vec:
                                    (js, ts) = tau_feat_vec[feat]
                                    for (feature, tag) in avg_feat_vec.keys():
                                        if feature == feat:
                                            avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                                feat_vec[feat, output[i]] -= 1.0
                                feat_vec[feat, label] += 1.0

                                avg_feat_vec[feat, output[i]] -= 1.0
                                avg_feat_vec[feat, label] += 1.0


                    else:  # for i==0 case, all the first word in each sentence
                        label_pre = 'B_-1'  # previous label will be denoted by B_-1
                        for feat in feats:
                            if feat[0] == 'B':  # bigram feature case
                                feat = feat + ":" + label_pre
                            
                            if output[i] != label and feat in tau_feat_vec:
                                (js, ts) = tau_feat_vec[feat]
                                for (feature, tag) in avg_feat_vec.keys():
                                    if feature == feat:
                                        avg_feat_vec[feat, tag] = avg_feat_vec[feat, tag] + feat_vec[feat, tag] * (t*m + j - ts*m - js)


                            feat_vec[feat, output[i]] -= 1.0
                            feat_vec[feat, label] += 1.0

                            avg_feat_vec[feat, output[i]] -= 1.0
                            avg_feat_vec[feat, label] += 1.0


        # end of iteration

    # averaging perceptron
    for key in avg_feat_vec.keys():
        avg_feat_vec[key] = avg_feat_vec[key]/float(numepochs*m)
    # please limit the number of iterations of training to n iterations
    return avg_feat_vec
Exemple #24
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    numepochs = int(1)
    default_tag = tagset[0]
    for t in range(numepochs):
        tmp = 0
        # Count sentence
        print 'Iteration#',t,' is processing now.'
        cnt = 0
        for (labeled_list, feat_list) in train_data:
            cnt = cnt + 1
            print 'Sentence[',cnt,'] is now processing...'
            labels = copy.deepcopy(labeled_list)
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)

            feat_index = 0
            # check word by word if the predicted tag is equal to the true tag
            for i, v in enumerate(output):
                (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                # retrieve the feature for a word
                if len(feats) == 0:
                    print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                    raise ValueError("features do not align with input sentence")
                
                label = labels[i].split()[2]
                if i > 1: 
                    label_i_1 = labels[i-1].split()[2]
                    label_i_2 = labels[i-2].split()[2]
                    if output[i] != label:
                        for feat in feats:
                            if feat[0] == 'T' and output[i-2] != label_i_2 and output[i-1] != label_i_1: 
                                # trigram case 
                                feat_out = feat + ":" + output[i-2] + "," + output[i-1]  
                                # feat_out is the "B:<previous output>"
                                feat_lab = feat + ":" + label_i_2 + "," + label_i_1
                                # feat_lab is the "B:<previous label>"
                                # reward best condition
                                feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1

                                # penalize condition
                                feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1

                            elif feat[0] == 'B' and output[i-1] != label_i_1:
                                # bigram case
                                feat_out = feat + ":" + output[i-1]  
                                feat_lab = feat + ":" + label_i_1
                                feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1
                                feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1

                            else: 
                            # for U00 to U22 feature
                                feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                                feat_vec[feat, label] = feat_vec[feat, label] + 1
                elif i == 1:
                    # for i==0 case, all the first word in each sentence
                    label_i_2 = '_-1'  # previous label will be denoted by B_-1
                    label_i_1 = labels[i-1].split()[2]
                    if  output[i] != label:
                        for feat in feats:
                            if feat[0] == 'T' and output[i-1] != label_i_1:
                            # trigram case 
                                feat_out = feat + ":" + label_i_2 + "," + output[i-1]  
                                feat_lab = feat + ":" + label_i_2 + "," + label_i_1
                                # reward best condition
                                feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1

                                # penalize condition
                                feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1
                            
                            elif feat[0] == 'B':
                                feat_out = feat + ":" + output[i-1]  
                                feat_lab = feat + ":" + label_i_1
                                feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1
                                feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1

                            else: 
                            # for U00 to U22 feature
                                feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                                feat_vec[feat, label] = feat_vec[feat, label] + 1
                elif i == 0:
                    label_i_2 = '_B-2'
                    label_i_1 = '_B-1'
                    if output[i] != label:
                        for feat in feats:
                            if feat[0] == 'T':
                            # trigram case 
                                feat = feat + ":" + label_i_2 + "," + label_i_1
                            
                            elif feat[0] == 'B':
                            #bigram case
                                feat = feat + ":" + label_i_1

                            feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                            feat_vec[feat, label] = feat_vec[feat, label] + 1

        filename = 'mid_model_iter' + str(t)
        perc.perc_write_to_file(feat_vec, filename)


    for (k1, k2), v in feat_vec.items():
        if v == 0:
            del feat_vec[k1,k2]


    # please limit the number of iterations of training to n iterations
    return feat_vec
Exemple #25
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    cumulative_feat_vec = defaultdict(float)
    index_dec = dict()
    epoch = 0
    count = 0
    numsen = len(train_data)
    while (epoch < numepochs):
        print(epoch)
        mistakes = 0
        correct = 0
        #print(numsen)
        sen = 0
        for sentence_data in train_data:
            if (epoch != numepochs or sen != numsen):

                words = []
                postags = []
                truetags = []
                label_list = sentence_data[0]
                feat_list = sentence_data[1]
                for label in label_list:
                    (word, postag, chunktag) = label.split(" ")
                    words.append(word)
                    postags.append(postag)
                    truetags.append(chunktag)
                tagset = perc.read_tagset(opts.tagsetfile)
                default_tag = tagset[0]
                argmaxtags = perc.perc_test(feat_vec, label_list, feat_list,
                                            tagset, default_tag)
                feat_index = 0
                i = 0

                for word in words:
                    (feat_index, feats_for_this_word) = perc.feats_for_word(
                        feat_index, feat_list)
                    # print(len(feats_for_this_word))
                    argmax = argmaxtags[i]
                    tru = truetags[i]
                    if (argmax == tru):
                        i += 1
                        continue
                    for f in feats_for_this_word:
                        wrongkey = f, argmax
                        rightkey = f, tru
                        if (wrongkey in index_dec):
                            (index_epoch, index_sen) = index_dec[wrongkey]
                            idletime = (epoch * numsen + sen -
                                        index_epoch * numsen - index_sen)
                            cumulative_feat_vec[
                                wrongkey] = cumulative_feat_vec.get(
                                    wrongkey,
                                    0) + feat_vec.get(wrongkey, 0) * idletime
                        if (rightkey in index_dec):
                            (index_epoch, index_sen) = index_dec[rightkey]
                            idletime = (epoch * numsen + sen -
                                        index_epoch * numsen - index_sen)
                            cumulative_feat_vec[
                                rightkey] = cumulative_feat_vec.get(
                                    rightkey,
                                    0) + feat_vec.get(rightkey, 0) * idletime

                        feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                        feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                        cumulative_feat_vec[
                            wrongkey] = cumulative_feat_vec.get(
                                wrongkey, 0) + feat_vec[wrongkey]
                        cumulative_feat_vec[
                            rightkey] = cumulative_feat_vec.get(
                                rightkey, 0) + feat_vec[rightkey]
                        index_dec[wrongkey] = (epoch, sen)
                        index_dec[rightkey] = (epoch, sen)
                    i += 1
                i = 0

                for word in words:
                    argmax = argmaxtags[i]
                    tru = truetags[i]
                    if (argmax == tru):
                        i += 1
                        correct += 1
                        continue
                    else:
                        mistakes += 1
                    argmaxprev = "B:"
                    truprev = "B:"
                    if (i == 0):
                        argmaxprev += "B_-1"
                        truprev += "B_-1"
                    else:
                        argmaxprev += argmaxtags[i - 1]
                        truprev += truetags[i - 1]
                    wrongkey = argmaxprev, argmax
                    rightkey = truprev, tru

                    if (wrongkey in index_dec):
                        (index_epoch, index_sen) = index_dec[wrongkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                            wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime

                    if (rightkey in index_dec):
                        (index_epoch, index_sen) = index_dec[rightkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                            rightkey, 0) + feat_vec.get(rightkey, 0) * idletime

                    feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                    feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                    cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                        wrongkey, 0) + feat_vec[wrongkey]
                    cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                        rightkey, 0) + feat_vec[rightkey]
                    index_dec[wrongkey] = (epoch, sen)
                    index_dec[rightkey] = (epoch, sen)
                    i += 1

                #keys=feat_vec.keys()
                #for key in keys:
                #cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key]
                count += 1
            else:

                words = []
                postags = []
                truetags = []
                label_list = sentence_data[0]
                feat_list = sentence_data[1]
                for label in label_list:
                    (word, postag, chunktag) = label.split(" ")
                    words.append(word)
                    postags.append(postag)
                    truetags.append(chunktag)
                tagset = perc.read_tagset(opts.tagsetfile)
                default_tag = tagset[0]
                argmaxtags = perc.perc_test(feat_vec, label_list, feat_list,
                                            tagset, default_tag)
                feat_index = 0
                i = 0

                for word in words:
                    (feat_index, feats_for_this_word) = perc.feats_for_word(
                        feat_index, feat_list)
                    # print(len(feats_for_this_word))
                    argmax = argmaxtags[i]
                    tru = truetags[i]
                    for f in feats_for_this_word:
                        wrongkey = f, argmax
                        rightkey = f, tru

                        (index_epoch, index_sen) = index_dec[wrongkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                            wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime

                        (index_epoch, index_sen) = index_dec[rightkey]
                        idletime = (epoch * numsen + sen -
                                    index_epoch * numsen - index_sen)
                        cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                            rightkey, 0) + feat_vec.get(rightkey, 0) * idletime

                    if (argmax == tru):
                        i += 1
                        continue
                    for f in feats_for_this_word:
                        wrongkey = f, argmax
                        rightkey = f, tru

                        feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                        feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                        cumulative_feat_vec[
                            wrongkey] = cumulative_feat_vec.get(
                                wrongkey, 0) + feat_vec[wrongkey]
                        cumulative_feat_vec[
                            rightkey] = cumulative_feat_vec.get(
                                rightkey, 0) + feat_vec[rightkey]
                        index_dec[wrongkey] = (epoch, sen)
                        index_dec[rightkey] = (epoch, sen)
                    i += 1
                i = 0

                for word in words:
                    argmax = argmaxtags[i]
                    tru = truetags[i]

                    argmaxprev = "B:"
                    truprev = "B:"
                    if (i == 0):
                        argmaxprev += "B_-1"
                        truprev += "B_-1"
                    else:
                        argmaxprev += argmaxtags[i - 1]
                        truprev += truetags[i - 1]
                    wrongkey = argmaxprev, argmax
                    rightkey = truprev, tru

                    (index_epoch, index_sen) = index_dec[wrongkey]
                    idletime = (epoch * numsen + sen - index_epoch * numsen -
                                index_sen)
                    cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                        wrongkey, 0) + feat_vec.get(wrongkey, 0) * idletime

                    (index_epoch, index_sen) = index_dec[rightkey]
                    idletime = (epoch * numsen + sen - index_epoch * numsen -
                                index_sen)
                    cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                        rightkey, 0) + feat_vec.get(rightkey, 0) * idletime

                    if (argmax == tru):
                        i += 1
                        correct += 1
                        continue
                    else:
                        mistakes += 1

                    feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                    feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                    cumulative_feat_vec[wrongkey] = cumulative_feat_vec.get(
                        wrongkey, 0) + feat_vec[wrongkey]
                    cumulative_feat_vec[rightkey] = cumulative_feat_vec.get(
                        rightkey, 0) + feat_vec[rightkey]
                    index_dec[wrongkey] = (epoch, sen)
                    index_dec[rightkey] = (epoch, sen)
                    i += 1

                # keys=feat_vec.keys()
                # for key in keys:
                # cumulative_feat_vec[key]=cumulative_feat_vec.get(key,0)+feat_vec[key]
                count += 1

            if (sen % 1000 == 0):
                print(str(sen) + "/" + str(len(train_data)))
            sen += 1

        #print(mistakes)
        #print(correct)
        epoch += 1

    keys = cumulative_feat_vec.keys()
    for key in keys:
        cumulative_feat_vec[key] = float(
            cumulative_feat_vec[key]) / float(count)

    # please limit the number of iterations of training to n iterations

    return cumulative_feat_vec
Exemple #26
0
def avg_perc_train(train_data, tagset, iterations=1):
    feat_vec = FeatureVector()
    feat_vec_sum = FeatureVector()
    last_change_dict = FeatureVector()
    total_sentence_count = 0
    default_tag = tagset[0]
    import random
    for iteration in range(iterations):
        # Number of Sentences

        # stocastic gradient descent
        batch_train_data = random.sample(train_data, 128)

        sentence_total = len(batch_train_data)
        sentence_count = 0

        for (labeled_list, feat_list) in batch_train_data:
            # For averaged perceptron, we need to know exactly how many
            # sentences we have used during training
            total_sentence_count += 1

            # Print out information
            sentence_count += 1
            print "iteration", iteration, "sentence", sentence_count, "of", sentence_total

            # Retrieve Gold Output
            gold_output = []
            gold_output.append('B_-1')
            for i in labeled_list:
                (w, t, label) = i.split()
                gold_output.append(label)
            gold_output.append('B_+1')

            # Retrieve Local Output
            local_output = perc.perc_test(feat_vec,
                                          labeled_list,
                                          feat_list,
                                          tagset,
                                          default_tag)
            local_output.insert(0, 'B_-1')
            local_output.append('B_+1')

            print gold_output
            print local_output

            # Extract features from both outputs
            local_vec = retrieve_feature(local_output, feat_list)
            gold_vec  = retrieve_feature(gold_output, feat_list)

            # Calculate delta
            delta_vec = gold_vec - local_vec

            # This is the key to averaged perceptron, it sums up all the
            # feat_vec we have used, and returns the averaged value by
            # dividing that sum with the total_sentence_count(total number
            # of sentences used during training, including duplicates
            # during multiple iterations)

            #    feat_vec += delta_vec
            #    feat_vec_sum += feat_vec

            # The following is the optimisation for averaged perceptron
            # which does exactly the same thing as the code in the above two
            # lines. Instead of updating the feat_vec_sum everytime we train
            # a new sentence, we do lazy update.
            if sentence_count != sentence_total:
                # Not the last sentence of current iteration
                if not gold_vec == local_vec:
                    for key in delta_vec:
                        # Only update the changed values, and store when they
                        # was last updated
                        feat_vec_sum[key] += feat_vec[key] * (total_sentence_count - last_change_dict[key])
                        last_change_dict[key] = total_sentence_count

                    feat_vec += delta_vec
                    # Because feat_vec is updated here by adding delta_vec, we
                    # do exactly the same thing to feat_vec_sum, because it is
                    # in its nature, a sum of feat_vecs
                    feat_vec_sum += delta_vec
            else:
                # Is the last sentence of current iteration, we need to apply
                # all pending updates to feat_vec_sum
                for key in last_change_dict.keys() + feat_vec.keys():
                    # Just to make sure we have indeed updated every key.
                    feat_vec_sum[key] += feat_vec[key] * (total_sentence_count - last_change_dict[key])
                    last_change_dict[key] = total_sentence_count

                if not gold_vec == local_vec:
                    # Last but not least, don't forget the current delta_vec
                    feat_vec += delta_vec
                    feat_vec_sum += delta_vec

        # Dump every iteration
        tmp = feat_vec_sum / total_sentence_count
        tmp.dump("models/jetic_avg_Iter_" + str(iteration+1) + ".model")

    # Finalisation, divide feat_vec_sum with total_sentence_count
    feat_vec = feat_vec_sum / total_sentence_count

    return feat_vec.export()
Exemple #27
0
def perc_train(train_data, tagset, numepochs):
    starttime = datetime.now()
    feat_vec = defaultdict(int)
    default_tag = tagset[0]
    sigma = defaultdict(int)
    gamma = defaultdict(int)
    # insert your code here
    # please limit the number of iterations of training to n iterations
    for i in range(numepochs):
        epochstarttime = datetime.now()
        numOfError = 0
        argMaxoutput = []
        for (labeled_list, feat_list) in train_data:
            argMaxoutput = perc.perc_test(feat_vec, labeled_list, feat_list,
                                          tagset, default_tag)
            expectedOutput = [
                element.split(" ")[2] for element in labeled_list
            ]
            for j in range(len(expectedOutput)):
                trueLabel = expectedOutput[j]
                argMaxLabel = argMaxoutput[j]
                trueLabel_prev = expectedOutput[j - 1]
                argMaxLabel_prev = argMaxoutput[j - 1]
                if (trueLabel != argMaxLabel):
                    numOfError = numOfError + 1
                    for feat in feat_list[j * 20:j * 20 + 20]:
                        if (feat == "B") & (j > 0):
                            trueLabel_prev = expectedOutput[j - 1]
                            argMaxLabel_prev = argMaxoutput[j - 1]
                            feat_vec["B:" + trueLabel_prev,
                                     trueLabel] = feat_vec["B:" +
                                                           trueLabel_prev,
                                                           trueLabel] + 1
                            feat_vec["B:" + argMaxLabel_prev,
                                     argMaxLabel] = feat_vec["B:" +
                                                             argMaxLabel_prev,
                                                             argMaxLabel] - 1
                            #sigma["B:"+trueLabel_prev,trueLabel] = sigma["B:"+ trueLabel_prev,trueLabel] + feat_vec["B:"+ trueLabel_prev,trueLabel]
                            #sigma["B:"+argMaxLabel_prev,argMaxLabel] = sigma["B:"+argMaxLabel_prev,argMaxLabel] + feat_vec["B:"+argMaxLabel_prev,argMaxLabel]
                        else:
                            feat_vec[feat,
                                     trueLabel] = feat_vec[feat, trueLabel] + 1
                            feat_vec[feat,
                                     argMaxLabel] = feat_vec[feat,
                                                             argMaxLabel] - 1
                            #sigma[feat,trueLabel] =  sigma[feat,trueLabel] + feat_vec[feat,trueLabel];
                            #sigma[feat,argMaxLabel] = sigma[feat,argMaxLabel] + feat_vec[feat,argMaxLabel];

                elif (j > 0) & (trueLabel == argMaxLabel) & (trueLabel_prev !=
                                                             argMaxLabel_prev):
                    feat_vec["B:" + trueLabel_prev,
                             trueLabel] = feat_vec["B:" + trueLabel_prev,
                                                   trueLabel] + 1
                    feat_vec["B:" + argMaxLabel_prev,
                             argMaxLabel] = feat_vec["B:" + argMaxLabel_prev,
                                                     argMaxLabel] - 1
                    #sigma["B:"+trueLabel_prev,trueLabel] = sigma["B:"+ trueLabel_prev,trueLabel] + feat_vec["B:"+ trueLabel_prev,trueLabel]
                    #sigma["B:"+argMaxLabel_prev,argMaxLabel] = sigma["B:"+argMaxLabel_prev,argMaxLabel] + feat_vec["B:"+argMaxLabel_prev,argMaxLabel]
                '''
                elif (j > 1) & (trueLabel == argMaxLabel) & (trueLabel_prev == argMaxLabel_prev) & (expectedOutput[j-2] == argMaxoutput[j-2]):
                        feat_vec["B:"+expectedOutput[j-2],trueLabel_prev,trueLabel] = feat_vec["B:"+ expectedOutput[j-2],trueLabel_prev,trueLabel] + 1
                        feat_vec["B:"+argMaxLabel_prev,argMaxLabel] = feat_vec["B:"+argMaxoutput[j-2],argMaxLabel_prev,argMaxLabel] - 1
                        #sigma["B:"+trueLabel_prev,trueLabel] = sigma["B:"+ trueLabel_prev,trueLabel] + feat_vec["B:"+ trueLabel_prev,trueLabel]
                        #sigma["B:"+argMaxLabel_prev,argMaxLabel] = sigma["B:"+argMaxLabel_prev,argMaxLabel] + feat_vec["B:"+argMaxLabel_prev,argMaxLabel]
                '''
            #sigma = dict(Counter(sigma)+Counter(feat_vec))

            for key in feat_vec:
                sigma[key] = sigma[key] + feat_vec[key]

        epochendtime = datetime.now()
        print "Number of error in Epoch", i + 1, " ", numOfError, " Time Taken:", epochendtime - epochstarttime

    for key, value in sigma.items():
        gamma[key] = value / (numepochs * len(train_data))
        #gamma[key] = value/(numepochs)

    endtime = datetime.now()
    print "Total Time taken to train:", endtime - starttime
    return gamma
Exemple #28
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    sigma_vec = defaultdict(int)
    gamma_vec = defaultdict(int)
    tau_vec = defaultdict(int)
    T = numepochs
    m = len(train_data)

    # Main loop
    for t in range(T):
        print('EPOCH:', t + 1)
        mistakes = 0

        for i in range(m):
            if i % 1000 == 0:
                print(i, end=' ')

            # Get output chunk tags from Viterbi
            labeled_list = train_data[i][0]
            feat_list = train_data[i][1]

            gold_tags = [ll.split()[2] for ll in labeled_list]
            output_tags = perc.perc_test(feat_vec,
                                         labeled_list,
                                         feat_list,
                                         tagset,
                                         default_tag=tagset[0])

            if t != T - 1 & i != m - 1:

                # Update weight vector if the output is incorrect
                if output_tags != gold_tags:

                    # Get feature IDs: from training data and from Viterbi output
                    feat_ids_gold, feat_ids_output = get_feature_ids(
                        feat_list, gold_tags, output_tags)

                    for k in range(len(gold_tags)):

                        if output_tags[k] != gold_tags[k]:

                            for f in feat_ids_gold[k]:

                                if f in tau_vec.keys():
                                    sigma_vec[f] = sigma_vec[f] + feat_vec[
                                        f] * (t * m + i - tau_vec[f][1] * m -
                                              tau_vec[f][0])

                                feat_vec[f] = feat_vec[f] + 1
                                sigma_vec[f] = sigma_vec[f] + 1
                                # record the location where the dimension tag is updated
                                tau_vec[f] = [i, t]

                            for f in feat_ids_output[k]:

                                if f in tau_vec.keys():
                                    sigma_vec[f] = sigma_vec[f] + feat_vec[
                                        f] * (t * m + i - tau_vec[f][1] * m -
                                              tau_vec[f][0])

                                feat_vec[f] = feat_vec[f] - 1
                                sigma_vec[f] = sigma_vec[f] - 1
                                # record the location where the dimension tag is updated
                                tau_vec[f] = [i, t]
                        else:
                            # record the location where the dimension tag is updated
                            for f in feat_ids_gold[k]:
                                tau_vec[f] = [i, t]
                            for f in feat_ids_output[k]:
                                tau_vec[f] = [i, t]

                    mistakes += 1
            else:
                # to deal with the last sentence in the last iteration
                # Get feature IDs: from training data and from Viterbi output
                feat_ids_gold, feat_ids_output = get_feature_ids(
                    feat_list, gold_tags, output_tags)

                for f in tau_vec.keys():
                    sigma_vec[f] = sigma_vec[f] + feat_vec[f] * (
                        T * m + m - tau_vec[f][1] * m - tau_vec[f][0])

                for k in range(len(gold_tags)):
                    if output_tags[k] != gold_tags[k]:
                        for g in feat_ids_gold[k]:
                            feat_vec[g] = feat_vec[g] + 1
                            sigma_vec[g] = sigma_vec[g] + 1
                        for g in feat_ids_output[k]:
                            feat_vec[g] = feat_vec[g] - 1
                            sigma_vec[g] = sigma_vec[g] - 1

        print('\nMistakes in epoch {0}: {1} out of {2} sentences'.format(
            t + 1, mistakes, len(train_data)))

    for key in sigma_vec.keys():
        sigma_vec[key] = sigma_vec[key] / (m * T)

    return sigma_vec
def perc_train(train_data, tagset, T):
	feat_vec = defaultdict(int)
	sigma_feat_vec = defaultdict(float)
	sigma_feat_vec2 = defaultdict(float)
	tau = {}
	# insert your code here
	# please limit the number of iterations of training to n iterations
	
	print >> sys.stderr, "training data ..."
	M = len(train_data)
	for t in range (0,T):		
		for i in range(0,M):
			print >> sys.stderr, '\r{0}'.format("Iteration: %d/%d. Sentence: %d/%d\t" %(t+1, T, i+1, M)),
			
			labeled_list = train_data[i][0]
			feat_list = train_data[i][1]
	
			# Extract the labels from training data
			toutput = [tags.split(' ')[2] for tags in labeled_list ]	
			
			# Output from Viterbi Algorithm	
			zoutput = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, 'B-NP')
			
			# Compare outputs
			diff = toutput != zoutput
			if t != T-1 or i != M-1:
				if diff:				
					index = 0
					for p in range(0, len(zoutput)):							
						predicted_tag = zoutput[p]
						true_tag = toutput[p]
						(index, feats) = perc.feats_for_word(index, feat_list)										
						for feat in feats:
							s1 = s2 = ''
							if feat == 'B':
								if p >= 1:
									zprevtag = zoutput[p-1]
									tprevtag = toutput[p-1]
								else:
									zprevtag = tprevtag = 'B_-1'
								s1 = (feat+':'+zprevtag, predicted_tag)
								s2 = (feat+':'+tprevtag, true_tag)
							else:
								s1 = (feat, predicted_tag)
								s2 = (feat, true_tag)

							if s1 in tau:
								sigma_feat_vec[s1] = sigma_feat_vec[s1] + feat_vec[s1] * ((t+1) * M + (i+1) - tau[s1][1] * M - tau[s1][0])
							if s1 != s2 and s2 in tau:
								sigma_feat_vec[s2] = sigma_feat_vec[s2] + feat_vec[s2] * ((t+1) * M + (i+1) - tau[s2][1] * M - tau[s2][0])
							
							if s1 != s2:
								feat_vec[s1] -= 1
								feat_vec[s2] += 1
								sigma_feat_vec[s1] -= 1
								sigma_feat_vec[s2] += 1
								tau[s1] = (i+1,t+1)
							tau[s2] = (i+1,t+1)
			else:
				for s in tau:					
					sigma_feat_vec[s] = sigma_feat_vec[s] + feat_vec[s] * (T * M + M - tau[s][1] * M - tau[s][0])
				if diff:				
					index = 0
					for p in range(0, len(zoutput)):							
						predicted_tag = zoutput[p]
						true_tag = toutput[p]
						(index, feats) = perc.feats_for_word(index, feat_list)										
						for feat in feats:
							s1 = s2 = ''
							if feat == 'B':
								if p >= 1:
									zprevtag = zoutput[p-1]
									tprevtag = toutput[p-1]
								else:
									zprevtag = tprevtag = 'B_-1'
								s1 = (feat+':'+zprevtag, predicted_tag)
								s2 = (feat+':'+tprevtag, true_tag)
							else:
								s1 = (feat, predicted_tag)
								s2 = (feat, true_tag)
							
							if s1 != s2:
								feat_vec[s1] -= 1
								feat_vec[s2] += 1
								sigma_feat_vec[s1] -= 1
								sigma_feat_vec[s2] += 1
							
	print >> sys.stderr, "\ndone"
	print >> sys.stderr, "computing average vector ..."
	zerokeys = []
	for f in sigma_feat_vec:
		if sigma_feat_vec[f] == 0:
			zerokeys.append(f)
		else:
			sigma_feat_vec[f] = sigma_feat_vec[f]/(T*M)	
	for k in zerokeys:
		del sigma_feat_vec[k]
	print >> sys.stderr, "done"
	return sigma_feat_vec
Exemple #30
0
def avg_perc_train(train_data, tagset, n):
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")
    default_tag = tagset[0]

    feat_vec = defaultdict(int)
    avg_vec = defaultdict(int)
    last_iter = {}

    epochs = n
    num_updates = 0
    for round in range(0, epochs):
        num_mistakes = 0
        for (labeled_list, feat_list) in train_data:
            num_updates += 1
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset,
                                    default_tag)
            true_output = get_truth(labeled_list)
            logging.info("arg max output: %s" % (" ".join(output)))
            logging.info("truth: %s" % (" ".join(true_output)))
            #print >>sys.stderr, "\noutput:", output, "\ntruth:", true_output
            if output != true_output:
                num_mistakes += 1
                output.insert(0, 'B_-1')
                output.append('B_+1')
                true_output.insert(0, 'B_-1')
                true_output.append('B_+1')
                feat_index = 0
                for i in range(1, len(output) - 1):
                    #print >>sys.stderr, output[i], true_output[i]
                    (feat_index,
                     feats) = perc.feats_for_word(feat_index, feat_list)
                    if len(feats) == 0:
                        print >> sys.stderr, " ".join(labeled_list), " ".join(
                            feat_list), "\n"
                        raise ValueError(
                            "features do not align with input sentence")
                    #print >>sys.stderr, feats
                    feat_vec_update = defaultdict(int)
                    for feat in feats:
                        if feat == 'B':
                            output_feat = 'B:' + output[i - 1]
                            truth_feat = 'B:' + true_output[i - 1]
                        else:
                            output_feat = truth_feat = feat

                        feat_vec_update[output_feat, output[i]] += -1
                        feat_vec_update[truth_feat, true_output[i]] += 1
                        #reason: if output[i]==true_output[i] update = 0
                    for (upd_feat, upd_tag) in feat_vec_update:
                        if feat_vec_update[upd_feat, upd_tag] != 0:
                            feat_vec[upd_feat,
                                     upd_tag] += feat_vec_update[upd_feat,
                                                                 upd_tag]
                            logging.info(
                                "updating feat_vec with feature_id: (%s, %s) value: %d"
                                % (upd_feat, upd_tag,
                                   feat_vec_update[upd_feat, upd_tag]))
                            if (upd_feat, upd_tag) in last_iter:
                                avg_vec[upd_feat, upd_tag] += (
                                    num_updates - last_iter[upd_feat, upd_tag]
                                ) * feat_vec[upd_feat, upd_tag]
                            else:
                                avg_vec[upd_feat, upd_tag] = feat_vec[upd_feat,
                                                                      upd_tag]
                            last_iter[upd_feat, upd_tag] = num_updates
        print >> sys.stderr, "number of mistakes:", num_mistakes
    for (feat, tag) in feat_vec:
        if (feat, tag) in last_iter:
            avg_vec[feat, tag] += (num_updates -
                                   last_iter[feat, tag]) * feat_vec[feat, tag]
        else:
            avg_vec[feat, tag] = feat_vec[feat, tag]
        feat_vec[feat, tag] = avg_vec[feat, tag] / num_updates
    return feat_vec
Exemple #31
0
def perc_train(train_data, tagset, numepochs):
    # feat_vec = perc.perc_read_from_file(opts.modelfile)
    # print len(feat_vec)
    feat_vec = defaultdict(int)
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    numepochs = int(1)
    default_tag = tagset[0]
    for t in range(numepochs):

        print "Iteration#", t, " is processing now."
        counter = 0
        for (labeled_list, feat_list) in train_data:

            counter += 1
            print counter

            labels = copy.deepcopy(labeled_list)
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            # compare current output and true result
            # correct_flag = True
            feat_index = 0

            # check word by word if the predicted tag is equal to the true tag
            for i, v in enumerate(output):
                (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)

                # retrieve the feature for a word
                if len(feats) == 0:
                    print >> sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                    raise ValueError("features do not align with input sentence")

                fields = labels[i].split()
                label = fields[2]
                if i > 1:
                    pre_label = labels[i - 1].split()[2]
                    pre_pre_label = labels[i - 2].split()[2]

                    if output[i - 2] != pre_pre_label or output[i - 1] != pre_label or output[i] != label:
                        for feat in feats:
                            if feat[0] == "B":  # for bigram feature
                                feat_out = "B:" + output[i - 1]  # feat_out is the "B:<previous output>"
                                feat_lab = "B:" + pre_label  # feat_lab is the "B:<previous label>"
                                if output[i - 1] != pre_label and output[i] != label:
                                    feat_vec[feat_out, output[i]] -= 1
                                    feat_vec[feat_lab, output[i]] -= 1
                                    feat_vec[feat_out, label] += 1
                                    feat_vec[feat_lab, label] += 1
                                elif output[i - 1] == pre_label and output[i] != label:
                                    feat_vec[feat_lab, output[i]] -= 2
                                    feat_vec[feat_lab, label] += 2
                                elif output[i - 1] != pre_label and output[i] == label:
                                    pass
                                elif output[i - 1] == pre_label and output[i] == label:
                                    pass

                                feat_out = "T:" + output[i - 2] + "/" + output[i - 1]
                                feat_lab = "T:" + pre_pre_label + "/" + pre_label
                                if output[i - 2] == pre_pre_label and output[i - 1] == pre_label and output[i] != label:
                                    feat_vec[feat_out, output[i]] -= 1
                                    feat_vec[feat_lab, label] += 1
                                feat_vec[feat_out, output[i]] -= 1
                                feat_vec[feat_lab, label] += 1

                            else:  # for U00 to U22 feature
                                feat_vec[feat, output[i]] -= 1
                                feat_vec[feat, label] += 1

                elif i == 1:
                    pre_label = labels[i - 1].split()[2]
                    if output[i - 1] != pre_label or output[i] != label:
                        for feat in feats:
                            if feat[0] == "B":  # for bigram feature
                                feat_out = "B:" + output[i - 1]  # feat_out is the "B:<previous output>"
                                feat_lab = "B:" + pre_label  # feat_lab is the "B:<previous label>"
                                feat_vec[feat_out, output[i]] -= 1
                                feat_vec[feat_lab, label] += 1
                            else:  # for U00 to U22 feature
                                feat_vec[feat, output[i]] -= 1
                                feat_vec[feat, label] += 1
                        feat_out = "T:B_-1/" + output[i - 1]
                        feat_lab = "T:B_-1/" + pre_label
                        feat_vec[feat_out, output[i]] -= 1
                        feat_vec[feat_lab, label] += 1
                else:  # for i==0 case, all the first word in each sentence
                    for feat in feats:
                        if feat[0] == "B":  # bigram feature case
                            feat = "B:B_-1"
                        feat_vec[feat, output[i]] -= 1
                        feat_vec[feat, label] += 1
                    feat = "T:B_-2/B_-1"
                    feat_vec[feat, output[i]] -= 1
                    feat_vec[feat, label] += 1

    # please limit the number of iterations of training to n iterations
    return feat_vec
Exemple #32
0
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    epoch = 0

    while (epoch < numepochs):
        #print(epoch)
        mistakes = 0
        correct = 0
        #print(len(train_data))
        #sen=0
        for sentence_data in train_data:
            words = []
            postags = []
            truetags = []
            label_list = sentence_data[0]
            feat_list = sentence_data[1]
            for label in label_list:
                (word, postag, chunktag) = label.split(" ")
                words.append(word)
                postags.append(postag)
                truetags.append(chunktag)
            tagset = perc.read_tagset(opts.tagsetfile)
            default_tag = tagset[0]
            argmaxtags = perc.perc_test(feat_vec, label_list, feat_list,
                                        tagset, default_tag)
            feat_index = 0
            i = 0

            for word in words:
                (feat_index, feats_for_this_word) = perc.feats_for_word(
                    feat_index, feat_list)
                # print(len(feats_for_this_word))
                argmax = argmaxtags[i]
                tru = truetags[i]
                if (argmax == tru):
                    i += 1
                    continue
                for f in feats_for_this_word:
                    wrongkey = f, argmax
                    rightkey = f, tru
                    feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                    feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                i += 1
            i = 0

            for word in words:
                argmax = argmaxtags[i]
                tru = truetags[i]
                if (argmax == tru):
                    i += 1
                    correct += 1
                    continue
                else:
                    mistakes += 1
                argmaxprev = "B:"
                truprev = "B:"
                if (i == 0):
                    argmaxprev += "B_-1"
                    truprev += "B_-1"
                else:
                    argmaxprev += argmaxtags[i - 1]
                    truprev += truetags[i - 1]
                wrongkey = argmaxprev, argmax
                rightkey = truprev, tru
                feat_vec[wrongkey] = feat_vec.get(wrongkey, 0) - 1
                feat_vec[rightkey] = feat_vec.get(rightkey, 0) + 1
                i += 1
            #if(sen%1000==0):
            #print(str(sen)+"/"+str(len(train_data)))
            #sen+=1
        #print(mistakes)
        #print(correct)
        epoch += 1
    # please limit the number of iterations of training to n iterations
    return feat_vec
def perc_train(train_data, tagset, numepochs, word_set):
    feat_vec = defaultdict(int)
    # insert your code here
    if len(tagset) <= 0:
        raise ValueError("Empty tagset")

    # numepochs = int(50)
    default_tag = tagset[0]
    for t in range(numepochs):
        tmp = 0
        # Count sentence
        print 'Iteration#',t,' is processing now.'
        cnt = 0
        for (labeled_list, feat_list) in train_data:
            cnt = cnt + 1
            if cnt % 1000 == 0:
                print 'current status: ', str(round(100*cnt/9000.0,2)),'%'
            labels = copy.deepcopy(labeled_list)
            # add in the start and end buffers for the context
            # for every sentence in the training set, iterate numepochs times
            output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag, word_set)

            feat_index = 0
            # check word by word if the predicted tag is equal to the true tag
            for i, v in enumerate(output):
                (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                # retrieve the feature for a word
                if len(feats) == 0:
                    print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
                    raise ValueError("features do not align with input sentence")
                
                fields = labels[i].split()
                label = fields[2]

                if i > 0: 
                    label_pre = labels[i-1].split()[2]
                    for feat in feats:
                        if feat[0] == 'B': # for bigram feature
                            feat_out = feat + ":" + output[i-1]  # feat_out is the "B:<previous output>"
                            feat_lab = feat + ":" + label_pre  # feat_lab is the "B:<previous label>"

                            if   output[i-1] != label_pre and output[i] != label:
                                feat_vec[feat_out, output[i]]   -= 1
                                feat_vec[feat_lab, output[i]]   -= 1
                                feat_vec[feat_out, label]       += 1
                                feat_vec[feat_lab, label]       += 1

                            elif output[i-1] == label_pre and output[i] != label:
                                feat_vec[feat_lab, output[i]]   -= 2
                                feat_vec[feat_lab, label]       += 2

                            elif output[i-1] != label_pre and output[i] == label:
                                pass

                            elif output[i-1] == label_pre and output[i] == label:
                                pass

                            # feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1
                            # feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1

                            # feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1
                            # feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1

                        else: 
                            # for U00 to U22 feature
                            # if the condition is not right, there will be no penaulty and rewarding
                            feat_vec[feat, output[i]] -= 1
                            feat_vec[feat, label]     += 1
                else:  # for i==0 case, all the first word in each sentence
                    label_pre = '_B-1'  # previous label will be denoted by _B-1
                    for feat in feats:
                        if feat[0] == 'B':  # bigram feature case
                            feat = feat + ":" + label_pre
                        feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                        feat_vec[feat, label] = feat_vec[feat, label] + 1

                # if i > 0: 
                #     label_pre = labels[i-1].split()[2]
                #     if output[i-1] != label_pre or output[i] != label:
                #         for feat in feats:
                #             if feat[0] == 'B': 
                #             # for bigram feature
                #                 feat_out = feat + ":" + output[i-1]  
                #                 # feat_out is the "B:<previous output>"
                #                 feat_lab = feat + ":" + label_pre  
                #                 # feat_lab is the "B:<previous label>"
                #                 # reward best condition

                #                 feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1

                #                 # penalize condition
                #                 feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1
                                
                #             else: 
                #             # for U00 to U22 feature
                #                 feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                #                 feat_vec[feat, label] = feat_vec[feat, label] + 1
                # else:
                #     # for i==0 case, all the first word in each sentence
                #     label_pre = '_B-1'  # previous label will be denoted by _B-1
                #     for feat in feats:
                #         if feat[0] == 'B':  
                #         # bigram feature case
                #             feat = feat + ":" + label_pre
                #         feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
                #         feat_vec[feat, label] = feat_vec[feat, label] + 1

        if t % 3 == 0:
            perc.perc_write_to_file(feat_vec, 'model_' + str(t))

        perc.perc_write_to_file(feat_vec, 'model')
        os.system('python perc.py -m model | python score-chunks.py')

    # please limit the number of iterations of training to n iterations
    return feat_vec