Ejemplo n.º 1
0
def featurize(dataset):
    #random.shuffle(dataset)
#    dataset = dataset[:5000]

    train_data = dataset[0:int(len(dataset)*(3./5.))]
    validation_data = dataset[int(len(dataset)*(3./5.)):int(len(dataset)*(4./5.))]
    test_data = dataset[int(len(dataset)*(4./5.)):]

    print len(train_data)
    print len(validation_data)
    u_dict = sorted(Features.get_unigram_dict(train_data).keys())
    pos_dict = sorted(Features.get_pos_dict(train_data).keys())

    train = True
    for dataset in [train_data, test_data]:
        vecs = []
        for sent in dataset:
	    sent_data = [w[0] for w in sent]
            for i in range(len(sent)):
		word_data = sent[i]
		word = word_data[0]
		pos = word_data[1]
		tag = word_data[2]

		start = max(0, i-3)
		end = min(len(sent)-1, i+3)
		new_sent_data = [w[0] for w in sent[start:i] + sent[i+1:end]]

		current_vec = Features.ic(word)
		current_vec += Features.ic_average(new_sent_data)
		current_vec += Features.vectorize_pos(pos, pos_dict)
		current_vec += Features.skip_w2v(new_sent_data, word_data)
	 	current_vec += Features.word_w2v(word)

                if tag == "met":
                    current_vec += [1]
                else:
                    current_vec += [0]

                vecs.append(current_vec)

        if train:
    	    train_data = vecs
            train = False
        else:
            test_data = vecs

    return train_data, test_data