Example #1
0
    def build_feature_vectors(self, verbose=True, test_specific=(None, None), debug=False):
        vec_length = None
        word2vec_dict = truth.loadword2vecs()
        all_pos_tags = truth.extract_data_from_file(truth.EACH_UNIQUE_POS_FILE)  # We only want to import this file once.

        print 'Building feature vectors...'

        all_dep_names = list(self.sentences.get_all_dependencies()) + [vpe.EMPTY_DEP]
        lemma_list = self.sentences.get_frequent_lemmas(limit=100)
        dep_names = ('prep','nsubj','dobj','nmod','adv','conj','vmod','amod','csubj')
        print len(dep_names),dep_names

        bar = ProgBar(len(self.train_triggers) + len(self.val_triggers) + len(self.test_triggers))
        for trigger in self.train_triggers + self.val_triggers + self.test_triggers:
            if not (test_specific[0] and test_specific[1]) \
                    or (test_specific[0] == trigger.sentnum and test_specific[1] == trigger.wordnum):

                alignment_matrix(self.sentences, trigger, word2vec_dict, all_dep_names, lemma_list,
                                 dep_names=dep_names, pos_tags=all_pos_tags, debug=debug)

                if trigger == self.train_triggers[0]:
                    vec_length = len(trigger.gold_ant.x)
                    print 'Feature vector length: %d' % vec_length

                if vec_length:
                    assert len(trigger.gold_ant.x) == vec_length
                    for ant in trigger.possible_ants:
                        assert len(ant.x) == vec_length

                bar.update()

        return
Example #2
0
    return word_vecs


def get_vectors(vocab):
    # Mac Bin location
    bin_location = '/Users/kian/Documents/HONOR/project/word2vec/GoogleNews-vectors-negative300.bin'

    # PC Bin location
    #bin_location = 'C:\Users\Kian\Sura\project\word2vec\GoogleNews-vectors-negative300.bin'
    return load_bin_vec(
        bin_location, vocab,
        len(vocab))  #extractdatafromfile(EACH_UNIQUE_WORD_FILE))


start = time.clock()
print 'Getting vectors...'
vocab = extract_data_from_file(EACH_UNIQUE_WORD_FILE)
print 'Vocabulary size', len(vocab)
dics = get_vectors(vocab)
print 'Got vectors!'
print 'Time taken: ', (time.clock() - start)
print 'Writing new file...'
f = open(SVM_FILE_LOCATIONS + WORD2VEC_FILE, 'w')
for k in dics:
    f.write('%s' % k)
    for val in dics[k]:
        f.write(',%f' % val)
    f.write('\n')
f.close()
print 'Finished writing new file!'
Example #3
0
                vocab.remove(word)
                word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
            else:
                f.read(binary_len)

    return word_vecs

def get_vectors(vocab):
    # Mac Bin location
    bin_location = '/Users/kian/Documents/HONOR/project/word2vec/GoogleNews-vectors-negative300.bin'
    
    # PC Bin location
    #bin_location = 'C:\Users\Kian\Sura\project\word2vec\GoogleNews-vectors-negative300.bin'
    return load_bin_vec(bin_location, vocab, len(vocab))#extractdatafromfile(EACH_UNIQUE_WORD_FILE))

start = time.clock()
print 'Getting vectors...'
vocab = extract_data_from_file(EACH_UNIQUE_WORD_FILE)
print 'Vocabulary size',len(vocab)
dics = get_vectors(vocab)
print 'Got vectors!'
print 'Time taken: ',(time.clock()-start)
print 'Writing new file...'
f = open(SVM_FILE_LOCATIONS+WORD2VEC_FILE, 'w')
for k in dics:
    f.write('%s'%k)
    for val in dics[k]:
        f.write(',%f'%val)
    f.write('\n')
f.close()
print 'Finished writing new file!'