def build_feature_vectors(self, verbose=True, test_specific=(None, None), debug=False): vec_length = None word2vec_dict = truth.loadword2vecs() all_pos_tags = truth.extract_data_from_file(truth.EACH_UNIQUE_POS_FILE) # We only want to import this file once. print 'Building feature vectors...' all_dep_names = list(self.sentences.get_all_dependencies()) + [vpe.EMPTY_DEP] lemma_list = self.sentences.get_frequent_lemmas(limit=100) dep_names = ('prep','nsubj','dobj','nmod','adv','conj','vmod','amod','csubj') print len(dep_names),dep_names bar = ProgBar(len(self.train_triggers) + len(self.val_triggers) + len(self.test_triggers)) for trigger in self.train_triggers + self.val_triggers + self.test_triggers: if not (test_specific[0] and test_specific[1]) \ or (test_specific[0] == trigger.sentnum and test_specific[1] == trigger.wordnum): alignment_matrix(self.sentences, trigger, word2vec_dict, all_dep_names, lemma_list, dep_names=dep_names, pos_tags=all_pos_tags, debug=debug) if trigger == self.train_triggers[0]: vec_length = len(trigger.gold_ant.x) print 'Feature vector length: %d' % vec_length if vec_length: assert len(trigger.gold_ant.x) == vec_length for ant in trigger.possible_ants: assert len(ant.x) == vec_length bar.update() return
return word_vecs def get_vectors(vocab): # Mac Bin location bin_location = '/Users/kian/Documents/HONOR/project/word2vec/GoogleNews-vectors-negative300.bin' # PC Bin location #bin_location = 'C:\Users\Kian\Sura\project\word2vec\GoogleNews-vectors-negative300.bin' return load_bin_vec( bin_location, vocab, len(vocab)) #extractdatafromfile(EACH_UNIQUE_WORD_FILE)) start = time.clock() print 'Getting vectors...' vocab = extract_data_from_file(EACH_UNIQUE_WORD_FILE) print 'Vocabulary size', len(vocab) dics = get_vectors(vocab) print 'Got vectors!' print 'Time taken: ', (time.clock() - start) print 'Writing new file...' f = open(SVM_FILE_LOCATIONS + WORD2VEC_FILE, 'w') for k in dics: f.write('%s' % k) for val in dics[k]: f.write(',%f' % val) f.write('\n') f.close() print 'Finished writing new file!'
vocab.remove(word) word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) return word_vecs def get_vectors(vocab): # Mac Bin location bin_location = '/Users/kian/Documents/HONOR/project/word2vec/GoogleNews-vectors-negative300.bin' # PC Bin location #bin_location = 'C:\Users\Kian\Sura\project\word2vec\GoogleNews-vectors-negative300.bin' return load_bin_vec(bin_location, vocab, len(vocab))#extractdatafromfile(EACH_UNIQUE_WORD_FILE)) start = time.clock() print 'Getting vectors...' vocab = extract_data_from_file(EACH_UNIQUE_WORD_FILE) print 'Vocabulary size',len(vocab) dics = get_vectors(vocab) print 'Got vectors!' print 'Time taken: ',(time.clock()-start) print 'Writing new file...' f = open(SVM_FILE_LOCATIONS+WORD2VEC_FILE, 'w') for k in dics: f.write('%s'%k) for val in dics[k]: f.write(',%f'%val) f.write('\n') f.close() print 'Finished writing new file!'