''' Takes a text and returns a list of tokens ''' return [tx for tx in (t.text for t in nlp(u'' + txt.decode('ascii',errors='ignore'))) if tx != '\n'] if __name__ == '__main__': log('Checking data integrity...') data_integrity() log('Building word vectors from {}'.format(WV_FILE)) gb = GloVeBox(WV_FILE) gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)#.index() log('Building global word vectors from {}'.format(GLOBAL_WV_FILE)) global_gb = GloVeBox(GLOBAL_WV_FILE) global_gb.build(zero_token=True, normalize_variance=False, normalize_norm=True)#.index() log('writing GloVeBox pickle...') pickle.dump(gb, open(WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump(global_gb, open(GLOBAL_WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) log('Getting training examples') train_neg = get_data(positive=False) train_pos = get_data() train, test = {}, {}
WORDS_PER_SENTENCE = 20 SENTENCES_PER_PARAGRAPH = 20 WV_FILE = './data/wv/glove.42B.300d.120000.txt' log('Importing spaCy...') from spacy.en import English log('Initializing spaCy...') nlp = English() if __name__ == '__main__': log('Building word vectors from {}'.format(WV_FILE)) gb = GloVeBox(WV_FILE) gb.build(zero_token=True).index() log('writing GloVeBox pickle...') pickle.dump(gb, open(WV_FILE.replace('.txt', '-glovebox.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) log('Loading train and test pickles...') with open(TRAIN_FILE) as file: [train_reviews, train_labels] = pickle.load(file) with open(DEV_FILE) as file: [dev_reviews, dev_labels] = pickle.load(file) with open(TEST_FILE) as file: [test_reviews, test_labels] = pickle.load(file) # Merge train and dev train_reviews.extend(dev_reviews)