def get_X_train(data, wn=False, ignore=False, max_n_gram=1, lowercase=True, nopunc=False, lemmatize=False, stem=False, remove_stop_words=True, tfidf=False, verbose=True): if verbose: print('Using n-grams of up to %d words in length' % max_n_gram) if lowercase and verbose: print('Converting all text to lowercase') if lemmatize: tokenizer = LemmaTokenizer(nopunc) if verbose: print('Lemmatizing all words') elif stem: tokenizer = StemTokenizer(nopunc) if verbose: print('Stemming all words') else: tokenizer = None if remove_stop_words: stop_words = 'english' if verbose: print('Removing English stop words') else: stop_words = None t0 = time() if tfidf: if verbose: print() print('Extracting features from the test data using a tfidf vectorizer') vectorizer = TfidfVectorizer(lowercase=lowercase, tokenizer=tokenizer, stop_words=stop_words, ngram_range=(1, max_n_gram)) X_train = vectorizer.fit_transform(data) else: if verbose: print('Extracting features from the test data using a count vectorizer') vectorizer = CountVectorizer(lowercase=lowercase, tokenizer=tokenizer, stop_words=stop_words, ngram_range=(1, max_n_gram)) if wn: print('Learning a vocabulary dictionary with a count vectorizer') vectorizer.fit(data) print('Done learning vocabulary dictionary') vectorizer = WordNetVectorizer(vectorizer) print('Getting wordnet based feature vectors...') X_train = vectorizer.get_word_net_feature_vecs(data, ignore) print('Done getting wordnet based feature vectors') else: X_train = vectorizer.fit_transform(data) duration = time() - t0 if verbose: data_train_size_mb = size_mb(data) print('done in %fs at %0.3fMB/s' % (duration, data_train_size_mb / duration)) print('n_samples: %d, n_features: %d' % X_train.shape) print() return X_train, vectorizer
def get_X_train(data, wn=False, ignore=False, max_n_gram=1, lowercase=True, nopunc=False, lemmatize=False, stem=False, remove_stop_words=True, tfidf=False, verbose=True): if verbose: print('Using n-grams of up to %d words in length' % max_n_gram) if lowercase and verbose: print('Converting all text to lowercase') if lemmatize: tokenizer = LemmaTokenizer(nopunc) if verbose: print('Lemmatizing all words') elif stem: tokenizer = StemTokenizer(nopunc) if verbose: print('Stemming all words') else: tokenizer = None if remove_stop_words: stop_words = 'english' if verbose: print('Removing English stop words') else: stop_words = None t0 = time() if tfidf: if verbose: print() print( 'Extracting features from the test data using a tfidf vectorizer' ) vectorizer = TfidfVectorizer(lowercase=lowercase, tokenizer=tokenizer, stop_words=stop_words, ngram_range=(1, max_n_gram)) X_train = vectorizer.fit_transform(data) else: if verbose: print( 'Extracting features from the test data using a count vectorizer' ) vectorizer = CountVectorizer(lowercase=lowercase, tokenizer=tokenizer, stop_words=stop_words, ngram_range=(1, max_n_gram)) if wn: print('Learning a vocabulary dictionary with a count vectorizer') vectorizer.fit(data) print('Done learning vocabulary dictionary') vectorizer = WordNetVectorizer(vectorizer) print('Getting wordnet based feature vectors...') X_train = vectorizer.get_word_net_feature_vecs(data, ignore) print('Done getting wordnet based feature vectors') else: X_train = vectorizer.fit_transform(data) duration = time() - t0 if verbose: data_train_size_mb = size_mb(data) print('done in %fs at %0.3fMB/s' % (duration, data_train_size_mb / duration)) print('n_samples: %d, n_features: %d' % X_train.shape) print() return X_train, vectorizer