def get_X_train(data, wn=False, ignore=False, max_n_gram=1, lowercase=True, nopunc=False, lemmatize=False, stem=False, remove_stop_words=True, tfidf=False, verbose=True):

    if verbose:
        print('Using n-grams of up to %d words in length' % max_n_gram)

    if lowercase and verbose:
        print('Converting all text to lowercase')

    if lemmatize:
        tokenizer = LemmaTokenizer(nopunc)
        if verbose:
            print('Lemmatizing all words')
    elif stem:
        tokenizer = StemTokenizer(nopunc)
        if verbose:
            print('Stemming all words')
    else:
        tokenizer = None

    if remove_stop_words:
        stop_words = 'english'
        if verbose:
            print('Removing English stop words')
    else:
        stop_words = None

    t0 = time()
    if tfidf:
        if verbose:
            print()
            print('Extracting features from the test data using a tfidf vectorizer')
        vectorizer = TfidfVectorizer(lowercase=lowercase, tokenizer=tokenizer, stop_words=stop_words, ngram_range=(1, max_n_gram))
        X_train = vectorizer.fit_transform(data)
    else:
        if verbose:
            print('Extracting features from the test data using a count vectorizer')
        vectorizer = CountVectorizer(lowercase=lowercase, tokenizer=tokenizer, stop_words=stop_words, ngram_range=(1, max_n_gram))
        if wn:
            print('Learning a vocabulary dictionary with a count vectorizer')
            vectorizer.fit(data)
            print('Done learning vocabulary dictionary')
            vectorizer = WordNetVectorizer(vectorizer)
            print('Getting wordnet based feature vectors...')
            X_train = vectorizer.get_word_net_feature_vecs(data, ignore)
            print('Done getting wordnet based feature vectors')
        else:
            X_train = vectorizer.fit_transform(data)
    duration = time() - t0
    if verbose:
        data_train_size_mb = size_mb(data)
        print('done in %fs at %0.3fMB/s' % (duration, data_train_size_mb / duration))
        print('n_samples: %d, n_features: %d' % X_train.shape)
        print()
    return X_train, vectorizer
Example #2
0
def get_X_train(data,
                wn=False,
                ignore=False,
                max_n_gram=1,
                lowercase=True,
                nopunc=False,
                lemmatize=False,
                stem=False,
                remove_stop_words=True,
                tfidf=False,
                verbose=True):

    if verbose:
        print('Using n-grams of up to %d words in length' % max_n_gram)

    if lowercase and verbose:
        print('Converting all text to lowercase')

    if lemmatize:
        tokenizer = LemmaTokenizer(nopunc)
        if verbose:
            print('Lemmatizing all words')
    elif stem:
        tokenizer = StemTokenizer(nopunc)
        if verbose:
            print('Stemming all words')
    else:
        tokenizer = None

    if remove_stop_words:
        stop_words = 'english'
        if verbose:
            print('Removing English stop words')
    else:
        stop_words = None

    t0 = time()
    if tfidf:
        if verbose:
            print()
            print(
                'Extracting features from the test data using a tfidf vectorizer'
            )
        vectorizer = TfidfVectorizer(lowercase=lowercase,
                                     tokenizer=tokenizer,
                                     stop_words=stop_words,
                                     ngram_range=(1, max_n_gram))
        X_train = vectorizer.fit_transform(data)
    else:
        if verbose:
            print(
                'Extracting features from the test data using a count vectorizer'
            )
        vectorizer = CountVectorizer(lowercase=lowercase,
                                     tokenizer=tokenizer,
                                     stop_words=stop_words,
                                     ngram_range=(1, max_n_gram))
        if wn:
            print('Learning a vocabulary dictionary with a count vectorizer')
            vectorizer.fit(data)
            print('Done learning vocabulary dictionary')
            vectorizer = WordNetVectorizer(vectorizer)
            print('Getting wordnet based feature vectors...')
            X_train = vectorizer.get_word_net_feature_vecs(data, ignore)
            print('Done getting wordnet based feature vectors')
        else:
            X_train = vectorizer.fit_transform(data)
    duration = time() - t0
    if verbose:
        data_train_size_mb = size_mb(data)
        print('done in %fs at %0.3fMB/s' %
              (duration, data_train_size_mb / duration))
        print('n_samples: %d, n_features: %d' % X_train.shape)
        print()
    return X_train, vectorizer