Beispiel #1
0
def lemmatize_article(article):
    '''
    INPUT: article (str) - raw text from the article (where text has been lowered and punctuation removed already)
    OUTPUT: lemmatized_article - article text with all stopwords removed and the remaining text lemmatized
    '''
    # Load in stopwords from load_data
    stopwords = stop_words()
    # Load Dictionary to fix commonly mislemmatized words
    correct_lemma = fix_lemmatized_words()
    # Lemmatize article by running each word through the pattern.en lemmatizer and only including it in the resulting text if the word doesn't appear in the set of stopwords
    article = ' '.join([en.lemma(w) for w in article.split() if w not in stopwords])
    # Return the article text after fixing common mislemmatized words
    return ' '.join([correct_lemma[w] if w in correct_lemma else w for w in article.split()])
Beispiel #2
0
def create_document_vector(df, max_features=5000, max_df=1, min_df=1):
    '''
    INPUTS: df - df['lemmatized_text'] will be what is vectorized
            max_features - number of words to be kept in the TfidfVector
            max_df - Cut off for words appearing in a given threshold of documents. (i.e. 1 = no limit, 0.95 will exclude words appearing in at least 95% of documents from being included in the resulting vector)
            min_df - Cut off for words appearing in a minimum number of documents. (i.e. 2 = term must appear in at least two documents)
    OUTPUT: TfidfVector (X)
            Feature Names Array
            Reverse_lookup Dictionary - Used to quickly and efficiently return the index of a given word in the Feature Names Array
    '''
    stopwords = stop_words()
    # Create TfidfVectorizer
    tfid = TfidfVectorizer(input='content', stop_words=stopwords, use_idf=True, lowercase=True, max_features=max_features, max_df=max_df, min_df=min_df)
    X = tfid.fit_transform(df['lemmatized_text'].values)
    feature_names = np.array(tfid.get_feature_names())
    reverse_lookup = {word: idx for idx, word in enumerate(feature_names)}
    return X, feature_names, reverse_lookup