example2 = example.lower() print(example2) # In[ ]: example3 = CountVectorizer().build_tokenizer()(example2) print(example3) # In[ ]: pd.DataFrame([[x,example3.count(x)] for x in set(example3)], columns = ['Word', 'Count']) # Were you able to see everything that changed? # The process involved: # - Converting the headline to lowercase letters # - Splitting the sentence into a list of words # - Removing punctuation and meaningless words # - Transforming that list into a table of counts # What started as a relatively "messy" sentence has now become an neatly organized table! # And while this may not be exactly what goes on behind the scenes with scikit-learn, this example should give you a pretty good idea about how it works. # So now that you've seen what the text processing looks like, let's get started on the fun part, modeling! # ----------
# encoding=utf-8 import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression # pd.set_option("display.max_columns",None) data = pd.read_csv("G:\\datas\\ai\\Combined_News_DJIA.csv") # print(data.head()) print(type(data)) train = data[data["Date"] < "2015-01-01"] test = data[data["Date"] < "2014-12-31"] example = train.iloc[3, 10] print(example) # 全部转换为小写 example2 = example.lower() print(example2) example3 = CountVectorizer().build_tokenizer()(example2) print(example3) df = pd.DataFrame([[x,example3.count(x)] for x in set(example3)],columns=["Word","Count"]) print(df)
df.index = pd.DatetimeIndex(df.index) # Set X and y X = df.drop('Label', axis=1) y = df['Label'] # Split data into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Show tokenized words for the first row X_combined = combine_text_columns(X) tokenizer = CountVectorizer().build_tokenizer()(X_combined.iloc[0]) df = pd.DataFrame([[x, tokenizer.count(x)] for x in set(tokenizer)], columns=['Word', 'Count']) df.sort_values('Count', inplace=True, ascending=False) print(X.iloc[0].name, '\n') print(X_combined.iloc[0], '\n') print(df.head(15), '\n') # Create a FunctionTransfomer to combine text columns in a row combine_text_ft = FunctionTransformer(combine_text_columns, validate=False) # Create pipeline pl = Pipeline([('cmb', combine_text_ft), ('vct', CountVectorizer(ngram_range=(2, 2))), ('int', SparseInteractions(degree=2)), ('clf', LogisticRegression(C=.027, solver='sag'))])
print(data.head()) train = data[data['Date'] < '2015-01-01'] test = data[data['Date'] > '2014-12-31'] example = train.iloc[3, 10] print(example) example2 = example.lower() print(example2) example3 = CountVectorizer().build_tokenizer()(example2) print(example3) print( pd.DataFrame([[x, example3.count(x)] for x in set(example3)], columns=['Word', 'Count'])) trainheadlines = [] for row in range(0, len(train.index)): trainheadlines.append(' '.join(str(x) for x in train.iloc[row, 2:27])) print(trainheadlines) basicvectorizer = CountVectorizer(stop_words='english') basictrain = basicvectorizer.fit_transform(trainheadlines) print(basictrain.shape) basicmodel = LogisticRegression() basicmodel = basicmodel.fit(basictrain, train["Label"])