Example #1
0
example2 = example.lower()
print(example2)


# In[ ]:


example3 = CountVectorizer().build_tokenizer()(example2)
print(example3)


# In[ ]:


pd.DataFrame([[x,example3.count(x)] for x in set(example3)], columns = ['Word', 'Count'])


# Were you able to see everything that changed?  
# The process involved:  
# - Converting the headline to lowercase letters  
# - Splitting the sentence into a list of words  
# - Removing punctuation and meaningless words  
# - Transforming that list into a table of counts

# What started as a relatively "messy" sentence has now become an neatly organized table!  
# And while this may not be exactly what goes on behind the scenes with scikit-learn, this example should give you a pretty good idea about how it works.

# So now that you've seen what the text processing looks like, let's get started on the fun part, modeling!

# ----------
Example #2
0
# encoding=utf-8
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# pd.set_option("display.max_columns",None)
data = pd.read_csv("G:\\datas\\ai\\Combined_News_DJIA.csv")
# print(data.head())
print(type(data))
train = data[data["Date"] < "2015-01-01"]
test = data[data["Date"] < "2014-12-31"]

example = train.iloc[3, 10]
print(example)

# 全部转换为小写
example2 = example.lower()
print(example2)

example3 = CountVectorizer().build_tokenizer()(example2)
print(example3)

df = pd.DataFrame([[x,example3.count(x)] for x in set(example3)],columns=["Word","Count"])
print(df)
Example #3
0
    df.index = pd.DatetimeIndex(df.index)

    # Set X and y
    X = df.drop('Label', axis=1)
    y = df['Label']

    # Split data into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    # Show tokenized words for the first row
    X_combined = combine_text_columns(X)
    tokenizer = CountVectorizer().build_tokenizer()(X_combined.iloc[0])
    df = pd.DataFrame([[x, tokenizer.count(x)] for x in set(tokenizer)],
                      columns=['Word', 'Count'])
    df.sort_values('Count', inplace=True, ascending=False)
    print(X.iloc[0].name, '\n')
    print(X_combined.iloc[0], '\n')
    print(df.head(15), '\n')

    # Create a FunctionTransfomer to combine text columns in a row
    combine_text_ft = FunctionTransformer(combine_text_columns, validate=False)

    # Create pipeline
    pl = Pipeline([('cmb', combine_text_ft),
                   ('vct', CountVectorizer(ngram_range=(2, 2))),
                   ('int', SparseInteractions(degree=2)),
                   ('clf', LogisticRegression(C=.027, solver='sag'))])
print(data.head())

train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']

example = train.iloc[3, 10]
print(example)

example2 = example.lower()
print(example2)

example3 = CountVectorizer().build_tokenizer()(example2)
print(example3)

print(
    pd.DataFrame([[x, example3.count(x)] for x in set(example3)],
                 columns=['Word', 'Count']))

trainheadlines = []
for row in range(0, len(train.index)):
    trainheadlines.append(' '.join(str(x) for x in train.iloc[row, 2:27]))

print(trainheadlines)

basicvectorizer = CountVectorizer(stop_words='english')
basictrain = basicvectorizer.fit_transform(trainheadlines)
print(basictrain.shape)

basicmodel = LogisticRegression()
basicmodel = basicmodel.fit(basictrain, train["Label"])