Exemple #1
0
vect_train = vect_train.todense()
vectoriser.vocabulary_

text_test = dfreduced['twitter.text']
text_test = text_test.apply(lambda x: x.lower())
text_test = text_test.apply(lambda x: x.replace('#', ''))
text_test = text_test.apply(lambda x: x.replace('@', ''))
vect_test = vectoriser.transform(list(text_test))
vect_test = vect_test.todense()
vect_test

#Make into df
colnames = vectoriser.vocabulary_
df_features_reduced = pd.DataFrame(vect_test, columns=colnames)
df_features_reduced.head()

#Merge back to original df
dfreduced_added = pd.merge(dfreduced,
                           df_features_reduced,
                           how='inner',
                           left_index=True,
                           right_index=True)
dfreduced_added.info()

len(dfreduced_added.columns)
#write to db - has problem. Have to recomple sqlite with higher col number
dfreduced_added = dfreduced_added.drop('twitter.text', 1)
pu.toDB(con, dfreduced_added, 'FeaturesReduced3000T10000FwithWords')
dfreduced_added.columns
pu.to_weka(dfreduced_added, outfile='features_reduced.csv')
vect_test = vectoriser.transform(list(text_test))
vect_test = vect_test.todense()
vect_test


#Make into df
colnames = vectoriser.vocabulary_
df_features_reduced = pd.DataFrame(vect_test, columns = colnames)
df_features_reduced.head()

#Merge back to original df
dfreduced_added = pd.merge(dfreduced,df_features_reduced,how='inner',left_index=True,right_index=True)
dfreduced_added.info()

len(dfreduced_added.columns)
#write to db - has problem. Have to recomple sqlite with higher col number
dfreduced_added = dfreduced_added.drop('twitter.text',1)
pu.toDB(con, dfreduced_added, 'features_training3000T10000FwithWords')
dfreduced_added.columns
pu.to_weka(dfreduced_added, outfile='features_reduced.csv')
    


#Evaluate and look at predicted tf

    #Output just match_rowid + text + t|f
    
    
    
    
text_test = text_test.apply( lambda x: x.replace('#',''))
text_test = text_test.apply( lambda x: x.replace('@',''))
vect_test = vectoriser.transform(list(text_test))
vect_test = vect_test.todense()
vect_test


#Make into df
colnames = vectoriser.vocabulary_
df_features_reduced = pd.DataFrame(vect_test, columns = colnames)
df_features_reduced.head()

#Merge back to original df
dfreduced_added = pd.merge(dfreduced,df_features_reduced,how='inner',left_index=True,right_index=True)
dfreduced_added.info()

len(dfreduced_added.columns)
#write to db - has problem. Have to recomple sqlite with higher col number
dfreduced_added = dfreduced_added.drop('twitter.text',1)
pu.toDB(con, dfreduced_added, 'FeaturesReduced3000T10000FwithWords')
dfreduced_added.columns
pu.to_weka(dfreduced_added, outfile='features_reduced.csv')