vect_test = vectoriser.transform(list(text_test)) vect_test = vect_test.todense() vect_test #Make into df colnames = vectoriser.vocabulary_ df_features_reduced = pd.DataFrame(vect_test, columns = colnames) df_features_reduced.head() #Merge back to original df dfreduced_added = pd.merge(dfreduced,df_features_reduced,how='inner',left_index=True,right_index=True) dfreduced_added.info() len(dfreduced_added.columns) #write to db - has problem. Have to recomple sqlite with higher col number dfreduced_added = dfreduced_added.drop('twitter.text',1) pu.toDB(con, dfreduced_added, 'features_training3000T10000FwithWords') dfreduced_added.columns pu.to_weka(dfreduced_added, outfile='features_reduced.csv') #Evaluate and look at predicted tf #Output just match_rowid + text + t|f
from sklearn.feature_extraction.text import CountVectorizer os.getcwd() os.chdir('/home/phcostello/Documents/workspace/iHubCrowdSourcing') #if __name__ == '__main__': path = "/home/phcostello/Documents/Data/iHub/S3_RawData/" dbfile = "CrowdSourcingData.sqlite" con = sqlite3.connect(path + dbfile, detect_types=sqlite3.PARSE_DECLTYPES) #pu.showTables(con) startDate = "2011-03-03" endDate = "2015-03-05" #First make vocab from reduced features with which we'll train dfreduced = pu.readDB(con, 'FeaturesReduced2500T10000F', startDate, endDate) #maintwitter = pu.readDB(con,'MasterData',startDate,endDate, fields=['match_rowid','twitter.text']) #dfreduced_withtwitter = pd.merge(dfreduced,maintwitter,on='match_rowid') #pu.toDB(con, dfreduced_withtwitter, 'FeaturesHT') fin = open('Results/unique.txt') text_train = fin.readlines() fin.close() #This is fitting vocab vectoriser = CountVectorizer(min_df=1, stop_words='english') vect_train = vectoriser.fit_transform(text_train) #Change from sparse matrix to dense matrix vect_train = vect_train.todense() vectoriser.vocabulary_
import pandas.io.sql as psql import PSQLUtils reload(PSQLUtils) import sqlite3 import pickle pd.set_printoptions(max_colwidth = 400) pd.set_option(max_colwidth = 400) path = "/home/phcostello/Documents/Data/iHub/S3_RawData/" dbfile = "CrowdSourcingData.sqlite" con = sqlite3.connect(path + dbfile) PSQLUtils.showTables(con, display=True) df = PSQLUtils.readDB(con,'FeaturesReduced3000T10000FwithWords', 0,0) #Splitting data to features and target len(df.columns) target = df.pop('Newsworthy') len(df.columns) features = df features.pop('rowid') features.pop('match_rowid') #Find char features typeFirstCol = [type(it) for it in features.values[0]]
from sklearn.feature_extraction.text import CountVectorizer os.getcwd() os.chdir('/home/phcostello/Documents/workspace/iHubCrowdSourcing') #if __name__ == '__main__': path = "/home/phcostello/Documents/Data/iHub/S3_RawData/" dbfile = "CrowdSourcingData.sqlite" con = sqlite3.connect(path+ dbfile, detect_types=sqlite3.PARSE_DECLTYPES) #pu.showTables(con) startDate = "2011-03-03" endDate = "2015-03-05" #First make vocab from reduced features with which we'll train dfreduced = pu.readDB(con, 'FeaturesReduced2500T10000F', startDate, endDate) #maintwitter = pu.readDB(con,'MasterData',startDate,endDate, fields=['match_rowid','twitter.text']) #dfreduced_withtwitter = pd.merge(dfreduced,maintwitter,on='match_rowid') #pu.toDB(con, dfreduced_withtwitter, 'FeaturesHT') fin = open('Results/unique.txt') text_train = fin.readlines() fin.close() #This is fitting vocab vectoriser = CountVectorizer(min_df=1,stop_words='english') vect_train = vectoriser.fit_transform(text_train) #Change from sparse matrix to dense matrix vect_train = vect_train.todense() vectoriser.vocabulary_
# Import datasets, classifiers and performance metrics from sklearn import datasets, svm, metrics import pandas.io.sql as psql import PSQLUtils reload(PSQLUtils) import sqlite3 import pickle pd.set_printoptions(max_colwidth=400) pd.set_option(max_colwidth=400) path = "/home/phcostello/Documents/Data/iHub/S3_RawData/" dbfile = "CrowdSourcingData.sqlite" con = sqlite3.connect(path + dbfile) PSQLUtils.showTables(con, display=True) df = PSQLUtils.readDB(con, 'FeaturesReduced3000T10000FwithWords', 0, 0) #Splitting data to features and target len(df.columns) target = df.pop('Newsworthy') len(df.columns) features = df features.pop('rowid') features.pop('match_rowid') #Find char features typeFirstCol = [type(it) for it in features.values[0]] #Select just the unicodes indofunicode = typeFirstCol.index(unicode) indofunicode
def create_user_based_features(data, output_type = 'return', outputpath = None, out_table_name ='features_user_based'): if output_type not in set(['weka', 'sqlite', 'return']): raise ValueError('output_type {} not supported '.format(output_type)) #count items in string lists fieldstocount = ['twitter.links',\ 'twitter.mentions',\ 'twitter.hashtags'] counts = data[fieldstocount].applymap(countlist) counts['match_rowid'] = data['match_rowid'] #Check fields that are t/f fieldsTF = ['twitter.links' ,\ 'twitter.user.verified',\ 'twitter.mentions',\ 'twitter.user.geo_enabled'] data[fieldsTF].head() truefalse = data[fieldsTF].applymap(isTF) truefalse['match_rowid']=data['match_rowid'] #Create twitter age twitterAgeFields = ['twitter.created_at' ,\ 'twitter.user.created_at'] #Convert to unicode string to datetime data[twitterAgeFields].head() data[twitterAgeFields].ix[0,0] dt = pd.to_datetime(data[twitterAgeFields].ix[0,0]) #dt = data[twitterAgeFields].applymap(pd.to_datetime) #Date are read in in following unicode string format u'2013-03-19 13:00:16+00:00' dt = data[twitterAgeFields].applymap( lambda x: int(x.split('-')[0])) #convert datestring to int year dt.ix[0,0] twitterage = pd.DataFrame() twitterage= dt['twitter.created_at'] - dt['twitter.user.created_at'] twitterage = pd.DataFrame(twitterage) twitterage.columns = ['twitterage'] twitterage['match_rowid']=data['match_rowid'] #print len(twitterage) #twitterage.head() #Word count wordcounts = pd.DataFrame(data['twitter.text']).applymap(wordcount) wordcounts['match_rowid'] = data['match_rowid'] wordcounts.columns= ['wordcounts','match_rowid'] #print len(wordcounts) #wordcounts.head() rawfields = ['match_rowid', 'twitter.user.listed_count',\ 'klout.score',\ 'twitter.user.statuses_count',\ 'twitter.user.followers_count',\ 'twitter.user.friends_count',\ 'language.confidence',\ 'twitter.user.lang',\ 'Newsworthy'] untouchedfields = data[ rawfields] features = pd.DataFrame() features = pd.merge(counts,truefalse ,on= 'match_rowid') features = pd.merge(features,twitterage ,on= 'match_rowid') features = pd.merge(features,wordcounts ,on= 'match_rowid') features = pd.merge(features,untouchedfields ,on= 'match_rowid') print len(features) #Add column names colnames = ['links_number','mentions_number','Hashtags_number',\ 'match_rowid','links_exist','user_verified','mentions_exist',\ 'geo_location_exist', 'twitter_age','wordcounts'] colnames += rawfields[1:] #Drop match_rowid from rawfields names, 1st element features.columns = colnames #Replace empty value in numeric data with zero, e.g for count vars features[rawfields[:(len(rawfields)-2)]] = features[rawfields[:(len(rawfields)-2)]].applymap(replaceNullwithZero) #Add twitter text at the end features = pd.merge(features,data[['twitter.text','match_rowid']] ,on= 'match_rowid') if output_type == 'weka': con.close() #Write to weka friendly csv using custom function above outfile = outpath + 'CSV/{}.csv'.format(out_table_name) put.to_weka(features, outfile ) elif output_type == 'sqlite': #Write to db cur = con.cursor() sql = 'DROP TABLE "main"."{}"'.format(out_table_name) try: cur.execute(sql) except sqlite3.OperationalError as e: print 'got error {}'.format(e) con.commit() psql.write_frame(features, out_table_name , con)#, append='replace') con.commit() con.close() elif output_type == 'return': con.close() return features