vect_test = vectoriser.transform(list(text_test))
vect_test = vect_test.todense()
vect_test


#Make into df
colnames = vectoriser.vocabulary_
df_features_reduced = pd.DataFrame(vect_test, columns = colnames)
df_features_reduced.head()

#Merge back to original df
dfreduced_added = pd.merge(dfreduced,df_features_reduced,how='inner',left_index=True,right_index=True)
dfreduced_added.info()

len(dfreduced_added.columns)
#write to db - has problem. Have to recomple sqlite with higher col number
dfreduced_added = dfreduced_added.drop('twitter.text',1)
pu.toDB(con, dfreduced_added, 'features_training3000T10000FwithWords')
dfreduced_added.columns
pu.to_weka(dfreduced_added, outfile='features_reduced.csv')
    


#Evaluate and look at predicted tf

    #Output just match_rowid + text + t|f
    
    
    
    
Exemple #2
0
from sklearn.feature_extraction.text import CountVectorizer
os.getcwd()
os.chdir('/home/phcostello/Documents/workspace/iHubCrowdSourcing')

#if __name__ == '__main__':

path = "/home/phcostello/Documents/Data/iHub/S3_RawData/"
dbfile = "CrowdSourcingData.sqlite"
con = sqlite3.connect(path + dbfile, detect_types=sqlite3.PARSE_DECLTYPES)
#pu.showTables(con)

startDate = "2011-03-03"
endDate = "2015-03-05"
#First make vocab from reduced features with which we'll train
dfreduced = pu.readDB(con, 'FeaturesReduced2500T10000F', startDate, endDate)
#maintwitter = pu.readDB(con,'MasterData',startDate,endDate, fields=['match_rowid','twitter.text'])
#dfreduced_withtwitter = pd.merge(dfreduced,maintwitter,on='match_rowid')
#pu.toDB(con, dfreduced_withtwitter, 'FeaturesHT')

fin = open('Results/unique.txt')
text_train = fin.readlines()
fin.close()

#This is fitting vocab
vectoriser = CountVectorizer(min_df=1, stop_words='english')
vect_train = vectoriser.fit_transform(text_train)
#Change from sparse matrix to dense matrix
vect_train = vect_train.todense()
vectoriser.vocabulary_
Exemple #3
0
import pandas.io.sql as psql
import PSQLUtils
reload(PSQLUtils)
import sqlite3

import pickle



pd.set_printoptions(max_colwidth = 400)
pd.set_option(max_colwidth = 400)
path = "/home/phcostello/Documents/Data/iHub/S3_RawData/"
dbfile = "CrowdSourcingData.sqlite"
con = sqlite3.connect(path + dbfile)
PSQLUtils.showTables(con, display=True)
df = PSQLUtils.readDB(con,'FeaturesReduced3000T10000FwithWords', 0,0)


#Splitting data to features and target
len(df.columns)
target = df.pop('Newsworthy')
len(df.columns)
features = df
features.pop('rowid')
features.pop('match_rowid')



#Find char features
typeFirstCol = [type(it) for it in features.values[0]]
from sklearn.feature_extraction.text import CountVectorizer
os.getcwd()
os.chdir('/home/phcostello/Documents/workspace/iHubCrowdSourcing')

#if __name__ == '__main__':
        
path = "/home/phcostello/Documents/Data/iHub/S3_RawData/"
dbfile = "CrowdSourcingData.sqlite"
con = sqlite3.connect(path+ dbfile, detect_types=sqlite3.PARSE_DECLTYPES)
#pu.showTables(con)

startDate = "2011-03-03"
endDate = "2015-03-05"
#First make vocab from reduced features with which we'll train
dfreduced = pu.readDB(con, 'FeaturesReduced2500T10000F', startDate, endDate)
#maintwitter = pu.readDB(con,'MasterData',startDate,endDate, fields=['match_rowid','twitter.text'])
#dfreduced_withtwitter = pd.merge(dfreduced,maintwitter,on='match_rowid')
#pu.toDB(con, dfreduced_withtwitter, 'FeaturesHT')

fin = open('Results/unique.txt')
text_train = fin.readlines()
fin.close()

#This is fitting vocab
vectoriser = CountVectorizer(min_df=1,stop_words='english')
vect_train = vectoriser.fit_transform(text_train)
#Change from sparse matrix to dense matrix
vect_train = vect_train.todense()
vectoriser.vocabulary_
Exemple #5
0
# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics

import pandas.io.sql as psql
import PSQLUtils
reload(PSQLUtils)
import sqlite3

import pickle

pd.set_printoptions(max_colwidth=400)
pd.set_option(max_colwidth=400)
path = "/home/phcostello/Documents/Data/iHub/S3_RawData/"
dbfile = "CrowdSourcingData.sqlite"
con = sqlite3.connect(path + dbfile)
PSQLUtils.showTables(con, display=True)
df = PSQLUtils.readDB(con, 'FeaturesReduced3000T10000FwithWords', 0, 0)

#Splitting data to features and target
len(df.columns)
target = df.pop('Newsworthy')
len(df.columns)
features = df
features.pop('rowid')
features.pop('match_rowid')

#Find char features
typeFirstCol = [type(it) for it in features.values[0]]
#Select just the unicodes
indofunicode = typeFirstCol.index(unicode)
indofunicode
Exemple #6
0
def create_user_based_features(data, output_type = 'return', outputpath = None, out_table_name ='features_user_based'):    
    if output_type not in set(['weka', 'sqlite', 'return']):
        
        raise ValueError('output_type {} not supported '.format(output_type))
    
    #count items in string lists
    fieldstocount = ['twitter.links',\
    'twitter.mentions',\
    'twitter.hashtags']
    counts = data[fieldstocount].applymap(countlist)
    counts['match_rowid'] = data['match_rowid']
    
    #Check fields that are t/f
    fieldsTF = ['twitter.links' ,\
                'twitter.user.verified',\
                'twitter.mentions',\
                'twitter.user.geo_enabled']
    data[fieldsTF].head()
    truefalse = data[fieldsTF].applymap(isTF)
    truefalse['match_rowid']=data['match_rowid']
    
    
    #Create twitter age
    twitterAgeFields = ['twitter.created_at' ,\
                'twitter.user.created_at']
    #Convert to unicode string to datetime
    data[twitterAgeFields].head()
    data[twitterAgeFields].ix[0,0]
    dt = pd.to_datetime(data[twitterAgeFields].ix[0,0])
    #dt = data[twitterAgeFields].applymap(pd.to_datetime)
    #Date are read in in following unicode string format u'2013-03-19 13:00:16+00:00'
    dt = data[twitterAgeFields].applymap( lambda x:  int(x.split('-')[0])) #convert datestring to int year
    dt.ix[0,0]                                     
    twitterage = pd.DataFrame()
    twitterage= dt['twitter.created_at'] - dt['twitter.user.created_at'] 
    twitterage = pd.DataFrame(twitterage)
    twitterage.columns = ['twitterage']
    twitterage['match_rowid']=data['match_rowid']
    #print len(twitterage)
    #twitterage.head()
    
    #Word count
    wordcounts = pd.DataFrame(data['twitter.text']).applymap(wordcount)
    wordcounts['match_rowid'] = data['match_rowid']
    wordcounts.columns= ['wordcounts','match_rowid']
    #print len(wordcounts)
    #wordcounts.head()
    
    rawfields = ['match_rowid',
    'twitter.user.listed_count',\
    'klout.score',\
    'twitter.user.statuses_count',\
    'twitter.user.followers_count',\
    'twitter.user.friends_count',\
    'language.confidence',\
    'twitter.user.lang',\
    'Newsworthy']
    
    untouchedfields = data[ rawfields]
    features = pd.DataFrame()
    features = pd.merge(counts,truefalse ,on= 'match_rowid')
    features = pd.merge(features,twitterage ,on= 'match_rowid')
    features = pd.merge(features,wordcounts ,on= 'match_rowid')
    features = pd.merge(features,untouchedfields ,on= 'match_rowid')
    print len(features)
    
    #Add column names
    colnames = ['links_number','mentions_number','Hashtags_number',\
                        'match_rowid','links_exist','user_verified','mentions_exist',\
                        'geo_location_exist', 'twitter_age','wordcounts']
    
    colnames += rawfields[1:] #Drop match_rowid from rawfields names, 1st element
    features.columns = colnames
    
    #Replace empty value in numeric data with zero, e.g for count vars
    
    features[rawfields[:(len(rawfields)-2)]] = features[rawfields[:(len(rawfields)-2)]].applymap(replaceNullwithZero)
    
    #Add twitter text at the end
    features = pd.merge(features,data[['twitter.text','match_rowid']] ,on= 'match_rowid')
    
    if output_type == 'weka':
        con.close()
        #Write to weka friendly csv using custom function above
        outfile = outpath + 'CSV/{}.csv'.format(out_table_name)
        put.to_weka(features, outfile )
        
        
    elif output_type == 'sqlite':
        #Write to db
        cur = con.cursor()
        sql = 'DROP TABLE "main"."{}"'.format(out_table_name)
        try:
            cur.execute(sql)
        except sqlite3.OperationalError as e:
            print 'got error {}'.format(e)
        con.commit()
        psql.write_frame(features, out_table_name , con)#, append='replace')
        con.commit()
        con.close()
        
    elif output_type == 'return':
        con.close()
        return features