from sklearn.feature_extraction.text import CountVectorizer os.getcwd() os.chdir('/home/phcostello/Documents/workspace/iHubCrowdSourcing') #if __name__ == '__main__': path = "/home/phcostello/Documents/Data/iHub/S3_RawData/" dbfile = "CrowdSourcingData.sqlite" con = sqlite3.connect(path + dbfile, detect_types=sqlite3.PARSE_DECLTYPES) #pu.showTables(con) startDate = "2011-03-03" endDate = "2015-03-05" #First make vocab from reduced features with which we'll train dfreduced = pu.readDB(con, 'FeaturesReduced2500T10000F', startDate, endDate) #maintwitter = pu.readDB(con,'MasterData',startDate,endDate, fields=['match_rowid','twitter.text']) #dfreduced_withtwitter = pd.merge(dfreduced,maintwitter,on='match_rowid') #pu.toDB(con, dfreduced_withtwitter, 'FeaturesHT') fin = open('Results/unique.txt') text_train = fin.readlines() fin.close() #This is fitting vocab vectoriser = CountVectorizer(min_df=1, stop_words='english') vect_train = vectoriser.fit_transform(text_train) #Change from sparse matrix to dense matrix vect_train = vect_train.todense() vectoriser.vocabulary_
from sklearn.feature_extraction.text import CountVectorizer os.getcwd() os.chdir('/home/phcostello/Documents/workspace/iHubCrowdSourcing') #if __name__ == '__main__': path = "/home/phcostello/Documents/Data/iHub/S3_RawData/" dbfile = "CrowdSourcingData.sqlite" con = sqlite3.connect(path+ dbfile, detect_types=sqlite3.PARSE_DECLTYPES) #pu.showTables(con) startDate = "2011-03-03" endDate = "2015-03-05" #First make vocab from reduced features with which we'll train dfreduced = pu.readDB(con, 'FeaturesReduced2500T10000F', startDate, endDate) #maintwitter = pu.readDB(con,'MasterData',startDate,endDate, fields=['match_rowid','twitter.text']) #dfreduced_withtwitter = pd.merge(dfreduced,maintwitter,on='match_rowid') #pu.toDB(con, dfreduced_withtwitter, 'FeaturesHT') fin = open('Results/unique.txt') text_train = fin.readlines() fin.close() #This is fitting vocab vectoriser = CountVectorizer(min_df=1,stop_words='english') vect_train = vectoriser.fit_transform(text_train) #Change from sparse matrix to dense matrix vect_train = vect_train.todense() vectoriser.vocabulary_
import pandas.io.sql as psql import PSQLUtils reload(PSQLUtils) import sqlite3 import pickle pd.set_printoptions(max_colwidth = 400) pd.set_option(max_colwidth = 400) path = "/home/phcostello/Documents/Data/iHub/S3_RawData/" dbfile = "CrowdSourcingData.sqlite" con = sqlite3.connect(path + dbfile) PSQLUtils.showTables(con, display=True) df = PSQLUtils.readDB(con,'FeaturesReduced3000T10000FwithWords', 0,0) #Splitting data to features and target len(df.columns) target = df.pop('Newsworthy') len(df.columns) features = df features.pop('rowid') features.pop('match_rowid') #Find char features typeFirstCol = [type(it) for it in features.values[0]] #Select just the unicodes
from sklearn import datasets, svm, metrics import pandas.io.sql as psql import PSQLUtils reload(PSQLUtils) import sqlite3 import pickle pd.set_printoptions(max_colwidth=400) pd.set_option(max_colwidth=400) path = "/home/phcostello/Documents/Data/iHub/S3_RawData/" dbfile = "CrowdSourcingData.sqlite" con = sqlite3.connect(path + dbfile) PSQLUtils.showTables(con, display=True) df = PSQLUtils.readDB(con, 'FeaturesReduced3000T10000FwithWords', 0, 0) #Splitting data to features and target len(df.columns) target = df.pop('Newsworthy') len(df.columns) features = df features.pop('rowid') features.pop('match_rowid') #Find char features typeFirstCol = [type(it) for it in features.values[0]] #Select just the unicodes indofunicode = typeFirstCol.index(unicode) indofunicode ##ConvertThese to nominal