print "Number of bigram types extracted = ", len(typesDictBigrams) print "Number of unigram types extracted = ", len(typesDictUnigrams) #To print the types and their total number of occurrences... uncomment for speed #for key in typesDict: # print key+" : ",typesDict[key]," "; import extractTrigramFeatureVecX #importing file for extracting trigram feature vector from training data import extractBigramFeatureVecX #importing file for extracting bigram feature vector X from training data import extractFeatureVecX #importing file for extracting unigram feature vector from training data print "Feature Vector Extraction Started" XFeaturesUnigrams = extractFeatureVecX.extractFeatureVecX( filename, startColIdx, typesDictUnigrams) XFeaturesBigrams = extractBigramFeatureVecX.extractBigramFeatureVecX( filename, startColIdx, typesDictBigrams) XFeaturesTrigrams = extractTrigramFeatureVecX.extractTrigramFeatureVecX( filename, startColIdx, typesDictTrigrams) #combining the two feature vectors below XFeatures = [] Ndata = len(XFeaturesUnigrams) #number of sentences in training data lUTypes = len(XFeaturesUnigrams[0]) #number of unigram types lBTypes = len(XFeaturesBigrams[0]) #number of bigram types lTTypes = len(XFeaturesTrigrams[0]) #number of trigram types
filename = "../rawdata/train/twitter-train-cleansed-B_rmnotav_ADDEDtest.tsv" startColIdx = 3 typesDictTrigrams = extractTrigrams.extractTrigrams(filename, startColIdx) typesDictUnigrams = extractTypes.extractTypes(filename, startColIdx) print "Number of trigram types extracted = ", len(typesDictTrigrams) print "Number of unigram types extracted = ", len(typesDictUnigrams) import extractTrigramFeatureVecX #importing file for extracting trigram feature vector from training data import extractFeatureVecX #importing file for extracting unigram feature vector from training data print "Feature Vector Extraction Started" XfeaturesUnigrams = extractFeatureVecX.extractFeatureVecX( filename, startColIdx, typesDictUnigrams) XfeaturesTrigrams = extractTrigramFeatureVecX.extractTrigramFeatureVecX( filename, startColIdx, typesDictTrigrams) #combining the two feature vectors below Xfeatures = [] Ndata = len(XfeaturesUnigrams) #number of sentences in training data lUTypes = len(XfeaturesUnigrams[0]) #number of unigram types lTTypes = len(XfeaturesTrigrams[0]) #number of trigram types for i in range(0, NData): XFeatures.append(XFeaturesUnigrams[i]) for j in range(0, lTTypes): XFeatures[i].append(XfeaturesTrigrams[i][j]) #combining of feature vectors finished
#filename = "addedTest.tsv"; filename = "../rawdata/train/twitter-train-cleansed-B_rmnotav_ADDEDtest.tsv"; startColIdx = 3; typesDictTrigrams = extractTrigrams.extractTrigrams(filename, startColIdx); typesDictUnigrams = extractTypes.extractTypes(filename, startColIdx); print "Number of trigram types extracted = ", len(typesDictTrigrams); print "Number of unigram types extracted = ", len(typesDictUnigrams); import extractTrigramFeatureVecX; #importing file for extracting trigram feature vector from training data import extractFeatureVecX; #importing file for extracting unigram feature vector from training data print "Feature Vector Extraction Started"; XfeaturesUnigrams = extractFeatureVecX.extractFeatureVecX(filename, startColIdx, typesDictUnigrams); XfeaturesTrigrams = extractTrigramFeatureVecX.extractTrigramFeatureVecX(filename, startColIdx, typesDictTrigrams); #combining the two feature vectors below Xfeatures=[]; Ndata=len(XfeaturesUnigrams)#number of sentences in training data lUTypes=len(XfeaturesUnigrams[0]); #number of unigram types lTTypes=len(XfeaturesTrigrams[0]); #number of trigram types for i in range (0,NData): XFeatures.append(XFeaturesUnigrams[i]); for j in range (0,lTTypes): XFeatures[i].append(XfeaturesTrigrams[i][j]);#combining of feature vectors finished #print "Feature Vector of size ", len(Xfeatures), " extracted"; labelIdx = 2;
#filename = "test_input.tsv"; filename = "../rawdata/train/twitter-train-cleansed-B_rmnotav_ADDEDtest_new.tsv"; startColIdx = 3; typesDictTrigrams = extractTrigrams.extractTrigrams(filename, startColIdx); typesDictUnigrams = extractTypes.extractTypes(filename, startColIdx); print "Number of trigram types extracted = ", len(typesDictTrigrams); print "Number of unigram types extracted = ", len(typesDictUnigrams); import extractTrigramFeatureVecX; #importing file for extracting trigram feature vector from training data import extractFeatureVecX; #importing file for extracting unigram feature vector from training data print "Feature Vector Extraction Started"; XfeaturesUnigrams = extractFeatureVecX.extractFeatureVecX(filename, startColIdx, typesDictUnigrams); XfeaturesTrigrams = extractTrigramFeatureVecX.extractTrigramFeatureVecX(filename, startColIdx, typesDictTrigrams); #combining the two feature vectors below Xfeatures=[]; Ndata=len(XfeaturesUnigrams)#number of sentences in training data lUTypes=len(XfeaturesUnigrams[0]); #number of unigram types lTTypes=len(XfeaturesTrigrams[0]); #number of trigram types for i in range (0,Ndata): XFeatures.append(XFeaturesUnigrams[i]); for j in range (0,lTTypes): XFeatures[i].append(XfeaturesTrigrams[i][j]);#combining of feature vectors finished #print "Feature Vector of size ", len(Xfeatures), " extracted"; labelIdx = 2;
typesDict = extractTypes.extractTypes(filename, startColIdx) #The line below does NOT work! #OrderedDict(sorted(typesDict.items(), key=lambda t: t[0])); print "Number of types extracted = ", len(typesDict) #To print the types and their total number of occurrences... uncomment for speed #for key in typesDict: # print key+" : ",typesDict[key]," "; import extractFeatureVecX #importing file for extracting feature vector X from training data print "Feature Vector X Extraction Started" Xfeatures = extractFeatureVecX.extractFeatureVecX(filename, startColIdx, typesDict) print "Feature Vector X of size ", len(Xfeatures), " extracted" import addPOStagsFeature #print "Adding NounNum to Xfeatures ", len(Xfeatures); #Xfeatures = addPOStagsFeature.addNounNum(filename, Xfeatures); #print "Adding VerbNum to Xfeatures ", len(Xfeatures); #Xfeatures = addPOStagsFeature.addVerbNum(filename, Xfeatures#); #print "Adding AdjAdvNum to Xfeatures ", len(Xfeatures); #Xfeatures = addPOStagsFeature.addAdjAdvNum(filename, Xfeatures); #print "Adding NounAdjRatio to Xfeatures ", len(Xfeatures); #Xfeatures = addPOStagsFeature.addNounAdjRatio(filename, Xfeatures);
startColIdx = 3; typesDict = extractTypes.extractTypes(filename, startColIdx); #The line below does NOT work! #OrderedDict(sorted(typesDict.items(), key=lambda t: t[0])); print "Number of types extracted = ", len(typesDict); #To print the types and their total number of occurrences... uncomment for speed #for key in typesDict: # print key+" : ",typesDict[key]," "; import extractFeatureVecX; #importing file for extracting feature vector X from training data print "Feature Vector X Extraction Started"; Xfeatures = extractFeatureVecX.extractFeatureVecX(filename, startColIdx, typesDict); print "Feature Vector X of size ", len(Xfeatures), " extracted"; labelIdx = 2; import handleClassLabels; print "Class Label Vector Y Extraction Started"; YLabels = handleClassLabels.extractClassLabels(filename, labelIdx); print "Class Label Vector Y of size ", len(YLabels), " extracted"; #Setting up scaler for standardisation from sklearn import preprocessing scaler = preprocessing.StandardScaler(); import numpy as np #from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV