Exemple #1
0
print "Number of bigram types extracted = ", len(typesDictBigrams)
print "Number of unigram types extracted = ", len(typesDictUnigrams)

#To print the types and their total number of occurrences... uncomment for speed
#for key in typesDict:
#    print key+" : ",typesDict[key]," ";

import extractTrigramFeatureVecX
#importing file for extracting trigram feature vector from training data
import extractBigramFeatureVecX
#importing file for extracting bigram feature vector X from training data
import extractFeatureVecX
#importing file for extracting unigram feature vector from training data
print "Feature Vector Extraction Started"

XFeaturesUnigrams = extractFeatureVecX.extractFeatureVecX(
    filename, startColIdx, typesDictUnigrams)
XFeaturesBigrams = extractBigramFeatureVecX.extractBigramFeatureVecX(
    filename, startColIdx, typesDictBigrams)
XFeaturesTrigrams = extractTrigramFeatureVecX.extractTrigramFeatureVecX(
    filename, startColIdx, typesDictTrigrams)

#combining the two feature vectors below
XFeatures = []
Ndata = len(XFeaturesUnigrams)  #number of sentences in training data
lUTypes = len(XFeaturesUnigrams[0])
#number of unigram types
lBTypes = len(XFeaturesBigrams[0])
#number of bigram types
lTTypes = len(XFeaturesTrigrams[0])
#number of trigram types
filename = "../rawdata/train/twitter-train-cleansed-B_rmnotav_ADDEDtest.tsv"
startColIdx = 3

typesDictTrigrams = extractTrigrams.extractTrigrams(filename, startColIdx)
typesDictUnigrams = extractTypes.extractTypes(filename, startColIdx)

print "Number of trigram types extracted = ", len(typesDictTrigrams)
print "Number of unigram types extracted = ", len(typesDictUnigrams)

import extractTrigramFeatureVecX
#importing file for extracting trigram feature vector from training data
import extractFeatureVecX
#importing file for extracting unigram feature vector from training data

print "Feature Vector Extraction Started"
XfeaturesUnigrams = extractFeatureVecX.extractFeatureVecX(
    filename, startColIdx, typesDictUnigrams)
XfeaturesTrigrams = extractTrigramFeatureVecX.extractTrigramFeatureVecX(
    filename, startColIdx, typesDictTrigrams)

#combining the two feature vectors below
Xfeatures = []
Ndata = len(XfeaturesUnigrams)  #number of sentences in training data
lUTypes = len(XfeaturesUnigrams[0])
#number of unigram types
lTTypes = len(XfeaturesTrigrams[0])
#number of trigram types
for i in range(0, NData):
    XFeatures.append(XFeaturesUnigrams[i])
    for j in range(0, lTTypes):
        XFeatures[i].append(XfeaturesTrigrams[i][j])
        #combining of feature vectors finished
#filename = "addedTest.tsv";
filename = "../rawdata/train/twitter-train-cleansed-B_rmnotav_ADDEDtest.tsv";
startColIdx = 3;

typesDictTrigrams = extractTrigrams.extractTrigrams(filename, startColIdx);
typesDictUnigrams = extractTypes.extractTypes(filename, startColIdx);

print "Number of trigram types extracted = ", len(typesDictTrigrams);
print "Number of unigram types extracted = ", len(typesDictUnigrams);

import extractTrigramFeatureVecX; #importing file for extracting trigram feature vector from training data
import extractFeatureVecX; #importing file for extracting unigram feature vector from training data

print "Feature Vector Extraction Started";
XfeaturesUnigrams = extractFeatureVecX.extractFeatureVecX(filename, startColIdx, typesDictUnigrams);
XfeaturesTrigrams = extractTrigramFeatureVecX.extractTrigramFeatureVecX(filename, startColIdx, typesDictTrigrams);

#combining the two feature vectors below
Xfeatures=[];
Ndata=len(XfeaturesUnigrams)#number of sentences in training data 
lUTypes=len(XfeaturesUnigrams[0]); #number of unigram types
lTTypes=len(XfeaturesTrigrams[0]); #number of trigram types
for i in range (0,NData):
    XFeatures.append(XFeaturesUnigrams[i]);
    for j in range (0,lTTypes):
        XFeatures[i].append(XfeaturesTrigrams[i][j]);#combining of feature vectors finished

#print "Feature Vector of size ", len(Xfeatures), " extracted";

labelIdx = 2;
#filename = "test_input.tsv";
filename = "../rawdata/train/twitter-train-cleansed-B_rmnotav_ADDEDtest_new.tsv";
startColIdx = 3;

typesDictTrigrams = extractTrigrams.extractTrigrams(filename, startColIdx);
typesDictUnigrams = extractTypes.extractTypes(filename, startColIdx);

print "Number of trigram types extracted = ", len(typesDictTrigrams);
print "Number of unigram types extracted = ", len(typesDictUnigrams);

import extractTrigramFeatureVecX; #importing file for extracting trigram feature vector from training data
import extractFeatureVecX; #importing file for extracting unigram feature vector from training data

print "Feature Vector Extraction Started";
XfeaturesUnigrams = extractFeatureVecX.extractFeatureVecX(filename, startColIdx, typesDictUnigrams);
XfeaturesTrigrams = extractTrigramFeatureVecX.extractTrigramFeatureVecX(filename, startColIdx, typesDictTrigrams);

#combining the two feature vectors below
Xfeatures=[];
Ndata=len(XfeaturesUnigrams)#number of sentences in training data 
lUTypes=len(XfeaturesUnigrams[0]); #number of unigram types
lTTypes=len(XfeaturesTrigrams[0]); #number of trigram types
for i in range (0,Ndata):
    XFeatures.append(XFeaturesUnigrams[i]);
    for j in range (0,lTTypes):
        XFeatures[i].append(XfeaturesTrigrams[i][j]);#combining of feature vectors finished

#print "Feature Vector of size ", len(Xfeatures), " extracted";

labelIdx = 2;
typesDict = extractTypes.extractTypes(filename, startColIdx)

#The line below does NOT work!
#OrderedDict(sorted(typesDict.items(), key=lambda t: t[0]));

print "Number of types extracted = ", len(typesDict)

#To print the types and their total number of occurrences... uncomment for speed
#for key in typesDict:
#    print key+" : ",typesDict[key]," ";

import extractFeatureVecX
#importing file for extracting feature vector X from training data
print "Feature Vector X Extraction Started"
Xfeatures = extractFeatureVecX.extractFeatureVecX(filename, startColIdx,
                                                  typesDict)
print "Feature Vector X of size ", len(Xfeatures), " extracted"

import addPOStagsFeature
#print "Adding NounNum to Xfeatures ", len(Xfeatures);
#Xfeatures = addPOStagsFeature.addNounNum(filename, Xfeatures);

#print "Adding VerbNum to Xfeatures ", len(Xfeatures);
#Xfeatures = addPOStagsFeature.addVerbNum(filename, Xfeatures#);

#print "Adding AdjAdvNum to Xfeatures ", len(Xfeatures);
#Xfeatures = addPOStagsFeature.addAdjAdvNum(filename, Xfeatures);

#print "Adding NounAdjRatio to Xfeatures ", len(Xfeatures);
#Xfeatures = addPOStagsFeature.addNounAdjRatio(filename, Xfeatures);
Exemple #6
0
startColIdx = 3;

typesDict = extractTypes.extractTypes(filename, startColIdx);

#The line below does NOT work!
#OrderedDict(sorted(typesDict.items(), key=lambda t: t[0]));

print "Number of types extracted = ", len(typesDict);

#To print the types and their total number of occurrences... uncomment for speed
#for key in typesDict:
#    print key+" : ",typesDict[key]," ";

import extractFeatureVecX; #importing file for extracting feature vector X from training data
print "Feature Vector X Extraction Started";
Xfeatures = extractFeatureVecX.extractFeatureVecX(filename, startColIdx, typesDict);
print "Feature Vector X of size ", len(Xfeatures), " extracted";

labelIdx = 2;
import handleClassLabels;
print "Class Label Vector Y Extraction Started";
YLabels = handleClassLabels.extractClassLabels(filename, labelIdx);
print "Class Label Vector Y of size ", len(YLabels), " extracted";

#Setting up scaler for standardisation
from sklearn import preprocessing
scaler = preprocessing.StandardScaler();

import numpy as np
#from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV