def GetXYVocab(NumpSamples=-1): StemmedLexicons = LoadStemmedLex() Lines = [l.strip() for l in open('AllNRCFeats.txt').readlines()][:NumpSamples] Sentences = [ ''.join(l.strip().split('^')[:-2]) for l in open('ForNRCFeats.csv').xreadlines() ][:NumpSamples] Labels = [L.split(';')[-1] for L in Lines] YY = GetYFromStringLabels(Labels) new_Lines = [] new_Sentences = [] for i, y in enumerate(YY): if y != 0: new_Lines.append(Lines[i]) new_Sentences.append(Sentences[i]) LexFeats = [GetLexFeats(Sent, StemmedLexicons) for Sent in new_Sentences] LexFeats = np.array(LexFeats) LexFeats = csr_matrix(LexFeats) print 'loaded lexicon features of shape', LexFeats.shape Samples, new_Labels = zip(*[tuple(L.split(';')) for L in new_Lines]) Y = GetYFromStringLabels(new_Labels) print 'loaded {} samples'.format(len(Samples)) print 'Label dist: ', Counter(Y) CountVecter = CountVectorizer(lowercase=True, dtype=np.float64, encoding='utf-8', tokenizer=None, max_features=20000) #,max_df=0.95) X = CountVecter.fit_transform(Samples) X = Normalizer().fit_transform(X) print 'shape of X matrix before adding lex feats', X.shape # Select_Feats = SelectPercentile(f_classif, percentile=9) # X_new = Select_Feats.fit_transform(X, Y) # print 'shape of X matrix after selecting KBest feats', X_new.shape X = hstack([X, LexFeats]) print 'shape of X matrix after adding lex feats', X.shape print '*' * 80 feature_names = CountVecter.get_feature_names() # print 'number of features before selection', len(feature_names) # mask = Select_Feats.get_support() # list of booleans # new_features = [] # The list of your K best features # for bool, feature in zip(mask, feature_names): # if bool: # new_features.append(feature) # print 'number of features after selection', len(new_features) Vocab = feature_names + [ 'HLPos', 'HLNeg', 'HLSum', 'NrcPos', 'NrcNeg', 'NrcSum', 'SubjPos', 'SubjNeg', 'SubjSum' ] print 'number of vocabulary (adding lex feats)', len(Vocab) print '*' * 80 return X, Y, Vocab, CountVecter
def GetXYVocab(NumpSamples=-1): StemmedLexicons = LoadStemmedLex() Lines = [ l.strip() for l in open('../../Data/AllNRCFeatsRestAspCatABSA.txt').xreadlines() ][:NumpSamples] Sentences = [ ''.join(l.strip().split(';')[:-2]) for l in open('../../Data/RestAspCatABSA.csv').xreadlines() ][:NumpSamples] LexFeats = [GetLexFeats(Sent, StemmedLexicons) for Sent in Sentences] LexFeats = np.array(LexFeats) LexFeats = csr_matrix(LexFeats) print 'loaded lexicon features of shape', LexFeats.shape Samples, Labels = zip(*[tuple(L.split(';')) for L in Lines]) Y = GetYFromStringLabels(Labels) print 'loaded {} samples'.format(len(Samples)) print 'Label dist: ', Counter(Y) CountVecter = CountVectorizer(lowercase=False, dtype=np.float64, binary=False) #,max_df=0.95) X = CountVecter.fit_transform(Samples) X = Normalizer().fit_transform(X) print 'shape of X matrix before adding lex feats', X.shape X = hstack([X, LexFeats]) print 'shape of X matrix after adding lex feats', X.shape Vocab = CountVecter.get_feature_names() + [ 'HLPos', 'HLNeg', 'HLSum', 'NrcPos', 'NrcNeg', 'NrcSum', 'SubjPos', 'SubjNeg', 'SubjSum' ] return X, Y, Vocab
def GetTestXY(fname, CountVecter): StemmedLexicons = LoadStemmedLex() Lines = [l.strip() for l in open(fname).readlines()] Sentences = [ ''.join(l.strip().split('^')[:-2]) for l in open(fname).xreadlines() ] Labels = [L.split(';')[-1] for L in Lines] YY = GetYFromStringLabels(Labels) new_Lines = [] new_Sentences = [] for i, y in enumerate(YY): if y != 0: new_Lines.append(Lines[i]) new_Sentences.append(Sentences[i]) LexFeats = [GetLexFeats(Sent, StemmedLexicons) for Sent in new_Sentences] LexFeats = np.array(LexFeats) LexFeats = csr_matrix(LexFeats) print 'loaded lexicon features of shape', LexFeats.shape Samples, new_Labels = zip(*[tuple(L.split(';')) for L in new_Lines]) Y = GetYFromStringLabels(new_Labels) print '#' * 20 print fname print 'loaded {} samples from {}'.format(len(Samples), fname) print 'Label dist: ', Counter(Y) X = CountVecter.transform(Samples) X = Normalizer().fit_transform(X) X = hstack([X, LexFeats]) return X, Y