Beispiel #1
0
def GetXYVocab(NumpSamples=-1):
    StemmedLexicons = LoadStemmedLex()
    Lines = [l.strip()
             for l in open('AllNRCFeats.txt').readlines()][:NumpSamples]
    Sentences = [
        ''.join(l.strip().split('^')[:-2])
        for l in open('ForNRCFeats.csv').xreadlines()
    ][:NumpSamples]

    Labels = [L.split(';')[-1] for L in Lines]
    YY = GetYFromStringLabels(Labels)
    new_Lines = []
    new_Sentences = []
    for i, y in enumerate(YY):
        if y != 0:
            new_Lines.append(Lines[i])
            new_Sentences.append(Sentences[i])

    LexFeats = [GetLexFeats(Sent, StemmedLexicons) for Sent in new_Sentences]
    LexFeats = np.array(LexFeats)
    LexFeats = csr_matrix(LexFeats)
    print 'loaded lexicon features of shape', LexFeats.shape

    Samples, new_Labels = zip(*[tuple(L.split(';')) for L in new_Lines])
    Y = GetYFromStringLabels(new_Labels)
    print 'loaded {} samples'.format(len(Samples))
    print 'Label dist: ', Counter(Y)

    CountVecter = CountVectorizer(lowercase=True,
                                  dtype=np.float64,
                                  encoding='utf-8',
                                  tokenizer=None,
                                  max_features=20000)  #,max_df=0.95)
    X = CountVecter.fit_transform(Samples)
    X = Normalizer().fit_transform(X)
    print 'shape of X matrix before adding lex feats', X.shape
    # Select_Feats = SelectPercentile(f_classif, percentile=9)
    # X_new = Select_Feats.fit_transform(X, Y)
    # print 'shape of X matrix after selecting KBest feats', X_new.shape

    X = hstack([X, LexFeats])
    print 'shape of X matrix after adding lex feats', X.shape

    print '*' * 80
    feature_names = CountVecter.get_feature_names()
    # print 'number of features before selection', len(feature_names)
    # mask = Select_Feats.get_support()  # list of booleans
    # new_features = []  # The list of your K best features
    # for bool, feature in zip(mask, feature_names):
    #     if bool:
    #         new_features.append(feature)
    # print 'number of features after selection', len(new_features)

    Vocab = feature_names + [
        'HLPos', 'HLNeg', 'HLSum', 'NrcPos', 'NrcNeg', 'NrcSum', 'SubjPos',
        'SubjNeg', 'SubjSum'
    ]
    print 'number of vocabulary (adding lex feats)', len(Vocab)
    print '*' * 80
    return X, Y, Vocab, CountVecter
Beispiel #2
0
def GetXYVocab(NumpSamples=-1):
    StemmedLexicons = LoadStemmedLex()
    Lines = [
        l.strip()
        for l in open('../../Data/AllNRCFeatsRestAspCatABSA.txt').xreadlines()
    ][:NumpSamples]
    Sentences = [
        ''.join(l.strip().split(';')[:-2])
        for l in open('../../Data/RestAspCatABSA.csv').xreadlines()
    ][:NumpSamples]
    LexFeats = [GetLexFeats(Sent, StemmedLexicons) for Sent in Sentences]
    LexFeats = np.array(LexFeats)
    LexFeats = csr_matrix(LexFeats)
    print 'loaded lexicon features of shape', LexFeats.shape

    Samples, Labels = zip(*[tuple(L.split(';')) for L in Lines])
    Y = GetYFromStringLabels(Labels)
    print 'loaded {} samples'.format(len(Samples))
    print 'Label dist: ', Counter(Y)

    CountVecter = CountVectorizer(lowercase=False,
                                  dtype=np.float64,
                                  binary=False)  #,max_df=0.95)
    X = CountVecter.fit_transform(Samples)
    X = Normalizer().fit_transform(X)
    print 'shape of X matrix before adding lex feats', X.shape
    X = hstack([X, LexFeats])
    print 'shape of X matrix after adding lex feats', X.shape
    Vocab = CountVecter.get_feature_names() + [
        'HLPos', 'HLNeg', 'HLSum', 'NrcPos', 'NrcNeg', 'NrcSum', 'SubjPos',
        'SubjNeg', 'SubjSum'
    ]
    return X, Y, Vocab
Beispiel #3
0
def GetTestXY(fname, CountVecter):
    StemmedLexicons = LoadStemmedLex()
    Lines = [l.strip() for l in open(fname).readlines()]
    Sentences = [
        ''.join(l.strip().split('^')[:-2]) for l in open(fname).xreadlines()
    ]

    Labels = [L.split(';')[-1] for L in Lines]
    YY = GetYFromStringLabels(Labels)
    new_Lines = []
    new_Sentences = []
    for i, y in enumerate(YY):
        if y != 0:
            new_Lines.append(Lines[i])
            new_Sentences.append(Sentences[i])

    LexFeats = [GetLexFeats(Sent, StemmedLexicons) for Sent in new_Sentences]
    LexFeats = np.array(LexFeats)
    LexFeats = csr_matrix(LexFeats)
    print 'loaded lexicon features of shape', LexFeats.shape

    Samples, new_Labels = zip(*[tuple(L.split(';')) for L in new_Lines])
    Y = GetYFromStringLabels(new_Labels)
    print '#' * 20
    print fname
    print 'loaded {} samples from {}'.format(len(Samples), fname)
    print 'Label dist: ', Counter(Y)

    X = CountVecter.transform(Samples)
    X = Normalizer().fit_transform(X)
    X = hstack([X, LexFeats])
    return X, Y