Beispiel #1
0
def generateSentementFeatures(x_text, max_length):
    sequence_length = max_length  #max(len(x) for x in x_text)
    stemmer = PorterStemmer()
    unstemmed_texts = []
    for line in x_text:
        # line = line.decode('utf8', 'ignore').encode('ascii', 'ignore')
        # try:
        # items = line.split('\t')
        # if len(items) > 3:
        # _drug = string.lower(string.strip(items[1]))
        # _text = string.lower(string.strip(items[-1]))
        # _class = int(string.strip(items[2]))
        senttokens = nltk.word_tokenize(line)
        stemmed_text = ''
        for t in senttokens:
            stemmed_text += ' ' + stemmer.stem(t)
        unstemmed_texts.append(line)

        # training_set['synsets'].append(FeatureExtractionUtilities.getSynsetString(_text, None))
        # print 'synsets==>'+str(training_set['synsets'])
        # training_set['clusters'].append(FeatureExtractionUtilities.getclusterfeatures(_text))
        # print 'clusters==>' + str(training_set['clusters'])
        # training_set['text'].append(stemmed_text)
        # training_set['class'].append(_class)
        # print 'class==>' + str(training_set['class'])
        # except UnicodeDecodeError:
        #     print 'please convert to correct encoding..'

    # infile.close()
    # a = np.zeros((len(x_text), 90), dtype=np.float64).tolist()
    # trained_data = np.concatenate((np.array(trained_data),a),axis=1)

    print 'Generating training set sentiment features .. '
    sentiments = FeatureExtractionUtilities.getsentimentfeatures(
        unstemmed_texts, sequence_length)
    return sentiments
    return loaded_data_set


if __name__ == '__main__':
    #LOAD THE FEATURE EXTRACTION RESOURCES
    loadFeatureExtractionModuleItems()

    #LOAD THE DATA -- *SAMPLE SCRIPT USES THE SAME DATA FOR TRAINING AND TESTING*
    data_set_filename = 'binary_downloaded.tsv'
    training_data = loadData(data_set_filename)
    testing_data = loadData(data_set_filename)

    #GENERATE THE TRAINING SET FEATURES
    print 'GENERATING TRAINING SET FEATURES.. '
    training_data[
        'sentiments'] = FeatureExtractionUtilities.getsentimentfeatures(
            training_data['unstemmed_text'])
    training_data[
        'structuralfeatures'] = FeatureExtractionUtilities.getstructuralfeatures(
            training_data['unstemmed_text'])
    training_data[
        'adrlexicon'] = FeatureExtractionUtilities.getlexiconfeatures(
            training_data['unstemmed_text'])
    training_data['topictexts'], training_data[
        'topics'] = FeatureExtractionUtilities.gettopicscores(
            training_data['text'])
    training_data['goodbad'] = FeatureExtractionUtilities.goodbadFeatures(
        training_data['text'])

    #SCALE THE STRUCTURAL FEATURES
    scaler1 = preprocessing.StandardScaler().fit(
        training_data['structuralfeatures'])
                # training_set['synsets'].append(FeatureExtractionUtilities.getSynsetString(_text, None))
                # print 'synsets==>'+str(training_set['synsets'])
                # training_set['clusters'].append(FeatureExtractionUtilities.getclusterfeatures(_text))
                # print 'clusters==>' + str(training_set['clusters'])
                training_set['text'].append(stemmed_text)
                training_set['class'].append(_class)
                # print 'class==>' + str(training_set['class'])
        except UnicodeDecodeError:
            print 'please convert to correct encoding..'

    infile.close()

    print 'Generating training set sentiment features .. '
    training_set[
        'sentiments'] = FeatureExtractionUtilities.getsentimentfeatures(
            unstemmed_texts)
    # print 'training_set[sentiments]'+str(training_set['sentiments'])
    # training_set['structuralfeatures'] = FeatureExtractionUtilities.getstructuralfeatures(unstemmed_texts)
    # print 'training_set[structuralfeatures]'+str(training_set['structuralfeatures'])
    # scaler1 = preprocessing.StandardScaler().fit( training_set['structuralfeatures'])
    # train_structural_features = scaler1.transform( training_set['structuralfeatures'])
    # training_set['adrlexicon'] = FeatureExtractionUtilities.getlexiconfeatures(unstemmed_texts)
    # print 'adrlexicon==>' + str(training_set['adrlexicon'])
    # training_set['topictexts'],training_set['topics'] = FeatureExtractionUtilities.gettopicscores(training_set['text'])
    # print 'topictexts==>' + str(training_set['topictexts'])
    # training_set['goodbad'] = FeatureExtractionUtilities.goodbadFeatures(training_set['text'])
    # print 'goodbad==>' + str(training_set['goodbad'])
    '''
        Initialize the vectorizers
    '''
    print 'Initialize the vectorizers..'