def evaluate_classifier_Decision(featx):

    train_negids = train.fileids('neg')
    train_posids = train.fileids('pos')
    test_negids = test.fileids('neg')
    test_posids = test.fileids('pos')
    train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids]
    train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids]
    test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids]
    test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids]
    trainfeats = train_negfeats + train_posfeats
    testfeats = test_negfeats + test_posfeats

    train_negcutoff = len(train_negfeats)*1/100
    train_poscutoff = len(train_posfeats)*1/100
    trainfeats_Decision = train_negfeats[:train_negcutoff] + train_posfeats[:train_poscutoff]
    DecisionTree_classifier = DecisionTreeClassifier.train(trainfeats_Decision)
    refsets = collections.defaultdict(set)
    testsets_Decision = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)           
            observed_Decision = DecisionTree_classifier.classify(feats)
            testsets_Decision[observed_Decision].add(i)

    accuracy3 = nltk.classify.util.accuracy(DecisionTree_classifier, testfeats)  
    pos_precision3 = nltk.metrics.precision(refsets['pos'], testsets_Decision['pos'])
    pos_recall3 = nltk.metrics.recall(refsets['pos'], testsets_Decision['pos'])
    neg_precision3 = nltk.metrics.precision(refsets['neg'], testsets_Decision['neg'])
    neg_recall3 = nltk.metrics.recall(refsets['neg'], testsets_Decision['neg'])

    return(['DecisionTree',accuracy3,pos_precision3,pos_recall3,neg_precision3,neg_recall3])
Beispiel #2
0
def build_dt_model(data_file, class_file):
    #import data and class files
    data = []
    classes = []

    with open(data_file) as dFile:
        reader = csv.DictReader(dFile)
        data = [row for row in reader]

    with open(class_file) as cFile:
        for line in cFile.readlines():
            classes.append(int(line.strip()))

    #create classifier input
    DT_Input = []

    if (len(data) == len(classes)):
        for i in range(0, len(data)):
            DT_Input.append((data[i], classes[i]))

    #Train Classifier
    classifier = DecisionTreeClassifier.train(DT_Input)

    #Create orderedDict
    dict_keys = data[0].keys()

    ordDict = OrderedDict(zip(dict_keys, np.repeat('0', len(dict_keys))))

    return (classifier, ordDict)
Beispiel #3
0
def searchDecisionTreeClassifier(title, train_departments):
    """

    :param title:
    :param train_departments:
    :return:
    """
    timeTraning = time.time()
    classifier = DecisionTreeClassifier.train(train_departments)
    timeTraning = time.time() - timeTraning

    test_sent_features = word_feats(title)

    timeClassify = time.time()
    found_department = classifier.classify(test_sent_features)
    timeClassify = time.time() - timeClassify

    #probability = classifier.prob_classify_many(test_sent_features)
    #print(probability.prob(found_department))

    return [
        found_department,
        #probability.prob(found_department),
        0,
        accuracy(classifier, train_departments[1000:]),
        timeClassify,
        timeTraning,
    ]
def main_function():
	conn = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter_analysis")
	hq_conn = MySQLdb.connect(host="localhost", user="******", passwd="tanzania", db="twitter")

	training_tweets = get_test_tweets(conn)
	training_feature_set = process_tweets(training_tweets)

	classifier = DecisionTreeClassifier.train(training_feature_set)

	test_tweets = get_training_tweets(conn)
	test_feature_set = process_tweets(test_tweets)

	classifier_accuracy = accuracy(classifier, test_feature_set)

	alt_full_matrix = {'+':{'+':0, '-':0, 'E':0}, 
				'-':{'+':0, '-':0, 'E':0}, 
				'E':{'+':0, '-':0, 'E':0}}

	#for f in test_tweets:
	#f = test_tweets[0]

	#print f
	#guess = classifier.classify(process_tweet(f[1]))
	#print guess
	#	update_tweet_polarity(f[0], guess, conn)
	##	pl = classifier.prob_classify(process_tweet(f[1]))
	#	idx = f[2]
	#	if idx == 'I' or idx == 'O':
	#		idx = 'E'
	#	alt_full_matrix[idx][guess] += 1

	#print alt_full_matrix

	print "classifier accuracy: " + repr(classifier_accuracy)
Beispiel #5
0
def dt_sentiment():
    training_data = [('I love this sandwich.', 'pos'),
                    ('This is an amazing place!', 'pos'),
                    ('I feel very good about these beers.', 'pos'),
                    ('This is my best work.', 'pos'),
                    ("What an awesome view", 'pos'),
                    ('I do not like this restaurant', 'neg'),
                    ('I am tired of this stuff.', 'neg'),
                    ("I can't deal with this", 'neg'),
                    ('He is my sworn enemy!', 'neg'),
                    ('My boss is horrible.', 'neg')]

    vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))
    print vocabulary
    #for i in vocabulary:
        #print "voc:",i

    feature_set = [({i:(i in word_tokenize(sentence.lower())) for i in vocabulary},tag) for sentence, tag in training_data]
    lfs=len(feature_set)
    #print lfs
    #print feature_set
    
    dt_classifier = DecisionTreeClassifier.train(feature_set,binary=True, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30)
    test_sentence = "This is the best work I deal with!"
    featurized_test_sentence =  {i:(i in word_tokenize(test_sentence.lower())) for i in vocabulary}
    #print featurized_test_sentence
    #for i1 in featurized_test_sentence:
        #print i1

    print "test_sent:",test_sentence
    print "tag:",dt_classifier.classify(featurized_test_sentence)
Beispiel #6
0
def evaluate_classifier(featureX):

    negIds = app_reviews.fileids('neg')
    posIds = app_reviews.fileids('pos')

    posFeatures = [(featureX(app_reviews.words(fileids=[f])), 'neg')
                   for f in negIds]
    negFeatures = [(featureX(app_reviews.words(fileids=[f])), 'pos')
                   for f in posIds]

    #selects 3/4 of the features to be used for training and 1/4 to be used for testing
    posCutoff = int(math.floor(len(posFeatures) * 3 / 4))
    negCutoff = int(math.floor(len(negFeatures) * 3 / 4))

    trainFeatures = negFeatures[:negCutoff] + posFeatures[:posCutoff]
    testFeatures = negFeatures[negCutoff:] + posFeatures[posCutoff:]

    #trains a Naive Bayes Classifier
    NBclassifier = NaiveBayesClassifier.train(trainFeatures)
    #trains a Maximum Entropy or Logistic Regression Classifier
    MEclassifier = MaxentClassifier.train(trainFeatures,
                                          algorithm='gis',
                                          trace=0,
                                          max_iter=10,
                                          min_lldelta=0.5)
    #trains a DecisionTree Classifier
    DTclassifier = DecisionTreeClassifier.train(trainFeatures,
                                                binary=True,
                                                entropy_cutoff=0.5,
                                                depth_cutoff=70,
                                                support_cutoff=10)

    #Combining Classifiers with Voting
    classifier = MaxVoteClassifier(NBclassifier, MEclassifier, DTclassifier)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        observed = classifier.classify(features)
        testSets[observed].add(i)

    #prints metrics to show how well the feature selection
    print('train on %d instances, test on %d instances' %
          (len(trainFeatures), len(testFeatures)))
    print 'Accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print 'pos Precision:', nltk.metrics.precision(referenceSets['pos'],
                                                   testSets['pos'])
    print 'pos Recall:', nltk.metrics.recall(referenceSets['pos'],
                                             testSets['pos'])
    print 'neg Precision:', nltk.metrics.precision(referenceSets['neg'],
                                                   testSets['neg'])
    print 'neg Recall:', nltk.metrics.recall(referenceSets['neg'],
                                             testSets['neg'])
def decisionTree(features_train, features_test):
	print 'train on %d instances, test on %d instances' % (len(features_train), len(features_test))
	classifier = DecisionTreeClassifier.train(features_train,
												binary=True,
												entropy_cutoff=0.8,
												depth_cutoff=5,
												support_cutoff=30)
	print 'accuracy:', nltk.classify.util.accuracy(classifier, features_test)
	precisions, recalls = precision_recall(classifier, features_test)
	print "accuracy: ", precisions, "fitness: ", recalls
Beispiel #8
0
def classify(inputdir):
        #filenames = os.listdir('d:\\shir\\')
        filenames = os.listdir(inputdir)

        feat_set = []
        sets = []
        for name in filenames:
        # print name
            lineno=0
            path = os.path.join(inputdir, name)
            sense = name.split('\\')[-1].split('.')[0]
            print 'training', sense

            file = codecs.open(path, 'r', 'utf-8')
            allwords = []
            for line in file:
              if len(line.split())>2:
                     lineno+=1
                     line = line.strip()
                     words=[]
                     tags=[]
                     tokens = line.split()

                     for item in tokens:
                           if len(item.split('\\'))==2:
                                word=item.split('\\')[0]
                                tag= item.split('\\')[1]
                                words.append(word)
                                tags.append(tag)
                                allwords.append(word)
                     feat_set.append((bag_of_words(line),sense))
                     #feat_set.append((get_feature2(line),sense))
              else:
                  words=[]
                  tags=[]
            file.close()

        random.shuffle(feat_set)
        random.shuffle(feat_set)
        #random.shuffle(feat_set)



        train_data = train_feats(feat_set)
        test_data  = test_feats(feat_set)
        #classifier=  MaxentClassifier.train(train_data)
        nb_classifier = NaiveBayesClassifier.train(train_data)
        dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30)
       # pickle.dump(classifier, classifier_save_file)
        entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=1, min_lldelta=0.5)
        print "nb accuracy "+ str(accuracy(nb_classifier, test_data) * 100)
        print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100)
        print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100)
        mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, entropy_classifier)
        print "max vote accuracy "+ str(accuracy(mv_classifier, test_data) * 100)
Beispiel #9
0
    def create_trained_model(self,
                             dataset=None,
                             train_data=None,
                             train_target=None):
        """ Creates and trains new model

        Args:
            dataset: dataset with reviews and positive or negative labels
        Returns:
            A trained model
        """

        if self.classifier_type == 'nb':
            model = NaiveBayesClassifier.train(dataset)
        elif self.classifier_type == 'dt':
            model = DecisionTreeClassifier.train(dataset)
        elif self.classifier_type == 'rf':
            forest = RandomForestClassifier(n_estimators=100, random_state=0)
            model = forest.fit(train_data, train_target)
        elif self.classifier_type == 'svm':
            model = svm.LinearSVC(max_iter=10000)
            model.fit(train_data, train_target)
        elif self.classifier_type == 'nn':
            input_dimension = len(train_data[0])

            model = Sequential()
            model.add(Dense(20, input_dim=input_dimension, activation='relu'))
            model.add(Dropout(0.5))
            model.add(Dense(30, activation='relu'))
            model.add(Dropout(0.5))
            model.add(Dense(20, activation='relu'))
            model.add(Dropout(0.5))
            model.add(Dense(1, activation='sigmoid'))

            print('Compiling model...')
            model.compile(
                optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

            print('Training model...')
            model.fit(
                train_data,
                train_target,
                epochs=50,
                verbose=0,
                class_weight={
                    0: 3,
                    1: 1
                })

            print('Model trained!')

        return model
Beispiel #10
0
def main():
    log.info('loading negative_tweets')
    neg_data = load_twitter_data('negative_tweets')
    neg_tokens = tokenized_tweets(neg_data)
    log.info('tokenized negative_tweets')

    log.info('loading positive_tweets')
    pos_data = load_twitter_data('positive_tweets')
    pos_tokens = tokenized_tweets(pos_data)
    log.info('tokenized positive_tweets')

    log.info('extracting the top 5000 words')
    top_5000 = top_n_tokens(5000)

    log.info(
        'creating feature set from negative tweets - labeled with {}'.format(
            NEGATIVE))
    neg_features = [(find_features(tkns, top_5000), NEGATIVE)
                    for tkns in neg_tokens]

    log.info(
        'creating feature set from positive tweets - labeled with {}'.format(
            POSITIVE))
    pos_features = [(find_features(tkns, top_5000), POSITIVE)
                    for tkns in pos_tokens]

    log.info('combining and shuffling feature sets')
    all_features = neg_features + pos_features
    random.shuffle(all_features)

    log.info('splitting the combined feature sets ~(70/30)')
    split_idx = int(len(all_features) * .7)
    training_set = all_features[:split_idx]
    testing_set = all_features[split_idx:]
    log.info('training data set contains {} features'.format(
        len(training_set)))
    log.info('testing data set contains {} features'.format(len(testing_set)))

    log.info("\n#####  STARTING CLASSIFIER TRAINING #####")
    log.info("this will take a while\n")

    DT_classifier = DecisionTreeClassifier.train(training_set)
    log.info(("DT_classifier accuracy percent:",
              (nltk.classify.accuracy(DT_classifier, testing_set)) * 100))
    save_classifier(DT_classifier, 'dt_classifier.pickle')

    NB_classifier = nltk.NaiveBayesClassifier.train(training_set)
    log.info(("Original Naive Bayes Algo accuracy percent:",
              (nltk.classify.accuracy(NB_classifier, testing_set)) * 100))
    NB_classifier.show_most_informative_features(15)
    save_classifier(NB_classifier, 'nb_classifier.pickle')

    save_tokens(top_5000, 'top_5000.pickle')
Beispiel #11
0
def run(training):
    """
    To create and train a DecisionTreeClassifier
    :return: a trained Classifier
    """
    print "Training DT Classifier..."
    # feats = label_feat_from_corps(movie_reviews)
    # training, testing = split_label_feats(feats)

    dt_classifier = DecisionTreeClassifier.train(training, binary=True, entropy_cutoff=0.8, depth_cutoff=10, support_cutoff=30)
    print "DT Classifier trained..."
    return save_classifier(dt_classifier)
Beispiel #12
0
def trainDT(featuresets):
    #idx = 2*len(featuresets) / ratio
    #train_set, test_set = featuresets[idx:], featuresets[:idx]
    train_set = featuresets
    #max_iter=20
    classifier = DecisionTreeClassifier.train(train_set)
    #print accuracy(classifier, test_set)
    #classifier.show_most_informative_features(100)
    #train_set, test_set = featuresets[idx:], featuresets[:idx]
    #classifier.train(train_set, algo, max_iter=20)
    #print accuracy(classifier, test_set)
    #classifier.show_most_informative_features(100)
    return classifier
Beispiel #13
0
def trainDT(featuresets):
    #idx = 2*len(featuresets) / ratio
    #train_set, test_set = featuresets[idx:], featuresets[:idx]
    train_set = featuresets
    #max_iter=20
    classifier = DecisionTreeClassifier.train(train_set)
    #print accuracy(classifier, test_set)
    #classifier.show_most_informative_features(100)
    #train_set, test_set = featuresets[idx:], featuresets[:idx]
    #classifier.train(train_set, algo, max_iter=20)
    #print accuracy(classifier, test_set)
    #classifier.show_most_informative_features(100)
    return classifier
Beispiel #14
0
    def classify_decision_tree(self):

        print "training decision tree"
        classifier = DecisionTreeClassifier.train(self.feature_vectors_tuples_for_train, depth_cutoff=200, entropy_cutoff=0.1)
        print "testing classifier"
        classified_labels =  classifier.batch_classify([feature_set_tuple[0] for feature_set_tuple in self.feature_vectors_tuples_for_test])
        correct = 0
        wrong = 0
        for i in range(0, len(classified_labels)):
            if classified_labels[i] is self.feature_vectors_tuples_for_test[i][1]:
                correct += 1
            else:
                wrong += 1
        print correct, wrong
Beispiel #15
0
def evaluate_features(feature_select):
    #reading pre-labeled input and splitting into lines
    negSentences = open(os.path.join(__location__, 'rt-polarity-neg.txt'), 'r', encoding='utf8')
    posSentences = open(os.path.join(__location__, 'rt-polarity-pos.txt'), 'r', encoding='utf8')
    stopwords = open(os.path.join(__location__, 'stopwords.txt'), 'r', encoding='utf8').read()
    negSentences = re.split(r'\n', negSentences.read())
    posSentences = re.split(r'\n', posSentences.read())
    
    posFeatures = []
    negFeatures = []
    counter = 0
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i)
        if any(stopword in posWords for stopword in stopwords):
            pass
        else: 
            posWords = [feature_select(posWords), 'pos']
            posFeatures.append(posWords)
    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i)
        if any(stopword in posWords for stopword in stopwords):
            pass
        else:
            negWords = [feature_select(negWords), 'neg']
            negFeatures.append(negWords)
    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    
    #Runs the classifier on the testFeatures
    classifier = DecisionTreeClassifier.train(trainFeatures)
    
    #Sets up labels to look at output
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)
    for i, (features, label) in enumerate(testFeatures): # enumerate adds number-count to each item
        referenceSets[label].add(i)               # recorded polarity for these test sentences
        predicted = classifier.classify(features) # classifiers' proposed polarity for tests
        testSets[predicted].add(i)

    #Outputs
    print('train on %s instances, test on %s instances'% (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print('pos precision:', scores.precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', scores.recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:', scores.precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', scores.recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)
def decision_tree_classifier(feature_vector_train, feature_vector_test):
    features_train, topics_train = zip(*feature_vector_train)
    features_test, topics_test = zip(*feature_vector_test)

    # training

    classifier2 = DecisionTreeClassifier.train(features_train, depth_cutoff=250, entropy_cutoff=0.1)

    # Kept an entropy cutoff in order to improve the training time (this might lead to loss in accuracy though)
    # Same goes for depth cutoff (for refining the tree). Kept it as 250.

    # testing

    predicted_topics = classifier2.classify_many(features_test)

    print classification_report(topics_test, predicted_topics, target_names=set(topics_test))
Beispiel #17
0
def classifier(train_feats, model='nb_classifier'):
    if model is 'nb_classifier':
        return NaiveBayesClassifier.train(train_feats)

    elif model is 'dt_classifier':
        return DecisionTreeClassifier.train(train_feats,
                                            binary=True,
                                            entropy_cutoff=0.8,
                                            depth_cutoff=5,
                                            support_cutoff=30)

    elif model is 'me_classifier':
        return MaxentClassifier.train(train_feats,
                                      algorithm='gis',
                                      trace=0,
                                      max_iter=10,
                                      min_lldelta=0.5)
def dt_classify(filename):
    raw_sample_stream = get_samples_stream(filename)
    all_samples = list( binary_bow_feature(raw_sample_stream) )

    # filter out two classes of outliers
    # these two categories contain too few examples, so the word frequency in these two categories
    # cannot reflect the true probability
    # all_samples = [(features,aspect) for features,aspect in all_samples if aspect != common.AspectNothing and aspect != common.AspectBusiness]

    test_sample_ratio = 0.25
    train_samples,test_samples = split_samples(all_samples,test_sample_ratio)
    print "training set has {} samples, test set has {} samples".format(len(train_samples),len(test_samples))

    classifier = DecisionTreeClassifier.train(train_samples,binary=True, depth_cutoff=15,verbose=True)
    print "training completes"

    print "training accuracy: {}".format(accuracy(classifier,train_samples))
    print "test accuracy: {}".format(accuracy(classifier,test_samples))

    return classifier
# Recalls Neg: 0.896
print("######################################################################")
me_classifier = MaxentClassifier.train(train_feats, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)
print("Accuracy Max Entropy: " + str(accuracy(me_classifier, test_feats)))
# Accuracy Max Entropy: 0.912
me_precisions, me_recalls = precision_recall(me_classifier, test_feats)
print("Precisions Max Entropy Pos: " + str(me_precisions['pos']))
# Precisions Max Entropy Pos: 0.8992248062015504
print("Precisions Max Entropy Neg: " + str(me_precisions['neg']))
# Precisions Max Entropy Neg: 0.9256198347107438
print("Recalls Max Entropy Pos: " + str(me_recalls['pos']))
# Recalls Max Entropy Pos: 0.928
print("Recalls Max Entropy Neg: " + str(me_recalls['neg']))
# Recalls Max Entropy Neg: 0.896
print("######################################################################")
dt_classifier = DecisionTreeClassifier.train(train_feats, binary=True, depth_cutoff=20, support_cutoff=20, entropy_cutoff=0.01)
print("Accuracy Decision Tree: " + str(accuracy(dt_classifier, test_feats)))
# Accuracy Decision Tree: 0.68600000000000005
dt_precisions, dt_recalls = precision_recall(dt_classifier, test_feats)
print("Precisions Decision Tree Pos: " + str(dt_precisions['pos']))
# Precisions Decision Tree Pos: 0.6741573033707865
print("Precisions Decision Tree Neg: " + str(dt_precisions['neg']))
# Precisions Decision Tree Neg: 0.69957081545064381
print("Recalls Decision Tree Pos: " + str(dt_recalls['pos']))
# Recalls Decision Tree Pos: 0.71999999999999997
print("Recalls Decision Tree Neg: " + str(dt_recalls['neg']))
# Recalls Decision Tree Neg: 0.65200000000000002
print("######################################################################")
sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats)
print("Accuracy Sklearn Linear SVC: " + str(accuracy(sk_classifier, test_feats)))
# Accuracy Sklearn Linear SVC: 0.86
Beispiel #20
0
def decision_tree(train_data):
    training_data = []
    for data in train_data:
        training_data.append(preprocess(data[0],label=data[1]))
    cl = DecisionTreeClassifier.train(training_data)
    return cl
Beispiel #21
0
def decision_tree(train_data):
    training_data = []
    for data in train_data:
        training_data.append(preprocess(data[0], label=data[1]))
    cl = DecisionTreeClassifier.train(training_data)
    return cl
Beispiel #22
0
 def train(self, features_label):
     self._classifier = DecisionTreeClassifier.train(features_label,
                                                     entropy_cutoff=0.05,
                                                     depth_cutoff=200,
                                                     support_cutoff=20)
     return None
Beispiel #23
0
def classify(inputdir):
        #filenames = os.listdir('d:\\shir\\')
        filenames = os.listdir(inputdir)

        feat_set = []
        sets = []

        for name in filenames:
        # print name
                labeledlist = []
                lineno=0
                path = os.path.join(inputdir, name)
                sense = name.split('\\')[-1].split('.')[0]
                print 'training', sense

                file = codecs.open(path, 'r', 'utf-8')
                allwords = []
                for line in file:
                      if len(line.split())>2:
                             lineno+=1
                             line = line.strip()
                             words=[]
                             tags=[]
                             tokens = line.split()

                             for item in tokens:
                                   if len(item.split('\\'))==2:
                                        word=item.split('\\')[0]
                                        tag= item.split('\\')[1]
                                        words.append(word)
                                        tags.append(tag)
                                        allwords.append(word)
                             feat_set.append((bag_of_bigrams_words(words),sense))
                            # feat_set.append((context_feature(line),sense))
                      else:
                          words=[]
                          tags=[]
                print lineno
                labeledlist.append((sense,allwords))


#                feat_set.append((bigram_feature(allwords),sense))
                file.close()
        high_info_words = set(high_information_words(labeledlist))
        for item in  high_info_words:
                      print item

        random.shuffle(feat_set)
        random.shuffle(feat_set)
        random.shuffle(feat_set)

        

        train_data = train_feats(feat_set)
        test_data  = test_feats(feat_set)
        print "training on "+str(len(train_data))+" instances"
        print "testting on "+str(len(test_data))+" instances"
        #classifier=  MaxentClassifier.train(train_data)
       # nb_classifier = NaiveBayesClassifier.train(train_data)
        dt_classifier = DecisionTreeClassifier.train(train_data, entropy_cutoff=0.8, depth_cutoff=7, support_cutoff=10)
       # print dt_classifier.pp()
       # pickle.dump(classifier, classifier_save_file)
        entropy_classifier = MaxentClassifier.train(train_data,algorithm='iis', trace=0, max_iter=2, min_lldelta=0.5)
        print "nb accuracy "
       # print accuracy(nb_classifier, test_data) * 100
       # print "nb precision and recall"
#        print precision_recall(nb_classifier,test_data)

    #    print   nb_classifier.show_most_informative_features()
#        for item in  nb_classifier.most_informative_features():
#            print item
     #   print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100)
        print "entropy accuracy "+ str(accuracy(entropy_classifier, test_data) * 100)
Beispiel #24
0
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


featuresets = [(document_features(d), c) for (d, c) in documents]
train_set = featuresets[:1000]
test_set = featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("NaiveBayesClassifier Accuracy     =>" +
      str(nltk.classify.accuracy(classifier, test_set) * 100))
classifier.show_most_informative_features(5)

classifier = DecisionTreeClassifier.train(train_set,
                                          binary=False,
                                          entropy_cutoff=0.4,
                                          depth_cutoff=20,
                                          support_cutoff=50)
print("DecisionTreeClassifier Accuracy     =>" +
      str(nltk.classify.accuracy(classifier, test_set) * 100))
# To Test This Application Put in File 1.txt and try to make the text large as possible because the features not large (small data set)
#InputList=[]
#with open("1.txt", 'r') as f:
#    for line in f:
#        for word in line.split():
#            InputList.append(word)
#            words.append(word)
#print(classifier.classify(document_features(InputList)))
Beispiel #25
0
def getClassifier(tweetfile,cfg):
    degreesToUse = cfg['NLPnGrams']
    print "DEBOOOOO", degreesToUse, type(degreesToUse)
    classMode = cfg['NLPMode'].replace('-',' ').replace('_',' ')
    shortClass = classMode.replace(' ','').lower()
    loadNeeded = True 

    if 'NLPTEST' not in cfg.keys():
	degreeString = '-'.join([str(degree) for degree in degreesToUse])
        pickleFile = 'nlpTrainers/'+tweetfile.replace('.csv','.'+shortClass+degreeString+'.pickle')  
	if isfile(pickleFile):
		print "Loading pickled", shortClass, "classifier"
		fileIn = open(pickleFile)
		classifier = cPickle.load(fileIn)
		fileIn.close()
		loadNeeded = False
    
    if loadNeeded:
        if 'NLPTEST'in cfg.keys():
            content = prepText(tweetfile)
            categorized = prepClassifications(content)
            NGrammized = collectNGrams(categorized,degreesToUse,cfg)
        else:
            print "Loading content & preparing text"
            content = prepText(loadFile(tweetfile))
            print "Categorizing contents"
            categorized = prepClassifications(content)
            print "Deriving NGrams of length(s)", degreesToUse
            NGrammized = collectNGrams(categorized,degreesToUse,cfg)
            print "Compiling Results"
        readyToSend = []
        allCats = [str(key) for key in NGrammized.keys()]
        for category in allCats:
            readyToSend += NGrammized[category]
            
        print "Attempting Classification by mode", classMode, degreesToUse
        if classMode == 'naive bayes':
            from nltk.classify import NaiveBayesClassifier
            classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'}
        elif classMode == 'positive naive bayes':
            from nltk.classify import PositiveNaiveBayesClassifier
            classifier = {'class':PositiveNaiveBayesClassifier.train(readyToSend),'mode':'pnb'}
        elif classMode == 'max ent':
            #import nltk.classify
            #from sklearn.linear_model import LogisticRegression
            #from nltk.classify import SklearnClassifier
            #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'}
            from nltk.classify import MaxentClassifier
            classifier = {'class':MaxentClassifier.train(readyToSend,algorithm='iis'),'mode':'me'}
        elif classMode == 'decision tree':
            from nltk.classify import DecisionTreeClassifier
            classifier = {'class':DecisionTreeClassifier.train(readyToSend),'mode':'dt'}
        elif classMode == 'svm':
            if "SVMOrder" in cfg.keys():
                priority =  cfg['SVMOrder']
            else:
                priority =  "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210"
            if type(priority) is str:
                priority = list(priority)
            priority = [entry for entry in priority if entry in allCats]
            preppedSVM = prepSVMAll(readyToSend,priority,allCats,cfg)
            classifier = {'class':preppedSVM,'mode':'svm','priority':priority}
	else:
	    from nltk.classify import NaiveBayesClassifier
            classifier = {'class':NaiveBayesClassifier.train(readyToSend),'mode':'nb'}
        
        if 'NLPTEST' not in cfg.keys():
            print "Pickling Classifier"
            fileOut = open(pickleFile, 'wb')
            cPickle.dump(classifier, fileOut)
            fileOut.close() 
              
    if 'NLPTEST' not in cfg.keys():
        if classMode != 'svm': 
        	classifier['class'].show_most_informative_features(n=150)
	"""else:
		for key in classifier['class'].keys():
			print classifier		
			print classifier.keys()
			classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))"""    
    
    return classifier
Beispiel #26
0
raw_data_names = ([(name, 'male') for name in names.words('male.txt')] +
                  [(name, 'female') for name in names.words('female.txt')])

random.shuffle(raw_data_names)


def genderFeatures(word):
    return {
        'last_letter': word[-1],
        'fisrt_letter': word[0],
        'last_two_leters': word[-2:],
        'last_three_leters': word[-3:]
    }


#Create training and test-sets som of code from from nltk chapter 6, 1.1
features = [(genderFeatures(n), gender) for (n, gender) in raw_data_names]
testSet, trainSet = features[:1000], features[1000:]

#Train classifier
nbc = NaiveBayesClassifier.train(trainSet)
tc = DecisionTreeClassifier.train(trainSet)
ec = MaxentClassifier.train(trainSet, trace=0)

print("Please Wait.")
print("Naive Bayes: ", nltk.classify.accuracy(nbc, testSet), "%")
print("Please Wait..")
print("Tree Classifier", nltk.classify.accuracy(tc, testSet), "%")
print("Please Wait...")
print("Entropy Classifier: ", nltk.classify.accuracy(ec, testSet), "%")
Beispiel #27
0
    X = [x[0] for x in dataset]
    Y = [x[1] for x in dataset]
    kfold = StratifiedKFold(n_splits=int(args.z),
                            shuffle=True,
                            random_state=seed)
    cvscores = []
    for train, test in kfold.split(X, Y):
        # print(dataset[train[0]])
        train_data = []
        for i in range(len(train)):
            train_data.append(dataset[train[i]])
        test_data = []
        for i in range(len(test)):
            test_data.append(dataset[test[i]])
        model = DecisionTreeClassifier.train(train_data)
        scores = nltk.classify.util.accuracy(model, test_data)
        print("{}%".format(scores * 100))
        cvscores.append(scores * 100)
        # plot_model(model, to_file='model.png')

    print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))
    ### create training and test sets
    ## set the cutoffs
    # negcutoff = math.floor(len(neg_list)*3/4)
    # poscutoff = math.floor(len(pos_list)*3/4)
    #
    # top10list = []
    # avgAccuracy = 0
    # for z in range(int(args.z)):
    #     #train = neg_list[:negcutoff] + pos_list[:poscutoff]
Beispiel #28
0
def getClassifier(tweetfile, cfg):
    degreesToUse = cfg['NLPnGrams']
    print "DEBOOOOO", degreesToUse, type(degreesToUse)
    classMode = cfg['NLPMode'].replace('-', ' ').replace('_', ' ')
    shortClass = classMode.replace(' ', '').lower()
    loadNeeded = True

    if 'NLPTEST' not in cfg.keys():
        degreeString = '-'.join([str(degree) for degree in degreesToUse])
        pickleFile = 'nlpTrainers/' + tweetfile.replace(
            '.csv', '.' + shortClass + degreeString + '.pickle')
        if isfile(pickleFile):
            print "Loading pickled", shortClass, "classifier"
            fileIn = open(pickleFile)
            classifier = cPickle.load(fileIn)
            fileIn.close()
            loadNeeded = False

    if loadNeeded:
        if 'NLPTEST' in cfg.keys():
            content = prepText(tweetfile)
            categorized = prepClassifications(content)
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
        else:
            print "Loading content & preparing text"
            content = prepText(loadFile(tweetfile))
            print "Categorizing contents"
            categorized = prepClassifications(content)
            print "Deriving NGrams of length(s)", degreesToUse
            NGrammized = collectNGrams(categorized, degreesToUse, cfg)
            print "Compiling Results"
        readyToSend = []
        allCats = [str(key) for key in NGrammized.keys()]
        for category in allCats:
            readyToSend += NGrammized[category]

        print "Attempting Classification by mode", classMode, degreesToUse
        if classMode == 'naive bayes':
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }
        elif classMode == 'positive naive bayes':
            from nltk.classify import PositiveNaiveBayesClassifier
            classifier = {
                'class': PositiveNaiveBayesClassifier.train(readyToSend),
                'mode': 'pnb'
            }
        elif classMode == 'max ent':
            #import nltk.classify
            #from sklearn.linear_model import LogisticRegression
            #from nltk.classify import SklearnClassifier
            #classifier = {'class':LogisticRegression.train(readyToSend),'mode':'me'}
            from nltk.classify import MaxentClassifier
            classifier = {
                'class': MaxentClassifier.train(readyToSend, algorithm='iis'),
                'mode': 'me'
            }
        elif classMode == 'decision tree':
            from nltk.classify import DecisionTreeClassifier
            classifier = {
                'class': DecisionTreeClassifier.train(readyToSend),
                'mode': 'dt'
            }
        elif classMode == 'svm':
            if "SVMOrder" in cfg.keys():
                priority = cfg['SVMOrder']
            else:
                priority = "ABCDEFGHIJKLMNOPQRSTUVWXYZ9876543210"
            if type(priority) is str:
                priority = list(priority)
            priority = [entry for entry in priority if entry in allCats]
            preppedSVM = prepSVMAll(readyToSend, priority, allCats, cfg)
            classifier = {
                'class': preppedSVM,
                'mode': 'svm',
                'priority': priority
            }
        else:
            from nltk.classify import NaiveBayesClassifier
            classifier = {
                'class': NaiveBayesClassifier.train(readyToSend),
                'mode': 'nb'
            }

        if 'NLPTEST' not in cfg.keys():
            print "Pickling Classifier"
            fileOut = open(pickleFile, 'wb')
            cPickle.dump(classifier, fileOut)
            fileOut.close()

    if 'NLPTEST' not in cfg.keys():
        if classMode != 'svm':
            classifier['class'].show_most_informative_features(n=150)
        """else:
		for key in classifier['class'].keys():
			print classifier		
			print classifier.keys()
			classifier['class'][key].show_most_informative_features(n=150/len(classifier['class'].keys()))"""

    return classifier
tgd=brown.tagged_words(categories="news")

'''
print tgd[:3]
[(u'The', u'AT'), (u'Fulton', u'NP-TL'), (u'County', u'NN-TL')]
'''
feats=[(pos_feats(w),c) for (w,c) in tgd]
lens=int(len(feats)*0.2)
tain,test=feats[lens:],feats[:lens]
'''
print tain[:10]

[({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'AT'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'CD'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'VBG'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'IN'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN-TL'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NP'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'CS'), ({u'endswith med': False, u'endswith mee': False, u'endswith meg': False, u'endswith 343': False, u'endswith hce': False, u'endswith 348': False, u'endswith ala': False, u'endswith ghn': False, u'endswith nw.': False, u'endswith .k.': False, u'endswith vre': False, u'endswith 3a': False, u'endswith xth': False, u'endswith lbs': False, u'endswith xty': False, u'endswith oze': False, u'endswith nen': False, u'endswith xts': False, u'endswith aur': False, u'endswith yms': False, u'endswith aut': False, u'endswith aux': False, u'endswith pth': False, u"endswith k's": False, u'endswith aud': False, u'endswith ptu': False, u'endswith aui': False, u'endswith aul': False, u'endswith aum': False, u'endswith aun': False, u'endswith upi': False, u'endswith air': False, u'endswith upa': False, u'endswith aix': False, u'endswith upy': False, u'endswith ups': False, u'endswith ain': False, u'endswith cwt': False, u'endswith upt': False, u'endswith new': False, u'endswith neu': False, u'endswith net': False, u'endswith nes': False, u'endswith ner': False, u'endswith nez': False, u'endswith ney': False, u'endswith nex': False, u'endswith nee': False, u'endswith ned': False, u'endswith nec': False, u'endswith .e.': False, u'endswith nel': False, u'endswith nek': False, u'endswith nmr': False, u'endswith nei': False, u'endswith aus': False, u"endswith th'": False, u'endswith 7th': False, u'endswith -ho': False, u'endswith fha': False, u'endswith zur': False, u"endswith y'n": False, u'endswith mny': False, u'endswith mns': False, u'endswith ucy': False, u'endswith oeb': False, u'endswith rek': False, u'endswith g/l': False, u'endswith ak.': False, u'endswith 271': False, u'endswith 270': False, u'endswith 273': False, u'endswith 275': False, u'endswith 274': False, u'endswith 276': False, u'endswith sce': False, u'endswith 27%': False, u'endswith sca': False, u'endswith 606': False, u'endswith sch': False, u'endswith 298': False, u'endswith 297': False, u'endswith 290': False, u'endswith 29%': False, u'endswith cm.': False, u'endswith .32': False, u'endswith ajk': False, u'endswith 362': False, u'endswith jac': False, u'endswith *yt': False, u'endswith *yr': False, u'endswith *yp': False, u'endswith bl': False, u'endswith /3%': False, u'endswith $45': False, u'endswith $40': False, u'endswith cth': False, u'endswith fur': False, u'endswith kus': False, u'endswith kup': False}, u'NN')]

'''
clf=DecisionTreeClassifier.train(tain)
print accuracy(clf,test)
print clf.classify(pos_feats("dogs"))
#0.144952759821
#NN

'''
for w in brown.words()[:10]:
    print w
    print w[-2:]

The
he
Fulton
on
County
Beispiel #30
0
pickle.dump(train_feats, save_train_feats)
save_train_feats.close()

save_test_feats = open("pickled_algos/test_feats", "wb")
pickle.dump(test_feats, save_test_feats)
save_test_feats.close()

nb_classifier = NaiveBayesClassifier.train(train_feats)

print(accuracy(nb_classifier, test_feats))

save_nb_classifier = open("pickled_algos/nb_classifier", "wb")
pickle.dump(nb_classifier, save_nb_classifier)
save_nb_classifier.close()

dt_classifier = DecisionTreeClassifier.train(train_feats)

print(accuracy(dt_classifier, test_feats))

save_dt_classifier = open("pickled_algos/dt_classifier", "wb")
pickle.dump(dt_classifier, save_dt_classifier)
save_dt_classifier.close()

sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats)

print(accuracy(sk_classifier, test_feats))

save_sk_classifier = open("pickled_algos/sk_classifier", "wb")
pickle.dump(sk_classifier, save_sk_classifier)
save_sk_classifier.close()
Beispiel #31
0
    def train(self, reviews_file):
        """ Trains a classifier based on drug reviews with ratings

        Args:
            reviews_file: Reviews file to use for training.
        """
        ## Parse data from files
        reviews = self.parse_reviews(reviews_file)

        with open('stopwords.txt') as stop_words_file:
            text = self.clean_text(stop_words_file.read())
            stop_words = text.splitlines()

        ## Parse and convert positive and negative examples
        positive_comments = []
        negative_comments = []

        for review in reviews:
            comment = review['comment']
            rating = review['rating']

            comment = self.format_text(comment, stop_words)

            if float(rating) <= self.negative_threshold:
                negative_comments.append((comment, 'neg'))
            if float(rating) >= self.positive_threshold:
                positive_comments.append((comment, 'pos'))

        seed = 123
        numpy.random.seed(seed)

        print("Total Negative Instances:" + str(len(negative_comments)))
        print("Total Positive Instances:" + str(len(positive_comments)))

        negcutoff = math.floor(len(negative_comments) * 1)
        poscutoff = math.floor(len(positive_comments) * 1)

        neg_idx_train = sorted(
            random.sample(range(len(negative_comments)), negcutoff))
        neg_train = [negative_comments[i] for i in neg_idx_train]

        pos_idx_train = sorted(
            random.sample(range(len(positive_comments)), poscutoff))
        pos_train = [positive_comments[i] for i in pos_idx_train]

        dataset = neg_train + pos_train

        comments = [x[0] for x in dataset]
        ratings = [x[1] for x in dataset]
        kfold = StratifiedKFold(n_splits=self.iterations,
                                shuffle=True,
                                random_state=seed)
        cvscores = []
        for train, test in kfold.split(comments, ratings):
            train_data = []
            for item in train:
                train_data.append(dataset[item])
            test_data = []
            for item in test:
                test_data.append(dataset[item])

            if self.classifier_type == 'nb':
                self.model = NaiveBayesClassifier.train(train_data)
            elif self.classifier_type == 'dt':
                self.model = DecisionTreeClassifier.train(train_data)

            scores = nltk.classify.util.accuracy(self.model, test_data)
            print("{}%".format(scores * 100))
            cvscores.append(scores * 100)
            # plot_model(model, to_file='model.png')

            if self.classifier_type == 'nb':
                self.model.show_most_informative_features()

        print("%.2f%% (+/- %.2f%%)" %
              (numpy.mean(cvscores), numpy.std(cvscores)))
def words_bag(words):
    return dict([(word,True) for word in words])

neg_list = movie_reviews.fileids('neg')
pos_list = movie_reviews.fileids('pos')
 
negfeats = [(words_bag(movie_reviews.words(fileids=[f])), 'neg') for f in neg_list]
posfeats = [(words_bag(movie_reviews.words(fileids=[f])), 'pos') for f in pos_list]

'''gathering training and test data for decision trees''' 
negcutoff_train_dt = len(negfeats)
poscutoff_train_dt = len(posfeats)
training_data_dt = negfeats[:negcutoff_train_dt] + posfeats[:poscutoff_train_dt]


classifier_dt = DecisionTreeClassifier.train(training_data_dt)

print "Decision Trees"    
print 'train on %d instances:' % (len(training_data_dt))

sentence_list = []

#comments = "first half was good but oh boy the second was shit, overall good movie. no matter how many times I watch this, I still like it. must watch movie, i just want to touch the sweet panda"
while 1:
    comments = raw_input("Enter a review comment ending with a dot :")
    sentence_list = sent_tokenize(comments)
    for sentence in sentence_list:
        word_punct = wordpunct_tokenize(sentence)
        for words in word_punct:
            input_cl = words_bag(words)
        print sentence + "--->" + classifier_dt.classify(input_cl)
Beispiel #33
0
def classify(inputdir):
    #filenames = os.listdir('d:\\shir\\')
    filenames = os.listdir(inputdir)

    feat_set = []
    sets = []

    for name in filenames:
        # print name
        labeledlist = []
        lineno = 0
        path = os.path.join(inputdir, name)
        sense = name.split('\\')[-1].split('.')[0]
        print 'training', sense

        file = codecs.open(path, 'r', 'utf-8')
        allwords = []
        for line in file:
            if len(line.split()) > 2:
                lineno += 1
                line = line.strip()
                words = []
                tags = []
                tokens = line.split()

                for item in tokens:
                    if len(item.split('\\')) == 2:
                        word = item.split('\\')[0]
                        tag = item.split('\\')[1]
                        words.append(word)
                        tags.append(tag)
                        allwords.append(word)
                feat_set.append((bag_of_bigrams_words(words), sense))
            # feat_set.append((context_feature(line),sense))
            else:
                words = []
                tags = []
        print lineno
        labeledlist.append((sense, allwords))

        #                feat_set.append((bigram_feature(allwords),sense))
        file.close()
    high_info_words = set(high_information_words(labeledlist))
    for item in high_info_words:
        print item

    random.shuffle(feat_set)
    random.shuffle(feat_set)
    random.shuffle(feat_set)

    train_data = train_feats(feat_set)
    test_data = test_feats(feat_set)
    print "training on " + str(len(train_data)) + " instances"
    print "testting on " + str(len(test_data)) + " instances"
    #classifier=  MaxentClassifier.train(train_data)
    # nb_classifier = NaiveBayesClassifier.train(train_data)
    dt_classifier = DecisionTreeClassifier.train(train_data,
                                                 entropy_cutoff=0.8,
                                                 depth_cutoff=7,
                                                 support_cutoff=10)
    # print dt_classifier.pp()
    # pickle.dump(classifier, classifier_save_file)
    entropy_classifier = MaxentClassifier.train(train_data,
                                                algorithm='iis',
                                                trace=0,
                                                max_iter=2,
                                                min_lldelta=0.5)
    print "nb accuracy "
    # print accuracy(nb_classifier, test_data) * 100
    # print "nb precision and recall"
    #        print precision_recall(nb_classifier,test_data)

    #    print   nb_classifier.show_most_informative_features()
    #        for item in  nb_classifier.most_informative_features():
    #            print item
    #   print "dt accuracy "+ str(accuracy(dt_classifier, test_data) * 100)
    print "entropy accuracy " + str(
        accuracy(entropy_classifier, test_data) * 100)
'''
Created on Apr 25, 2010

@author: Ben
'''
from nltk import classify
from nltk.classify import DecisionTreeClassifier
from edu.zoller.nlp import common      

print 'Reading feature words...'
feature_words = common.read_tf_feature_words()

print 'Assembling training feature sets...'
train_set = []
for filename in common.train:
    year_class = common.get_40_year_class(filename)
    features = common.get_tf_features(filename, feature_words)
    train_set.append((features, year_class))
    
print 'Training classifier...'
classifier = DecisionTreeClassifier.train(train_set)
    
print 'Assembling test feature sets...'
test_set = []
for filename in common.test:
    year_class = common.get_40_year_class(filename)
    features = common.get_tf_features(filename, feature_words)
    test_set.append((features, year_class))

print 'Classifying test accuracy'
print classify.accuracy(classifier, test_set)
Beispiel #35
0
# In[31]:

#find most informative features
classifier.show_most_informative_features(n=10)


# In[32]:

#Run Decision Tree for Unigrams to find recall

from nltk.classify import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier.train(train_set, 
                                             binary=True, 
                                             entropy_cutoff=0.8, 
                                             depth_cutoff=5, 
                                             support_cutoff=30)
refset = collections.defaultdict(set)
testset = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_set):
    refset[label].add(i)
    observed = dt_classifier.classify(feats)
    testset[observed].add(i)
print("UnigramDT Recall")
print('Bullying recall:', recall(testset['Bullying'], refset['Bullying']))
print("")


# In[33]:
Beispiel #36
0
 def train(self, features_label):
     self._classifier = DecisionTreeClassifier.train(
         features_label, entropy_cutoff=0.05, depth_cutoff=200, support_cutoff=20
     )
     return None
Beispiel #37
0
                          )  # actual female/ actual female + missed female
    recall_male = tn / (tn + fp)
    fscore_female = 2 * precision_female * recall_female / precision_female + recall_female
    fscore_male = 2 * precision_male * recall_male / precision_male + recall_male

    print(f"Precision for female names: {round(precision_female,2)}")
    print(f"Precision for male names: {round(precision_male,2)}")
    print(f"Recall for female names: {round(recall_female,2)}")
    print(f"Recall for male names: {round(recall_male,2)}")
    print(f"F-score for female names: {round(fscore_female,2)}")
    print(f"F-score for male names: {round(fscore_male,2)}")
    print("\n")


# Decision Tree
dec_tree = DecisionTreeClassifier.train(train_features)
print(
    f"The accuracy of the Decision Tree classifier is {round(accuracy(dec_tree, test_features),2)}"
)
performance(dec_tree)

# Naive Bayes
nb = NaiveBayesClassifier.train(train_features)
print(
    f"The accuracy of the Naive Bayes classifier is {round(accuracy(nb, test_features),2)}"
)
performance(nb)

# Maximum Entropy
max_ent = MaxentClassifier.train(
    train_features, trace=0)  # set trace to 0 to not print the log
Beispiel #38
0
        neg_features.append(k)

negcutoff = len(neg_features)*3//4
poscutoff = len(pos_features)*3//4
trainfeats = neg_features[:negcutoff] + pos_features[:poscutoff]
testfeats = neg_features[negcutoff:] + pos_features[poscutoff:]
print ('\n')
print('Total Training Instances - '+ str(len(trainfeats)))
print( 'Total Testing Instances - ' + str(len(testfeats)))

classifier = NaiveBayesClassifier.train(trainfeats)
print ('\n')
print('NaiveBayesClassifier accuracy:', nltk.classify.util.accuracy(classifier, testfeats))


classifier1 = DecisionTreeClassifier.train(trainfeats,entropy_cutoff=0)
print ('\n')
print('DecisionTreeClassifier accuracy:', nltk.classify.util.accuracy(classifier1, testfeats))

feature_names = ["polarity_nature","polarity_value"]
X = df[feature_names]
X.polarity_nature = X.polarity_nature.apply(lambda i: 0.0 if i=="neutral" else ( 1.0 if i=="postive" else -1.0))
df["status1"] = df.status.apply(lambda i: 0.0 if i==({u'fair': u'neutral'}, 1) else ( 1.0 if i==({u'fair': u'positive'}, 1) else -1.0))
y = df.status1
print (y.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,test_size=0.2)
print (len(X_train), len(X_test))


linreg = LinearRegression()