Exemple #1
0
def main():
	me=Classifier()
	feature_counter=Counter()
	feature_set=pickle.load(open('validation_set.pkl', 'rb'))
	feature_set_labels=[]
	for tweet, rating in feature_set:
		print rating
		try:
			float(rating)
		except:
			continue
		if float(rating)>0:
			label='positive'
		elif float(rating)<0:
			label='negative'
		else:
			label='neutral'
		feature_set_labels.append((tweet, label))
	feature_list=chain.from_iterable([word_tokenize(process_tweet(tweet)) for tweet, sentiment in feature_set_labels])
	for feat in feature_list:
		feature_counter[feat]+=1
	me.feature_list=[feat for feat, count in feature_counter.most_common(1000)]
	ts=[(me.extract_features(tweet), label) for tweet, label in feature_set]
	print 'training Maxent'
	me.classifier=MaxentClassifier.train(ts)
	return me
def main():
    #INTAKE DATA & BUILD TRAINING/TEST SETS
    reviews_corpus = parseData()
    set_corpus_frequencies(reviews_corpus)
    training_data = reviews_corpus[:26]
    test_data = reviews_corpus[26:]

    #BUILD MAXENT MODEL
    training_set = get_training_feats(training_data)
    classifier = MaxentClassifier.train(training_set)

    #CLASSIFY, EXTRACT & EVAL BY TOPIC
    scores = []
    baselines = []

    for topic in test_data:
        extracted_summary = extract_summary_for_topic(topic, classifier)
        random_summary = random.sample(topic['lines'], len(extracted_summary))

        score = RougeCalculator().score(extracted_summary, topic['gold_std'])
        baseline = RougeCalculator().score(random_summary, topic['gold_std'])

        scores.append(score)
        baselines.append(baseline)

        print "Summary for " + topic['topic'] + ':'
        print ''.join(extracted_summary)
        print "Rouge Score: " + str(score)

    print "Extracted Summary Rouge Average"
    print sum(scores) / len(scores)

    print "Baseline Summary Rouge Average"
    print sum(baselines) / len(baselines)
Exemple #3
0
def trainClassifier(data, config):
    words = []
    labels = []
    for sentenceData in data:
        words += sentenceToDictList(sentenceData[0], config)
        labels += sentenceData[1]
    classifier = MaxentClassifier.train(zip(words,labels), algorithm, trace=0, max_iter=1000)
    return classifier
Exemple #4
0
def trainMaxentropy(trainFeatures, trainLabels):
    import shorttext
    from shorttext.classifiers import MaxEntClassifier

    classifier = MaxEntClassifier()
    clf = make_pipeline(DictVectorizer(sparse=True),
                        MaxentClassifier(encoding=None, weights=0))
    scores = cross_val_score(clf, trainFeatures, trainLabels, cv=5)
    clf.fit(trainFeatures, trainLabels)
    return clf, scores.mean(), scores
Exemple #5
0
def trainClassifier(data, config):
    words = []
    labels = []
    wordsSet = set()
    for sentenceDataList in data:
        for sentenceData in sentenceDataList:
            wordsSet |= set(sentenceData[0])
            words += sentenceToDictList(sentenceData[0], config)
            labels += sentenceData[1]
    classifier = MaxentClassifier.train(zip(words,labels), algorithm, trace=0, max_iter=1000)
    return (classifier, wordsSet)
    def train(self):
        for sentence,tags in self.datasource:
            sentence_processed = self.nlp(u' '.join(sentence))
            for token in range(len(sentence)):
                self.featuresets.append((features.feature_compiler(token,sentence_processed),tags[token]))

        train_set, test_set = self.featuresets[0:-1000], self.featuresets[-1000:]
        pprint(train_set[:10])
        self.classifier = MaxentClassifier.train(train_set)

        #Saving the classifier
        self.save()
Exemple #7
0
def main():
	me=Classifier()
	feature_counter=Counter()
	feature_set=pickle.load(open('undersampled_emoticon.pkl', 'rb'))
	feature_list=chain.from_iterable([word_tokenize(process_tweet(tweet)) for tweet, sentiment in feature_set])
	for feat in feature_list:
		feature_counter[feat]+=1
	me.feature_list=[feat for feat, count in feature_counter.most_common(1000)]
	ts=[(me.extract_features(tweet), label) for tweet, label in feature_set]
	print 'training Maxent, algorithm CG'
	me.classifier=MaxentClassifier.train(ts)
	return me
def training(list_filename, model_name):
    # training in large data
    context_data = []
    for filename in list_filename:
        json_objects = read_jsonfile(filename)
        for json_object in json_objects:
            context_data.extend(get_context_sentence(json_object, 0))
    print('Done get contexts')
    m = MaxentClassifier.train(context_data, max_iter=100)
    with open(model_name, 'wb') as fmodel:
        pickle.dump(m, fmodel)
    print('Finish training maxent model')
Exemple #9
0
def trainGenderClassifier(model="NB"):
    my_names = ([(name, 'male') for name in names.words('male.txt')] +
                [(name, 'female') for name in names.words('female.txt')])
    shuffle(my_names)
    train_set = [(gender_features(n), g) for (n, g) in my_names]
    if model == "NB":
        nb_classifier = NaiveBayesClassifier.train(train_set)
        joblib.dump(nb_classifier, 'nb_gender_classifier.pkl')
    elif model == "ME":
        me_classifier = MaxentClassifier.train(train_set, "megam")
        joblib.dump(me_classifier, 'me_gender_classifier.pkl')
    else:
        raise ValueError(
            "Enter Model Type: Naive Bayes (NB) or Maximum Entropy (ME)")
Exemple #10
0
    def train(self, corpus_path, model_path):
        with open(corpus_path) as corpus_file:
            corpus = pickle.load(corpus_file)

        train_set = []
        for row in corpus:
            sentence = [value for (value, _) in row]
            history = []
            for i, (value, column) in enumerate(row):
                feature_set = self.db_row_features(sentence, i, history)
                train_set.append((feature_set, column))
                history.append(column)

        classifier = MaxentClassifier.train(train_set, max_iter=20)

        with open(model_path, "wb") as model_file:
            pickle.dump(classifier, model_file)
def model_dev(func_name): 
	from nltk.corpus import names   
	names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
	random.shuffle(names)
	print "Length of dataset %d"%len(names)
	random.shuffle(names)
	random.shuffle(names)
	print "How the data set looks"
	print names[0:10]
	print "Testing the output of feature extraction"
	print "For name Gary -- %s"%func_name('Gary')
	featuresets = [(func_name(n), g) for (n, g) in names]
	print "length of featureset data %d"%len(featuresets)
	print featuresets[0:10]
	train_set, test_set = featuresets[500:], featuresets[:500]
	print "Length of train data %d"%len(train_set)
	print "length of test data %d"%len(test_set)
	time.sleep(10)
	os.system('clear')

	print "\n\nNaive Bayes Classification\n\n"
	nb_classifier = NaiveBayesClassifier.train(train_set)
	check_list=['Gary', 'Shivam', 'Grace', 'Sarah', 'Shaym', 'Richa', 'Abhisheyk']
	for name in check_list:
		print "Naive gender classification of ---%s --is-- %s---"%(name,nb_classifier.classify(func_name(name)))
	print "The accuracy of the naive classifier is"
	print classify.accuracy(nb_classifier, test_set)
	print "The most informative features are:"
	print nb_classifier.show_most_informative_features(5)

	time.sleep(10)
	os.system('clear')
	print "\n\nMaxent Classification\n\n"
	mod=MaxentClassifier.train(train_set)
	for name in check_list:
		print "Maxent gender classification of ---%s --is-- %s---"%(name,mod.classify(func_name(name)))
	print "The accuracy of maxent is"
	print classify.accuracy(mod, test_set)
	print "The most informative features are:"
	print mod.show_most_informative_features(5)
Exemple #12
0
def train(features, samples_proportion, classifier_choose):
    train_size = int(len(features) * samples_proportion)
    train_set, test_set = features[:train_size], features[train_size:]
    print('Training set size = ' + str(len(train_set)) + ' emails')
    print('Test set size = ' + str(len(test_set)) + ' emails')
    classifier = NaiveBayesClassifier.train(train_set)
    if classifier_choose == 1:
        classifier = NaiveBayesClassifier.train(train_set)
    elif classifier_choose == 2:
        classifier = SklearnClassifier(MultinomialNB()).train(train_set)
    elif classifier_choose == 3:
        classifier = SklearnClassifier(GaussianNB()).train(train_set)
    elif classifier_choose == 4:
        classifier = SklearnClassifier(BernoulliNB()).train(train_set)
    elif classifier_choose == 5:
        classifier = SklearnClassifier(SVC(), sparse=False).train(train_set)
    elif classifier_choose == 6:
        #Bisa pilih algorithm ama masukin parameter ketigas buat tentuin brapa kali iterasi
        #Makin banyak iterasi makin accurassi makin bagus (mungkin masih gk yakin)
        classifier = MaxentClassifier.train(train_set,
                                            MaxentClassifier.ALGORITHMS[0])

    return train_set, test_set, classifier
Exemple #13
0
def trainMaxEnt(fp):
    """
	Function that extracts all features from a training json file at [fp], train a MaxEnt 
	classifier based on these features, and return that classifier

	"""
    with open(fp, "r") as fileHandle:
        test_set = json.load(fileHandle)
    fileHandle.close()

    maxEntCorpus = []

    print("Extracting features from training file...")

    for title in test_set["data"]:
        for paragraph in title["paragraphs"]:
            sents = sent_tokenize(paragraph["context"])

            for question in paragraph["qas"]:
                q = question["question"]

                simFeature, candSent = genSimFeature(q, sents)
                atFeature = matchAT(extractAT(q), candSent)
                focusFeature = genFocusFeature(q, sents)

                features = {
                    simFeature: True,
                    atFeature: True,
                    focusFeature: True
                }

                if question["is_impossible"]:
                    maxEntCorpus.append((features, 0))
                else:
                    maxEntCorpus.append((features, 1))

    return MaxentClassifier.train(maxEntCorpus, max_iter=30)
Exemple #14
0
	def training_weight_iis(self, paragraph):
		train = []
		for index, data in enumerate(paragraph):
			sentence = sent_tokenize(data)
		# 1. Pemecahan paragraf kedalam kalimat
			for index, data in enumerate(sentence):	
		# 2. Convert sentence to lower
				sent_lower = data.lower()
		# 3. Convert terbilang ke angka
				sent_conv = self.func.terbilang_to_number(sent_lower)
				print "training kata [%s]"%sent_conv
		# 4. Stemming
				tokenize = word_tokenize(sent_conv)

				div_sentence = []
				for data in tokenize:
					if "/" not in data:
						# ubah menjadi kata dasar
						sent_stem = self.stemmer.stem(data)
						data = sent_stem
					elif "/con" in data:
						# ubah menjadi kata dasar kemudian dicocokan kedalam gazeter kondisi
						sent_stem = self.stemmer.stem(self.w.search(data).group(1))
						data = sent_stem+"/CON"
					elif "/" in data:
						word = self.w.search(data).group(1)
						label = self.lbl.search(data).group(1)
						data = word+"/"+label.upper()
					div_sentence.append(data)
				train.append(" ".join(div_sentence))

		#print train
		#melakukan training dengan sentence yang sudah diubah kedalam kata dasar
		me_classifier = MaxentClassifier.train(self.binary_feature(train, "train_iis"), 'iis', trace=100, max_iter=2000, min_lldelta=0.5)
		#print me_classifier.show_most_informative_features()
		return me_classifier
        keep_dup=False)
    print(len(X_train))

    X_test, Y_test = get_data_for_cognitive_classifiers(
        threshold=[0.75],
        what_type=['ada', 'os', 'bcl'],
        what_for='test',
        keep_dup=False)
    print('Loaded/Preprocessed data')

    train_set = [(features(X_train[i]), Y_train[i])
                 for i in range(len(X_train))]
    test_set = [(features(X_test[i]), Y_test[i]) for i in range(len(X_test))]

    if TRAIN:
        classifier = MaxentClassifier.train(train_set, max_iter=100)
        classifier.predict_proba = classifier.prob_classify
        pickle.dump(
            classifier,
            open(
                os.path.join(os.path.dirname(__file__),
                             'models/MaxEnt/maxent.pkl'), 'wb'))

    if not TRAIN:
        classifier = pickle.load(
            open(
                os.path.join(os.path.dirname(__file__),
                             'models/MaxEnt/maxent_85.pkl'), 'rb'))

    pred = []
    actual = [x[1] for x in test_set]
Exemple #16
0
    if sent:
        check = True
        for char in sent:
            if char not in alphabet:
                check = False
        if check:
            sent = sent + stop_char
            sentences.append(sent)
                
print('%d sentences after cleanup' %(len(sentences)))
print("")

print('getting data...')
sys.stdout.flush()
train_data = get_training_data(sentences)
print('done.')
print('training model...')
sys.stdout.flush()
model = MaxentClassifier.train(train_data, labels=alphabet)
print('done.')
print('pickling...')
sys.stdout.flush()
with open('maxentmodel.pickle', 'wb') as f:
    pickle.dump(model, f)
print('done')
sys.stdout.flush()

end = time.time()
print('%.2f seconds' %(end-start))
sys.stdout.flush()
    testSet.append(each[0])

#change these ints to change the test entries: should be safe up to 500 or so
#first one is used by bayes, second is used by maxent
a = 10
b = 111

print('Training Naive Bayes')
bayesClassifier = NaiveBayesClassifier.train(trainingSet)
print('Naive Bayes training complete')

print('Naive Bayes most important features:')
bayesClassifier.show_most_informative_features(5)

print('Dialogue:')
print(prelimTestSet[a])
print('Bayes classification:')
print(bayesClassifier.classify(testSet[a]))

print('Training Maximum Entropy')
maxEntClassifier = MaxentClassifier.train(trainingSet, max_iter=30)
print('Maximum Entropy training complete')

print('Maximum Entropy most important features:')
maxEntClassifier.show_most_informative_features(5)

print('Dialogue:')
print(prelimTestSet[b])
print('MaxEnt classification:')
print(maxEntClassifier.classify(testSet[b]))
Exemple #18
0
    if len(args) > 0:
        path = args[0]
    else:
        print "Usage: python train.py -m <path/to/model/file> path/to/training/data"
        sys.exit(2)

    # Check the path is exists or not?
    if not os.path.exists(path):
        print "The path \'%s\' is not exist. Try again!" % path
        sys.exit(2)
    elif not os.path.isfile(path):
        print "The path \'%s\' is not a file. Try again!" % path
        sys.exit(2)

    # Load dataset
    print "Loading training data..."
    dataset = np.load(path)

    # Training processing
    print "Training Maximum Entropy Model from the dataset \'%s\'" % path
    maxent = MaxentClassifier.train(dataset, max_iter=10)

    # Save model
    print "Saving model into file %s" % model
    with io.open(model, 'wb') as fmodel:
        pickle.dump(maxent, fmodel)

    # Finished?
    print "DONE!!"
Exemple #19
0
                f.write(tag)
                f.write("\n")


if __name__ == "__main__":
    # load files
    trainfilePath = "CONLL_NAME_CORPUS_FOR_STUDENTS/CONLL_train.pos-chunk-name"
    testfilePath = "CONLL_NAME_CORPUS_FOR_STUDENTS/CONLL_dev.pos-chunk"
    predictfilePath = "CONLL_NAME_CORPUS_FOR_STUDENTS/CONLL_test.pos-chunk"

    trainwords, trainTokenList = loadTrainData(trainfilePath)
    trainWord2VecFeature = generateWord2Vec(trainwords)

    testwords, testTokenList = loadTestData(testfilePath)
    testWord2VecFeature = generateWord2Vec(testwords)

    predictwords, predictTokenList = loadTestData(predictfilePath)
    predictWord2VecFeature = generateWord2Vec(predictwords)

    # train model
    trainToks = create_trainToks(trainTokenList,trainWord2VecFeature)
    model = MaxentClassifier.train(trainToks)

    # predict
    testFratureSet = create_testFeatureSet(testTokenList,testWord2VecFeature)
    labels = predict(model,testFratureSet)
    write_out(labels,"response.name")

    predictFeatureSet = create_testFeatureSet(predictTokenList, predictWord2VecFeature)
    labels = predict(model,predictFeatureSet)
    write_out(labels,"CONLL_test.name")
Exemple #20
0
# /usr/bin/env python3
# -*- coding:utf-8 -*-
import sys
import math
from collections import defaultdict
from nltk import MaxentClassifier

# play outlook temperature humidity windy

maxent = MaxentClassifier()


class MaxEnt:
    def __init__(self):
        self._samples = []  # 样本集, 元素是[y,x1,x2,...,xn]的元组
        self._Y = set([])  # 标签集合,相当于去重之后的y
        self._numXY = defaultdict(int)  # Key是(xi,yi)对,Value是count(xi,yi)
        self._N = 0  # 样本数量
        self._n = 0  # 特征对(xi,yi)总数量
        self._xyID = {}  # 对(x,y)对做的顺序编号(ID), Key是(xi,yi)对,Value是ID
        self._C = 0  # 样本最大的特征数量,用于求参数时的迭代,见IIS原理说明
        self._ep_ = []  # 样本分布的特征期望值
        self._ep = []  # 模型分布的特征期望值
        self._w = []  # 对应n个特征的权值
        self._lastw = []  # 上一轮迭代的权值
        self._EPS = 0.01  # 判断是否收敛的阈值

    def load_data(self, filename):
        for line in open(filename, "r"):
            sample = line.strip().split("\t")
            if len(sample) < 2:  # 至少:标签+一个特征
for index, tuples in df[["words", "pos", "tags"]].iterrows():
    word_tuple, pos_tuple, tag_tuple = tuples
    word_num = 0
    prev_tag = prev_tag1 = prev_tag2 = None
    for word_num in range(len(word_tuple)):
        feature = (extract_features(word_num, word_tuple, pos_tuple,
                                    [prev_tag2, prev_tag1, prev_tag]),
                   tag_tuple[word_num])
        features.append(feature)
        prev_tag = tag_tuple[word_num]
        prev_tag1 = prev_tag
        prev_tag2 = prev_tag1

# In[11]:

memm_classifier = MaxentClassifier.train(features, "megam")

# ## Testing Phase

# In[12]:

text = open("./goldoutput.txt").read()
lines = [y.strip() for y in text.split("\n\n")]
test_df = pd.DataFrame(lines, columns=["sentence"])
# test_df = dev_df.copy()
test_df.loc[:, "sentence_token"] = test_df["sentence"].apply(
    lambda x: tuple(y.split("\t") for y in x.split("\n")))
test_df.loc[:, "words_"] = test_df["sentence_token"].apply(
    lambda x: tuple(y[1] for y in x))
test_df.loc[:, "pos"] = test_df["words_"].apply(
    lambda x: tuple(x[1] for x in nltk.pos_tag(x)))
def build_model(training_features,preprocessed_validation_data ):
    algorithm = MaxentClassifier.ALGORITHMS[0]
    MaxEntClassifier = MaxentClassifier.train(training_features, algorithm,max_iter=10)
    predictions =  [MaxEntClassifier.classify(extract_tweet_features(tweet[0])) for tweet in preprocessed_validation_data]
    return MaxEntClassifier, predictions
Exemple #23
0
def main():
    # if preprocessed data was stored previously, just load it
    if os.path.isfile('./data/processed/preptrainingdata.pickle') \
            and os.path.isfile('./data/processed/preptestdata.pickle'):
        preptrainingdata_f = open('./data/processed/preptrainingdata.pickle', 'r')
        preptrainingdata = pickle.load(preptrainingdata_f)

        preptestdata_f = open('./data/processed/preptestdata.pickle', 'r')
        preptestdata = pickle.load(preptestdata_f)

        preptrainingdata_f.close()
        preptestdata_f.close()

    else:
        # preprocess training and test data and store them
        trainingdatapath = './data/original/origintrainingdata.csv'
        testdatapath = './data/original/origintestdata.csv'

        preprocessor = Preprocessor(trainingdatapath, testdatapath)

        [training, test] = preprocessor.read_data(2000, 2000)

        # preprocessing step
        for row in training+test:
            row[0] = preprocessor.preprocess(row[0])

        preptrainingdata = training
        preptestdata = test

        # store preprocessed training data
        save_documents = open('./data/processed/preptrainingdata.pickle', 'w')
        pickle.dump(preptrainingdata, save_documents)
        save_documents.close()

        # store preprocessed test data
        save_documents = open('./data/processed/preptestdata.pickle', 'w')
        pickle.dump(preptestdata, save_documents)
        save_documents.close()

    if os.path.isfile('./data/processed/trainingfeaset.pickle') \
            and os.path.isfile('./data/processed/testfeaset.pickle')\
            and os.path.isfile('./data/processed/word_features.pickle'):

        trainingfeaset_f = open('./data/processed/trainingfeaset.pickle', 'r')
        trainingfeaset = pickle.load(trainingfeaset_f)

        testfeaset_f = open('./data/processed/testfeaset.pickle', 'r')
        testfeaset = pickle.load(testfeaset_f)

        word_features_f = open('./data/processed/word_features.pickle', 'r')
        word_features = pickle.load(word_features_f)

        trainingfeaset_f.close()
        testfeaset_f.close()
        word_features_f.close()

    else:
        # feature extraction and feature set construction and store them
        fea_extractor = FeatureExtractor()
        all_words = []

        for row in preptrainingdata+preptestdata:
            all_words.extend(fea_extractor.getfeavector(row[0]))

        word_features = fea_extractor.getfeatures(all_words, 4000)

        del all_words  # release some memory

        trainingfeaset = [(fea_extractor.construct_feaset(row[0], word_features), row[1]) for row in preptrainingdata]
        testfeaset = [(fea_extractor.construct_feaset(row[0], word_features), row[1]) for row in preptestdata]

        # random.shuffle(trainingfeaset)
        # random.shuffle(testfeaset)

        save_documents = open('./data/processed/word_features.pickle', 'w')
        pickle.dump(word_features, save_documents)
        save_documents.close()

        save_documents = open('./data/processed/trainingfeaset.pickle', 'w')
        pickle.dump(trainingfeaset, save_documents)
        save_documents.close()

        save_documents = open('./data/processed/testfeaset.pickle', 'w')
        pickle.dump(testfeaset, save_documents)
        save_documents.close()

    # Naive Bayes
    if os.path.isfile('./data/processed/NB_classifier.pickle'):
        NB_classifier_f = open("./data/processed/NB_classifier.pickle", "r")
        NB_classifier = pickle.load(NB_classifier_f)
        NB_classifier_f.close()

    else:
        NB_classifier = nltk.NaiveBayesClassifier.train(trainingfeaset)
        save_classifier = open("./data/processed/NB_classifier.pickle", "w")
        pickle.dump(NB_classifier, save_classifier)
        save_classifier.close()

    print("Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(NB_classifier, testfeaset)) * 100)
    print NB_classifier.show_most_informative_features(10)

    # Maximum Entropy
    if os.path.isfile('./data/processed/MaxEntClassifier.pickle'):
        MaxEntClassifier_f = open('./data/processed/MaxEntClassifier.pickle','r')
        MaxEntClassifier = pickle.load(MaxEntClassifier_f)
        MaxEntClassifier_f.close()

    else:
        MaxEntClassifier = MaxentClassifier.train(trainingfeaset, algorithm='GIS', max_iter=10)
        save_classifier = open("./data/processed/MaxEntClassifier2.pickle", "w")
        pickle.dump(MaxEntClassifier, save_classifier)
        save_classifier.close()

    print "MaxEnt Classifier accuracy percent:", nltk.classify.accuracy(MaxEntClassifier, testfeaset)
    print MaxEntClassifier.show_most_informative_features(10)

    fea_extractor = FeatureExtractor()
    trainingset = fea_extractor.construct_svm_feaset(preptrainingdata, word_features)
    problem = svm_problem(trainingset['labels'], trainingset['feature_vectors'])
    param = svm_parameter('-q')
    param.kernel_type = LINEAR
    svm_classifier = svm_train(problem, param)
    svm_save_model('./data/svm_classifier', svm_classifier)

    testset = fea_extractor.construct_svm_feaset(preptestdata, word_features)
    p_labels, p_accs, p_vals = svm_predict(testset['labels'], testset['feature_vectors'], svm_classifier)

    print p_labels
    print p_accs
Exemple #24
0
 def train(self):
     feature_set = list()
     for prop in self.word_prop:
         for feats in self.word_prop[prop]:
             feature_set.append((feats, prop))
     self.model = MaxentClassifier.train(feature_set, "gis", max_iter=10)
Exemple #25
0
## lists
# merge lists: http://stackoverflow.com/questions/252703/python-append-vs-extend
x = [1,2,3]
x.append([4,5])  # append a list as an item in list x
x.extend([4,5])  # adds 4 and 5 as separate elements in list x

# frequency count over a list
from collections import Counter
Counter(['apple','red','apple','red','red','pear'])


## to train Max Entropy classifier using MegaM
from nltk import MaxentClassifier
nltk.config_megam('/Users/andrewcaines/Downloads/megam_0.92/megam')
classifier = MaxentClassifier.train(trainfeats, 'megam')


## strings with (u'x') for unicode
[item.decode('UTF-8') if isinstance(item, basestring) else item for item in listx])

## range of numbers
range(0, 10)

## sequence of numbers
import numpy
numpy.arange(0, 10, 2)


## average
numpy.mean([1, 2, 3])
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features


def gender_features3(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}


if __name__ == '__main__':
    print("Lab 3 - Exercise 2")
    data = get_data()
    train_set = apply_features(gender_features3, data[500:])
    test_set = apply_features(gender_features3, data[:500])

    print("Training classifiers")
    # Train the different classifiers on the training set
    classifier = [(NaiveBayesClassifier.train(train_set), "NaiveBayes"),
                  (DecisionTreeClassifier.train(train_set), "DecisionTree"),
                  (MaxentClassifier.train(train_set, max_iter=10, trace=0), "MaxEntropy")]

    # Test all classifiers on the test set
    for classifier, name in classifier:
        acc = accuracy(classifier, test_set)
        print("{} classifier test accuracy: {}".format(name, acc))
def main():
    data = []
    with open('data-1_train.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            data.append(row)
    fields = data[0]
    data = np.array(data[1:], dtype=object)
    print(data.shape, fields)
    words = filterData(data)
    print(words.shape)

    x_train = []
    y_train = []
    x_train_aspect = []
    for i in range(len(data)):
        x_train.append(words[i][1])
        y_train.append(data[i][4])
        x_train_aspect.append(data[i][2])

    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_train_aspect = np.array(x_train_aspect)

    print('here')
    print(x_train[0])
    print('here')
    print(y_train[0:10])

    features = Features(x_train, x_train_aspect)
    print('printing features')
    print(features)
    print('Length: ', len(features), type(features))

    features = set(features)
    print('Length2: ', len(features))

    # 10-Fold Cross Validation
    kf = KFold(n_splits=10)
    kf.get_n_splits(x_train)

    for train_index, test_index in kf.split(x_train):
        print(type(train_index))
        print(type(x_train))
        errors = 0
        x_train_kf, x_test_kf = x_train[train_index], x_train[test_index]
        y_train_kf, y_test_kf = y_train[train_index], y_train[test_index]
        print(type(x_train_aspect))
        x_train_aspect_kf = x_train_aspect[train_index]

        fv = Features(x_train_kf, x_train_aspect_kf)
        x_train_maxent = train_data(fv, x_train_kf, y_train_kf)
        print('Train feature vectors created')

        x_test_maxent = test_data(fv, x_test_kf, y_test_kf)
        print('Test feature vectors created')

        mec = MaxentClassifier.train(x_train_maxent)
        print('train finish')

        for featureset, label in zip(x_test_maxent, y_test_kf):
            if (mec.classify(featureset) != label):
                errors += 1

        print("Accuracy: %f" % (1 - (errors / float(len(y_test_kf)))))
Exemple #28
0
    if len(args) > 0:
        path = args[0]
    else:
        print "Usage: python train.py -m <path/to/model/file> path/to/training/data"
        sys.exit(2)

    # Check the path is exists or not?
    if not os.path.exists(path):
        print "The path \'%s\' is not exist. Try again!" % path
        sys.exit(2)
    elif not os.path.isfile(path):
        print "The path \'%s\' is not a file. Try again!" % path
        sys.exit(2)

    # Load dataset
    print "Loading training data..."
    dataset = np.load(path)

    # Training processing
    print "Training Maximum Entropy Model from the dataset \'%s\'" % path
    maxent = MaxentClassifier.train(dataset, max_iter=10)


    # Save model
    print "Saving model into file %s" % model
    with io.open(model, 'wb') as fmodel:
        pickle.dump(maxent, fmodel)

    # Finished?
    print "DONE!!"
import pickle


if __name__ == "__main__":
    '''
    init the program, prepare input, output
    '''
    in_file =  codecs.open(sys.argv[1],encoding='utf-8',mode='r')

    
    '''
    label our train data
    '''
    lines = in_file.readlines()
    labeled_entries = flat_list(map(get_labeled, lines))
    
    '''
    train a classifier
    '''
    mx_classifier = MaxentClassifier.train(labeled_entries);
    
    '''
    save the classifier to the disk
    '''
    mx_file = open('mx_classifier.pkl', 'wb')
    pickle.dump(mx_classifier, mx_file)
    mx_file.close()

    in_file.close()

    mx_classifier.show_most_informative_features(5) | stdout
 def _train(self, txs, tys):
     #rid2shard = ST.random_shardlize(10, len(self._train_xs))
     train_set = [(self._feature_encoding(txt), tag) for txt, tag in zip(txs, tys)]
     return MaxentClassifier.train(train_set, algorithm='iis', max_iter=4)
Exemple #31
0
 def __init__(self):
     self.train_set, self.test_set = utils.chunked_training_dataset()
     print('Ngram chunk tagger training started')
     self.classifier = MaxentClassifier.train(
         self.__transformed_training_set())
     print('Ngram chunk tagger training completed')
    else:
        features['left_neighbor_len'] = "0"
        features['left_neighbor_digit'] = "False"
        features['left_neighbor_title'] = "False"

    if nxt != PARAGRAPH:
        features['right_neighbor_len'] = "%s" % len(nxt)
        features['right_neighbor_digit'] = "%s" % isdigit(nxt)
        features['right_neighbor_title'] = "%s" % title(nxt)
        features['paragraph_end'] = "False"
    else:
        features['right_neighbor_len'] = "0"
        features['right_neighbor_digit'] = "False"
        features['right_neighbor_title'] = "False"
        features['paragraph_end'] = "True"

    return features


if __name__ == "__main__":
    tree = "try_train.xml"

    data = collect_classified_data(tree)

    train_set, test_set = data, data
    me_classifier = MaxentClassifier.train(train_set)

    test_ex = test_set[0][0]
    print(test_ex)
    print(me_classifier.classify(test_ex))
print(gender_features('Gary'))

featuresets = [(gender_features(n), g) for (n, g) in names]
print(featuresets[0:10])

train_set, test_set = featuresets[500:], featuresets[:500]
print(len(train_set), len(test_set))

nb_classifier = NaiveBayesClassifier.train(train_set)
print(nb_classifier.classify(gender_features('Gary')))
print(nb_classifier.classify(gender_features('Grace')))

print(classify.accuracy(nb_classifier, test_set))
nb_classifier.show_most_informative_features(5)

me_classifier = MaxentClassifier.train(train_set)

me_classifier.classify(gender_features('Gary'))
me_classifier.classify(gender_features('Grace'))
print(classify.accuracy(me_classifier, test_set))

me_classifier.show_most_informative_features(5)

def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features
def main():
    # for feature extraction
    fea_extractor = FeatureExtractor()

    # if preprocessed data was stored previously, just load it
    # for what is mean by "preprocessed", refer to preprocess method in preproc_fea_extraction.py
    if os.path.isfile('./data/processed/preptrainingdata.pickle') \
            and os.path.isfile('./data/processed/preptestdata.pickle'):

        print "preptrainingdata and preptestdata detected, load files..."

        preptrainingdata_f = open('./data/processed/preptrainingdata.pickle', 'r')
        preptrainingdata = pickle.load(preptrainingdata_f)

        preptestdata_f = open('./data/processed/preptestdata.pickle', 'r')
        preptestdata = pickle.load(preptestdata_f)

        preptrainingdata_f.close()
        preptestdata_f.close()

    else:

        print "no preptrainingdata and preptestdata detected, create from scratch..."

        # preprocess training and test data and store them
        trainingdatapath = './data/original/origintrainingdata.csv'
        testdatapath = './data/original/origintestdata.csv'

        preprocessor = Preprocessor(trainingdatapath, testdatapath)

        [training, test] = preprocessor.read_data(2000, 2000)

        print "reading training data and all test data done..."

        print "length of training", len(training)

        # preprocessing step
        for row in training+test:
            row[0] = preprocessor.preprocess(row[0])

        preptrainingdata = training
        preptestdata = test

        print "preprocessing done..."

        # store preprocessed training data
        save_documents = open('./data/processed/preptrainingdata.pickle', 'w')
        pickle.dump(preptrainingdata, save_documents)
        save_documents.close()

        # store preprocessed test data
        save_documents = open('./data/processed/preptestdata.pickle', 'w')
        pickle.dump(preptestdata, save_documents)
        save_documents.close()

    # if training feature set and test feature set are stored previously, just load them
    # these feature set are used by Naive Bayes and Maximum Entropy
    # word_features contains the names of features (which are words)
    # e.g. a word is a feature, feature name is the word, value is True or False
    if os.path.isfile('./data/processed/trainingfeaset.pickle') \
            and os.path.isfile('./data/processed/testfeaset.pickle')\
            and os.path.isfile('./data/processed/word_features.pickle'):

        print "trainingfeaset, testfeaset and word_features detected, load files..."

        trainingfeaset_f = open('./data/processed/trainingfeaset.pickle', 'r')
        trainingfeaset = pickle.load(trainingfeaset_f)

        testfeaset_f = open('./data/processed/testfeaset.pickle', 'r')
        testfeaset = pickle.load(testfeaset_f)

        word_features_f = open('./data/processed/word_features.pickle', 'r')
        word_features = pickle.load(word_features_f)

        trainingfeaset_f.close()
        testfeaset_f.close()
        word_features_f.close()

    else:

        print "no trainingfeaset, testfeaset and word_features detected, create from scratch..."

        # feature extraction and feature set construction and store them
        all_words = []

        for row in preptrainingdata+preptestdata:
            all_words.extend(fea_extractor.get_feavector(row[0]))

        print "generating all_words done..."
        print "start generating word_features..."

        # set desired # of features in the second parameter
        word_features = fea_extractor.get_features(all_words, 5000)

        print "generating word_features done..."

        del all_words  # release some memory

        trainingfeaset = [(fea_extractor.construct_feaset(row[0], word_features), row[1]) for row in preptrainingdata]
        testfeaset = [(fea_extractor.construct_feaset(row[0], word_features), row[1]) for row in preptestdata]

        print "generating trainingfeaset and testfeaset done... great progress!"

        # random.shuffle(trainingfeaset)
        # random.shuffle(testfeaset)

        save_documents = open('./data/processed/word_features.pickle', 'w')
        pickle.dump(word_features, save_documents)
        save_documents.close()

        save_documents = open('./data/processed/trainingfeaset.pickle', 'w')
        pickle.dump(trainingfeaset, save_documents)
        save_documents.close()

        save_documents = open('./data/processed/testfeaset.pickle', 'w')
        pickle.dump(testfeaset, save_documents)
        save_documents.close()

        print "storing training and test featureset files done..."

    # Naive Bayes
    print "Naive Bayes start..."

    if os.path.isfile('./data/processed/NB_classifier.pickle'):
        NB_classifier_f = open("./data/processed/NB_classifier.pickle", "r")
        NB_classifier = pickle.load(NB_classifier_f)
        NB_classifier_f.close()

    else:
        start = time.time()
        NB_classifier = nltk.NaiveBayesClassifier.train(trainingfeaset)
        NB_trainingtime = time.time() - start

        print "Naive Bayes training time:", NB_trainingtime

        save_classifier = open("./data/processed/NB_classifier.pickle", "w")
        pickle.dump(NB_classifier, save_classifier)
        save_classifier.close()

    print "Naive Bayes Classifier accuracy percent:", (nltk.classify.accuracy(NB_classifier, testfeaset)) * 100
    print NB_classifier.show_most_informative_features(10)

    # Maximum Entropy
    print "Maximum Entropy start..."

    if os.path.isfile('./data/processed/MaxEntClassifier.pickle'):
        MaxEntClassifier_f = open('./data/processed/MaxEntClassifier.pickle','r')
        MaxEntClassifier = pickle.load(MaxEntClassifier_f)
        MaxEntClassifier_f.close()

    else:
        start = time.time()
        MaxEntClassifier = MaxentClassifier.train(trainingfeaset, algorithm='GIS', max_iter=10)
        MaxEnt_trainingtime = time.time() - start

        print "Maximum Entropy training time:", MaxEnt_trainingtime

        save_classifier = open("./data/processed/MaxEntClassifier.pickle", "w")
        pickle.dump(MaxEntClassifier, save_classifier)
        save_classifier.close()

    print "MaxEnt Classifier accuracy percent:", nltk.classify.accuracy(MaxEntClassifier, testfeaset)
    print MaxEntClassifier.show_most_informative_features(10)

    # SVM
    print "SVM start..."

    testset = fea_extractor.construct_svm_feaset(preptestdata, word_features)

    if os.path.isfile('./data/processed/svm_classifier.model'):

        svm_classifier = svm_load_model('./data/processed/svm_classifier.model')

    else:

        trainingset = fea_extractor.construct_svm_feaset(preptrainingdata, word_features)

        problem = svm_problem(trainingset['labels'], trainingset['feature_vectors'])
        param = svm_parameter('-q')
        param.kernel_type = LINEAR

        start = time.time()
        svm_classifier = svm_train(problem, param)
        svm_trainingtime = time.time() - start

        print "SVM training time:", svm_trainingtime

        svm_save_model('./data/processed/svm_classifier.model', svm_classifier)

    p_labels, p_accs, p_vals = svm_predict(testset['labels'], testset['feature_vectors'], svm_classifier)

    print p_labels
    print p_accs
Exemple #35
0
def train_classifier(data):
    me_classifier = MaxentClassifier.train(train_set)
    return me_classifier