Beispiel #1
0
def main_svmlight():
    # copied:
    import svmlight
    import pdb
    
    training_data = syntheticData(30, 1)
    test_data     = syntheticData(30, 1)
    #training_data = __import__('data').train0
    #test_data = __import__('data').test0

    print 'HERE 0'
    print 'training_data is', training_data
    print 'test_data is', test_data

    # train a model based on the data
    #pdb.set_trace()
    print 'HERE 1'
    model = svmlight.learn(training_data, type='regression', kernelType=2, verbosity=3)
    print 'HERE 2'

    # model data can be stored in the same format SVM-Light uses, for interoperability
    # with the binaries.
    svmlight.write_model(model, 'my_model.dat')
    print 'HERE 3'

    # classify the test data. this function returns a list of numbers, which represent
    # the classifications.
    #predictions = svmlight.classify(model, test_data)
    pdb.set_trace()
    predictions = svmlight.classify(model, training_data)
    print 'HERE 4'
    
    for p,example in zip(predictions, test_data):
        print 'pred %.8f, actual %.8f' % (p, example[0])
Beispiel #2
0
def training_model(ind, n=3):
    print "Loading features"
    load_features(n, fmap)
    print "Feature map size: %s" % fmap.getSize()
    print "Getting training data"
    train = []
    for i in ind.get_pos_train_ind():
        item = os.listdir("pos")[i]
        train.append(
            (1, [(fmap.getID(item[0]), item[1])
                 for item in ngrams.ngrams(n,
                                           open("pos/" + item).read()).items()
                 if fmap.hasFeature(item[0])]))
    for i in ind.get_neg_train_ind():
        item = os.listdir("neg")[i]
        train.append((-1, [
            (fmap.getID(item[0]), item[1])
            for item in ngrams.ngrams(n,
                                      open("neg/" + item).read()).items()
            if fmap.hasFeature(item[0])
        ]))
    print "Training model"
    model = svmlight.learn(train, type='classification', verbosity=0)
    svmlight.write_model(model, 'my_model.dat')
    return model
Beispiel #3
0
def run_svm(article_count, feature_functions, kernel='polynomial', split=0.9, model_path='svm.model'):
    # https://bitbucket.org/wcauchois/pysvmlight
    articles, total_token_count = preprocess_wsj(article_count, feature_functions)

    dictionary = Dictionary()
    dictionary.add_one('ZZZZZ')  # so that no features are labeled 0
    data = []
    for article in articles:
        for sentence in article:
            for tag, token_features in zip(sentence.def_tags, sentence.data):
                # only use def / indef tokens
                if tag in ('DEF', 'INDEF'):
                    features = dictionary.add(token_features)
                    features = sorted(list(set(features)))
                    feature_values = zip(features, [1]*len(features))
                    data.append((+1 if tag == 'DEF' else -1, feature_values))

    train, test = bifurcate(data, split, shuffle=True)

    # for corpus, name in [(train, 'train'), (test, 'test')]:
        # write_svm(corpus, 'wsj_svm-%s.data' % name)

    #####################
    # do svm in Python...
    model = svmlight.learn(train, type='classification', kernel=kernel)

    # svmlight.learn options
    # type: select between 'classification', 'regression', 'ranking' (preference ranking), and 'optimization'.
    # kernel: select between 'linear', 'polynomial', 'rbf', and 'sigmoid'.
    # verbosity: set the verbosity level (default 0).
    # C: trade-off between training error and margin.
    # poly_degree: parameter d in polynomial kernel.
    # rbf_gamma: parameter gamma in rbf kernel.
    # coef_lin
    # coef_const
    # costratio (corresponds to -j option to svm_learn)
    svmlight.write_model(model, model_path)

    gold_labels, test_feature_values = zip(*test)
    # total = len(gold_labels)

    test_pairs = [(0, feature_values) for feature_values in test_feature_values]
    predictions = svmlight.classify(model, test_pairs)

    correct, wrong = matches(
        [(gold > 0) for gold in gold_labels],
        [(prediction > 0) for prediction in predictions])

    return dict(
        total_articles_count=len(articles),  # int
        total_token_count=total_token_count,  # int
        train_count=len(train),  # int
        test_count=len(test),  # int
        kernel=kernel,
        correct=correct,
        wrong=wrong,
        total=correct + wrong,
    )
Beispiel #4
0
    def train(self):
        """Learn model weights from training instances."""

        # Train using svmlight
        self._svmmodel = svmlight.learn(self._training_data, type='ranking')

        # Write svmlight output to a temp file and recover weights
        modelout = NamedTemporaryFile(delete=False)
        svmlight.write_model(self._svmmodel, modelout.name)
        modelout.close()
        self._recover_weights(modelout.name)
        remove(modelout.name)
Beispiel #5
0
def trainall():
    """
    使用svm训练0-9 10个数字样本
    :return:
    """
    for i in range(10):
        print "training ", i
        training_data = totrain(i)
        model = svmlight.learn(training_data, type="classification", verbosity=0)
        model_name = "model/" + str(i)
        svmlight.write_model(model, model_name)  # write model

        """
Beispiel #6
0
def train(fnames, topics):

  training_data = init_train_data(fnames, topics)
  print ('[ train ] ===================')

  with open(TRAINING_DATA, 'w') as f :
    pprint.pprint(training_data, f)
  # train a model based on the data
  model = svmlight.learn(training_data, type='ranking', kernel = 'linear',  verbosity=0)
  
  # model data can be stored in the same format SVM-Light uses, for interoperability
  # with the binaries.
  svmlight.write_model(model, 'ef_model.dat')
  ZC.dump_cache()
Beispiel #7
0
def training_model(ind,n=3):
    print "Loading features"
    load_features(n,fmap)
    print "Feature map size: %s" % fmap.getSize()
    print "Getting training data"
    train = []
    for i in ind.get_pos_train_ind():
        item = os.listdir("pos")[i]
        train.append((1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("pos/"+item).read()).items() if fmap.hasFeature(item[0])]))
    for i in ind.get_neg_train_ind():
        item = os.listdir("neg")[i]
        train.append((-1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("neg/"+item).read()).items() if fmap.hasFeature(item[0])]))
    print "Training model"
    model = svmlight.learn(train, type='classification', verbosity=0)
    svmlight.write_model(model, 'my_model.dat')
    return model
Beispiel #8
0
def train(fnames, topics):

    training_data = init_train_data(fnames, topics)
    print('[ train ] ===================')

    with open(TRAINING_DATA, 'w') as f:
        pprint.pprint(training_data, f)
    # train a model based on the data
    model = svmlight.learn(training_data,
                           type='ranking',
                           kernel='linear',
                           verbosity=0)

    # model data can be stored in the same format SVM-Light uses, for interoperability
    # with the binaries.
    svmlight.write_model(model, 'ef_model.dat')
    ZC.dump_cache()
Beispiel #9
0
def test_svmlight():
    training_data = [(1, [(1,2),(2,5),(3,6),(5,1),(4,2),(6,1)]),
                     (1, [(1,2),(2,1),(3,4),(5,3),(4,1),(6,1)]),
                     (1, [(1,2),(2,2),(3,4),(5,1),(4,1),(6,1)]),
                     (1, [(1,2),(2,1),(3,3),(5,1),(4,1),(6,1)]),
                     (-1, [(1,2),(2,1),(3,1),(5,3),(4,2),(6,1)]),
                     (-1, [(1,1),(2,1),(3,1),(5,3),(4,1),(6,1)]),
                     (-1, [(1,1),(2,2),(3,1),(5,3),(4,1),(6,1)]),
                     (-1, [(1,1),(2,1),(3,1),(5,1),(4,3),(6,1)]),
                     (-1, [(1,2),(2,1),(3,1),(5,2),(4,1),(6,5)]),
                     (-1, [(7,10)])]
    
    test_data = [(0, [(1,2),(2,6),(3,4),(5,1),(4,1),(6,1)]),
                 (0, [(1,2),(2,6),(3,4)])]
    
    model = svmlight.learn(training_data, type='classification', verbosity=0)
    svmlight.write_model(model, 'my_model.dat')
    predictions = svmlight.classify(model, test_data)
    for p in predictions:
        print '%.8f' % p
Beispiel #10
0
def test_svmlight():
    training_data = [(1, [(1, 2), (2, 5), (3, 6), (5, 1), (4, 2), (6, 1)]),
                     (1, [(1, 2), (2, 1), (3, 4), (5, 3), (4, 1), (6, 1)]),
                     (1, [(1, 2), (2, 2), (3, 4), (5, 1), (4, 1), (6, 1)]),
                     (1, [(1, 2), (2, 1), (3, 3), (5, 1), (4, 1), (6, 1)]),
                     (-1, [(1, 2), (2, 1), (3, 1), (5, 3), (4, 2), (6, 1)]),
                     (-1, [(1, 1), (2, 1), (3, 1), (5, 3), (4, 1), (6, 1)]),
                     (-1, [(1, 1), (2, 2), (3, 1), (5, 3), (4, 1), (6, 1)]),
                     (-1, [(1, 1), (2, 1), (3, 1), (5, 1), (4, 3), (6, 1)]),
                     (-1, [(1, 2), (2, 1), (3, 1), (5, 2), (4, 1), (6, 5)]),
                     (-1, [(7, 10)])]

    test_data = [(0, [(1, 2), (2, 6), (3, 4), (5, 1), (4, 1), (6, 1)]),
                 (0, [(1, 2), (2, 6), (3, 4)])]

    model = svmlight.learn(training_data, type='classification', verbosity=0)
    svmlight.write_model(model, 'my_model.dat')
    predictions = svmlight.classify(model, test_data)
    for p in predictions:
        print '%.8f' % p
Beispiel #11
0
def main_svmlight():
    # copied:
    import svmlight
    import pdb

    training_data = syntheticData(30, 1)
    test_data = syntheticData(30, 1)
    #training_data = __import__('data').train0
    #test_data = __import__('data').test0

    print 'HERE 0'
    print 'training_data is', training_data
    print 'test_data is', test_data

    # train a model based on the data
    #pdb.set_trace()
    print 'HERE 1'
    model = svmlight.learn(training_data,
                           type='regression',
                           kernelType=2,
                           verbosity=3)
    print 'HERE 2'

    # model data can be stored in the same format SVM-Light uses, for interoperability
    # with the binaries.
    svmlight.write_model(model, 'my_model.dat')
    print 'HERE 3'

    # classify the test data. this function returns a list of numbers, which represent
    # the classifications.
    #predictions = svmlight.classify(model, test_data)
    pdb.set_trace()
    predictions = svmlight.classify(model, training_data)
    print 'HERE 4'

    for p, example in zip(predictions, test_data):
        print 'pred %.8f, actual %.8f' % (p, example[0])
Beispiel #12
0
def run_svm(article_count,
            feature_functions,
            kernel='polynomial',
            split=0.9,
            model_path='svm.model'):
    # https://bitbucket.org/wcauchois/pysvmlight
    articles, total_token_count = preprocess_wsj(article_count,
                                                 feature_functions)

    dictionary = Dictionary()
    dictionary.add_one('ZZZZZ')  # so that no features are labeled 0
    data = []
    for article in articles:
        for sentence in article:
            for tag, token_features in zip(sentence.def_tags, sentence.data):
                # only use def / indef tokens
                if tag in ('DEF', 'INDEF'):
                    features = dictionary.add(token_features)
                    features = sorted(list(set(features)))
                    feature_values = zip(features, [1] * len(features))
                    data.append((+1 if tag == 'DEF' else -1, feature_values))

    train, test = bifurcate(data, split, shuffle=True)

    # for corpus, name in [(train, 'train'), (test, 'test')]:
    # write_svm(corpus, 'wsj_svm-%s.data' % name)

    #####################
    # do svm in Python...
    model = svmlight.learn(train, type='classification', kernel=kernel)

    # svmlight.learn options
    # type: select between 'classification', 'regression', 'ranking' (preference ranking), and 'optimization'.
    # kernel: select between 'linear', 'polynomial', 'rbf', and 'sigmoid'.
    # verbosity: set the verbosity level (default 0).
    # C: trade-off between training error and margin.
    # poly_degree: parameter d in polynomial kernel.
    # rbf_gamma: parameter gamma in rbf kernel.
    # coef_lin
    # coef_const
    # costratio (corresponds to -j option to svm_learn)
    svmlight.write_model(model, model_path)

    gold_labels, test_feature_values = zip(*test)
    # total = len(gold_labels)

    test_pairs = [(0, feature_values)
                  for feature_values in test_feature_values]
    predictions = svmlight.classify(model, test_pairs)

    correct, wrong = matches([(gold > 0) for gold in gold_labels],
                             [(prediction > 0) for prediction in predictions])

    return dict(
        total_articles_count=len(articles),  # int
        total_token_count=total_token_count,  # int
        train_count=len(train),  # int
        test_count=len(test),  # int
        kernel=kernel,
        correct=correct,
        wrong=wrong,
        total=correct + wrong,
    )
Beispiel #13
0
def save_classifier(clf, clf_i):
    directory = args.output_directory
    if not os.path.exists(directory):
        os.makedirs(directory)
    svmlight.write_model(clf, os.path.join(directory, str(clf_i)))
Beispiel #14
0
		if val in percentages and percentages[val]:
			print " Progress: %i %s" %(val, "%")
			percentages[val] = False

		try:
			source = open(directory + filename, 'r')
			train_type = int(source.readline())
			train_num_dimensions = int(source.readline())
			train_dimensions = source.readline().strip().split()
			source.close()

		   	num = 1
		   	vals=[]
			for val in train_dimensions:
				vals.append((num, float(val)))
				num += 1
			
			training_data.append((train_type, vals))

		except Exception as e:
			print "ERROR:", e
			break
		counter += 1

print "Imported:", len(training_data), "\n"
print "Building Model"
model = svmlight.learn(training_data, type='classification', verbosity=0)
print "Write Model"
svmlight.write_model(model, 'svm-model.dat')

Beispiel #15
0
import svmlight

training_data = __import__('data').train0
test_data = __import__('data').test0

# train a model based on the data
model = svmlight.learn(training_data, type='classification', verbosity=0)

# model data can be stored in the same format SVM-Light uses, for
# interoperability with the binaries.
svmlight.write_model(model, 'my_model.dat')

# classify the test data. this function returns a list of numbers, which
# represent the classifications.
predictions = svmlight.classify(model, test_data)
for p in predictions:
    print('%.8f' % p)
Beispiel #16
0
import anglepy.ndict as ndict
from anglepy.misc import lazytheanofunc

import svmlight

'''
To install pysvmlight (on MAC):
1) cd to pysvmlight dir
> export CFLAGS=-Qunused-arguments
> export CPPFLAGS=-Qunused-arguments
> chmod +x setup.py
> ./setup.py build
> sudo ./setup.py install
'''

'''
===> Example from pysvmlight doc:
# train a model based on the data
model = svmlight.learn(training_data, type='classification', verbosity=0)

# model data can be stored in the same format SVM-Light uses, for interoperability
# with the binaries.
svmlight.write_model(model, 'my_model.dat')

# classify the test data. this function returns a list of numbers, which represent
# the classifications.
predictions = svmlight.classify(model, test_data)
for p in predictions:
    print '%.8f' % p
'''
Beispiel #17
0
test_data = __import__('data').test0

train = [ 
(1,[(1,0.5),(2,0.125)]),
(1,[(1,0.25),(2,0.125)]),
(1,[(1,1.75),(2,0.0)]),
(0,[(1,0.125),(2,0.25)]),
(0,[(1,0.5),(2,1)]),
(0,[(1,0.3),(2,0.4)])]
#(3,[(1,0.125),(2,0.2)]),
#(3,[(1,0),(2,0)]),
#(3,[(1,1),(2,1.1)])]


test = [
(1,[(1,1.0),(2,0.1)]),
(-1,[(1,0.1),(2,2.1)])]

# train a model based on the data
model = svmlight.learn(train, type='ranking', verbosity=0)

# model data can be stored in the same format SVM-Light uses, for interoperability
# with the binaries.
svmlight.write_model(model, 'my_model.dat')

# classify the test data. this function returns a list of numbers, which represent
# the classifications.
predictions = svmlight.classify(model, test)
for p in predictions:
    print '%.8f' % p
Beispiel #18
0
	def train_binary(self, x, y):
		train_data_svml = svmlfeaturisexy(x, y)
		model = svmlight.learn(train_data_svml, type='classification', verbosity=0, kernel='rbf', C=self.C, rbf_gamma=self.gamma)
		svmlight.write_model(model, 'tsvm_mnist.dat')
Beispiel #19
0
import collections as C
import anglepy as ap
import anglepy.ndict as ndict
from anglepy.misc import lazytheanofunc

import svmlight
'''
To install pysvmlight (on MAC):
1) cd to pysvmlight dir
> export CFLAGS=-Qunused-arguments
> export CPPFLAGS=-Qunused-arguments
> chmod +x setup.py
> ./setup.py build
> sudo ./setup.py install
'''
'''
===> Example from pysvmlight doc:
# train a model based on the data
model = svmlight.learn(training_data, type='classification', verbosity=0)

# model data can be stored in the same format SVM-Light uses, for interoperability
# with the binaries.
svmlight.write_model(model, 'my_model.dat')

# classify the test data. this function returns a list of numbers, which represent
# the classifications.
predictions = svmlight.classify(model, test_data)
for p in predictions:
    print '%.8f' % p
'''
				print '%s f-measure: %f' % (label, f_measure(ref, test) or 0)

if args.show_most_informative and hasattr(classifier, 'show_most_informative_features') and not (args.multi and args.binary) and not args.cross_fold:
	print '%d most informative features' % args.show_most_informative
	classifier.show_most_informative_features(args.show_most_informative)

##############
## pickling ##
##############

if not args.no_pickle and not args.cross_fold:
	if args.filename:
		fname = os.path.expanduser(args.filename)
	else:
		name = '%s_%s.pickle' % (args.corpus, '_'.join(args.classifier))
		fname = os.path.join(os.path.expanduser('~/nltk_data/classifiers'), name.lower())
	
	# We can't persist the SVM classifier directly since it contains
	# C-objects. We need to save the model separately.
	if classifier.__class__.__name__ == "SvmClassifier":
		import svmlight
		
		model_name = '%s_%s_model.dat' % (args.corpus, '_'.join(args.classifier))
		model_fname = os.path.join(os.path.expanduser('~/nltk_data/classifiers'), model_name.lower())
		svmlight.write_model(classifier._model, model_fname)
		
		# Remove the model from the classifier so it can be saved.
		classifier._model = None
		
	dump_object(classifier, fname, trace=args.trace)
Beispiel #21
0
        train = svm_parse('aux/train_' + ts + '.txt')
        aux = svm_parse('aux/test_' + ts + '.txt')
        test, val = adapt_to_svmlight_format(aux)

        print("Training it=", it, "cost-factor=", cost_factor + 1)

        model = svmlight.learn(list(train),
                               type='classification',
                               verbosity=0,
                               costratio=cost_factor +
                               1)  ## costratio = cost-factor

        if dump == "yes":
            svmlight.write_model(
                model,
                "models/model_" + dataset + "_" + features + "_it" + str(it) +
                "_cost_fact" + str(cost_factor + 1) + "_" + ts + ".dat")

        predictions = svmlight.classify(model, test)
        print("Predicting it=", it, "cost-factor=", cost_factor + 1)

        tp, tn, fp, fn = evaluate(predictions)
        accuracies.append(
            weighted_accuracy(cost_factor + 1, tn, tp, fn, fp) * 100)
        predictions = np.array(predictions)
        predictions[predictions < 0] = -1
        predictions[predictions > 0] = 1
        f1_micro.append(
            f1_score(val, predictions, average='micro')
        )  # micro: calculates metrics totally by counting the total true positives, false negatives and false positives
        cl = f1_score(val, predictions,
			nskipped = 1
			if len(sentences) > 1:  # because there have to be transitions
				docModel = DummyDocModel(sentences)
				grid = TextrazorEntityGrid(docModel.cleanSentences(), 1, textrazorEntities, textrazorSentences)
				if grid.valid and len(grid.matrixIndices) > 0:
					grid.printMatrix()
					featureVector = FeatureVector(grid, clusterIndex)
					featureVector.printVector()
					featureVector.printVectorWithIndices()
					vector = featureVector.getVector(qualityScore)
					featureVectors.append(vector)
					docIndex += 1

			else:
				print "SKIPPING (not enough sentences) %s, nskipped=(%d)" % (fileName, nskipped)
				nskipped += 1
		else:
			print "SKIPPING (no pickle file)%s, nskipped=(%d)" % (fileName, nskipped)
			nskipped += 1

		# pickleFile = open("../cache/svmlightCache/featureVectors.pickle", 'wb')
		# pickle.dump(featureVectors, pickleFile, pickle.HIGHEST_PROTOCOL)
		# pickleFile.close()
		# if docIndex >= maxN:
		#	break
		numDocsTried += 1
	clusterIndex += 1
# now train on the data
model = svmlight.learn(featureVectors, type='ranking', verbosity=0)
svmlight.write_model(model, '../cache/svmlightCache/svmlightModel.dat')
Beispiel #23
0
def svm():
	# load the sentiment score file with (word,pos) -> (posScore,negScore) dictionary
	# and the (review,sentiment) pair list
	synDict = pickle.load(open('sentiment_score.pickle','rb'))
	annot = pickle.load(open('sent_400_wspos.pickle','rb'))
	poscount = bothcount = 0
	posTot = 0
	bothTot = 0

	print annot

	# 0 -> pos, 1 -> neg, 2 -> both, 3 -> neut
	data = {'pos':[],'neg':[],'both':[],'neut':[]}
	strToNum = {'pos':0,'neg':1,'both':2,'neut':3}

	for line,sent in annot:
		score = (0,0)
		hits = 0 # number of words found in dictionary, for scaling

		string = ''
		for word in line.split():
			string += word.split('#')[0]+' '

		neg = negate.negating(string.strip(' '))

		# catch empty case, simpler than re-pickling
		if neg == []:
			continue

		# calculate (posScore, negScore) for each word in line
		for i,word in enumerate(line.split()):
			tri = word.split('#')
			tempscore = (0,0)
			if len(tri) == 3:
				hits += 1
				pair = (tri[0]+'#'+tri[2],tri[1])
				tempscore = synDict.get(pair,(0,0))
	
				if 'NOT' in neg[i]:
					tempscore = (tempscore[1],tempscore[0]) # set to reverse value b/c inverted meaning

				
			score = (score[0]+tempscore[0],score[1]+tempscore[1]) # add tempscore to score

		data[sent.strip(' ')].append(score)
		


	featList = []

	# convert to feature lists
	for key in data.keys():
		featList.append(map(lambda (a,b): (strToNum[key],[(1,a),(2,b)]),data[key]))

	

	# construct test and train sets as fractions of featList
	train = featList[0][:3*len(featList[0])/4]+featList[1][:3*len(featList[1])/4]+featList[2][:3*len(featList[2])/4]+featList[3][:3*len(featList[3])/4]
	test = featList[0][3*len(featList[0])/4:]+featList[1][3*len(featList[1])/4:]+featList[2][3*len(featList[2])/4:]+featList[3][3*len(featList[3])/4:]

	for element in train:
		print element

	# train and test model
	model = svmlight.learn(train, type='classification', verbosity=0)
	svmlight.write_model(model, 'my_model1.dat')
	predictions = svmlight.classify(model, test)
	for p in predictions:
		#print '%.8f' % p
		pass	
Beispiel #24
0
        val = int(float(counter) / len(filenames) * 100)
        if val in percentages and percentages[val]:
            print " Progress: %i %s" % (val, "%")
            percentages[val] = False

        try:
            source = open(directory + filename, 'r')
            train_type = int(source.readline())
            train_num_dimensions = int(source.readline())
            train_dimensions = source.readline().strip().split()
            source.close()

            num = 1
            vals = []
            for val in train_dimensions:
                vals.append((num, float(val)))
                num += 1

            training_data.append((train_type, vals))

        except Exception as e:
            print "ERROR:", e
            break
        counter += 1

print "Imported:", len(training_data), "\n"
print "Building Model"
model = svmlight.learn(training_data, type='classification', verbosity=0)
print "Write Model"
svmlight.write_model(model, 'svm-model.dat')