def test_svm(): trainingset = [ngrams(1, "foo foo bar baz"), ngrams(1, "foo foo bar bar baz baz"), ngrams(1,"foo foo bar baz")] labels = [1, -1, -1] lsc = LinearSVMClassifier() for vec in zip(trainingset, labels): lsc.addFeatureVector(vec[0], vec[1]) print lsc.classify(ngrams(1, "foo foo bar bar baz baz")) print lsc.classify(ngrams(1, "foo foo foo bar baz"))
def test_svm(): trainingset = [ ngrams(1, "foo foo bar baz"), ngrams(1, "foo foo bar bar baz baz"), ngrams(1, "foo foo bar baz") ] labels = [1, -1, -1] lsc = LinearSVMClassifier() for vec in zip(trainingset, labels): lsc.addFeatureVector(vec[0], vec[1]) print lsc.classify(ngrams(1, "foo foo bar bar baz baz")) print lsc.classify(ngrams(1, "foo foo foo bar baz"))
from cuckoovec import CuckooVector import numpy as np import string from ngrams import * vocab = [c for c in string.ascii_uppercase + string.ascii_lowercase] # letters n = 5 t = 2 # form random sparse vectors from <= t random words and weights def randsparse(t): return dict(zip(np.random.choice(vocab, t), np.random.randn(t))) s1 = [randsparse(t) for i in range(n)] s2 = [randsparse(t) for i in range(n)] t1 = ngrams(s1) print(t1) t2 = ngrams(s2) v1 = CuckooVector(t1) v2 = CuckooVector(t2) # these should be close to orthogonal. print(v1.dot(v2))
def hashindex(elements: List, hashsize: int, gramsize: int): return [hash(tp) % hashsize for tp in ngrams(elements, gramsize, True)]
def _wordchar_to_idx(self, words: List, hashsize: int, gramsize: int): chars = [] for w in words: chars += [for tp in ngrams(list(w), gramsize, True)] return [hash(tp) % hashsize for tp in chars]
import ngrams sentence = 'this is a foo bar sentences and i want to ngramize it' n = 6 sixgrams = ngrams(sentence.split(), n) for grams in sixgrams: print(grams)