def cost(self): cost = 0 for i, center in enumerate(self.centroids): for doc in self.clusters[i]: cost += distance(self.corpus.document(doc).vector, self.corpus.document(center).vector) return cost
def run(self,minePackage): q=Vector(minePackage['searchKey']) clouds=minePackage['clouds'] for cloud in clouds: for n in cloud.graph.nodes(): methodData=cloud.graph.node[n]['methodData'] v=Vector(methodData.getData()) cloud.graph.node[n]['weight']=1-distance(v,q)
def run(self, minePackage): q = Vector(minePackage['searchKey']) clouds = minePackage['clouds'] for cloud in clouds: for n in cloud.graph.nodes(): methodData = cloud.graph.node[n]['methodData'] v = Vector(methodData.getData()) cloud.graph.node[n]['weight'] = 1 - distance(v, q)
def cost(self): cost = 0 for i, center in enumerate(self.centroids): for doc in self.clusters[i]: cost += distance( self.corpus.document(doc).vector, self.corpus.document(center).vector) return cost
def _generate_clusters(self, k=10, p=0.8, maxlevel=10): """Use KMEANS method by default, and choose the initial k values by KMPP method. k is the number of clusters. p is to control the error of KMEANS, when p=1.0 is faster with small error. """ if self.method == "kmeans": from pattern.vector import KMEANS, KMPP self.clusters = self.corpus.cluster(method=KMEANS, k=k, seed=KMPP, p=p, iterations=10) doc_list = [] # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid. for cluster in self.clusters: c = centroid(cluster) d_min = (cluster[0].vector, c) for doc in cluster: d = distance(doc.vector, c) if distance(doc.vector, c) < d_min: d_min = d doc_min = doc doc_list.append(doc_min) self.centroids = [i.name for i in doc_list] self.clusters = [[i.name for i in cluster] for cluster in self.clusters] elif self.method == 'covertree': def mydistance(doc_name1, doc_name2): v1 = self.corpus.document(doc_name1).vector v2 = self.corpus.document(doc_name2).vector return distance(v1, v2) self.covertree = Covertree(mydistance, maxlevel) for i, doc in enumerate(self.corpus): tree_node = myTree(doc.name) self.covertree.insert(tree_node, self.covertree.ct, 0) self.covertree.merge_levels() self.centroids, self.clusters = self.covertree.clustering_from_ct( k)
def similarity(a, b): docA = Document(a) docB = Document(b) vecA = normalize(docA.vector) vecB = normalize(docB.vector) #print docA.vector return 1 - distance(vecA, vecB)
def test_distance(self): # Assert distance metrics. v1 = vector.Vector({"cat":1}) v2 = vector.Vector({"cat":0.5, "dog":1}) for d, method in ( (0.55, vector.COSINE), # 1 - ((1*0.5 + 0*1) / (sqrt(1**2 + 0**2) * sqrt(0.5**2 + 1**2))) (1.25, vector.EUCLIDEAN), # (1-0.5)**2 + (0-1)**2 (1.50, vector.MANHATTAN), # abs(1-0.5) + abs(0-1) (1.00, vector.HAMMING), # (True + True) / 2 (1.11, lambda v1, v2: 1.11)): self.assertAlmostEqual(vector.distance(v1, v2, method), d, places=2) print("pattern.vector.distance()")
def _generate_clusters(self, k=10, p=0.8, maxlevel=10): """Use KMEANS method by default, and choose the initial k values by KMPP method. k is the number of clusters. p is to control the error of KMEANS, when p=1.0 is faster with small error. """ if self.method == "kmeans": from pattern.vector import KMEANS, KMPP self.clusters = self.corpus.cluster(method=KMEANS, k=k, seed=KMPP, p=p, iterations=10) doc_list = [] # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid. for cluster in self.clusters: c = centroid(cluster) d_min = (cluster[0].vector, c) for doc in cluster: d = distance(doc.vector, c) if distance(doc.vector, c) < d_min: d_min = d doc_min = doc doc_list.append(doc_min) self.centroids = [i.name for i in doc_list] self.clusters = [[i.name for i in cluster] for cluster in self.clusters] elif self.method == 'covertree': def mydistance(doc_name1, doc_name2): v1 = self.corpus.document(doc_name1).vector v2 = self.corpus.document(doc_name2).vector return distance(v1, v2) self.covertree = Covertree(mydistance, maxlevel) for i, doc in enumerate(self.corpus): tree_node = myTree(doc.name) self.covertree.insert(tree_node, self.covertree.ct, 0) self.covertree.merge_levels() self.centroids, self.clusters = self.covertree.clustering_from_ct(k)
def mydistance(doc_name1, doc_name2): v1 = self.corpus.document(doc_name1).vector v2 = self.corpus.document(doc_name2).vector return distance(v1, v2)
def calcularVectorSpaceModel(self,string,unaQuery): return distance(string, unaQuery, method=COSINE)
def variance(cluster): return avg([distance(centroid(cluster), v) for v in cluster])
def recommend_game(this_game): games = recommendable_games(this_game) total_recommendable = games.count() print 'Total recommendable games based on ' + this_game.title + ": " + total_recommendable.__str__() document_title = Document(this_game.title) document_publisher = Document(this_game.publisher) document_summary = Document(this_game.summary, top=None, threshold=0, stemmer=None, exclude=[], stopwords=False, language='en') document_keywords = Document(', '.join([x['name'] for x in this_game.keywords.all().values("name")])) document_genres = Document(', '.join([x['name'] for x in this_game.genres.all().values("name")])) # format: {"id":id, socre:"SUM(dist*pond)"} game_similarities = [] summary_documents = [] for game in games: score = 0 game = Game.objects.filter(title=game['title'], platform=game['platform'])[0] title_similarity = 1 - distance(document_title.vector, Document(game.title).vector) publisher_similarity = 1 - distance(document_publisher.vector, Document(game.publisher).vector) genre_similarity = 1 - distance(document_genres.vector, Document( ', '.join([x['name'] for x in game.genres.all().values("name")]) ).vector) keywords_similarity = 1 - distance(document_keywords.vector, Document( ', '.join([x['name'] for x in game.keywords.all().values("name")]) ).vector) score = (0.15 * title_similarity) + (0.2 * genre_similarity) + (0.2 * publisher_similarity) + ( 0.20 * keywords_similarity) summary_documents.append(Document(game.summary, top=None, threshold=0, stemmer=None, exclude=[], stopwords=False, language='en', name=game.id)) game_similarities.append({"id": game.id, "score": score}) to_compare = Document(document_summary) model = Model(documents=summary_documents, weight=TFIDF) neighbours = model.neighbors(to_compare, top=total_recommendable) for neighbour in neighbours: for rec_game in game_similarities: if rec_game['id'] == neighbour[1].name: rec_game['score'] = rec_game['score'] + 0.25 * neighbour[0] recommended = sorted(game_similarities, key=lambda k: -k['score'])[0:total_recommendable] if len(recommended) >= 40: random_selection = random.sample(recommended[0:40], 25) else: random_selection = random.sample(recommended, 25) recommended_ids = [g['id'] for g in random_selection] return recommended_ids
recall #test SVM testsvm = SVM(train=datadocs[:500]) print 'svm features = ', testsvm.features saccuracy, sprecision, srecall, sf1 = testsvm.test(datadocs[500:]) print 'svm accuracy =', saccuracy #classifier training example with test classificaiton nb2 = NB() for review, rating in data: v = Document(review, type=int(rating)) #print v.vector nb2.train(v) print 'nb2 classes', nb2.classes print 'test classification', nb2.classify(Document('A poor movie!')) #cosine similarity example from pattern.vector import Vector, distance v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1}) v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1}) print 'cosine similarity between two vectors', 1 - distance(v1, v2) # a model is a collection of Document objects #todo now that we've built SVM, needs to create documents out of dummy tweets #todo then add them to the SVM classifier as a training and test set #todo build db schema to save tweets and relationships to db #todo get some sample twitter data in db
print document.wordcount # Total word count. print document.vector # Cached Vector (read-only dict). print document.tf( 'conclude') # returns the frequency of a word as a number between 0.0-1.0. print document.tfidf( 'conclude' ) # returns the word's relevancy as tf-idf. Note: simply yields tf if model is None. print document.keywords( top=10, normalized=True ) # returns a sorted list of (weight, word)-tuples. With normalized=True # the weights will be between 0.0-1.0 (their sum is 1.0). print document.copy() # document vector v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1}) v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1}) print 1 - distance(v1, v2) # model d1 = Document('A tiger is a big yellow cat with stripes.', type='tiger') d2 = Document( 'A lion is a big yellow cat with manes.', type='lion', ) d3 = Document('An elephant is a big grey animal with a slurf.', type='elephant') print d1.vector m = Model(documents=[d1, d2, d3], weight=TFIDF) print d1.vector print m.similarity(d1, d2) # tiger vs. lion print m.similarity(d1, d3) # tiger vs. elephant # lsa concept space d1 = Document('The cat purrs.', name='cat1')
def calcularVectorSpaceModel(self, string, unaQuery): if string: return distance(string.vector, unaQuery.vector, method=COSINE) else: return 0
def calcularVectorSpaceModel(self, doc1, doc2): return distance(doc1.vector, doc2.vector, method=COSINE)