Esempio n. 1
0
    def cost(self):
        cost = 0
        for i, center in enumerate(self.centroids):
            for doc in self.clusters[i]:
                cost += distance(self.corpus.document(doc).vector, self.corpus.document(center).vector)

        return cost
Esempio n. 2
0
 def run(self,minePackage):
     q=Vector(minePackage['searchKey'])
     clouds=minePackage['clouds']
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData=cloud.graph.node[n]['methodData']
             v=Vector(methodData.getData())
             cloud.graph.node[n]['weight']=1-distance(v,q)
Esempio n. 3
0
 def run(self, minePackage):
     q = Vector(minePackage['searchKey'])
     clouds = minePackage['clouds']
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData = cloud.graph.node[n]['methodData']
             v = Vector(methodData.getData())
             cloud.graph.node[n]['weight'] = 1 - distance(v, q)
Esempio n. 4
0
    def cost(self):
        cost = 0
        for i, center in enumerate(self.centroids):
            for doc in self.clusters[i]:
                cost += distance(
                    self.corpus.document(doc).vector,
                    self.corpus.document(center).vector)

        return cost
Esempio n. 5
0
    def _generate_clusters(self, k=10, p=0.8, maxlevel=10):
        """Use KMEANS method by default, and choose the initial k values by KMPP method.
        k is the number of clusters.
        p is to control the error of KMEANS, when p=1.0 is faster with small error.
        """
        if self.method == "kmeans":

            from pattern.vector import KMEANS, KMPP
            self.clusters = self.corpus.cluster(method=KMEANS,
                                                k=k,
                                                seed=KMPP,
                                                p=p,
                                                iterations=10)
            doc_list = []
            # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid.
            for cluster in self.clusters:
                c = centroid(cluster)
                d_min = (cluster[0].vector, c)
                for doc in cluster:
                    d = distance(doc.vector, c)
                    if distance(doc.vector, c) < d_min:
                        d_min = d
                        doc_min = doc
                doc_list.append(doc_min)
            self.centroids = [i.name for i in doc_list]
            self.clusters = [[i.name for i in cluster]
                             for cluster in self.clusters]

        elif self.method == 'covertree':

            def mydistance(doc_name1, doc_name2):
                v1 = self.corpus.document(doc_name1).vector
                v2 = self.corpus.document(doc_name2).vector
                return distance(v1, v2)

            self.covertree = Covertree(mydistance, maxlevel)

            for i, doc in enumerate(self.corpus):
                tree_node = myTree(doc.name)
                self.covertree.insert(tree_node, self.covertree.ct, 0)

            self.covertree.merge_levels()
            self.centroids, self.clusters = self.covertree.clustering_from_ct(
                k)
def similarity(a, b):

  docA = Document(a)
  docB = Document(b)

  vecA = normalize(docA.vector)
  vecB = normalize(docB.vector)

  #print docA.vector
  return 1 - distance(vecA, vecB)
Esempio n. 7
0
 def test_distance(self):
     # Assert distance metrics.
     v1 = vector.Vector({"cat":1})
     v2 = vector.Vector({"cat":0.5, "dog":1})
     for d, method in (
       (0.55, vector.COSINE),    # 1 - ((1*0.5 + 0*1) / (sqrt(1**2 + 0**2) * sqrt(0.5**2 + 1**2)))
       (1.25, vector.EUCLIDEAN), # (1-0.5)**2 + (0-1)**2
       (1.50, vector.MANHATTAN), # abs(1-0.5) + abs(0-1)
       (1.00, vector.HAMMING),   # (True + True) / 2
       (1.11, lambda v1, v2: 1.11)):
         self.assertAlmostEqual(vector.distance(v1, v2, method), d, places=2)
     print("pattern.vector.distance()")
Esempio n. 8
0
 def test_distance(self):
     # Assert distance metrics.
     v1 = vector.Vector({"cat":1})
     v2 = vector.Vector({"cat":0.5, "dog":1})
     for d, method in (
       (0.55, vector.COSINE),    # 1 - ((1*0.5 + 0*1) / (sqrt(1**2 + 0**2) * sqrt(0.5**2 + 1**2)))
       (1.25, vector.EUCLIDEAN), # (1-0.5)**2 + (0-1)**2
       (1.50, vector.MANHATTAN), # abs(1-0.5) + abs(0-1)
       (1.00, vector.HAMMING),   # (True + True) / 2
       (1.11, lambda v1, v2: 1.11)):
         self.assertAlmostEqual(vector.distance(v1, v2, method), d, places=2)
     print("pattern.vector.distance()")
Esempio n. 9
0
    def _generate_clusters(self, k=10, p=0.8, maxlevel=10):
        """Use KMEANS method by default, and choose the initial k values by KMPP method.
        k is the number of clusters.
        p is to control the error of KMEANS, when p=1.0 is faster with small error.
        """
        if self.method == "kmeans":

            from pattern.vector import KMEANS, KMPP
            self.clusters = self.corpus.cluster(method=KMEANS, k=k, seed=KMPP, p=p, iterations=10)
            doc_list = []
            # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid.
            for cluster in self.clusters:
                c = centroid(cluster)
                d_min = (cluster[0].vector, c)
                for doc in cluster:
                    d = distance(doc.vector, c)
                    if distance(doc.vector, c) < d_min:
                        d_min = d
                        doc_min = doc
                doc_list.append(doc_min)
            self.centroids = [i.name for i in doc_list]
            self.clusters = [[i.name for i in cluster] for cluster in self.clusters]

        elif self.method == 'covertree':

            def mydistance(doc_name1, doc_name2):
                v1 = self.corpus.document(doc_name1).vector
                v2 = self.corpus.document(doc_name2).vector
                return distance(v1, v2)

            self.covertree = Covertree(mydistance, maxlevel)

            for i, doc in enumerate(self.corpus):
                tree_node = myTree(doc.name)
                self.covertree.insert(tree_node, self.covertree.ct, 0)

            self.covertree.merge_levels()
            self.centroids, self.clusters = self.covertree.clustering_from_ct(k)
Esempio n. 10
0
 def mydistance(doc_name1, doc_name2):
     v1 = self.corpus.document(doc_name1).vector
     v2 = self.corpus.document(doc_name2).vector
     return distance(v1, v2)
Esempio n. 11
0
 def calcularVectorSpaceModel(self,string,unaQuery):
     return distance(string, unaQuery, method=COSINE)
 def variance(cluster):
     return avg([distance(centroid(cluster), v) for v in cluster])
Esempio n. 13
0
def recommend_game(this_game):
    games = recommendable_games(this_game)

    total_recommendable = games.count()
    print 'Total recommendable games based on ' + this_game.title + ": " + total_recommendable.__str__()

    document_title = Document(this_game.title)
    document_publisher = Document(this_game.publisher)
    document_summary = Document(this_game.summary,
                                top=None,
                                threshold=0,
                                stemmer=None,
                                exclude=[],
                                stopwords=False,
                                language='en')
    document_keywords = Document(', '.join([x['name'] for x in this_game.keywords.all().values("name")]))
    document_genres = Document(', '.join([x['name'] for x in this_game.genres.all().values("name")]))

    # format: {"id":id, socre:"SUM(dist*pond)"}
    game_similarities = []
    summary_documents = []
    for game in games:
        score = 0
        game = Game.objects.filter(title=game['title'], platform=game['platform'])[0]

        title_similarity = 1 - distance(document_title.vector, Document(game.title).vector)
        publisher_similarity = 1 - distance(document_publisher.vector, Document(game.publisher).vector)
        genre_similarity = 1 - distance(document_genres.vector, Document(
            ', '.join([x['name'] for x in game.genres.all().values("name")])
        ).vector)
        keywords_similarity = 1 - distance(document_keywords.vector, Document(
            ', '.join([x['name'] for x in game.keywords.all().values("name")])
        ).vector)

        score = (0.15 * title_similarity) + (0.2 * genre_similarity) + (0.2 * publisher_similarity) + (
            0.20 * keywords_similarity)

        summary_documents.append(Document(game.summary,
                                          top=None,
                                          threshold=0,
                                          stemmer=None,
                                          exclude=[],
                                          stopwords=False,
                                          language='en',
                                          name=game.id))

        game_similarities.append({"id": game.id, "score": score})

    to_compare = Document(document_summary)

    model = Model(documents=summary_documents, weight=TFIDF)

    neighbours = model.neighbors(to_compare, top=total_recommendable)

    for neighbour in neighbours:
        for rec_game in game_similarities:
            if rec_game['id'] == neighbour[1].name:
                rec_game['score'] = rec_game['score'] + 0.25 * neighbour[0]

    recommended = sorted(game_similarities, key=lambda k: -k['score'])[0:total_recommendable]

    if len(recommended) >= 40:
        random_selection = random.sample(recommended[0:40], 25)
    else:
        random_selection = random.sample(recommended, 25)

    recommended_ids = [g['id'] for g in random_selection]

    return recommended_ids
Esempio n. 14
0
 def mydistance(doc_name1, doc_name2):
     v1 = self.corpus.document(doc_name1).vector
     v2 = self.corpus.document(doc_name2).vector
     return distance(v1, v2)
Esempio n. 15
0
    recall

#test SVM
testsvm = SVM(train=datadocs[:500])
print 'svm features = ', testsvm.features
saccuracy, sprecision, srecall, sf1 = testsvm.test(datadocs[500:])
print 'svm accuracy =', saccuracy


#classifier training example with test classificaiton
nb2 = NB()
for review, rating in data:
    v = Document(review, type=int(rating))
    #print v.vector
    nb2.train(v)

print 'nb2 classes', nb2.classes
print 'test classification', nb2.classify(Document('A poor movie!'))

#cosine similarity example
from pattern.vector import Vector, distance
v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1})
v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1})
print 'cosine similarity between two vectors', 1 - distance(v1, v2)

# a model is a collection of Document objects

#todo now that we've built SVM, needs to create documents out of dummy tweets
#todo then add them to the SVM classifier as a training and test set
#todo build db schema to save tweets and relationships to db
#todo get some sample twitter data in db
print document.wordcount  # Total word count.
print document.vector  # Cached Vector (read-only dict).
print document.tf(
    'conclude')  # returns the frequency of a word as a number between 0.0-1.0.
print document.tfidf(
    'conclude'
)  # returns the word's relevancy as tf-idf. Note: simply yields tf if model is None.
print document.keywords(
    top=10, normalized=True
)  # returns a sorted list of (weight, word)-tuples. With normalized=True
# the weights will be between 0.0-1.0 (their sum is 1.0).
print document.copy()
# document vector
v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1})
v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1})
print 1 - distance(v1, v2)
# model
d1 = Document('A tiger is a big yellow cat with stripes.', type='tiger')
d2 = Document(
    'A lion is a big yellow cat with manes.',
    type='lion',
)
d3 = Document('An elephant is a big grey animal with a slurf.',
              type='elephant')
print d1.vector
m = Model(documents=[d1, d2, d3], weight=TFIDF)
print d1.vector
print m.similarity(d1, d2)  # tiger vs. lion
print m.similarity(d1, d3)  # tiger vs. elephant
# lsa concept space
d1 = Document('The cat purrs.', name='cat1')
Esempio n. 17
0
 def calcularVectorSpaceModel(self, string, unaQuery):
     if string:
         return distance(string.vector, unaQuery.vector, method=COSINE)
     else:
         return 0
Esempio n. 18
0
 def calcularVectorSpaceModel(self, doc1, doc2):
     return distance(doc1.vector, doc2.vector, method=COSINE)