Esempio n. 1
0
 def test_cosine_similarity(self):
     # Assert document cosine similarity.
     v1 = self.model.similarity(self.model[0], self.model[1])
     v2 = self.model.similarity(self.model[0], self.model[2])
     v3 = self.model.similarity(self.model[0], vector.Document("cats cats"))
     self.assertAlmostEqual(v1, 0.20, places=2)
     self.assertAlmostEqual(v2, 0.00, places=2)
     self.assertAlmostEqual(v3, 0.45, places=2)
     # Assert that Model.similarity() is aware of LSA reduction.
     self.model.reduce(2)
     v1 = self.model.similarity(self.model[0], self.model[1])
     v2 = self.model.similarity(self.model[0], self.model[2])
     self.assertAlmostEqual(v1, 1.00, places=2)
     self.assertAlmostEqual(v2, 0.00, places=2)
     self.model.lsa = None
     print("pattern.vector.Model.similarity()")
Esempio n. 2
0
 def test_information_gain(self):
     # Assert information gain weights.
     # Example from http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf
     v = vector.Corpus([
         vector.Document({"wind": 1}, type=False),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 1}, type=True),
         vector.Document({"wind": 1}, type=False),
         vector.Document({"wind": 1}, type=False)
     ])
     self.assertAlmostEqual(v.information_gain("wind"), 0.52, places=2)
     print "patten.vector.Corpus.information_gain()"
def get_all_artists_all_words_to_file():
    f = open(common_words_file, "w")
    docs = ""
    for dir in os.listdir(basedir):
        print "artist" + dir
        if dir != '.git':
            # push the name of the artist onto the exclude words list
            name1 = re.split('-', dir)
            for n in name1:
                rap_exclude_words.append(n)

            docs += get_artist_docs(dir)

    corpus = vec.Document(docs, exclude=rap_exclude_words, stop_words=False)
    for ln in corpus.keywords(top=20):
        f.write("%0.08f\t%s" % ln)
        print "%0.08f\t%s" % ln
    f.close()
Esempio n. 4
0
 def test_cosine_similarity(self):
     # Assert document cosine similarity.
     v1 = self.model.similarity(self.model[0], self.model[1])
     v2 = self.model.similarity(self.model[0], self.model[2])
     v3 = self.model.similarity(self.model[0], vector.Document("cats cats"))
     self.assertAlmostEqual(v1, 0.20, places=2)
     self.assertAlmostEqual(v2, 0.00, places=2)
     self.assertAlmostEqual(v3, 0.45, places=2)
     # Assert that Model.similarity() is aware of LSA reduction.
     try:
         import numpy
         self.model.reduce(2)
         v1 = self.model.similarity(self.model[0], self.model[1])
         v2 = self.model.similarity(self.model[0], self.model[2])
         self.assertAlmostEqual(v1, 1.00, places=2)
         self.assertAlmostEqual(v2, 0.00, places=2)
         self.model.lsa = None
     except ImportError, e:
         pass
Esempio n. 5
0
 def test_document_vector(self):
     # Assert Vector properties.
     # Test copy.
     v = vector.Document("the cat sat on the mat").vector
     v = v.copy()
     # Test properties.
     self.assertTrue(isinstance(v, dict))
     self.assertTrue(isinstance(v, vector.Vector))
     self.assertTrue(isinstance(v.id, int))
     self.assertEqual(sorted(v.features), ["cat", "mat", "sat"])
     self.assertEqual(v.weight, vector.TF)
     self.assertAlmostEqual(v.norm, 0.58, places=2)
     self.assertAlmostEqual(v["cat"], 0.33, places=2)
     self.assertAlmostEqual(v["sat"], 0.33, places=2)
     self.assertAlmostEqual(v["mat"], 0.33, places=2)
     # Test copy + update.
     v = v({"cat": 1, "sat": 1, "mat": 1})
     self.assertEqual(sorted(v.features), ["cat", "mat", "sat"])
     self.assertAlmostEqual(v["cat"], 1.00, places=2)
     self.assertAlmostEqual(v["sat"], 1.00, places=2)
     self.assertAlmostEqual(v["mat"], 1.00, places=2)
     print "pattern.vector.Document.vector"
Esempio n. 6
0
 def test_classifier_vector(self):
     # Assert Classifier._vector() (translates input from train() and classify() to a Vector).
     v = vector.Classifier()._vector
     self.assertEqual(("cat", {
         "cat": 0.5,
         "purs": 0.5
     }), v(vector.Document("the cat purs", type="cat")))
     self.assertEqual(("cat", {
         "cat": 0.5,
         "purs": 0.5
     }), v({
         "cat": 0.5,
         "purs": 0.5
     }, type="cat"))
     self.assertEqual(("cat", {
         "cat": 0.5,
         "purs": 0.5
     }), v(["cat", "purs"], type="cat"))
     self.assertEqual(("cat", {
         "cat": 0.5,
         "purs": 0.5
     }), v("cat purs", type="cat"))
     print "pattern.Classifier._vector()"
Esempio n. 7
0
 def test_information_gain(self):
     # Assert information gain weights.
     # Example from
     # http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf
     m = vector.Model([
         vector.Document({"wind": 1}, type=False),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 0}, type=True),
         vector.Document({"wind": 1}, type=True),
         vector.Document({"wind": 1}, type=False),
         vector.Document({"wind": 1}, type=False)], weight=None
     )
     self.assertAlmostEqual(m.information_gain("wind"), 0.52, places=2)
     # Example from http://rutcor.rutgers.edu/~amai/aimath02/PAPERS/14.pdf
     m = vector.Model([
         vector.Document({"3": 1}, type=True),
         vector.Document({"3": 5}, type=True),
         vector.Document({"3": 1}, type=False),
         vector.Document({"3": 7}, type=True),
         vector.Document({"3": 2}, type=False),
         vector.Document({"3": 2}, type=True),
         vector.Document({"3": 6}, type=False),
         vector.Document({"3": 4}, type=True),
         vector.Document({"3": 0}, type=False),
         vector.Document({"3": 9}, type=True)], weight=None
     )
     self.assertAlmostEqual(m.ig("3"), 0.571, places=3)
     self.assertAlmostEqual(m.gr("3"), 0.195, places=3)
     print("patten.vector.Model.information_gain()")
     print("patten.vector.Model.gain_ratio()")
def lsa_apply(df):
    print("Building model")
    m = pv.Model([pv.Document(a) for a in df['abstract']], weight=pv.TFIDF)
    print("Returning reduction")
    return m.reduce(2)
def get_lsa(texts):
    docs = [pv.Document(a) for a in texts]
    model = pv.Model(docs, weight=pv.TFIDF)
    lsa = model.reduce(2)
    return lsa
Esempio n. 10
0
def lsa_apply(df):
    m = pv.Model([pv.Document(a) for a in df['abstract']], weight=pv.TFIDF)
    return m.reduce(2)
Esempio n. 11
0
def create_models(group):
    docs = [pv.Document(item, threshold=1) for item in group]
    return pv.Model(docs, weight=pv.TFIDF)
Esempio n. 12
0
def get():
    res = es.search(index="posts", body={
        "query": {
            "match_all": {}
        },
        'size': 10000
    })

    # sid = res['_scroll_id']
    # scroll_size = len(res['hits']['hits'])

    things = {}
    boat_docs = {}
    c = collections.Counter()
    d = collections.Counter()
    for item in res['hits']['hits']:
        raw_doc = item['_source']['source']
        original_boat_name = item['_source']['boat']
        # for boat in boats:
        #     if raw_doc.find(boat) > -1:
        t = TextBlob(raw_doc)
        d = vector.Document(raw_doc, threshold=1, stopwords=False)

        things.setdefault(original_boat_name, collections.Counter())
        things[original_boat_name].update([i[1] for i in d.keywords(top=10)])

                # boat_docs.setdefault(original_boat_name, collections.Counter())
                # for i in t.ngrams(3):
                #     boat_docs[original_boat_name].update([' '.join(i)])


                # if t.sentiment.polarity > 0:
                #     print(t.sentiment)
                    # print(raw_doc)



                # doc = nlp(raw_doc)
                # for chunk in doc.noun_chunks:
                #     things.setdefault(original_boat_name, collections.Counter())
                #     things[original_boat_name].update(chunk)
                # for sent in doc.sents:
                #     print(TextBlob(str(sent)).sentiment.polarity, sent)
                    
                # print(raw_doc)
                # print('-'*40)

                # print(raw_doc)

    final = {}
    filterwords = ['boat', 'boats', 'sail', 'sailing', 'template']
    for boat_name, ist in things.items():
        print(boat_name)
        final.setdefault(boat_name, [])
        commons = ist.most_common()
        for i in commons:
            if i[0] not in filterwords and len(i[0]) > 1 and boat_name.lower().find(i[0]) == -1 and i[1] > 1 and not parse_int(i[0]):
                final[boat_name].append({
                    'word': i[0],
                    'count': i[1],
                })

    for name, common in final.items():
        post_data = {
            'body': {
                'keywords': common,
                'boat': name
            },
            "index": "boats",
            'doc_type': 'boat'
        }
        res = es.index(**post_data)