Esempio n. 1
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(en.sentiment("wonderful")[0] > 0)
     self.assertTrue(en.sentiment("horrible")[0] < 0)
     self.assertTrue(en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
     self.assertTrue(en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0)
     # Assert that :) and :( are recognized.
     self.assertTrue(en.sentiment(":)")[0] > 0)
     self.assertTrue(en.sentiment(":(")[0] < 0)
     # Assert the accuracy of the sentiment analysis (for the positive class).
     # Given are the scores for Pang & Lee's polarity dataset v2.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.755)
     self.assertTrue(P > 0.760)
     self.assertTrue(R > 0.747)
     self.assertTrue(F > 0.754)
     # Assert the accuracy of the sentiment analysis on short text (for the positive class).
     # Given are the scores for Pang & Lee's sentence polarity dataset v1.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.642)
     self.assertTrue(P > 0.653)
     self.assertTrue(R > 0.607)
     self.assertTrue(F > 0.629)
     print "pattern.en.sentiment()"
Esempio n. 2
0
    def testModel(self, *args):
        ''' Perform learning of a Model from training data.
        '''
        documents = []

        data = Datasheet.load(os.path.join("corpora","twitter","trainer", "tweets_stream_data_nb.csv"))
        data2 = Datasheet.load(os.path.join("corpora","twitter","trainer", "tweets_stream_data_svm.csv"))
        data3 = Datasheet.load(os.path.join("corpora","twitter","trainer", "ensenmble.csv"))

        if args:
            classifier = Classifier.load('models/nb_model.ept')
            print "Document class is %s" % classifier.classify(Document(args[0]))
            print "Document probability is : ", classifier.classify(Document(args[0]), discrete=False) 
            label = classifier.classify(Document(args[0]), discrete=False)

            print label["positive"]

        else:
            i = n = 0
            pos=neg=0
            classifier = Classifier.load('models/nb_model.ept')
            data = shuffled(data)

            for document, label in data[:]+data2[:]+data3[:]:
                doc_vector = Document(document, type=str(label), stopwords=True)
                documents.append(doc_vector)
                if 'positive' in label:
                    pos+=1
                else:
                    neg+=1
     
            print "10-fold CV"
            print k_fold_cv(NB, documents=documents, folds=10)

        print "Neg: %s, Pos: %s" % (neg, pos)
        print classifier.distribution

        print "Classes in Naive Bayes Classifier"
        print classifier.classes

        print "Area Under the Curve: %0.6f" % classifier.auc(documents, k=10)

        print "Model Performance (Positive Classifications)"
        accuracy, precision, recall, f1 = classifier.test(data[:]+data2[:]+data3[:], target='positive')
        print "Accuracy = %.6f; F-Score = %.6f; Precision = %.6f; Recall = %.6f" % (accuracy, f1, precision, recall)

        print "Model Performance(Negative Classifications)"
        accuracy, precision, recall, f1 = classifier.test(data[:]+data2[:]+data3[:], target='negative')
        print "Accuracy = %.6f; F-Score = %.6f; Precision = %.6f; Recall = %.6f" % (accuracy, f1, precision, recall)

        print "Model Performance"
        accuracy, precision, recall, f1 = classifier.test(data[:]+data2[:]+data3[:])

        print "Accuracy = %.6f; F-Score = %.6f; Precision = %.6f; Recall = %.6f" % (accuracy, f1, precision, recall)

        print "Confusion Matrix"
        print classifier.confusion_matrix(data[:]+data2[:]+data3[:])
        print classifier.confusion_matrix(data[:]+data2[:]+data3[:])('positive')
        print classifier.confusion_matrix(data[:]+data2[:]+data3[:])('negative')
Esempio n. 3
0
 def test_sentiment_twitter(self):
     sanders = os.path.join(PATH, "corpora", "polarity-en-sanders.csv")
     if os.path.exists(sanders):
         # Assert the accuracy of the sentiment analysis on tweets.
         # Given are the scores for Sanders Twitter Sentiment Corpus:
         # http://www.sananalytics.com/lab/twitter-sentiment/
         # Positive + neutral is taken as polarity >= 0.0,
         # Negative is taken as polarity < 0.0.
         # Since there are a lot of neutral cases,
         # and the algorithm predicts 0.0 by default (i.e., majority class) the results are good.
         # Distinguishing negative from neutral from positive is a much
         # harder task
         from pattern.db import Datasheet
         from pattern.metrics import test
         reviews = []
         for i, id, date, tweet, polarity, topic in Datasheet.load(sanders):
             if polarity != "irrelevant":
                 reviews.append(
                     (tweet, polarity in ("positive", "neutral")))
         A, P, R, F = test(
             lambda review: en.positive(review, threshold=0.0), reviews)
         #print(A, P, R, F)
         self.assertTrue(A > 0.824)
         self.assertTrue(P > 0.879)
         self.assertTrue(R > 0.911)
         self.assertTrue(F > 0.895)
Esempio n. 4
0
 def test_spelling(self):
     # Assert case-sensitivity + numbers.
     for a, b in (
             (".", "."),
             ("?", "?"),
             ("!", "!"),
             ("I", "I"),
             ("a", "a"),
             ("42", "42"),
             ("3.14", "3.14"),
             ("The", "The"),
             ("the", "the")):
         self.assertEqual(en.suggest(a)[0][0], b)
     # Assert spelling suggestion accuracy.
     # Note: simply training on more text will not improve accuracy.
     i = j = 0.0
     from pattern.db import Datasheet
     for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")):
         for w in wrong.split(" "):
             if en.suggest(w)[0][0] == correct:
                 i += 1
             else:
                 j += 1
     self.assertTrue(i / (i + j) > 0.70)
     print("pattern.en.suggest()")
Esempio n. 5
0
 def test_modality(self):
     # Assert -1.0 => +1.0 representing the degree of certainty.
     v = en.modality(en.Sentence(en.parse("I wish it would stop raining.")))
     self.assertTrue(v < 0)
     v = en.modality(
         en.Sentence(en.parse("It will surely stop raining soon.")))
     self.assertTrue(v > 0)
     # Assert the accuracy of the modality algorithm.
     # Given are the scores for the CoNLL-2010 Shared Task 1 Wikipedia uncertainty data:
     # http://www.inf.u-szeged.hu/rgai/conll2010st/tasks.html#task1
     # The baseline should increase (not decrease) when the algorithm is
     # modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     sentences = []
     for certain, sentence in Datasheet.load(os.path.join(PATH, "corpora", "uncertainty-conll2010.csv")):
         sentence = en.parse(sentence, chunks=False, light=True)
         sentence = en.Sentence(sentence)
         sentences.append((sentence, int(certain) > 0))
     A, P, R, F = test(
         lambda sentence: en.modality(sentence) > 0.5, sentences)
     #print(A, P, R, F)
     self.assertTrue(A > 0.69)
     self.assertTrue(P > 0.72)
     self.assertTrue(R > 0.64)
     self.assertTrue(F > 0.68)
     print("pattern.en.modality()")
Esempio n. 6
0
def main():
	table = Datasheet()

	for cap in CAPS:
		url = 	URL("http://www.comuni-italiani.it/001/272/farmacie/cap%s.html" % cap)
		connection = url.open()
		doc = Document( connection.read() )
		items = doc.by_tag("table")
		row = []
		for j, td in enumerate( items[5].by_tag('td') ):
			strcap = "%s, Telefono:" % cap
			save = "%s" % plaintext(td.content).replace('\n', ',', 3).replace("Telefono:", strcap).replace(";", "").replace("Partita Iva", ",Partita Iva") + "\n"
			if save != None:
				row.append( save )
		table.append( row )
		print  "%s ----------------------------------------------------------------------------" % str(j)
		
	table.save("files/farmacie_torino.txt")
Esempio n. 7
0
def main():
	table = Datasheet()

	url = 	URL("http://www.comuniecitta.it/torino/elenco-ospedali-di-torino.html")
	connection = url.open()
	doc = Document( connection.read() )
	items = doc.by_class('ulamm')[1:]
	row = []
	for ul in items:
		li = ul.by_tag('li')
		kind = plaintext(ul.previous.content)
		for el in li:
			if el != None:
				save = "%s, %s \n" % ( plaintext(el.content).replace('\n', ','), kind, )
				row.append(save)
	table.append( row )
		
	table.save("files/h_torino.txt")
Esempio n. 8
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-en-celex.csv")):
         if en.inflect.singularize(pl) == sg:
             i +=1
         n += 1
     self.assertTrue(float(i) / n > 0.95)
     print "pattern.en.inflect.singularize()"
Esempio n. 9
0
 def test_attributive(self):
     # Assert the accuracy of the attributive algorithm ("fel" => "felle").
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, sg, pl in Datasheet.load(os.path.join("corpora", "celex-wordforms-nl.csv")):
         if nl.attributive(pred) == attr:
             i +=1
         n += 1
     self.assertTrue(float(i) / n > 0.96)
     print "pattern.nl.attributive()"
Esempio n. 10
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
         if it.singularize(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.84)
     print "pattern.it.singularize()"
Esempio n. 11
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("felle" => "fel").
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
         if nl.predicative(attr) == pred:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.96)
     print("pattern.nl.predicative()")
Esempio n. 12
0
def model(top=None):
    """ Returns a Model of e-mail messages.
        Document type=True => HAM, False => SPAM.
        Documents are mostly of a technical nature (developer forum posts).
    """
    documents = []
    for score, message in Datasheet.load(os.path.join(PATH, "corpora", "spam-apache.csv")):
        document = vector.Document(message, stemmer="porter", top=top, type=int(score) > 0)
        documents.append(document)
    return vector.Model(documents)
Esempio n. 13
0
 def test_pluralize(self):
     # Assert the accuracy of the pluralization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
         if it.pluralize(sg) == pl:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.93)
     print("pattern.it.pluralize()")
Esempio n. 14
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
         if nl.singularize(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.88)
     print("pattern.nl.singularize()")
Esempio n. 15
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(en.sentiment("wonderful")[0] > 0)
     self.assertTrue(en.sentiment("horrible")[0] < 0)
     self.assertTrue(
         en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
     self.assertTrue(
         en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0)
     # Assert that :) and :( are recognized.
     self.assertTrue(en.sentiment(":)")[0] > 0)
     self.assertTrue(en.sentiment(":(")[0] < 0)
     # Assert the accuracy of the sentiment analysis (for the positive class).
     # Given are the scores for Pang & Lee's polarity dataset v2.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     # The baseline should increase (not decrease) when the algorithm is
     # modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")):
         reviews.append((review, int(score) > 0))
     from time import time
     t = time()
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     #print(A, P, R, F)
     self.assertTrue(A > 0.751)
     self.assertTrue(P > 0.770)
     self.assertTrue(R > 0.710)
     self.assertTrue(F > 0.710)
     # Assert the accuracy of the sentiment analysis on short text (for the positive class).
     # Given are the scores for Pang & Lee's sentence polarity dataset v1.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     #print(A, P, R, F)
     self.assertTrue(A > 0.654)
     self.assertTrue(P > 0.660)
     self.assertTrue(R > 0.636)
     self.assertTrue(F > 0.648)
     print("pattern.en.sentiment()")
Esempio n. 16
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
         if tag == "n":
             if de.singularize(pl) == sg:
                 i += 1
             n += 1
     self.assertTrue(float(i) / n > 0.82)
     print("pattern.de.singularize()")
Esempio n. 17
0
 def test_pluralize(self):
     # Assert the accuracy of the pluralization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pos, sg, pl, mf in Datasheet.load(
             os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
         if it.pluralize(sg) == pl:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.93)
     print "pattern.it.pluralize()"
Esempio n. 18
0
def model(top=None):
    """ Returns a Model of e-mail messages.
        Document type=True => HAM, False => SPAM.
        Documents are mostly of a technical nature (developer forum posts).
    """
    documents = []
    for score, message in Datasheet.load(os.path.join(PATH, "corpora", "spam-apache.csv")):
        document = vector.Document(
            message, stemmer="porter", top=top, type=int(score) > 0)
        documents.append(document)
    return vector.Model(documents)
Esempio n. 19
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("belles" => "beau").
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, tag in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-fr-lexique.csv")):
         if tag == "a":
             if fr.predicative(attr) == pred:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.95)
     print "pattern.fr.predicative()"
Esempio n. 20
0
 def test_intertextuality(self):
     # Evaluate accuracy for plagiarism detection.
     from pattern.db import Datasheet
     data = Datasheet.load(os.path.join(PATH, "corpora", "plagiarism-clough&stevenson.csv"))
     data = [((txt, src), int(plagiarism) > 0) for txt, src, plagiarism in data]
     def plagiarism(txt, src):
         return metrics.intertextuality([txt, src], n=3)[0,1] > 0.05
     A, P, R, F = metrics.test(lambda x: plagiarism(*x), data)
     self.assertTrue(P > 0.96)
     self.assertTrue(R > 0.94)
     print "pattern.metrics.intertextuality()"
Esempio n. 21
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pos, sg, pl, mf in Datasheet.load(
             os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
         if it.singularize(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.84)
     print("pattern.it.singularize()")
Esempio n. 22
0
 def load_domains(self):
     """loads domain information"""
     sources_path = pd('data', 'source_data.csv')
     domain_file = Datasheet.load(sources_path, headers=True)
     for row in domain_file:
         url = row[1]
         if str(row[-1]).find("\""):
             cats = row[2:-1]
         else:
             cats = row[2:]
         self.cat_dict[url] = cats
Esempio n. 23
0
 def test_spelling(self):
     # Assert spelling suggestion accuracy.
     i = j = 0.0
     from pattern.db import Datasheet
     for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "birkbeck-spelling.csv")):
         for w in wrong.split(" "):
             if en.spelling(w)[0][0] == correct:
                 i += 1
             else:
                 j += 1
     self.assertTrue(i / (i+j) > 0.70)
Esempio n. 24
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for sg, pl in Datasheet.load(
             os.path.join(PATH, "corpora", "wordforms-en-celex.csv")):
         if en.inflect.singularize(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.95)
     print "pattern.en.inflect.singularize()"
Esempio n. 25
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("großer" => "groß").
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
         if tag == "a":
             if de.predicative(attr) == pred:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.98)
     print("pattern.de.predicative()")
Esempio n. 26
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("felle" => "fel").
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, sg, pl in Datasheet.load(
             os.path.join(PATH, "corpora", "celex-wordforms-nl.csv")):
         if nl.predicative(attr) == pred:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.96)
     print "pattern.nl.predicative()"
Esempio n. 27
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("belles" => "beau").
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, tag in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-fr-lexique.csv")):
         if tag == "a":
             if fr.predicative(attr) == pred:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.95)
     print("pattern.fr.predicative()")
Esempio n. 28
0
 def test_intertextuality(self):
     # Evaluate accuracy for plagiarism detection.
     from pattern.db import Datasheet
     data = Datasheet.load(os.path.join(PATH, "corpora", "plagiarism-clough&stevenson.csv"))
     data = [((txt, src), int(plagiarism) > 0) for txt, src, plagiarism in data]
     def plagiarism(txt, src):
         return metrics.intertextuality([txt, src], n=3)[0,1] > 0.05
     A, P, R, F = metrics.test(lambda x: plagiarism(*x), data)
     self.assertTrue(P > 0.96)
     self.assertTrue(R > 0.94)
     print("pattern.metrics.intertextuality()")
Esempio n. 29
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("großer" => "groß").
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")):
         if tag == "a":
             if de.predicative(attr) == pred:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.98)
     print "pattern.de.predicative()"
Esempio n. 30
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")):
         if tag == "n":
             if de.singularize(pl) == sg:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.81)
     print "pattern.de.singularize()"
Esempio n. 31
0
 def test_pluralize(self):
     # Assert the accuracy of the pluralization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")):
         if tag == "n":
             if de.pluralize(sg) == pl:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.69)
     print "pattern.de.pluralize()"
Esempio n. 32
0
 def test_attributive(self):
     # Assert the accuracy of the attributive algorithm ("fel" => "felle").
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, sg, pl in Datasheet.load(
             os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
         if nl.attributive(pred) == attr:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.96)
     print("pattern.nl.attributive()")
Esempio n. 33
0
 def test_pluralize(self):
     # Assert the accuracy of the pluralization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
         if tag == "n":
             if de.pluralize(sg) == pl:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.69)
     print("pattern.de.pluralize()")
Esempio n. 34
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, sg, pl in Datasheet.load(
             os.path.join(PATH, "corpora", "celex-wordforms-nl.csv")):
         if nl.singularize(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.88)
     print "pattern.nl.singularize()"
Esempio n. 35
0
 def test_pluralize(self):
     # Assert "auto's" as plural of "auto".
     self.assertEqual("auto's", nl.inflect.pluralize("auto"))
     # Assert the accuracy of the pluralization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
         if nl.pluralize(sg) == pl:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.74)
     print("pattern.nl.pluralize()")
Esempio n. 36
0
 def test_spelling(self):
     i = j = 0.0
     from pattern.db import Datasheet
     for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-ru.csv")):
         for w in wrong.split(" "):
             suggested = ru.suggest(w)
             if suggested[0][0] == correct:
                 i += 1
             else:
                 j += 1
     self.assertTrue(i / (i + j) > 0.65)
     print("pattern.ru.suggest()")
Esempio n. 37
0
 def test_spelling(self):
     # Assert spelling suggestion accuracy.
     # Note: simply training on more text will not improve accuracy.
     i = j = 0.0
     from pattern.db import Datasheet
     for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "birkbeck-spelling.csv")):
         for w in wrong.split(" "):
             if en.spelling(w)[0][0] == correct:
                 i += 1
             else:
                 j += 1
     self.assertTrue(i / (i+j) > 0.70)
Esempio n. 38
0
 def test_pluralize(self):
     # Assert "auto's" as plural of "auto".
     self.assertEqual("auto's", nl.inflect.pluralize("auto"))
     # Assert the accuracy of the pluralization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")):
         if nl.pluralize(sg) == pl:
             i +=1
         n += 1
     self.assertTrue(float(i) / n > 0.74)
     print "pattern.nl.pluralize()"
Esempio n. 39
0
 def test_spelling(self):
     # Assert spelling suggestion accuracy.
     # Note: simply training on more text will not improve accuracy.
     i = j = 0.0
     from pattern.db import Datasheet
     for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")):
         for w in wrong.split(" "):
             if en.spelling(w)[0][0] == correct:
                 i += 1
             else:
                 j += 1
     self.assertTrue(i / (i+j) > 0.70)
Esempio n. 40
0
def scrape_news_text(news_url):

    global counter

    news_html = requests.get(news_url).content

    #    print(news_html)
    '''convert html to BeautifulSoup object'''
    news_soup = BeautifulSoup(news_html, 'lxml')
    # soup.find("div", {"id": "articlebody"})
    #    paragraphs = [par.text for par in news_soup.find_all('p')]
    #    news_text = '\n'.join(paragraphs)

    #    print(news_soup.find("div", {"id": "articleText"}))

    date_object = news_soup.find(itemprop="datePublished")
    news_object = news_soup.find("div", {"id": "articleText"})

    if date_object is None:
        return "  "

    if news_object is None:
        return "   "

    news_date = date_object.get_text(
    )  #   find("div", {"id": "articleText"}).text
    news_text = news_object.text

    #    print(news_date)
    #    print(news_text)
    print(news_url)

    try:
        # We'll store tweets in a Datasheet.
        # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
        # In the first column, we'll store a unique id for each tweet.
        # We only want to add the latest tweets, i.e., those we haven't seen yet.
        # With an index on the first column we can quickly check if an id already exists.
        # The pd() function returns the parent directory of this script + any given path.
        table = Datasheet.load(pd("nasdaq2.csv"))
    except:
        table = Datasheet()

    news_sentiment = sentiment(news_text)

    print(news_sentiment)

    table.append([counter, news_date, news_url, news_sentiment])

    table.save(pd("nasdaq2.csv"))

    counter += 1

    return news_text
Esempio n. 41
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("cruciali" => "cruciale").
     
     from pattern.db import Datasheet
     i, n = 0, 0
     for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
         if pos != "j":
             continue
         if it.predicative(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.87)
     print "pattern.it.predicative()"
Esempio n. 42
0
 def test_gender(self):
     # Assert the accuracy of the gender disambiguation algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
         g = it.gender(sg)
         if mf in g and it.PLURAL not in g:
             i += 1
         g = it.gender(pl)
         if mf in g and it.PLURAL in g:
             i += 1
         n += 2
     self.assertTrue(float(i) / n > 0.92)
     print "pattern.it.gender()"
Esempio n. 43
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     test = {}
     for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
         if tag == "n": test.setdefault(lemma, []).append(w)
     i, n = 0, 0
     for sg, pl in test.items():
         pl = sorted(pl, key=len, reverse=True)[0]
         if es.singularize(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.93)
     print "pattern.es.singularize()"
Esempio n. 44
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("horribles" => "horrible").
     from pattern.db import Datasheet
     test = {}
     for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
         if tag == "j": test.setdefault(lemma, []).append(w)
     i, n = 0, 0
     for pred, attr in test.items():
         attr = sorted(attr, key=len, reverse=True)[0]
         if es.predicative(attr) == pred:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.92)
     print "pattern.es.predicative()"
Esempio n. 45
0
 def test_predicative(self):
     # Assert the accuracy of the predicative algorithm ("horribles" => "horrible").
     from pattern.db import Datasheet
     test = {}
     for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
         if tag == "j": test.setdefault(lemma, []).append(w)
     i, n = 0, 0
     for pred, attr in test.items():
         attr = sorted(attr, key=len, reverse=True)[0]
         if es.predicative(attr) == pred:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.92)
     print("pattern.es.predicative()")
Esempio n. 46
0
 def test_singularize(self):
     # Assert the accuracy of the singularization algorithm.
     from pattern.db import Datasheet
     test = {}
     for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")):
         if tag == "n": test.setdefault(lemma, []).append(w)
     i, n = 0, 0
     for sg, pl in test.items():
         pl = sorted(pl, key=len, reverse=True)[0]
         if es.singularize(pl) == sg:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.93)
     print("pattern.es.singularize()")
Esempio n. 47
0
    def test_predicative(self):
        # Assert the accuracy of the predicative algorithm ("cruciali" => "cruciale").

        from pattern.db import Datasheet
        i, n = 0, 0
        for pos, sg, pl, mf in Datasheet.load(
                os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
            if pos != "j":
                continue
            if it.predicative(pl) == sg:
                i += 1
            n += 1
        self.assertTrue(float(i) / n > 0.87)
        print("pattern.it.predicative()")
Esempio n. 48
0
 def test_gender(self):
     # Assert the accuracy of the gender disambiguation algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")):
         g = it.gender(sg)
         if mf in g and it.PLURAL not in g:
             i += 1
         g = it.gender(pl)
         if mf in g and it.PLURAL in g:
             i += 1
         n += 2
     self.assertTrue(float(i) / n > 0.92)
     print("pattern.it.gender()")
Esempio n. 49
0
 def test_pluralize(self):
     # Assert "octopodes" for classical plural of "octopus".
     # Assert "octopuses" for modern plural.
     self.assertEqual("octopodes", en.inflect.pluralize("octopus", classical=True))
     self.assertEqual("octopuses", en.inflect.pluralize("octopus", classical=False))
     # Assert the accuracy of the pluralization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-en-celex.csv")):
         if en.inflect.pluralize(sg) == pl:
             i += 1
         n += 1
     self.assertTrue(float(i) / n > 0.95)
     print("pattern.en.inflect.pluralize()")
Esempio n. 50
0
 def test_pluralize(self):
     # Assert "octopodes" for classical plural of "octopus".
     # Assert "octopuses" for modern plural.
     self.assertEqual("octopodes", en.inflect.pluralize("octopus", classical=True))
     self.assertEqual("octopuses", en.inflect.pluralize("octopus", classical=False))
     # Assert the accuracy of the pluralization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-en-celex.csv")):
         if en.inflect.pluralize(sg) == pl:
             i +=1
         n += 1
     self.assertTrue(float(i) / n > 0.95)
     print "pattern.en.inflect.pluralize()"
Esempio n. 51
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(nl.sentiment("geweldig")[0] > 0)
     self.assertTrue(nl.sentiment("verschrikkelijk")[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for 3,000 book reviews.
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-nl-bol.com.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: nl.positive(review), reviews)
     self.assertTrue(A > 0.80)
     self.assertTrue(P > 0.77)
     self.assertTrue(R > 0.85)
     self.assertTrue(F > 0.81)
     print "pattern.nl.sentiment()"
Esempio n. 52
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(fr.sentiment("fabuleux")[0] > 0)
     self.assertTrue(fr.sentiment("terrible")[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for 1,500 book reviews.
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for review, score in Datasheet.load(os.path.join(PATH, "corpora", "polarity-fr-amazon.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: fr.positive(review), reviews)
     self.assertTrue(A > 0.75)
     self.assertTrue(P > 0.76)
     self.assertTrue(R > 0.73)
     self.assertTrue(F > 0.75)
     print "pattern.fr.sentiment()"
Esempio n. 53
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(fr.sentiment("fabuleux")[0] > 0)
     self.assertTrue(fr.sentiment("terrible")[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for 1,500 book reviews.
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for review, score in Datasheet.load(os.path.join(PATH, "corpora", "polarity-fr-amazon.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: fr.positive(review), reviews)
     self.assertTrue(A > 0.75)
     self.assertTrue(P > 0.76)
     self.assertTrue(R > 0.73)
     self.assertTrue(F > 0.75)
     print "pattern.fr.sentiment()"
Esempio n. 54
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(nl.sentiment("geweldig")[0] > 0)
     self.assertTrue(nl.sentiment("verschrikkelijk")[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for 3,000 book reviews.
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-nl-bol.com.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: nl.positive(review), reviews)
     self.assertTrue(A > 0.80)
     self.assertTrue(P > 0.77)
     self.assertTrue(R > 0.85)
     self.assertTrue(F > 0.81)
     print "pattern.nl.sentiment()"
Esempio n. 55
0
 def test_spelling(self):
     # Assert case-sensitivity + numbers.
     for a, b in ((".", "."), ("?", "?"), ("!", "!"), ("I", "I"),
                  ("a", "a"), ("42", "42"), ("3.14", "3.14"),
                  ("The", "The"), ("the", "the")):
         self.assertEqual(en.suggest(a)[0][0], b)
     # Assert spelling suggestion accuracy.
     # Note: simply training on more text will not improve accuracy.
     i = j = 0.0
     from pattern.db import Datasheet
     for correct, wrong in Datasheet.load(
             os.path.join(PATH, "corpora", "spelling-birkbeck.csv")):
         for w in wrong.split(" "):
             if en.suggest(w)[0][0] == correct:
                 i += 1
             else:
                 j += 1
     self.assertTrue(i / (i + j) > 0.70)
     print "pattern.en.suggest()"
Esempio n. 56
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(en.sentiment("wonderful")[0] > 0)
     self.assertTrue(en.sentiment("horrible")[0] < 0)
     self.assertTrue(en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
     self.assertTrue(en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for Pang & Lee's polarity dataset v2.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join("corpora", "pang&lee-polarity.txt")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.71)
     self.assertTrue(P > 0.72)
     self.assertTrue(R > 0.70)
     self.assertTrue(F > 0.71)
     print "pattern.en.sentiment()"
Esempio n. 57
0
 def test_sentiment_twitter(self):
     sanders = os.path.join(PATH, "corpora", "polarity-en-sanders.csv")
     if os.path.exists(sanders):
         # Assert the accuracy of the sentiment analysis on tweets.
         # Given are the scores for Sanders Twitter Sentiment Corpus:
         # http://www.sananalytics.com/lab/twitter-sentiment/
         # Positive + neutral is taken as polarity >= 0.0,
         # Negative is taken as polarity < 0.0.
         # Since there are a lot of neutral cases,
         # and the algorithm predicts 0.0 by default (i.e., majority class) the results are good.
         # Distinguishing negative from neutral from positive is a much harder task
         from pattern.db import Datasheet
         from pattern.metrics import test
         reviews = []
         for i, id, date, tweet, polarity, topic in Datasheet.load(sanders):
             if polarity != "irrelevant":
                 reviews.append((tweet, polarity in ("positive", "neutral")))
         A, P, R, F = test(lambda review: en.positive(review, threshold=0.0), reviews)
         self.assertTrue(A > 0.824)
         self.assertTrue(P > 0.878)
         self.assertTrue(R > 0.912)
         self.assertTrue(F > 0.895)
Esempio n. 58
0
 def test_modality(self):
     # Assert -1.0 => +1.0 representing the degree of certainty.
     v = en.modality(en.Sentence(en.parse("I wish it would stop raining.")))
     self.assertTrue(v < 0)
     v = en.modality(en.Sentence(en.parse("It will surely stop raining soon.")))
     self.assertTrue(v > 0)
     # Assert the accuracy of the modality algorithm.
     # Given are the scores for the CoNLL-2010 Shared Task 1 Wikipedia uncertainty data:
     # http://www.inf.u-szeged.hu/rgai/conll2010st/tasks.html#task1
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     sentences = []
     for certain, sentence in Datasheet.load(os.path.join(PATH, "corpora", "conll2010-uncertainty.csv")):
         sentence = en.parse(sentence, chunks=False, light=True)
         sentence = en.Sentence(sentence)
         sentences.append((sentence, int(certain) > 0))
     A, P, R, F = test(lambda sentence: en.modality(sentence) > 0.5, sentences)
     self.assertTrue(A > 0.67)
     self.assertTrue(P > 0.69)
     self.assertTrue(R > 0.62)
     self.assertTrue(F > 0.65)
     print "pattern.en.modality()"
Esempio n. 59
0
sys.path.insert(0, os.path.join("..", ".."))

from pattern.vector import Document, Model, NB
from pattern.db import Datasheet

# Naive Bayes is one of the oldest classifiers,
# but is is still popular because it is fast for models
# that have many documents and many features.
# It is outperformed by KNN and SVM, but useful as a baseline for tests.

# We'll test it with a corpus of spam e-mail messages,
# included in the test suite, stored as a CSV-file.
# The corpus contains mostly technical e-mail from developer mailing lists.
data = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora",
                    "spam-apache.csv")
data = Datasheet.load(data)

documents = []
for score, message in data:
    document = Document(message, type=int(score) > 0)
    documents.append(document)
m = Model(documents)

print("number of documents:", len(m))
print("number of words:", len(m.vector))
print("number of words (average):",
      sum(len(d.features) for d in m.documents) / float(len(m)))
print()

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
Esempio n. 60
0
# This requires a personal license key.
# If you are logged in to Facebook, you can get a license key here:
# http://www.clips.ua.ac.be/pattern-facebook
# (We don't / can't store your information).

# 1) Searching for public status updates.
#    Search for all status updates that contain the word "horrible".

try:
    # We'll store the status updates in a Datasheet.
    # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
    # In the first column, we'll store a unique id for each status update.
    # We only want to add new status updates, i.e., those we haven't seen yet.
    # With an index on the first column we can quickly check if an id already
    # exists.
    table = Datasheet.load(pd("opinions.csv"))
    index = set(table.columns[0])
except:
    table = Datasheet()
    index = set()

fb = Facebook()

# With Facebook.search(cached=False), a "live" request is sent to Facebook:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
for status in fb.search("horrible", count=25, cached=False):
    print("=" * 100)
    print(status.id)
    print(status.text.encode("utf-8"))