def test_sentiment(self): # Assert < 0 for negative adjectives and > 0 for positive adjectives. self.assertTrue(en.sentiment("wonderful")[0] > 0) self.assertTrue(en.sentiment("horrible")[0] < 0) self.assertTrue(en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0) self.assertTrue(en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0) # Assert that :) and :( are recognized. self.assertTrue(en.sentiment(":)")[0] > 0) self.assertTrue(en.sentiment(":(")[0] < 0) # Assert the accuracy of the sentiment analysis (for the positive class). # Given are the scores for Pang & Lee's polarity dataset v2.0: # http://www.cs.cornell.edu/people/pabo/movie-review-data/ # The baseline should increase (not decrease) when the algorithm is modified. from pattern.db import Datasheet from pattern.metrics import test reviews = [] for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: en.positive(review), reviews) self.assertTrue(A > 0.755) self.assertTrue(P > 0.760) self.assertTrue(R > 0.747) self.assertTrue(F > 0.754) # Assert the accuracy of the sentiment analysis on short text (for the positive class). # Given are the scores for Pang & Lee's sentence polarity dataset v1.0: # http://www.cs.cornell.edu/people/pabo/movie-review-data/ reviews = [] for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: en.positive(review), reviews) self.assertTrue(A > 0.642) self.assertTrue(P > 0.653) self.assertTrue(R > 0.607) self.assertTrue(F > 0.629) print "pattern.en.sentiment()"
def testModel(self, *args): ''' Perform learning of a Model from training data. ''' documents = [] data = Datasheet.load(os.path.join("corpora","twitter","trainer", "tweets_stream_data_nb.csv")) data2 = Datasheet.load(os.path.join("corpora","twitter","trainer", "tweets_stream_data_svm.csv")) data3 = Datasheet.load(os.path.join("corpora","twitter","trainer", "ensenmble.csv")) if args: classifier = Classifier.load('models/nb_model.ept') print "Document class is %s" % classifier.classify(Document(args[0])) print "Document probability is : ", classifier.classify(Document(args[0]), discrete=False) label = classifier.classify(Document(args[0]), discrete=False) print label["positive"] else: i = n = 0 pos=neg=0 classifier = Classifier.load('models/nb_model.ept') data = shuffled(data) for document, label in data[:]+data2[:]+data3[:]: doc_vector = Document(document, type=str(label), stopwords=True) documents.append(doc_vector) if 'positive' in label: pos+=1 else: neg+=1 print "10-fold CV" print k_fold_cv(NB, documents=documents, folds=10) print "Neg: %s, Pos: %s" % (neg, pos) print classifier.distribution print "Classes in Naive Bayes Classifier" print classifier.classes print "Area Under the Curve: %0.6f" % classifier.auc(documents, k=10) print "Model Performance (Positive Classifications)" accuracy, precision, recall, f1 = classifier.test(data[:]+data2[:]+data3[:], target='positive') print "Accuracy = %.6f; F-Score = %.6f; Precision = %.6f; Recall = %.6f" % (accuracy, f1, precision, recall) print "Model Performance(Negative Classifications)" accuracy, precision, recall, f1 = classifier.test(data[:]+data2[:]+data3[:], target='negative') print "Accuracy = %.6f; F-Score = %.6f; Precision = %.6f; Recall = %.6f" % (accuracy, f1, precision, recall) print "Model Performance" accuracy, precision, recall, f1 = classifier.test(data[:]+data2[:]+data3[:]) print "Accuracy = %.6f; F-Score = %.6f; Precision = %.6f; Recall = %.6f" % (accuracy, f1, precision, recall) print "Confusion Matrix" print classifier.confusion_matrix(data[:]+data2[:]+data3[:]) print classifier.confusion_matrix(data[:]+data2[:]+data3[:])('positive') print classifier.confusion_matrix(data[:]+data2[:]+data3[:])('negative')
def test_sentiment_twitter(self): sanders = os.path.join(PATH, "corpora", "polarity-en-sanders.csv") if os.path.exists(sanders): # Assert the accuracy of the sentiment analysis on tweets. # Given are the scores for Sanders Twitter Sentiment Corpus: # http://www.sananalytics.com/lab/twitter-sentiment/ # Positive + neutral is taken as polarity >= 0.0, # Negative is taken as polarity < 0.0. # Since there are a lot of neutral cases, # and the algorithm predicts 0.0 by default (i.e., majority class) the results are good. # Distinguishing negative from neutral from positive is a much # harder task from pattern.db import Datasheet from pattern.metrics import test reviews = [] for i, id, date, tweet, polarity, topic in Datasheet.load(sanders): if polarity != "irrelevant": reviews.append( (tweet, polarity in ("positive", "neutral"))) A, P, R, F = test( lambda review: en.positive(review, threshold=0.0), reviews) #print(A, P, R, F) self.assertTrue(A > 0.824) self.assertTrue(P > 0.879) self.assertTrue(R > 0.911) self.assertTrue(F > 0.895)
def test_spelling(self): # Assert case-sensitivity + numbers. for a, b in ( (".", "."), ("?", "?"), ("!", "!"), ("I", "I"), ("a", "a"), ("42", "42"), ("3.14", "3.14"), ("The", "The"), ("the", "the")): self.assertEqual(en.suggest(a)[0][0], b) # Assert spelling suggestion accuracy. # Note: simply training on more text will not improve accuracy. i = j = 0.0 from pattern.db import Datasheet for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")): for w in wrong.split(" "): if en.suggest(w)[0][0] == correct: i += 1 else: j += 1 self.assertTrue(i / (i + j) > 0.70) print("pattern.en.suggest()")
def test_modality(self): # Assert -1.0 => +1.0 representing the degree of certainty. v = en.modality(en.Sentence(en.parse("I wish it would stop raining."))) self.assertTrue(v < 0) v = en.modality( en.Sentence(en.parse("It will surely stop raining soon."))) self.assertTrue(v > 0) # Assert the accuracy of the modality algorithm. # Given are the scores for the CoNLL-2010 Shared Task 1 Wikipedia uncertainty data: # http://www.inf.u-szeged.hu/rgai/conll2010st/tasks.html#task1 # The baseline should increase (not decrease) when the algorithm is # modified. from pattern.db import Datasheet from pattern.metrics import test sentences = [] for certain, sentence in Datasheet.load(os.path.join(PATH, "corpora", "uncertainty-conll2010.csv")): sentence = en.parse(sentence, chunks=False, light=True) sentence = en.Sentence(sentence) sentences.append((sentence, int(certain) > 0)) A, P, R, F = test( lambda sentence: en.modality(sentence) > 0.5, sentences) #print(A, P, R, F) self.assertTrue(A > 0.69) self.assertTrue(P > 0.72) self.assertTrue(R > 0.64) self.assertTrue(F > 0.68) print("pattern.en.modality()")
def main(): table = Datasheet() for cap in CAPS: url = URL("http://www.comuni-italiani.it/001/272/farmacie/cap%s.html" % cap) connection = url.open() doc = Document( connection.read() ) items = doc.by_tag("table") row = [] for j, td in enumerate( items[5].by_tag('td') ): strcap = "%s, Telefono:" % cap save = "%s" % plaintext(td.content).replace('\n', ',', 3).replace("Telefono:", strcap).replace(";", "").replace("Partita Iva", ",Partita Iva") + "\n" if save != None: row.append( save ) table.append( row ) print "%s ----------------------------------------------------------------------------" % str(j) table.save("files/farmacie_torino.txt")
def main(): table = Datasheet() url = URL("http://www.comuniecitta.it/torino/elenco-ospedali-di-torino.html") connection = url.open() doc = Document( connection.read() ) items = doc.by_class('ulamm')[1:] row = [] for ul in items: li = ul.by_tag('li') kind = plaintext(ul.previous.content) for el in li: if el != None: save = "%s, %s \n" % ( plaintext(el.content).replace('\n', ','), kind, ) row.append(save) table.append( row ) table.save("files/h_torino.txt")
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-en-celex.csv")): if en.inflect.singularize(pl) == sg: i +=1 n += 1 self.assertTrue(float(i) / n > 0.95) print "pattern.en.inflect.singularize()"
def test_attributive(self): # Assert the accuracy of the attributive algorithm ("fel" => "felle"). from pattern.db import Datasheet i, n = 0, 0 for pred, attr, sg, pl in Datasheet.load(os.path.join("corpora", "celex-wordforms-nl.csv")): if nl.attributive(pred) == attr: i +=1 n += 1 self.assertTrue(float(i) / n > 0.96) print "pattern.nl.attributive()"
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): if it.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.84) print "pattern.it.singularize()"
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("felle" => "fel"). from pattern.db import Datasheet i, n = 0, 0 for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")): if nl.predicative(attr) == pred: i += 1 n += 1 self.assertTrue(float(i) / n > 0.96) print("pattern.nl.predicative()")
def model(top=None): """ Returns a Model of e-mail messages. Document type=True => HAM, False => SPAM. Documents are mostly of a technical nature (developer forum posts). """ documents = [] for score, message in Datasheet.load(os.path.join(PATH, "corpora", "spam-apache.csv")): document = vector.Document(message, stemmer="porter", top=top, type=int(score) > 0) documents.append(document) return vector.Model(documents)
def test_pluralize(self): # Assert the accuracy of the pluralization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): if it.pluralize(sg) == pl: i += 1 n += 1 self.assertTrue(float(i) / n > 0.93) print("pattern.it.pluralize()")
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")): if nl.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.88) print("pattern.nl.singularize()")
def test_sentiment(self): # Assert < 0 for negative adjectives and > 0 for positive adjectives. self.assertTrue(en.sentiment("wonderful")[0] > 0) self.assertTrue(en.sentiment("horrible")[0] < 0) self.assertTrue( en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0) self.assertTrue( en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0) # Assert that :) and :( are recognized. self.assertTrue(en.sentiment(":)")[0] > 0) self.assertTrue(en.sentiment(":(")[0] < 0) # Assert the accuracy of the sentiment analysis (for the positive class). # Given are the scores for Pang & Lee's polarity dataset v2.0: # http://www.cs.cornell.edu/people/pabo/movie-review-data/ # The baseline should increase (not decrease) when the algorithm is # modified. from pattern.db import Datasheet from pattern.metrics import test reviews = [] for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")): reviews.append((review, int(score) > 0)) from time import time t = time() A, P, R, F = test(lambda review: en.positive(review), reviews) #print(A, P, R, F) self.assertTrue(A > 0.751) self.assertTrue(P > 0.770) self.assertTrue(R > 0.710) self.assertTrue(F > 0.710) # Assert the accuracy of the sentiment analysis on short text (for the positive class). # Given are the scores for Pang & Lee's sentence polarity dataset v1.0: # http://www.cs.cornell.edu/people/pabo/movie-review-data/ reviews = [] for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: en.positive(review), reviews) #print(A, P, R, F) self.assertTrue(A > 0.654) self.assertTrue(P > 0.660) self.assertTrue(R > 0.636) self.assertTrue(F > 0.648) print("pattern.en.sentiment()")
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")): if tag == "n": if de.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.82) print("pattern.de.singularize()")
def test_pluralize(self): # Assert the accuracy of the pluralization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load( os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): if it.pluralize(sg) == pl: i += 1 n += 1 self.assertTrue(float(i) / n > 0.93) print "pattern.it.pluralize()"
def model(top=None): """ Returns a Model of e-mail messages. Document type=True => HAM, False => SPAM. Documents are mostly of a technical nature (developer forum posts). """ documents = [] for score, message in Datasheet.load(os.path.join(PATH, "corpora", "spam-apache.csv")): document = vector.Document( message, stemmer="porter", top=top, type=int(score) > 0) documents.append(document) return vector.Model(documents)
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("belles" => "beau"). from pattern.db import Datasheet i, n = 0, 0 for pred, attr, tag in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-fr-lexique.csv")): if tag == "a": if fr.predicative(attr) == pred: i +=1 n += 1 self.assertTrue(float(i) / n > 0.95) print "pattern.fr.predicative()"
def test_intertextuality(self): # Evaluate accuracy for plagiarism detection. from pattern.db import Datasheet data = Datasheet.load(os.path.join(PATH, "corpora", "plagiarism-clough&stevenson.csv")) data = [((txt, src), int(plagiarism) > 0) for txt, src, plagiarism in data] def plagiarism(txt, src): return metrics.intertextuality([txt, src], n=3)[0,1] > 0.05 A, P, R, F = metrics.test(lambda x: plagiarism(*x), data) self.assertTrue(P > 0.96) self.assertTrue(R > 0.94) print "pattern.metrics.intertextuality()"
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load( os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): if it.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.84) print("pattern.it.singularize()")
def load_domains(self): """loads domain information""" sources_path = pd('data', 'source_data.csv') domain_file = Datasheet.load(sources_path, headers=True) for row in domain_file: url = row[1] if str(row[-1]).find("\""): cats = row[2:-1] else: cats = row[2:] self.cat_dict[url] = cats
def test_spelling(self): # Assert spelling suggestion accuracy. i = j = 0.0 from pattern.db import Datasheet for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "birkbeck-spelling.csv")): for w in wrong.split(" "): if en.spelling(w)[0][0] == correct: i += 1 else: j += 1 self.assertTrue(i / (i+j) > 0.70)
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for sg, pl in Datasheet.load( os.path.join(PATH, "corpora", "wordforms-en-celex.csv")): if en.inflect.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.95) print "pattern.en.inflect.singularize()"
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("großer" => "groß"). from pattern.db import Datasheet i, n = 0, 0 for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")): if tag == "a": if de.predicative(attr) == pred: i +=1 n += 1 self.assertTrue(float(i) / n > 0.98) print("pattern.de.predicative()")
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("felle" => "fel"). from pattern.db import Datasheet i, n = 0, 0 for pred, attr, sg, pl in Datasheet.load( os.path.join(PATH, "corpora", "celex-wordforms-nl.csv")): if nl.predicative(attr) == pred: i += 1 n += 1 self.assertTrue(float(i) / n > 0.96) print "pattern.nl.predicative()"
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("belles" => "beau"). from pattern.db import Datasheet i, n = 0, 0 for pred, attr, tag in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-fr-lexique.csv")): if tag == "a": if fr.predicative(attr) == pred: i +=1 n += 1 self.assertTrue(float(i) / n > 0.95) print("pattern.fr.predicative()")
def test_intertextuality(self): # Evaluate accuracy for plagiarism detection. from pattern.db import Datasheet data = Datasheet.load(os.path.join(PATH, "corpora", "plagiarism-clough&stevenson.csv")) data = [((txt, src), int(plagiarism) > 0) for txt, src, plagiarism in data] def plagiarism(txt, src): return metrics.intertextuality([txt, src], n=3)[0,1] > 0.05 A, P, R, F = metrics.test(lambda x: plagiarism(*x), data) self.assertTrue(P > 0.96) self.assertTrue(R > 0.94) print("pattern.metrics.intertextuality()")
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("großer" => "groß"). from pattern.db import Datasheet i, n = 0, 0 for tag, pred, attr in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")): if tag == "a": if de.predicative(attr) == pred: i +=1 n += 1 self.assertTrue(float(i) / n > 0.98) print "pattern.de.predicative()"
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")): if tag == "n": if de.singularize(pl) == sg: i +=1 n += 1 self.assertTrue(float(i) / n > 0.81) print "pattern.de.singularize()"
def test_pluralize(self): # Assert the accuracy of the pluralization algorithm. from pattern.db import Datasheet i, n = 0, 0 for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")): if tag == "n": if de.pluralize(sg) == pl: i +=1 n += 1 self.assertTrue(float(i) / n > 0.69) print "pattern.de.pluralize()"
def test_attributive(self): # Assert the accuracy of the attributive algorithm ("fel" => "felle"). from pattern.db import Datasheet i, n = 0, 0 for pred, attr, sg, pl in Datasheet.load( os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")): if nl.attributive(pred) == attr: i += 1 n += 1 self.assertTrue(float(i) / n > 0.96) print("pattern.nl.attributive()")
def test_pluralize(self): # Assert the accuracy of the pluralization algorithm. from pattern.db import Datasheet i, n = 0, 0 for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")): if tag == "n": if de.pluralize(sg) == pl: i +=1 n += 1 self.assertTrue(float(i) / n > 0.69) print("pattern.de.pluralize()")
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pred, attr, sg, pl in Datasheet.load( os.path.join(PATH, "corpora", "celex-wordforms-nl.csv")): if nl.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.88) print "pattern.nl.singularize()"
def test_pluralize(self): # Assert "auto's" as plural of "auto". self.assertEqual("auto's", nl.inflect.pluralize("auto")) # Assert the accuracy of the pluralization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")): if nl.pluralize(sg) == pl: i += 1 n += 1 self.assertTrue(float(i) / n > 0.74) print("pattern.nl.pluralize()")
def test_spelling(self): i = j = 0.0 from pattern.db import Datasheet for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-ru.csv")): for w in wrong.split(" "): suggested = ru.suggest(w) if suggested[0][0] == correct: i += 1 else: j += 1 self.assertTrue(i / (i + j) > 0.65) print("pattern.ru.suggest()")
def test_spelling(self): # Assert spelling suggestion accuracy. # Note: simply training on more text will not improve accuracy. i = j = 0.0 from pattern.db import Datasheet for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "birkbeck-spelling.csv")): for w in wrong.split(" "): if en.spelling(w)[0][0] == correct: i += 1 else: j += 1 self.assertTrue(i / (i+j) > 0.70)
def test_pluralize(self): # Assert "auto's" as plural of "auto". self.assertEqual("auto's", nl.inflect.pluralize("auto")) # Assert the accuracy of the pluralization algorithm. from pattern.db import Datasheet i, n = 0, 0 for pred, attr, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-nl-celex.csv")): if nl.pluralize(sg) == pl: i +=1 n += 1 self.assertTrue(float(i) / n > 0.74) print "pattern.nl.pluralize()"
def test_spelling(self): # Assert spelling suggestion accuracy. # Note: simply training on more text will not improve accuracy. i = j = 0.0 from pattern.db import Datasheet for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")): for w in wrong.split(" "): if en.spelling(w)[0][0] == correct: i += 1 else: j += 1 self.assertTrue(i / (i+j) > 0.70)
def scrape_news_text(news_url): global counter news_html = requests.get(news_url).content # print(news_html) '''convert html to BeautifulSoup object''' news_soup = BeautifulSoup(news_html, 'lxml') # soup.find("div", {"id": "articlebody"}) # paragraphs = [par.text for par in news_soup.find_all('p')] # news_text = '\n'.join(paragraphs) # print(news_soup.find("div", {"id": "articleText"})) date_object = news_soup.find(itemprop="datePublished") news_object = news_soup.find("div", {"id": "articleText"}) if date_object is None: return " " if news_object is None: return " " news_date = date_object.get_text( ) # find("div", {"id": "articleText"}).text news_text = news_object.text # print(news_date) # print(news_text) print(news_url) try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd("nasdaq2.csv")) except: table = Datasheet() news_sentiment = sentiment(news_text) print(news_sentiment) table.append([counter, news_date, news_url, news_sentiment]) table.save(pd("nasdaq2.csv")) counter += 1 return news_text
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("cruciali" => "cruciale"). from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): if pos != "j": continue if it.predicative(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.87) print "pattern.it.predicative()"
def test_gender(self): # Assert the accuracy of the gender disambiguation algorithm. from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): g = it.gender(sg) if mf in g and it.PLURAL not in g: i += 1 g = it.gender(pl) if mf in g and it.PLURAL in g: i += 1 n += 2 self.assertTrue(float(i) / n > 0.92) print "pattern.it.gender()"
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet test = {} for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")): if tag == "n": test.setdefault(lemma, []).append(w) i, n = 0, 0 for sg, pl in test.items(): pl = sorted(pl, key=len, reverse=True)[0] if es.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.93) print "pattern.es.singularize()"
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("horribles" => "horrible"). from pattern.db import Datasheet test = {} for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")): if tag == "j": test.setdefault(lemma, []).append(w) i, n = 0, 0 for pred, attr in test.items(): attr = sorted(attr, key=len, reverse=True)[0] if es.predicative(attr) == pred: i += 1 n += 1 self.assertTrue(float(i) / n > 0.92) print "pattern.es.predicative()"
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("horribles" => "horrible"). from pattern.db import Datasheet test = {} for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")): if tag == "j": test.setdefault(lemma, []).append(w) i, n = 0, 0 for pred, attr in test.items(): attr = sorted(attr, key=len, reverse=True)[0] if es.predicative(attr) == pred: i += 1 n += 1 self.assertTrue(float(i) / n > 0.92) print("pattern.es.predicative()")
def test_singularize(self): # Assert the accuracy of the singularization algorithm. from pattern.db import Datasheet test = {} for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")): if tag == "n": test.setdefault(lemma, []).append(w) i, n = 0, 0 for sg, pl in test.items(): pl = sorted(pl, key=len, reverse=True)[0] if es.singularize(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.93) print("pattern.es.singularize()")
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("cruciali" => "cruciale"). from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load( os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): if pos != "j": continue if it.predicative(pl) == sg: i += 1 n += 1 self.assertTrue(float(i) / n > 0.87) print("pattern.it.predicative()")
def test_gender(self): # Assert the accuracy of the gender disambiguation algorithm. from pattern.db import Datasheet i, n = 0, 0 for pos, sg, pl, mf in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-it-wiktionary.csv")): g = it.gender(sg) if mf in g and it.PLURAL not in g: i += 1 g = it.gender(pl) if mf in g and it.PLURAL in g: i += 1 n += 2 self.assertTrue(float(i) / n > 0.92) print("pattern.it.gender()")
def test_pluralize(self): # Assert "octopodes" for classical plural of "octopus". # Assert "octopuses" for modern plural. self.assertEqual("octopodes", en.inflect.pluralize("octopus", classical=True)) self.assertEqual("octopuses", en.inflect.pluralize("octopus", classical=False)) # Assert the accuracy of the pluralization algorithm. from pattern.db import Datasheet i, n = 0, 0 for sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-en-celex.csv")): if en.inflect.pluralize(sg) == pl: i += 1 n += 1 self.assertTrue(float(i) / n > 0.95) print("pattern.en.inflect.pluralize()")
def test_pluralize(self): # Assert "octopodes" for classical plural of "octopus". # Assert "octopuses" for modern plural. self.assertEqual("octopodes", en.inflect.pluralize("octopus", classical=True)) self.assertEqual("octopuses", en.inflect.pluralize("octopus", classical=False)) # Assert the accuracy of the pluralization algorithm. from pattern.db import Datasheet i, n = 0, 0 for sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-en-celex.csv")): if en.inflect.pluralize(sg) == pl: i +=1 n += 1 self.assertTrue(float(i) / n > 0.95) print "pattern.en.inflect.pluralize()"
def test_sentiment(self): # Assert < 0 for negative adjectives and > 0 for positive adjectives. self.assertTrue(nl.sentiment("geweldig")[0] > 0) self.assertTrue(nl.sentiment("verschrikkelijk")[0] < 0) # Assert the accuracy of the sentiment analysis. # Given are the scores for 3,000 book reviews. # The baseline should increase (not decrease) when the algorithm is modified. from pattern.db import Datasheet from pattern.metrics import test reviews = [] for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-nl-bol.com.csv")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: nl.positive(review), reviews) self.assertTrue(A > 0.80) self.assertTrue(P > 0.77) self.assertTrue(R > 0.85) self.assertTrue(F > 0.81) print "pattern.nl.sentiment()"
def test_sentiment(self): # Assert < 0 for negative adjectives and > 0 for positive adjectives. self.assertTrue(fr.sentiment("fabuleux")[0] > 0) self.assertTrue(fr.sentiment("terrible")[0] < 0) # Assert the accuracy of the sentiment analysis. # Given are the scores for 1,500 book reviews. # The baseline should increase (not decrease) when the algorithm is modified. from pattern.db import Datasheet from pattern.metrics import test reviews = [] for review, score in Datasheet.load(os.path.join(PATH, "corpora", "polarity-fr-amazon.csv")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: fr.positive(review), reviews) self.assertTrue(A > 0.75) self.assertTrue(P > 0.76) self.assertTrue(R > 0.73) self.assertTrue(F > 0.75) print "pattern.fr.sentiment()"
def test_spelling(self): # Assert case-sensitivity + numbers. for a, b in ((".", "."), ("?", "?"), ("!", "!"), ("I", "I"), ("a", "a"), ("42", "42"), ("3.14", "3.14"), ("The", "The"), ("the", "the")): self.assertEqual(en.suggest(a)[0][0], b) # Assert spelling suggestion accuracy. # Note: simply training on more text will not improve accuracy. i = j = 0.0 from pattern.db import Datasheet for correct, wrong in Datasheet.load( os.path.join(PATH, "corpora", "spelling-birkbeck.csv")): for w in wrong.split(" "): if en.suggest(w)[0][0] == correct: i += 1 else: j += 1 self.assertTrue(i / (i + j) > 0.70) print "pattern.en.suggest()"
def test_sentiment(self): # Assert < 0 for negative adjectives and > 0 for positive adjectives. self.assertTrue(en.sentiment("wonderful")[0] > 0) self.assertTrue(en.sentiment("horrible")[0] < 0) self.assertTrue(en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0) self.assertTrue(en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0) # Assert the accuracy of the sentiment analysis. # Given are the scores for Pang & Lee's polarity dataset v2.0: # http://www.cs.cornell.edu/people/pabo/movie-review-data/ # The baseline should increase (not decrease) when the algorithm is modified. from pattern.db import Datasheet from pattern.metrics import test reviews = [] for score, review in Datasheet.load(os.path.join("corpora", "pang&lee-polarity.txt")): reviews.append((review, int(score) > 0)) A, P, R, F = test(lambda review: en.positive(review), reviews) self.assertTrue(A > 0.71) self.assertTrue(P > 0.72) self.assertTrue(R > 0.70) self.assertTrue(F > 0.71) print "pattern.en.sentiment()"
def test_sentiment_twitter(self): sanders = os.path.join(PATH, "corpora", "polarity-en-sanders.csv") if os.path.exists(sanders): # Assert the accuracy of the sentiment analysis on tweets. # Given are the scores for Sanders Twitter Sentiment Corpus: # http://www.sananalytics.com/lab/twitter-sentiment/ # Positive + neutral is taken as polarity >= 0.0, # Negative is taken as polarity < 0.0. # Since there are a lot of neutral cases, # and the algorithm predicts 0.0 by default (i.e., majority class) the results are good. # Distinguishing negative from neutral from positive is a much harder task from pattern.db import Datasheet from pattern.metrics import test reviews = [] for i, id, date, tweet, polarity, topic in Datasheet.load(sanders): if polarity != "irrelevant": reviews.append((tweet, polarity in ("positive", "neutral"))) A, P, R, F = test(lambda review: en.positive(review, threshold=0.0), reviews) self.assertTrue(A > 0.824) self.assertTrue(P > 0.878) self.assertTrue(R > 0.912) self.assertTrue(F > 0.895)
def test_modality(self): # Assert -1.0 => +1.0 representing the degree of certainty. v = en.modality(en.Sentence(en.parse("I wish it would stop raining."))) self.assertTrue(v < 0) v = en.modality(en.Sentence(en.parse("It will surely stop raining soon."))) self.assertTrue(v > 0) # Assert the accuracy of the modality algorithm. # Given are the scores for the CoNLL-2010 Shared Task 1 Wikipedia uncertainty data: # http://www.inf.u-szeged.hu/rgai/conll2010st/tasks.html#task1 # The baseline should increase (not decrease) when the algorithm is modified. from pattern.db import Datasheet from pattern.metrics import test sentences = [] for certain, sentence in Datasheet.load(os.path.join(PATH, "corpora", "conll2010-uncertainty.csv")): sentence = en.parse(sentence, chunks=False, light=True) sentence = en.Sentence(sentence) sentences.append((sentence, int(certain) > 0)) A, P, R, F = test(lambda sentence: en.modality(sentence) > 0.5, sentences) self.assertTrue(A > 0.67) self.assertTrue(P > 0.69) self.assertTrue(R > 0.62) self.assertTrue(F > 0.65) print "pattern.en.modality()"
sys.path.insert(0, os.path.join("..", "..")) from pattern.vector import Document, Model, NB from pattern.db import Datasheet # Naive Bayes is one of the oldest classifiers, # but is is still popular because it is fast for models # that have many documents and many features. # It is outperformed by KNN and SVM, but useful as a baseline for tests. # We'll test it with a corpus of spam e-mail messages, # included in the test suite, stored as a CSV-file. # The corpus contains mostly technical e-mail from developer mailing lists. data = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora", "spam-apache.csv") data = Datasheet.load(data) documents = [] for score, message in data: document = Document(message, type=int(score) > 0) documents.append(document) m = Model(documents) print("number of documents:", len(m)) print("number of words:", len(m.vector)) print("number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m))) print() # Train Naive Bayes on all documents. # Each document has a type: True for actual e-mail, False for spam.
# This requires a personal license key. # If you are logged in to Facebook, you can get a license key here: # http://www.clips.ua.ac.be/pattern-facebook # (We don't / can't store your information). # 1) Searching for public status updates. # Search for all status updates that contain the word "horrible". try: # We'll store the status updates in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each status update. # We only want to add new status updates, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already # exists. table = Datasheet.load(pd("opinions.csv")) index = set(table.columns[0]) except: table = Datasheet() index = set() fb = Facebook() # With Facebook.search(cached=False), a "live" request is sent to Facebook: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. for status in fb.search("horrible", count=25, cached=False): print("=" * 100) print(status.id) print(status.text.encode("utf-8"))