def test_search_function(self): # Assert search() function. s = Sentence(parse("Go on Bors, chop his head off!")) m = search.search("PRP*? NN*", s) self.assertEqual(m[0].string, "Bors") self.assertEqual(m[1].string, "his head") print("pattern.search.search()")
def test_document(self): # Assert Document properties. # Test with different input types. for constructor, w in ((vector.Document, "The cats sit on the mat."), (vector.Document, ["The", "cats", "sit", "on", "the", "mat"]), (vector.Document, { "cat": 1, "mat": 1, "sit": 1 }), (vector.Document, Text(parse("The cats sat on the mat."))), (vector.Document, Sentence(parse("The cats sat on the mat.")))): # Test copy. v = constructor(w, stemmer=vector.LEMMA, stopwords=False, name="Cat", type="CAT") v = v.copy() # Test properties. self.assertEqual(v.name, "Cat") self.assertEqual(v.type, "CAT") self.assertEqual(v.count, 3) self.assertEqual(v.terms, {"cat": 1, "mat": 1, "sit": 1}) # Test iterator decoration. self.assertEqual(sorted(v.features), ["cat", "mat", "sit"]) self.assertEqual(sorted(v), ["cat", "mat", "sit"]) self.assertEqual(len(v), 3) self.assertEqual(v["cat"], 1) self.assertEqual("cat" in v, True) print "pattern.vector.Document"
def test_match(self): # Assert Constraint-Word matching. R = search.Constraint.fromstring S = lambda s: Sentence(parse(s, relations=True, lemmata=True)) W = lambda s, tag=None, index=0: search.Word(None, s, tag, index) for constraint, tests in ( (R("cat|dog"), [(W("cat"), 1), (W("dog"), 1), (W("fish"), 0)]), (R("cat*"), [(W("cats"), 1)]), (R("*cat"), [(W("tomcat"), 1)]), (R("c*t|d*g"), [(W("cat"), 1), (W("cut"), 1), (W("dog"), 1), (W("dig"), 1)]), (R("cats|NN*"), [(W("cats", "NNS"), 1), (W("cats"), 0)]), (R("^cat"), [(W("cat", "NN", index=0), 1),(W("cat", "NN", index=1), 0)]), (R("*|!cat"), [(W("cat"), 0), (W("dog"), 1), (W("fish"), 1)]), (R("my cat"), [(W("cat"), 0)]), (R("my cat"), [(S("my cat").words[1], 1)]), # "my cat" is an overspecification of "cat" (R("my_cat"), [(S("my cat").words[1], 1)]), (R("cat|NP"), [(S("my cat").words[1], 1)]), (R("dog|VP"), [(S("my dog").words[1], 0)]), (R("cat|SBJ"), [(S("the cat is sleeping").words[1], 1)]), (R("dog"), [(S("MY DOGS").words[1], 1)]), # lemma matches (R("dog"), [(S("MY DOG").words[1], 1)])): # case-insensitive for test, b in tests: self.assertEqual(constraint.match(test), bool(b)) # Assert Constraint-Taxa matching. t = search.Taxonomy() t.append("Tweety", type="bird") t.append("Steven", type="bird") v = search.Constraint.fromstring("BIRD", taxonomy=t) self.assertTrue(v.match(W("bird"))) self.assertTrue(v.match(S("tweeties")[0])) self.assertTrue(v.match(W("Steven"))) print("pattern.search.Constraint.match()")
def myExtract(statement): s = Sentence(parse(statement, relations=True, lemmata=True, light=True)) p = Pattern.fromstring('There be DT NN+') match = p.search(s) #raise Exception(match) return match
def basicExtract(statement): #s = Sentence(parse(statement, relations=True, lemmata=True, light=True)) #p = Pattern.fromstring('(DT) (RB) (JJ) NN+') s = Sentence(parse(statement, lemmata=True)) m = search("There be DT {JJ? NN}", s) return m
def test_match(self): # Assert Match properties. s = Sentence(parse("Death awaits you all with nasty, big, pointy teeth.")) p = search.Pattern(sequence=[ search.Constraint(tags=["JJ"], optional=True), search.Constraint(tags=["NN*"])]) m = p.search(s) self.assertTrue(isinstance(m, list)) self.assertEqual(m[0].pattern, p) self.assertEqual(m[1].pattern, p) self.assertEqual(m[0].words, [s.words[0]]) self.assertEqual(m[1].words, [s.words[-3], s.words[-2]]) # Assert contraint "NN*" links to "Death" and "teeth", and "JJ" to "pointy". self.assertEqual(m[0].constraint(s.words[ 0]), p[1]) self.assertEqual(m[1].constraint(s.words[-3]), p[0]) self.assertEqual(m[1].constraint(s.words[-2]), p[1]) # Assert constraints "JJ NN*" links to chunk "pointy teeth". self.assertEqual(m[1].constraints(s.chunks[-1]), [p[0], p[1]]) # Assert Match.constituents() by constraint, constraint index and list of indices. self.assertEqual(m[1].constituents(), [s.words[-3], s.words[-2]]) self.assertEqual(m[1].constituents(constraint=p[0]), [s.words[-3]]) self.assertEqual(m[1].constituents(constraint=1), [s.words[-2]]) self.assertEqual(m[1].constituents(constraint=(0,1)), [s.words[-3], s.words[-2]]) # Assert Match.string. self.assertEqual(m[1].string, "pointy teeth") print("pattern.search.Match")
def test_sentence(): from pattern.en import parse, Text, Sentence from pattern.en import pprint sent1 = "BS degree ( BSEE or BSCS strongly preferred , MSCS a plus ) and/or the equivalent in training and experience ." sent2 = "Bachelor's degree in Computer Science is required." sent3 = "He created the robot and broke it after making it." sent4 = "A Computer Science or related degree " sent5 = "bachelors degree in Computer Science or Information Systems and/or related experience required" result = parse(sent5, tokenize = True, # Tokenize the input, i.e. split punctuation from words. tags = True, # Find part-of-speech tags. chunks = True, # Find chunk tags, e.g. "the black cat" = NP = noun phrase. relations = True, # Find relations between chunks. lemmata = True, # Find word lemmata. light = True) pprint(result) sen = Sentence(result) # print type(sen) print sen for chunk in sen.chunks: print chunk.type, [(w.string, w.type) for w in chunk.words]
def test_match_function(self): # Assert match() function. s = Sentence(parse("Go on Bors, chop his head off!")) m1 = search.match("chop NP off", s, strict=False) m2 = search.match("chop NP+ off", s, strict=True) self.assertEqual(m1.constituents()[1].string, "his head") self.assertEqual(m2.constituents()[1].string, "his head") print "pattern.search.match()"
def modality(sentence, type=EPISTEMIC): """ Returns the sentence's modality as a weight between -1.0 and +1.0. Currently, the only type implemented is EPISTEMIC. Epistemic modality is used to express possibility (i.e. how truthful is what is being said). """ if isinstance(sentence, basestring): try: # A Sentence is expected but a string given. # Attempt to parse the string on-the-fly. from pattern.en import parse, Sentence sentence = Sentence(parse(sentence)) except ImportError: pass S, n, m = sentence, 0.0, 0 if not (hasattr(S, "words") and hasattr(S, "parse_token")): raise TypeError("%s object is not a parsed Sentence" % repr(S.__class__.__name__)) if type == EPISTEMIC: r = S.string.rstrip(" .!") for k, v in epistemic_weaseling.items(): for phrase in v: if phrase in r: n += k m += 2 for i, w in enumerate(S.words): for type, dict, weight in (("MD", epistemic_MD, 4), ("VB", epistemic_VB, 2), ("RB", epistemic_RB, 2), ("JJ", epistemic_JJ, 1), ("NN", epistemic_NN, 1), ("CC", epistemic_CC_DT_IN, 1), ("DT", epistemic_CC_DT_IN, 1), ("IN", epistemic_CC_DT_IN, 1), ("PRP", epistemic_PRP, 1), ("PRP$", epistemic_PRP, 1), ("WP", epistemic_PRP, 1)): # "likely" => weight 1, "very likely" => weight 2 if i > 0 and s(S[i - 1]) in MODIFIERS: weight += 1 # likely" => score 0.25 (neutral inclining towards positive). if w.type and w.type.startswith(type): for k, v in dict.items(): # Prefer lemmata. if (w.lemma or s(w)) in v: # Reverse score for negated terms. if i > 0 and s(S[i - 1]) in ("not", "n't", "never", "without"): k = -k * 0.5 n += weight * k m += weight break # Numbers, citations, explanations make the sentence more factual. if w.type in ("CD", "\"", "'", ":", "("): n += 0.75 m += 1 if m == 0: return 1.0 # No modal verbs/adverbs used, so statement must be true. return max(-1.0, min(n / (m or 1), +1.0))
def extractMood(characterSentences): """ Analyzes the sentence using grammatical mood module from pattern. """ characterMoods = defaultdict(list) for key, value in characterSentences.iteritems(): for x in value: characterMoods[key].append( mood(Sentence(parse(str(x), lemmata=True)))) return characterMoods
def test_convergence(self): # Test with random sentences and random patterns to see if it crashes. w = ("big", "white", "rabbit", "black", "cats", "is", "was", "going", "to", "sleep", "sleepy", "very", "or") x = ("DT?", "JJ?+", "NN*", "VP?", "cat", "[*]") for i in range(100): s = " ".join(random.choice(w) for i in range(20)) s = Sentence(parse(s, lemmata=True)) p = " ".join(random.choice(x) for i in range(5)) p = search.Pattern.fromstring(p) p.search(s)
def start(self): cloudSize = dameCloudSize(self.id_request) cloudSize = cloudSize[0][0] searchKey = dameSerchKey(self.id_request) searchKey = searchKey[0][0] step = 0 while step <= 5: #Mas adelante setear get_stop; esto indica la cantidad de niveles for id_cloud in dameIdCloud( self.id_request ): #Obtiene IDS de los clouds que pertenecen al proyecto print "Id Cloud: " + str(id_cloud[0]) cloud = self.generar_cloud(dameNodo(id_cloud[0])) true_nodes = self.trueNodesSelection(cloud) for n in true_nodes: try: cloud.graph.node[n]['select'] = False crawler = SimpleCrawler1(n, delay=0.1) crawler.newStructure(cloud.graph) time = 0 except: continue while len(crawler.visited) < cloudSize: print "Cloudsize = " + str( cloudSize) + " Crawler Visited = " + str( len(crawler.visited)) + " Nivel = " + str( step) print 'Explorando ...' crawler.crawl(method=None) time += 1 if time > cloudSize * 10: break actualizarSelect(cloud.graph.node[n]['ID'], cloud.graph.node[n]['select']) print print '#####Generando documentos#####' #Creacion de minePackage clouds = list() clouds.append(cloud) minePackage = dict() minePackage['clouds'] = clouds minePackage['searchKey'] = searchKey minePackage['searchKeyStemmer'] = count(words( Sentence(parse(searchKey))), stemmer=PORTER) self.IRController.start(minePackage) #Recupera Informacion #FALTA SCRAPPER CONTROLLER #Se pone none para que no ocupen espacio innecesario, todo ya fue guardado en BD minePackage = None cloud = None gc.collect step += 1 print "Explorando nivel nro: " + str(step) #Controla los niveles a expandir, en este caso 10 print "Proceso Finalizado"
def findVerb(sent): result = parse( sent, tokenize=True, tags=True, ) sen = Sentence(result) vlist = [word.string for word in sen if word.type.startswith("V")] print vlist vlist = [word.string for word in sen if word.type.startswith("V")] return vlist
def nouns_and_adjectives(self, results): nouns = [] adjectives = [] results_tree = parse(results, chunks=False) sentence = Sentence(results_tree) for word in sentence: if word.type == 'NN': nouns.append(word.string) elif word.type == 'JJ': adjectives.append(word.string) return nouns, adjectives
def run(self, minePackage): clouds = minePackage['clouds'] urlContent = UrlToPlainText() for cloud in clouds: for n in cloud.graph.nodes( ): #Itera una lista de enlaces de la nube print cloud.graph.node[n]['link'] pageContent = urlContent.plainTextConverter( cloud.graph.node[n]['link']) cloud.graph.node[n]['methodData'] = MethodData( count(words(Sentence(parse(pageContent))), stemmer=PORTER))
def Sentlist_tokenizedS(text_tokenizedS): from pattern.en import Sentence, parse print("Processing: tokenizing text by sentence") Sent_list = [] for e in text_tokenizedS: s = parse(e, lemmata=False, chunks = True) s = Sentence(s) Sent_list.append(s) print("Completed: text tokenized by sentence") return Sent_list
def __init__(self, data, url="", contenidoBd=""): if url != "": urlContent = UrlToPlainText() self.contenidoConEtiquetas = urlContent.plainTextConverter( url, "mantenerEtiquetas") self.contenido = plaintext(self.contenidoConEtiquetas, keep={}) else: if (contenidoBd != ""): self.contenidoConEtiquetas = contenidoBd self.contenido = plaintext(self.contenidoConEtiquetas, keep={}) else: self.contenido = "" self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)
def test_group(self): # Assert Match groups. s = Sentence(parse("the big black cat eats a tasty fish")) m = search.search("DT {JJ+} NN", s) self.assertEqual(m[0].group(1).string, "big black") self.assertEqual(m[1].group(1).string, "tasty") # Assert nested groups (and syntax with additional spaces). m = search.search("DT { JJ { JJ { NN }}}", s) self.assertEqual(m[0].group(1).string, "big black cat") self.assertEqual(m[0].group(2).string, "black cat") self.assertEqual(m[0].group(3).string, "cat") # Assert chunked groups. m = search.search("NP {VP NP}", s) v = m[0].group(1, chunked=True) self.assertEqual(v[0].string, "eats") self.assertEqual(v[1].string, "a tasty fish") print "pattern.search.Match.group()"
def getData(self, params): if self.now_cache is not None: if (self.now_cache + datetime.timedelta(minutes=5)) < datetime.datetime.now(): self.data_cache = None self.today_cache = None self.now_cache = None if self.data_cache is None: tweets = [] for cand in candidates: tweets.append({ 'tweets': api.user_timeline(cand['user'], count=20), 'name': cand['name'], 'party': cand['party'] }) all_tweets = [] for tweet_data in tweets: name = tweet_data['name'] party = tweet_data['party'] for tweet in tweet_data['tweets']: all_tweets.append({ 'Name': name, 'Tweet': tweet.text, 'Favorites': tweet.favorite_count, 'Retweets': tweet.retweet_count }) dfs = pd.DataFrame(all_tweets) sentiments = [sentiment(tweet) for tweet in dfs['Tweet']] dfs['Polarity'] = [sent[0] for sent in sentiments] dfs['Subjectivity'] = [sent[1] for sent in sentiments] modal = [ modality(Sentence(parse(tweet, lemmata=True))) for tweet in dfs['Tweet'] ] dfs['Certainty'] = modal today = date.strftime(datetime.datetime.now(), format='%m/%d/%Y, %H:%M') now = datetime.datetime.now() self.data_cache = dfs self.today_cache = today self.now_cache = now return self.data_cache
def calculate_phrase_sentiment(self, phrases): # print "Rating phrases sentiment..." valence_list = [] arousal_list = [] for p in phrases: pol = sentiment(p)[0] sent = parse(p, lemmata=True) mod = modality(Sentence(sent)) print mod valence_list.append(10 * pol) arousal_list.append(5 * mod) valence = max(valence_list) arousal = max(arousal_list) print "Valence: " + str(valence) print "arousal: " + str(arousal) return ((valence, arousal))
def team_sentiment_analysis(stats): for s in stats.sentences: this_sentiment = sentiment(s) polarity = float("{0:.2f}".format(this_sentiment[0])) subjectivity = float("{0:.2f}".format(this_sentiment[1])) polarity_10 = float("{0:.1f}".format(this_sentiment[0])) subjectivity_10 = float("{0:.1f}".format(this_sentiment[1])) stats.polarity_counts[polarity] += 1 stats.subjectivity_counts[subjectivity] += 1 stats.polarity_counts_10s[polarity_10] += 1 stats.subjectivity_counts_10s[subjectivity_10] += 1 s = Sentence(parse(s, lemmata=True)) stats.mood_counts[mood(s)] += 1 rounded_modality = float("{0:.2f}".format(modality(s))) rounded_modality_10 = float("{0:.1f}".format(modality(s))) stats.modality_counts[rounded_modality] += 1 stats.modality_counts_10s[rounded_modality_10] += 1
def mood(sentence, **kwargs): """Returns IMPERATIVE (command), CONDITIONAL (possibility), SUBJUNCTIVE (wish) or INDICATIVE (fact).""" if isinstance(sentence, basestring): try: # A Sentence is expected but a string given. # Attempt to parse the string on-the-fly. from pattern.en import parse, Sentence sentence = Sentence(parse(sentence)) except ImportError: pass if imperative(sentence, **kwargs): return IMPERATIVE if conditional(sentence, **kwargs): return CONDITIONAL if subjunctive(sentence, **kwargs): return SUBJUNCTIVE else: return INDICATIVE
def test_search(self): # Assert one match containing all words. v = search.Pattern.fromstring("*+") v = v.search("one two three") self.assertEqual(v[0].string, "one two three") # Assert one match for each word. v = search.Pattern.fromstring("*") v = v.search("one two three") self.assertEqual(v[0].string, "one") self.assertEqual(v[1].string, "two") self.assertEqual(v[2].string, "three") # Assert all variations are matched (sentence starts with a NN* which must be caught). v = search.Pattern.fromstring("(DT) JJ?+ NN*") v = v.search(Sentence(parse("dogs, black cats and a big white rabbit"))) self.assertEqual(v[0].string, "dogs") self.assertEqual(v[1].string, "black cats") self.assertEqual(v[2].string, "a big white rabbit") v = search.Pattern.fromstring("NN*") print "pattern.search.Pattern.search()"
def extract(statement): s = Sentence(parse(statement, lemmata=True)) '''c1 = Constraint.fromstring("There be DT") c2 = Constraint.fromstring("NN+") c3 = Constraint.fromstring("(DT)") c4 = Constraint.fromstring("(RB) (JJ) NNP+") c5 = Constraint.fromstring("(call) (DT)") c6 = Constraint.fromstring("(RB) (JJ) (NNPS|NNP)+") p = Pattern(sequence=[c1, c2, c3, c4, c5, c6]) match = p.search(s) ''' s = find_entities(s) # not sure about this "be" thing - happy to match plural (is/are) but not sure about past tense ... match = search(MATCH_STRING, s) #raise Exception(match) return s, match
# It does not use modal verbs such as "could" and "would": # "You could eat your dinner!" is not a command but a bubbly suggestion. # We can create a pattern that scans for infinitive verbs (VB), # and use "!" to exclude certain words: # "!could|!would|!should|!to+ VB" = infinitive not preceded by modal or "to". # This works fine except in one case: if the sentence starts with a verb. # So we need a second rule "^VB" to catch this. # Note that the example below contains a third rule: "^do|VB*". # This catches all sentences that start with a "do" verb regardless if it is infinitive, # because the parses sometimes tags infinitive "do" incorrectly. def imperative(sentence): for p in ("!could|!would|!should|!to+ VB", "^VB", "^do|VB*"): m = match(p, sentence) if match(p, sentence) and sentence.string.endswith( (".", "!")): # Exclude questions. return True return False for s in ("Just stop it!", "Look out!", "Do your homework!", "You should do your homework.", "Could you stop it.", "To be, or not to be."): s = parse(s) s = Sentence(s) print(s) print(imperative(s)) print("")
print() # Sentence chunks can be matched by tag (e.g. NP, VP, ADJP). # The pattern below matches anything from # "the rabbit gnaws at your fingers" to # "the white rabbit looks at the carrots": p = Pattern.fromstring("rabbit VP at NP", s) m = p.search(s) print(m) print() if m: for w in m[0].words: print(w, " \t=>", m[0].constraint(w)) print() print("-------------------------------------------------------------") # Finally, constraints can also include regular expressions. # To include them we need to use the full syntax instead of the search() # function: import re r = re.compile(r"[0-9|\.]+") # all numbers p = Pattern() p.sequence.append(Constraint(words=[r])) p.sequence.append(Constraint(tags=["NN*"])) s = Sentence(parse("I have 9.5 fingers.")) print(s) print(p.search(s)) print()
# (mail/spam, positive/negative, language, author's age, ...), # you can predict the type of other "unknown" texts. # The k-Nearest Neighbor algorithm classifies texts according # to the k documents that are most similar (cosine similarity) to the given input document. m = Model() t = Twitter() # First, we mine a model of a 1000 tweets. # We'll use hashtags as type. for page in range(1, 10): for tweet in t.search('#win OR #fail', start=page, count=100, cached=True): # If the tweet contains #win hashtag, we'll set its type to 'WIN': s = tweet.text.lower() # tweet in lowercase p = '#win' in s and 'WIN' or 'FAIL' # document labels s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None)) # Train k-Nearest Neighbor on the model. # Note that this is a only simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if you're classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN(baseline=None) # By default, baseline=MAJORITY for document in m: # (classify unknown documents with the most frequent type). classifier.train(document)
for word, pos in tag(strSentence): if pos in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): word = str(lemma(word)) if (word not in ("be", "do", "let", "begin", "have", "try", "start")): verbList.append(word) verbSentList.append(sid.polarity_scores(word)) #con.execute("INSERT OR IGNORE INTO verbList VALUES(?, ?)", (lemma(word),0,)) #con.execute("UPDATE verbList SET count = count + 1 WHERE verb=?", (lemma(word),)) a = parse(strSentence, relations=True, lemmata=True) #pprint(a) sentence = Sentence(a) for i in range(0, len(sentence.verbs) - 1): strVP = str(' '.join(sentence.verbs[i].lemmata)) if (strVP not in ("be", "do", "let", "begin", "have", "try", "start")): vpList.append(strVP) vpSentList.append(sid.polarity_scores(strVP)) #print(sentence.relations) #print(sentence.subjects) #print(sentence.objects) #print(sentence.verbs) #print(sentence.chunk) # sqlite3 insert : subject / objects / verbs / CPC / Sentiment # genre, wordCount, filename, sentence
def load_text(filename): lines = [line.strip().split('\t') for line in open(filename)][1:] return [ Sentence(format_sentence(sentence)) for sentence in group_sentences(lines) ]
from pattern.search import search, Pattern, Constraint from pattern.en import Sentence, parse # This example demonstrates an interesting search pattern that mines for comparisons. # Notice the use of the constraint "be". # If the output from the parser includes word lemmas (e.g. "doing" => "do") # these will also be matched. Using "be" then matches "is", "being", "are", ... # and if underspecification is used "could be", "will be", "definitely was", ... p = Pattern.fromstring("NP be (more) ADJP|ADVP than NP") for s in ( "the turtle was faster than the hare", "Arnold Schwarzenegger is more dangerous than Dolph Lundgren"): s = s = Sentence(parse(s, lemmata=True)) # parse lemmas m = p.search(s) print s print print m print if m: print m[0].constituents() # Words grouped by chunk whenever possible. print m[0].constraints(chunk=s.chunks[0]) # The constraints that match the given chunk. print m[0].constituents(constraint=p[0]) # Constituents for the given constraint. print m[0].constituents(constraint=[0,3,5]) # Constituents for the given constraint indices. print print print