def process(sentence): t = parsetree(sentence, lemmata=True) greeting=t.string.strip() greeting_list_q=["hola", "buenas"] greeting_list_a=["hola", "buenas"] if greeting.lower() in greeting_list_q: r=dict() r['type']='direct_answer' r['message']=greeting_list_a[randint(0,len(greeting_list_a)-1)] return r greeting_list_q=["que tal", "como estas", u"cómo estás", "como va", "como te encuentras", "va todo bien"] greeting_list_a=["Estoy bien, gracias por preguntar"] if greeting.lower() in greeting_list_q: r=dict() r['type']='direct_answer' r['message']=greeting_list_a[randint(0,len(greeting_list_a)-1)] return r greeting_list_q=["buenos dias", "buenas tardes", "buenas noches"] greeting_list_a=["hola", "buenas"] if greeting.lower() in greeting_list_q: r=dict() r['type']='direct_answer' r['message']=greeting_list_a[randint(0,len(greeting_list_a)-1)] return r return None
def process(sentence): t = parsetree(sentence, lemmata=True) m=pattern_utils.pattern_match("que {VP} {NP}", t) if m: n, g, noun = pattern_utils.parse_NP(m.group(2)) r=dict() r['type']='query' r['question']='que' r["relation"]=conjugate(m.group(1).string, INFINITIVE) r['gender']=g r['object']=noun return r m=pattern_utils.pattern_match("como {VP} {NP}", t) if m: n, g, noun = pattern_utils.parse_NP(m.group(2)) r=dict() r['type']='query' r['question']='como' r["relation"]=conjugate(m.group(1).string, INFINITIVE) r['gender']=g r['object']=noun return r return None
def nnps_and_keywords(text): s = parsetree(text, relations=True, lemmata=True) nnp_kw = {} for e in s: d = Document(e) kw = d.keywords() nnp = set() for w in kw: if w[1].type == 'NNP': wdstr = [] for wd in w[1].phrase.words: if wd.type == 'NNP': wdstr.append(wd.string) nnp.add("-".join(wdstr)) kw = d.keywords(top=5) words = set() for w in kw: if w[1].type != 'NNP': if w[1].lemma: words.add(w[1].lemma) else: words.add(w[1].string) if len(nnp)>1 and len(words)>1: if tuple(nnp) in nnp_kw: nnp_kw[tuple(nnp)].update(words) else: nnp_kw[tuple(nnp)]=words return nnp_kw
def __call__(self, text, default=None): result = [] for sentence in parsetree(text): result.extend(self.entity(sentence)) result = [item for item in result if item] if not result: return None return result
def lemma_esp(text): text = codecs.decode(text, "utf-8") text = unidecode(text) try: if (langid.classify(text.encode('utf-8').decode('utf-8'))[0] != "es"): return text else: s = parsetree(text, lemmata=True) lista_palabras = s.sentences[0].words lista_lemas = map(lambda x: x.lemma, lista_palabras) texto = ' '.join(lista_lemas) return texto except: s = parsetree(text, lemmata=True) lista_palabras = s.sentences[0].words lista_lemas = map(lambda x: x.lemma, lista_palabras) texto = ' '.join(lista_lemas) return texto
def parse(self, text): p_tree = parsetree(text, relations=True, lemmata=True) sentences = [] for p_sentence in p_tree: sentence = [] for p_word in p_sentence.words: sentence.append(self.classifier.classify(p_word)) sentences.append({"words": sentence, "string": p_sentence.string}) return sentences
def pattern_match(pattern, sentence): if type(sentence) is not Text: sentence = parsetree(sentence, lemmata=True) p = Pattern.fromstring(pattern) try: m = p.match(sentence) return m except: return None
def parse(self, text): """ Parses the text and extract the sources, reporters and entities. """ self.__sources, self.__reporters, self.__entities = [], [], [] text = self._clean(text) # POS-Tagging with relations and lemmas self.__tree = parsetree(text, relations=True, lemmata=True) # Extract the information self._extract_sources() self._extract_reporters()
def verbosInfinitivos(cadena): t = parsetree(cadena) verbos = search('VB*', t) #lis=verbos.match.string #print 'list: ',lis #print #no puedo convertirlo a lista de una?? lista =[] for match in verbos: lista.append((match.string , conjugate(match.string, INFINITIVE))) #print 'lista for: ',lista #print lista[3][1] return lista
def verbosInfinitivos(cadena): t = parsetree(cadena) verbos = search('VB*', t) print('verbos =', verbos) #lis=verbos.match.string #print ('list: ',lis) #print() #no puedo convertirlo a lista de una?? LAMBDA lista = [] for match in verbos: lista.append((match.string, conjugate(match.string, INFINITIVE))) #print ('lista for: ',lista) #print (lista[3][1]) return lista
def compute_topics(set_reduce_topics, today): # Based on similarity # Based on words cleanup_topic(today.day, today.month, today.year) ScrapedTopicGroups.sync() sites = SiteNewsScrapedData.objects.all() documents = [] for site in sites: for sentence in site.content.split('.'): if sentence: tree = parsetree(sentence, lemmata=True) if len(tree) > 0: documents.append(tree[0]) documents = [[w.lemma for w in document if w.tag.startswith((u'NN', u'NNS', u'NNP', u'NNPS')) and w.lemma not in settings.STOP_WORDS] for document in documents] documents = [Document(" ".join(document) + '.') for document in documents if len(document) > 1] model = Model_Comp(documents=documents) # format: (distribution, Document) documents_analyzed = [] for document in documents: tokens = [] similar_items_news = model.nearest_neighbors(document) for similarity, sim_document in similar_items_news: if similarity > 0.95 and sim_document.id not in documents_analyzed: tokens.extend([word for word, _ in sim_document.words.iteritems()]) documents_analyzed.append(sim_document.id) # Added is there some document similar if document.id not in documents_analyzed: tokens.extend([word for word, _ in document.words.iteritems()]) documents_analyzed.append(document.id) # filter the most relevant words (based on count) counter = defaultdict(int) for token in tokens: counter[token] += 1 # Order counter desc tokens_org = sorted(counter.items(), key=lambda element: element[1], reverse=True) tokens = [token for token, count in tokens_org[:3]] if tokens and len(tokens) > 0: links = SiteNewsScrapedData.find_coincidences(tokens) # Filtrar solamente si tiene mas de 3 links if len(links) > 3: ScrapedTopicGroups.create(tags=tokens, links=links, relevance=len(links), day=today.day, month=today.month, year=today.year) if set_reduce_topics: reduce_topics(today.day, today.month, today.year) return True
def interactive_loader(file_name): GROUP_SIZE = 3 text = file(file_name).read() words = parsetree(text, tags=False, chunks=False).words for word_group in zip(*[iter(words)]*GROUP_SIZE): options = "" d_words = [] for word in word_group: d_word = DWords.find_word(word.string.lower()) or DWord(word.string.lower()) if not (d_word.has_polarity() or d_word.is_modifier()): d_words.append(d_word) if len(d_words) == 0: continue while len(options)!=len(d_words): print "\t".join(map((lambda w: w.word), d_words)) options = list(raw_input("0:neutral\t1:positive\t2: negative\t3:inversor\t5:minimizer\t6:maximizer\td:descartar\n")[:len(d_words)]) for d_word in d_words: option = options.pop(0) if option=="0": d_word.polarity = 0 elif option=="1": d_word.polarity = 1 elif option=="2": d_word.polarity = -1 elif option == "3": d_word.modifier = -1 elif option == "5": d_word.modifier = 0.5 elif option == "6": d_word.modifier = 2 if option!="d": DWords.insert_word(d_word) save_in_file(d_word) print "\n\n\n"
def getLemmas(self, words): lemmas = [] for word in words: lemmas.append(parsetree(word, lemmata=True)[0].lemma[0]) return lemmas
from pattern.es import parsetree s = 'The mobile web is more important than mobile apps.' s = parsetree(s) print(s) for sentence in s: print(sentence) for chunk in sentence.chunks: print(chunk) for word in chunk.words: print(word)
def verbosInfinitivos(cadena): lis = limpiar_str(cadena).split(' ') t = parsetree(cadena) verbos = search('VB', t) print('Verbos :', verbos)
# coding: utf-8 import argparse from pattern.es import parsetree from pattern.vector import Document import json from operator import itemgetter from itertools import groupby from pprint import pprint parser = argparse.ArgumentParser(description='Find character names in text blobs. Create graph.') parser.add_argument('--text', type=argparse.FileType('r'), required=True, help='find names here') args = parser.parse_args() s = parsetree(args.text.read(), relations=True, lemmata=True) for i in range(len(s)): sentence = s[i] print "[%s]"%i, s[i].string.encode('utf8')
type=argparse.FileType('w'), required=True, help='pickle to output graph') args = parser.parse_args() last_names = [] for f in args.names: for name in f.readlines(): last_names.append( name.replace("Á", 'A').replace("á", 'a').replace("É", 'E').replace( "é", 'e').replace("Í", 'I').replace("í", 'i').replace( "Ó", 'O').replace("ó", 'o').replace("Ú", 'U').replace( "ú", 'u').upper().strip()) s = parsetree(args.text.read(), relations=True, lemmata=True) def names_from_dict(nis): names_in_sentence = nis.copy() indexes = names_in_sentence.keys() indexes.sort() names = [] for k, g in groupby(enumerate(indexes), lambda (i, x): i - x): name = [] for i in map(itemgetter(1), g): name.append(names_in_sentence[i]) names.append(" ".join([n.capitalize() for n in name]))
def stem_lemma(word): word = parsetree(word, lemmata=True)[0].lemmata[0] word = stemmer.stem(word) return word
# coding: utf-8 from pattern.es import parsetree theogony = open('data/narco/SresNarco/narco.txt').read() s = parsetree(theogony, relations=True, lemmata=True) for e in s: try: for v in e.verbs: if v.subject and v.object: subjects = [] for w in v.subject: if w.type == 'NNP': subjects.append(w.string) objects = [] for w in v.object: if w.type == 'NNP': objects.append(w.string) if objects and subjects: print subjects, v.lemmata, objects except: pass
def Word_list_to_Text(Word_list): string='' for w in Word_list: string+=w.string+' ' return parsetree(string)
# coding: utf-8 from pattern.es import parsetree theogony = open('data/narco/SresNarco/narco.txt').read() s = parsetree(theogony, relations=True, lemmata=True) for e in s: try: for v in e.verbs: if v.subject and v.object: subjects = [] for w in v.subject: if w.type == 'NNP': subjects.append( w.string ) objects = [] for w in v.object: if w.type == 'NNP': objects.append( w.string ) if objects and subjects: print subjects, v.lemmata, objects except: pass