def filter_pos_infinitive(self, s, category_list=[], allowed=False): ''' Filters grammatical categories (pos:Part-of-Speech tags) from a string and converts to infinitive, predicative and singularized forms words: If allowed is set to True it only allows POS in category_list. If allowed is set to False it allows all POS except those in category_list POS that can be in category list: nouns = ['NN', 'NNS', 'NNP', 'NNPS'] verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] adjectives = ['JJ','JJR','JJS'] determiners = ['DT'] conjunctions = ['IN', 'CC'] adverbs = ['RB','RBR', 'RBS'] modals = ['MD'] utterances = ['UH'] In: (s:string, category_list:list of strings, allowed:boolean) Out: (string) ''' if isinstance(s, str): s = unicode(s, "utf-8", "xmlcharrefreplace") list = [] pos_list = self.pos_tagging(s) if len(category_list) == 0: return s if allowed == False: for pos in pos_list: if pos.split(':')[1] not in category_list: if pos.split(':')[1] in ['NNS']: word = singularize(pos.split(':')[0]) list.append(word) elif pos.split(':')[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: word = conjugate(pos.split(':')[0], INFINITIVE) list.append(word) elif pos.split(':')[1] in ['JJ','JJR','JJS']: word = predicative(pos.split(':')[0]) list.append(word) else: list.append(pos.split(':')[0]) else: for pos in pos_list: if pos.split(':')[1] in category_list: if pos.split(':')[1] in ['NNS']: word = singularize(pos.split(':')[0]) list.append(word) elif pos.split(':')[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: word = conjugate(pos.split(':')[0], INFINITIVE) list.append(word) elif pos.split(':')[1] in ['JJ','JJR','JJS']: word = predicative(pos.split(':')[0]) list.append(word) else: list.append(pos.split(':')[0]) return u' '.join(list)
def pos_tagging_infinitive(self, s): ''' Grammatical category of each word a.k.a. Part-of-Speech (pos) tagging, but transformming adjectives to predicative form, singularizing nouns and verbs to infinitive form ej. ella:PRP maneja:VBD carros:NNS rojos:JJ PRP: Possesive pronoun ---> ella VBD: Verb in past tense ----> manejar(infinitive) NNS: Noun in plural --------> carro (singularized) JJ: adjective --------------> rojo (predicative) In: (s:string) string text Out: (list) list with grammatical categories in the form 'word:category' ''' categories = parse(s) list = [] if isinstance(s, str): s = unicode(s, "utf-8", "xmlcharrefreplace") for x in categories.split(): for y in x: if y[1] in ['NNS']: word = singularize(y[0]) list.append(word+":NN") elif y[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: word = conjugate(y[0], INFINITIVE) list.append(word+":VB") elif y[1] in ['JJ','JJR','JJS']: word = predicative(y[0]) list.append(word+":JJ") else: list.append(y[0]+':'+y[1]) return list
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("horribles" => "horrible"). from pattern.db import Datasheet test = {} for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")): if tag == "j": test.setdefault(lemma, []).append(w) i, n = 0, 0 for pred, attr in test.items(): attr = sorted(attr, key=len, reverse=True)[0] if es.predicative(attr) == pred: i += 1 n += 1 self.assertTrue(float(i) / n > 0.92) print("pattern.es.predicative()")
def test_predicative(self): # Assert the accuracy of the predicative algorithm ("horribles" => "horrible"). from pattern.db import Datasheet test = {} for w, lemma, tag, f in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-es-davies.csv")): if tag == "j": test.setdefault(lemma, []).append(w) i, n = 0, 0 for pred, attr in test.items(): attr = sorted(attr, key=len, reverse=True)[0] if es.predicative(attr) == pred: i += 1 n += 1 self.assertTrue(float(i) / n > 0.92) print "pattern.es.predicative()"
def unify_tokens(self): """ Singuralizes nouns, conjugates verbs to infinitive and passes adjectives to predicative form in tokens :return: Tokens """ if self._analysis is None: raise Exception('It\'s necessary execute first analize') for i in range(len(self._tokens)): if self._analysis[i][1][0] == 'n': self._tokens[i] = singularize(self._tokens[i]) elif self._analysis[i][1][0] == 'v': self._tokens[i] = conjugate(self._tokens[i], INFINITIVE) elif self._analysis[i][1][0] == 'a': self._tokens[i] = predicative(self._tokens[i]) return self._tokens
def adjetives(): print(predicative('hermosos')) print(attributive('hermoso', gender=FEMALE + PLURAL))
def standard_value(value): clean_value=predicative(value.lower().strip()) return clean_value