def handle_keywords(text, split=True): """ Automatic handling of Keywords from a text. It runs the sanitize function for keywords on every keyword. If `split` it's True, it splits the text by white spaces. Returns an :class:`quepy.expression.Expression` that represents the fact of having the keywords extracted from text. """ assert_valid_encoding(text) from quepy.semantics import HasKeyword if split: keywords = [HasKeyword.sanitize(x) for x in text.split()] else: keywords = (HasKeyword.sanitize(text),) if not keywords: raise ValueError(u"Couldn't extract any keyword from '%s'" % text) expr = None for keyword in keywords: if expr is not None: expr += HasKeyword(keyword) else: expr = HasKeyword(keyword) return expr
def penn_to_morphy_tag(tag): assert_valid_encoding(tag) for penn, morphy in _penn_to_morphy_tag.iteritems(): if tag.startswith(penn): return morphy return None
def handle_keywords(text, split=True): """ Automatic handling of Keywords from a text. It runs the sanitize function for keywords on every keyword. If `split` it's True, it splits the text by white spaces. Returns an :class:`quepy.expression.Expression` that represents the fact of having the keywords extracted from text. """ assert_valid_encoding(text) from quepy.semantics import HasKeyword if split: keywords = [HasKeyword.sanitize(x) for x in text.split()] else: keywords = (HasKeyword.sanitize(text), ) if not keywords: raise ValueError(u"Couldn't extract any keyword from '%s'" % text) expr = None for keyword in keywords: if expr is not None: expr += HasKeyword(keyword) else: expr = HasKeyword(keyword) return expr
def wrapper(string): assert_valid_encoding(string) words = tagger_function(string) for word in words: if word.pos not in PENN_TAGSET: logger.warning("Tagger emmited a non-penn " "POS tag {!r}".format(word.pos)) return words
def adapt(x): if isnode(x): x = u"?x{}".format(x) return x if isinstance(x, basestring): assert_valid_encoding(x) if x.startswith(u"\"") or ":" in x: return x return u'"{}"'.format(x) return unicode(x)
def adapt(x): if isnode(x): x = "?x{}".format(x) return x if isinstance(x, str): assert_valid_encoding(x) if x.startswith("\"") or ":" in x: return x return '"{}"'.format(x) return str(x)
def adapt(x): if isnode(x): x = u"x{}".format(x) return x if isinstance(x, basestring): assert_valid_encoding(x) x = escape(x) if x.startswith(u'"'): return x return u'"{}"'.format(x) return unicode(x)
def _read_line(text): """ Parses a line of the freeling command line output. """ assert_valid_encoding(text) assert u"#" in text start, text = text.split(u"#", 1) start = start.strip().rsplit(u" ", 1)[0] text = text.strip() token_has_spaces = False if start.count(u" ") > 2: token = FREELING_FUNCTION_OUTPUT_REGEX.match(start) assert not token is None token = token.group() token_has_spaces = True else: token = start.split(u" ")[0] if token_has_spaces: text = text.replace(token, u"<token>") text = text.split(u" ") assert len(text) % 4 == 0 best_word = None while text: word = Word(token) word.sense = text.pop() try: word.prob = float(text.pop()) except ValueError: raise TaggingError(u"The probability field of a" u" word was non-numerical") if word.prob < 0 or word.prob > 1: raise TaggingError(u"The probability field of a" u" word was not a probability") word.pos = text.pop() word.lemma = text.pop() if word.pos in (u"NNP", u"MR"): word.token = word.token.replace(u"_", u" ") if word.token == u"?" and word.pos == u"Fit": word.pos = u"." if not best_word or word.prob > best_word.prob: best_word = word return best_word
def run_nltktagger(string, nltk_data_path=None): """ Runs nltk tagger on `string` and returns a list of :class:`quepy.freeling.Word` objects. """ assert_valid_encoding(string) global _penn_to_morphy_tag if nltk_data_path: nltk.data.path = nltk_data_path from nltk.corpus import wordnet if not _penn_to_morphy_tag: _penn_to_morphy_tag = { u'NN': wordnet.NOUN, u'JJ': wordnet.ADJ, u'VB': wordnet.VERB, u'RB': wordnet.ADV, } # Recommended tokenizer doesn't handle non-ascii characters very well #tokens = nltk.word_tokenize(string) tokens = nltk.wordpunct_tokenize(string) tags = nltk.pos_tag(tokens) words = [] for token, pos in tags: word = Word(token) # Eliminates stuff like JJ|CC # decode ascii because they are the penn-like POS tags (are ascii). if sys.version_info[0] == 3: word.pos = pos.split("|")[0] else: word.pos = pos.split("|")[0].decode("ascii") mtag = penn_to_morphy_tag(word.pos) # Nice shooting, son. What's your name? lemma = wordnet.morphy(word.token, pos=mtag) if isinstance(lemma, str): # In this case lemma is example-based, because if it's rule based # the result should be unicode (input was unicode). # Since english is ascii the decoding is ok. if sys.version_info[0] == 2: lemma = lemma.decode("ascii") word.lemma = lemma if word.lemma is None: word.lemma = word.token.lower() words.append(word) return words
def run_nltktagger(string, nltk_data_path=None): """ Runs nltk tagger on `string` and returns a list of :class:`quepy.freeling.Word` objects. """ assert_valid_encoding(string) global _penn_to_morphy_tag if nltk_data_path: nltk.data.path = nltk_data_path from nltk.corpus import wordnet if not _penn_to_morphy_tag: _penn_to_morphy_tag = { u'NN': wordnet.NOUN, u'JJ': wordnet.ADJ, u'VB': wordnet.VERB, u'RB': wordnet.ADV, } # Recommended tokenizer doesn't handle non-ascii characters very well #tokens = nltk.word_tokenize(string) tokens = nltk.wordpunct_tokenize(string) tags = nltk.pos_tag(tokens) words = [] for token, pos in tags: word = Word(token) # Eliminates stuff like JJ|CC # decode ascii because they are the penn-like POS tags (are ascii). word.pos = pos.split("|")[0].decode("ascii") mtag = penn_to_morphy_tag(word.pos) # Nice shooting, son. What's your name? lemma = wordnet.morphy(word.token, pos=mtag) if isinstance(lemma, str): # In this case lemma is example-based, because if it's rule based # the result should be unicode (input was unicode). # Since english is ascii the decoding is ok. lemma = lemma.decode("ascii") word.lemma = lemma if word.lemma is None: word.lemma = word.token.lower() words.append(word) return words
def run_freeling(string, freeling_cmd): """ Runs freeling on `string` and returns a list of Word objects. """ assert_valid_encoding(string) ctx = sysutils.ExecutionContext() base_path = os.path.join(os.path.dirname(__file__), "freeling_data") config_path = __get_config_path(base_path) cmdline = freeling_cmd + \ " -f {0} --train".format(config_path) stdin = ctx.tmpfile("freeling_input") stdin.write(string.encode("utf-8")) stdin.seek(0) stdout, _ = ctx.runcmd(cmdline, stdin=stdin) stdout.seek(0) return _parse_freeling_output(stdout)
def run_spacytagger(string): """ Runs spacy on `string` and returns a list of :class:`quepy.tagger.Word` objects. """ assert_valid_encoding(string) # For now, at least, perform our own pre-processing # --to ensure terms like "presynaptic" are easily found later. string = ' '.join(string.split()) string = collapse(string) doc = nlp(string) # NOTE: spaCy expects and returns unicode spans = [(ent_id, nlp.vocab.strings[ent_id], doc[start:end]) for ent_id, start, end in matcher(doc)] for ent_id, label_id, span in spans: span.merge(label=label_id, tag='NNP' if label_id else span.root.tag_) # tag_ is the "fine-grained" POS words = [Word(x.text, x.lemma_, x.tag_) for x in doc] # The following is only for logging purposes; if necessary, it could be removed for production log.info(' '.join([t.text + '[' + str(t.i) + ']' for t in doc])) indent = " " longest = max(len(t.text) for t in doc) column = (len(doc) - 1) * len(indent) + longest + 2 wout = '{:' + str(column) + '}| ' def trav_tree(indents, node): log.info( wout.format((indent * indents) + node.text) + ', '.join( [ str(x) for x in [ node.i, node.is_oov, node.lemma_, node.tag_, \ "<-"+ str(node.left_edge), str(node.right_edge) +"->"] ]) ) for el in node.children: # NOTE: Could also change display based on node.lefts and node.rights trav_tree(indents + 1, el) for sent in doc.sents: trav_tree(0, sent.root) log.info('Ents: ' + str(doc.ents)) log.info('NPs: ' + str(list(doc.noun_chunks))) return words
def __setattr__(self, name, value): if name in self._encoding_attrs and value is not None: assert_valid_encoding(value) object.__setattr__(self, name, value)