Beispiel #1
0
def handle_keywords(text, split=True):
    """
    Automatic handling of Keywords from a text.
    It runs the sanitize function for keywords on every
    keyword.

    If `split` it's True, it splits the text by white spaces.

    Returns an :class:`quepy.expression.Expression` that represents
    the fact of having the keywords extracted from text.
    """

    assert_valid_encoding(text)

    from quepy.semantics import HasKeyword

    if split:
        keywords = [HasKeyword.sanitize(x) for x in text.split()]
    else:
        keywords = (HasKeyword.sanitize(text),)

    if not keywords:
        raise ValueError(u"Couldn't extract any keyword from '%s'" % text)

    expr = None
    for keyword in keywords:
        if expr is not None:
            expr += HasKeyword(keyword)
        else:
            expr = HasKeyword(keyword)

    return expr
Beispiel #2
0
def penn_to_morphy_tag(tag):
    assert_valid_encoding(tag)

    for penn, morphy in _penn_to_morphy_tag.iteritems():
        if tag.startswith(penn):
            return morphy
    return None
Beispiel #3
0
def penn_to_morphy_tag(tag):
    assert_valid_encoding(tag)

    for penn, morphy in _penn_to_morphy_tag.iteritems():
        if tag.startswith(penn):
            return morphy
    return None
Beispiel #4
0
def handle_keywords(text, split=True):
    """
    Automatic handling of Keywords from a text.
    It runs the sanitize function for keywords on every
    keyword.

    If `split` it's True, it splits the text by white spaces.

    Returns an :class:`quepy.expression.Expression` that represents
    the fact of having the keywords extracted from text.
    """

    assert_valid_encoding(text)

    from quepy.semantics import HasKeyword

    if split:
        keywords = [HasKeyword.sanitize(x) for x in text.split()]
    else:
        keywords = (HasKeyword.sanitize(text), )

    if not keywords:
        raise ValueError(u"Couldn't extract any keyword from '%s'" % text)

    expr = None
    for keyword in keywords:
        if expr is not None:
            expr += HasKeyword(keyword)
        else:
            expr = HasKeyword(keyword)

    return expr
Beispiel #5
0
 def wrapper(string):
     assert_valid_encoding(string)
     words = tagger_function(string)
     for word in words:
         if word.pos not in PENN_TAGSET:
             logger.warning("Tagger emmited a non-penn " "POS tag {!r}".format(word.pos))
     return words
Beispiel #6
0
 def wrapper(string):
     assert_valid_encoding(string)
     words = tagger_function(string)
     for word in words:
         if word.pos not in PENN_TAGSET:
             logger.warning("Tagger emmited a non-penn "
                            "POS tag {!r}".format(word.pos))
     return words
def adapt(x):
    if isnode(x):
        x = u"?x{}".format(x)
        return x
    if isinstance(x, basestring):
        assert_valid_encoding(x)
        if x.startswith(u"\"") or ":" in x:
            return x
        return u'"{}"'.format(x)
    return unicode(x)
def adapt(x):
    if isnode(x):
        x = "?x{}".format(x)
        return x
    if isinstance(x, str):
        assert_valid_encoding(x)
        if x.startswith("\"") or ":" in x:
            return x
        return '"{}"'.format(x)
    return str(x)
def adapt(x):
    if isnode(x):
        x = u"?x{}".format(x)
        return x
    if isinstance(x, basestring):
        assert_valid_encoding(x)
        if x.startswith(u"\"") or ":" in x:
            return x
        return u'"{}"'.format(x)
    return unicode(x)
Beispiel #10
0
def adapt(x):
    if isnode(x):
        x = u"x{}".format(x)
        return x
    if isinstance(x, basestring):
        assert_valid_encoding(x)
        x = escape(x)
        if x.startswith(u'"'):
            return x
        return u'"{}"'.format(x)
    return unicode(x)
Beispiel #11
0
def _read_line(text):
    """
    Parses a line of the freeling command line output.
    """

    assert_valid_encoding(text)
    assert u"#" in text

    start, text = text.split(u"#", 1)

    start = start.strip().rsplit(u" ", 1)[0]
    text = text.strip()
    token_has_spaces = False

    if start.count(u" ") > 2:
        token = FREELING_FUNCTION_OUTPUT_REGEX.match(start)
        assert not token is None
        token = token.group()
        token_has_spaces = True
    else:
        token = start.split(u" ")[0]

    if token_has_spaces:
        text = text.replace(token, u"<token>")

    text = text.split(u" ")
    assert len(text) % 4 == 0

    best_word = None
    while text:
        word = Word(token)
        word.sense = text.pop()
        try:
            word.prob = float(text.pop())
        except ValueError:
            raise TaggingError(u"The probability field of a"
                               u" word was non-numerical")
        if word.prob < 0 or word.prob > 1:
            raise TaggingError(u"The probability field of a"
                               u" word was not a probability")

        word.pos = text.pop()
        word.lemma = text.pop()

        if word.pos in (u"NNP", u"MR"):
            word.token = word.token.replace(u"_", u" ")

        if word.token == u"?" and word.pos == u"Fit":
            word.pos = u"."

        if not best_word or word.prob > best_word.prob:
            best_word = word

    return best_word
Beispiel #12
0
def _read_line(text):
    """
    Parses a line of the freeling command line output.
    """

    assert_valid_encoding(text)
    assert u"#" in text

    start, text = text.split(u"#", 1)

    start = start.strip().rsplit(u" ", 1)[0]
    text = text.strip()
    token_has_spaces = False

    if start.count(u" ") > 2:
        token = FREELING_FUNCTION_OUTPUT_REGEX.match(start)
        assert not token is None
        token = token.group()
        token_has_spaces = True
    else:
        token = start.split(u" ")[0]

    if token_has_spaces:
        text = text.replace(token, u"<token>")

    text = text.split(u" ")
    assert len(text) % 4 == 0

    best_word = None
    while text:
        word = Word(token)
        word.sense = text.pop()
        try:
            word.prob = float(text.pop())
        except ValueError:
            raise TaggingError(u"The probability field of a"
                               u" word was non-numerical")
        if word.prob < 0 or word.prob > 1:
            raise TaggingError(u"The probability field of a"
                               u" word was not a probability")

        word.pos = text.pop()
        word.lemma = text.pop()

        if word.pos in (u"NNP", u"MR"):
            word.token = word.token.replace(u"_", u" ")

        if word.token == u"?" and word.pos == u"Fit":
            word.pos = u"."

        if not best_word or word.prob > best_word.prob:
            best_word = word

    return best_word
Beispiel #13
0
def run_nltktagger(string, nltk_data_path=None):
    """
    Runs nltk tagger on `string` and returns a list of
    :class:`quepy.freeling.Word` objects.
    """
    assert_valid_encoding(string)

    global _penn_to_morphy_tag

    if nltk_data_path:
        nltk.data.path = nltk_data_path

    from nltk.corpus import wordnet

    if not _penn_to_morphy_tag:
        _penn_to_morphy_tag = {
            u'NN': wordnet.NOUN,
            u'JJ': wordnet.ADJ,
            u'VB': wordnet.VERB,
            u'RB': wordnet.ADV,
        }

    # Recommended tokenizer doesn't handle non-ascii characters very well
    #tokens = nltk.word_tokenize(string)
    tokens = nltk.wordpunct_tokenize(string)
    tags = nltk.pos_tag(tokens)

    words = []
    for token, pos in tags:
        word = Word(token)
        # Eliminates stuff like JJ|CC
        # decode ascii because they are the penn-like POS tags (are ascii).
        if sys.version_info[0] == 3:
            word.pos = pos.split("|")[0]
        else:
            word.pos = pos.split("|")[0].decode("ascii")

        mtag = penn_to_morphy_tag(word.pos)
        # Nice shooting, son. What's your name?
        lemma = wordnet.morphy(word.token, pos=mtag)
        if isinstance(lemma, str):
            # In this case lemma is example-based, because if it's rule based
            # the result should be unicode (input was unicode).
            # Since english is ascii the decoding is ok.
            if sys.version_info[0] == 2:
                lemma = lemma.decode("ascii")
        word.lemma = lemma
        if word.lemma is None:
            word.lemma = word.token.lower()

        words.append(word)

    return words
Beispiel #14
0
def run_nltktagger(string, nltk_data_path=None):
    """
    Runs nltk tagger on `string` and returns a list of
    :class:`quepy.freeling.Word` objects.
    """
    assert_valid_encoding(string)

    global _penn_to_morphy_tag

    if nltk_data_path:
        nltk.data.path = nltk_data_path

    from nltk.corpus import wordnet

    if not _penn_to_morphy_tag:
        _penn_to_morphy_tag = {
            u'NN': wordnet.NOUN,
            u'JJ': wordnet.ADJ,
            u'VB': wordnet.VERB,
            u'RB': wordnet.ADV,
        }

    # Recommended tokenizer doesn't handle non-ascii characters very well
    #tokens = nltk.word_tokenize(string)
    tokens = nltk.wordpunct_tokenize(string)
    tags = nltk.pos_tag(tokens)

    words = []
    for token, pos in tags:
        word = Word(token)
        # Eliminates stuff like JJ|CC
        # decode ascii because they are the penn-like POS tags (are ascii).
        word.pos = pos.split("|")[0].decode("ascii")

        mtag = penn_to_morphy_tag(word.pos)
        # Nice shooting, son. What's your name?
        lemma = wordnet.morphy(word.token, pos=mtag)
        if isinstance(lemma, str):
            # In this case lemma is example-based, because if it's rule based
            # the result should be unicode (input was unicode).
            # Since english is ascii the decoding is ok.
            lemma = lemma.decode("ascii")
        word.lemma = lemma
        if word.lemma is None:
            word.lemma = word.token.lower()

        words.append(word)

    return words
Beispiel #15
0
def run_freeling(string, freeling_cmd):
    """
    Runs freeling on `string` and returns a list of Word objects.
    """
    assert_valid_encoding(string)

    ctx = sysutils.ExecutionContext()
    base_path = os.path.join(os.path.dirname(__file__), "freeling_data")
    config_path = __get_config_path(base_path)

    cmdline = freeling_cmd + \
        " -f {0} --train".format(config_path)
    stdin = ctx.tmpfile("freeling_input")
    stdin.write(string.encode("utf-8"))
    stdin.seek(0)
    stdout, _ = ctx.runcmd(cmdline, stdin=stdin)
    stdout.seek(0)
    return _parse_freeling_output(stdout)
Beispiel #16
0
def run_freeling(string, freeling_cmd):
    """
    Runs freeling on `string` and returns a list of Word objects.
    """
    assert_valid_encoding(string)

    ctx = sysutils.ExecutionContext()
    base_path = os.path.join(os.path.dirname(__file__), "freeling_data")
    config_path = __get_config_path(base_path)

    cmdline = freeling_cmd + \
        " -f {0} --train".format(config_path)
    stdin = ctx.tmpfile("freeling_input")
    stdin.write(string.encode("utf-8"))
    stdin.seek(0)
    stdout, _ = ctx.runcmd(cmdline, stdin=stdin)
    stdout.seek(0)
    return _parse_freeling_output(stdout)
Beispiel #17
0
def run_spacytagger(string):
    """
    Runs spacy on `string` and returns a list of
    :class:`quepy.tagger.Word` objects.
    """
    assert_valid_encoding(string)

    # For now, at least, perform our own pre-processing
    # --to ensure terms like "presynaptic" are easily found later.
    string = ' '.join(string.split())
    string = collapse(string)

    doc = nlp(string)  # NOTE: spaCy expects and returns unicode

    spans = [(ent_id, nlp.vocab.strings[ent_id], doc[start:end])
             for ent_id, start, end in matcher(doc)]
    for ent_id, label_id, span in spans:
        span.merge(label=label_id, tag='NNP' if label_id else span.root.tag_)

    # tag_ is the "fine-grained" POS
    words = [Word(x.text, x.lemma_, x.tag_) for x in doc]

    # The following is only for logging purposes; if necessary, it could be removed for production
    log.info(' '.join([t.text + '[' + str(t.i) + ']' for t in doc]))
    indent = "  "
    longest = max(len(t.text) for t in doc)
    column = (len(doc) - 1) * len(indent) + longest + 2
    wout = '{:' + str(column) + '}| '

    def trav_tree(indents, node):
        log.info( wout.format((indent * indents) + node.text) + ', '.join(
            [ str(x) for x in [
                node.i, node.is_oov, node.lemma_, node.tag_, \
                "<-"+ str(node.left_edge), str(node.right_edge) +"->"] ]) )
        for el in node.children:
            # NOTE: Could also change display based on node.lefts and node.rights
            trav_tree(indents + 1, el)

    for sent in doc.sents:
        trav_tree(0, sent.root)
    log.info('Ents:  ' + str(doc.ents))
    log.info('NPs:   ' + str(list(doc.noun_chunks)))

    return words
Beispiel #18
0
 def __setattr__(self, name, value):
     if name in self._encoding_attrs and value is not None:
         assert_valid_encoding(value)
     object.__setattr__(self, name, value)
Beispiel #19
0
 def __setattr__(self, name, value):
     if name in self._encoding_attrs and value is not None:
         assert_valid_encoding(value)
     object.__setattr__(self, name, value)