Ejemplo n.º 1
0
    def __call__(self, tweet):
        text = tweet['text']
        tokens = token_re.findall(text)

        # tokens_features = map(list, featurize(tokens, crf_feature_functions))
        tokens_features = featurize(tokens, self.feature_functions)

        null_label = 'None'
        labels = self.crf.predict([tokens_features])[0]
        # tweet['labels'] = labels

        if 'sequences' not in tweet:
            tweet['sequences'] = []

        for sequence_label, entries in itertools.groupby(zip_boundaries(labels), lambda tup: tup[0]):
            if sequence_label != null_label:
                labels, starts, ends = zip(*entries)

                tweet['sequences'].append({
                    'text': sequence_label,
                    'start': starts[0],
                    'end': ends[-1],
                })

        return tweet
Ejemplo n.º 2
0
def main():
    # example usage:
    # echo "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced no evidence that any irregularities took place." | python __init__.py
    import sys
    from tweedr.lib.text import token_re
    from tweedr.ml.features.sets import all_feature_functions
    for line in sys.stdin:
        # tokenize the document on whitespace
        tokens = token_re.findall(line)
        # apply all feature functions
        tokens_features = featurize(tokens, all_feature_functions)
        for i, token_features in enumerate(tokens_features):
            print i, list(token_features)
Ejemplo n.º 3
0
def tagger_tag():
    # For bottle >= 0.10, request.forms.xyz attributes return unicode strings
    # and an empty string if decoding fails.
    text = request.forms.text
    tokens = token_re.findall(text.encode('utf8'))

    tokens_features = map(list, featurize(tokens, crf_feature_functions))
    tagger = GLOBALS['tagger']
    labels = tagger.predict([tokens_features])[0]

    sequences = [
        {'name': 'tokens', 'values': tokens},
        {'name': 'labels', 'values': labels},
    ]
    for feature_function in crf_feature_functions:
        sequences.append({
            'name': feature_function.__name__,
            'values': [', '.join(features) for features in feature_function(tokens)]})

    return {'sequences': sequences}
Ejemplo n.º 4
0
def tagger_tag():
    # For bottle >= 0.10, request.forms.xyz attributes return unicode strings
    # and an empty string if decoding fails.
    text = request.forms.text
    tokens = token_re.findall(text.encode("utf8"))

    tokens_features = map(list, featurize(tokens, crf_feature_functions))
    tagger = GLOBALS["tagger"]
    labels = tagger.predict([tokens_features])[0]

    sequences = [{"name": "tokens", "values": tokens}, {"name": "labels", "values": labels}]
    for feature_function in crf_feature_functions:
        sequences.append(
            {
                "name": feature_function.__name__,
                "values": [", ".join(features) for features in feature_function(tokens)],
            }
        )

    return {"sequences": sequences}
Ejemplo n.º 5
0
 def tokens(self):
     return token_re.findall(unicode(self.tweet).encode('utf8'))
Ejemplo n.º 6
0
 def tokenizer(self, text):
     tokens = token_re.findall(text)
     tokens_features = featurize(tokens, self.feature_functions)
     for token_features in tokens_features:
         for feature in token_features:
             yield feature
Ejemplo n.º 7
0
 def tokens(self):
     return token_re.findall(unicode(self.tweet).encode('utf8'))
Ejemplo n.º 8
0
Archivo: ml.py Proyecto: Priya22/tweedr
 def tokenizer(self, text):
     tokens = token_re.findall(text)
     tokens_features = featurize(tokens, self.feature_functions)
     for token_features in tokens_features:
         for feature in token_features:
             yield feature