def __call__(self, tweet): text = tweet['text'] tokens = token_re.findall(text) # tokens_features = map(list, featurize(tokens, crf_feature_functions)) tokens_features = featurize(tokens, self.feature_functions) null_label = 'None' labels = self.crf.predict([tokens_features])[0] # tweet['labels'] = labels if 'sequences' not in tweet: tweet['sequences'] = [] for sequence_label, entries in itertools.groupby(zip_boundaries(labels), lambda tup: tup[0]): if sequence_label != null_label: labels, starts, ends = zip(*entries) tweet['sequences'].append({ 'text': sequence_label, 'start': starts[0], 'end': ends[-1], }) return tweet
def main(): # example usage: # echo "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced no evidence that any irregularities took place." | python __init__.py import sys from tweedr.lib.text import token_re from tweedr.ml.features.sets import all_feature_functions for line in sys.stdin: # tokenize the document on whitespace tokens = token_re.findall(line) # apply all feature functions tokens_features = featurize(tokens, all_feature_functions) for i, token_features in enumerate(tokens_features): print i, list(token_features)
def tagger_tag(): # For bottle >= 0.10, request.forms.xyz attributes return unicode strings # and an empty string if decoding fails. text = request.forms.text tokens = token_re.findall(text.encode('utf8')) tokens_features = map(list, featurize(tokens, crf_feature_functions)) tagger = GLOBALS['tagger'] labels = tagger.predict([tokens_features])[0] sequences = [ {'name': 'tokens', 'values': tokens}, {'name': 'labels', 'values': labels}, ] for feature_function in crf_feature_functions: sequences.append({ 'name': feature_function.__name__, 'values': [', '.join(features) for features in feature_function(tokens)]}) return {'sequences': sequences}
def tagger_tag(): # For bottle >= 0.10, request.forms.xyz attributes return unicode strings # and an empty string if decoding fails. text = request.forms.text tokens = token_re.findall(text.encode("utf8")) tokens_features = map(list, featurize(tokens, crf_feature_functions)) tagger = GLOBALS["tagger"] labels = tagger.predict([tokens_features])[0] sequences = [{"name": "tokens", "values": tokens}, {"name": "labels", "values": labels}] for feature_function in crf_feature_functions: sequences.append( { "name": feature_function.__name__, "values": [", ".join(features) for features in feature_function(tokens)], } ) return {"sequences": sequences}
def tokens(self): return token_re.findall(unicode(self.tweet).encode('utf8'))
def tokenizer(self, text): tokens = token_re.findall(text) tokens_features = featurize(tokens, self.feature_functions) for token_features in tokens_features: for feature in token_features: yield feature