def __call__(self, tweet): text = tweet['text'] tokens = token_re.findall(text) # tokens_features = map(list, featurize(tokens, crf_feature_functions)) tokens_features = featurize(tokens, self.feature_functions) null_label = 'None' labels = self.crf.predict([tokens_features])[0] # tweet['labels'] = labels if 'sequences' not in tweet: tweet['sequences'] = [] for sequence_label, entries in itertools.groupby(zip_boundaries(labels), lambda tup: tup[0]): if sequence_label != null_label: labels, starts, ends = zip(*entries) tweet['sequences'].append({ 'text': sequence_label, 'start': starts[0], 'end': ends[-1], }) return tweet
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') parser.add_argument('--include-none', type=int, default=0, help='Include None in Confusion Matrix.') parser.add_argument('-threshold', type=int, default=10, help='Threshold for number of gold labels classified.') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).limit(opts.max_data) X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) categories = dict( (label.id, label.text) for label in DBSession.query(Label)) print 'categories', categories N = len(y) index = 0 for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() # print_gloss=True index = index + 1 evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts)
def from_data(cls, data, feature_functions): '''data must be an iterable of objects with .tokens and .labels attributes.''' crf = cls() X_y = ((featurize(datum.tokens, feature_functions), datum.labels) for datum in data) X, y = izip(*X_y) # X (and y) are iterables, by the way logger.debug('Fitting CRF') crf.fit(X, y) return crf
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') parser.add_argument( '--adjacent', type=int, default=0, help='Set adjacent to 1 if adjacent functions want to be used') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).\ filter(TokenizedLabel.tweet is not None).\ filter(TokenizedLabel.tweet != '').\ limit(opts.max_data) if (opts.adjacent == 0): X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) else: X_y = ((featurize_adjacent(item.tokens, crf_feature_functions), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) N = len(y) for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() # print_gloss=True evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
def tagger_tag(): # For bottle >= 0.10, request.forms.xyz attributes return unicode strings # and an empty string if decoding fails. text = request.forms.text tokens = token_re.findall(text.encode('utf8')) tokens_features = map(list, featurize(tokens, crf_feature_functions)) tagger = GLOBALS['tagger'] labels = tagger.predict([tokens_features])[0] sequences = [ {'name': 'tokens', 'values': tokens}, {'name': 'labels', 'values': labels}, ] for feature_function in crf_feature_functions: sequences.append({ 'name': feature_function.__name__, 'values': [', '.join(features) for features in feature_function(tokens)]}) return {'sequences': sequences}
def tagger_tag(): # For bottle >= 0.10, request.forms.xyz attributes return unicode strings # and an empty string if decoding fails. text = request.forms.text tokens = token_re.findall(text.encode("utf8")) tokens_features = map(list, featurize(tokens, crf_feature_functions)) tagger = GLOBALS["tagger"] labels = tagger.predict([tokens_features])[0] sequences = [{"name": "tokens", "values": tokens}, {"name": "labels", "values": labels}] for feature_function in crf_feature_functions: sequences.append( { "name": feature_function.__name__, "values": [", ".join(features) for features in feature_function(tokens)], } ) return {"sequences": sequences}
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') parser.add_argument('--adjacent', type=int, default=0, help='Set adjacent to 1 if adjacent functions want to be used') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).\ filter(TokenizedLabel.tweet is not None).\ filter(TokenizedLabel.tweet != '').\ limit(opts.max_data) if (opts.adjacent == 0): X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) else: X_y = ((featurize_adjacent(item.tokens, crf_feature_functions), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) N = len(y) for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() # print_gloss=True evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') parser.add_argument('--include-none', type=int, default=0, help='Include None in Confusion Matrix.') parser.add_argument('-threshold', type=int, default=10, help='Threshold for number of gold labels classified.') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).limit(opts.max_data) X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) categories = dict((label.id, label.text) for label in DBSession.query(Label)) print 'categories', categories N = len(y) index = 0 for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() # print_gloss=True index = index + 1 evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts)
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).limit(opts.max_data) for L in range(0, len(crf_feature_functions) + 1): for subset in itertools.combinations(crf_feature_functions, L): sub = list(subset) print sub X_y = ((featurize(item.tokens, sub), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) categories = dict((label.id, label.text) for label in DBSession.query(Label)) print 'categories', categories N = len(y) #tests on different data sets -> k folds is set to 10 right now for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
def from_path_or_data(cls, data, feature_functions, model_filepath=None): '''If we are given a model_filepath that points to an existing file, use it. otherwise, create a temporary file to store the model because CRFSuite doesn't seem to allow us to create a tagger directly from a trained trainer object.''' if model_filepath is None or not os.path.exists(model_filepath): if model_filepath is None: model_filepath = tempfile.NamedTemporaryFile(delete=False).name trainer = Trainer() for i, datum in enumerate(data): tokens = datum.tokens labels = datum.labels tokens_features = featurize(tokens, feature_functions) trainer.append_raw(tokens_features, labels) trainer.save(model_filepath) logger.debug('Trained on %d instances and saved to %s', i, model_filepath) else: logger.debug('Loading existing model from %s', model_filepath) return cls(model_filepath)
def tokenizer(self, text): tokens = token_re.findall(text) tokens_features = featurize(tokens, self.feature_functions) for token_features in tokens_features: for feature in token_features: yield feature