def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') parser.add_argument('--include-none', type=int, default=0, help='Include None in Confusion Matrix.') parser.add_argument('-threshold', type=int, default=10, help='Threshold for number of gold labels classified.') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).limit(opts.max_data) X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) categories = dict( (label.id, label.text) for label in DBSession.query(Label)) print 'categories', categories N = len(y) index = 0 for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() # print_gloss=True index = index + 1 evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts)
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') parser.add_argument( '--adjacent', type=int, default=0, help='Set adjacent to 1 if adjacent functions want to be used') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).\ filter(TokenizedLabel.tweet is not None).\ filter(TokenizedLabel.tweet != '').\ limit(opts.max_data) if (opts.adjacent == 0): X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) else: X_y = ((featurize_adjacent(item.tokens, crf_feature_functions), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) N = len(y) for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() # print_gloss=True evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
def main(): parser = argparse.ArgumentParser( description='Train CRFSuite on data from the QCRI MySQL database', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-k', '--k-folds', type=int, default=10, help='How many folds of the data to test on') parser.add_argument('--max-data', type=int, default=10000, help='Maximum data points to train and test on') opts = parser.parse_args() # e.g., tokenized_label = # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16 # tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5> # Train and test must be iterables of objects that support CRF-ready # .tokens and .labels attributes. query = DBSession.query(TokenizedLabel).limit(opts.max_data) for L in range(0, len(crf_feature_functions) + 1): for subset in itertools.combinations(crf_feature_functions, L): sub = list(subset) print sub X_y = ((featurize(item.tokens, sub), item.labels) for item in query) # unzip and flatten into static list X, y = zip(*X_y) # we need to read X multiple times, so make sure it's all static X = map(flatMap, X) categories = dict((label.id, label.text) for label in DBSession.query(Label)) print 'categories', categories N = len(y) #tests on different data sets -> k folds is set to 10 right now for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): train_X = [X[i] for i in train_indices] train_y = [y[i] for i in train_indices] test_X = [X[i] for i in test_indices] test_y = [y[i] for i in test_indices] classifier = CRF() evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
def __init__(self): self.crf = CRF.default(self.feature_functions) logger.info('SequenceTagger initialized')
def tagger_retrain(): GLOBALS['tagger'] = CRF.default(crf_feature_functions, retrain=True) return dict(success=True)
from tweedr.models import DBSession, TokenizedLabel import logging logger = logging.getLogger(__name__) # tell bottle where to look for templates # We use Mako templates (*.mako) that are in the templates/ directory in the package root. # There are also Handlebars (*.bars) templates in there, but those are rendered on the client-side. bottle.TEMPLATE_PATH.append(os.path.join(tweedr.root, 'templates')) # this is the primary export app = bottle.Bottle() # globals are messy, but we don't to retrain a tagger for every request logger.debug('initializing %s (training or loading CRF using defaults)', __name__) GLOBALS = dict(tagger=CRF.default(crf_feature_functions)) @app.get('/') def root(): redirect('/crf') @app.get('/crf') @view('crf.mako') def index(): # effectively static; all the fun stuff happens in the template return dict() @app.get('/tokenized_labels/sample')
def tagger_retrain(): GLOBALS["tagger"] = CRF.default(crf_feature_functions, retrain=True) return dict(success=True)
import logging logger = logging.getLogger(__name__) # tell bottle where to look for templates # We use Mako templates (*.mako) that are in the templates/ directory in the package root. # There are also Handlebars (*.bars) templates in there, but those are rendered on the client-side. bottle.TEMPLATE_PATH.append(os.path.join(tweedr.root, "templates")) # this is the primary export app = bottle.Bottle() # globals are messy, but we don't to retrain a tagger for every request logger.debug("initializing %s (training or loading CRF using defaults)", __name__) GLOBALS = dict(tagger=CRF.default(crf_feature_functions)) @app.get("/") def root(): redirect("/crf") @app.get("/crf") @view("crf.mako") def index(): # effectively static; all the fun stuff happens in the template return dict() @app.get("/tokenized_labels/sample")