Example #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k',
                        '--k-folds',
                        type=int,
                        default=10,
                        help='How many folds of the data to test on')
    parser.add_argument('--max-data',
                        type=int,
                        default=10000,
                        help='Maximum data points to train and test on')
    parser.add_argument('--include-none',
                        type=int,
                        default=0,
                        help='Include None in Confusion Matrix.')
    parser.add_argument('-threshold',
                        type=int,
                        default=10,
                        help='Threshold for number of gold labels classified.')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).limit(opts.max_data)
    X_y = ((featurize(item.tokens, crf_feature_functions), item.labels)
           for item in query)
    # unzip and flatten into static list
    X, y = zip(*X_y)
    # we need to read X multiple times, so make sure it's all static
    X = map(flatMap, X)

    categories = dict(
        (label.id, label.text) for label in DBSession.query(Label))
    print 'categories', categories

    N = len(y)
    index = 0
    for train_indices, test_indices in cross_validation.KFold(N,
                                                              opts.k_folds,
                                                              shuffle=True):
        # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
        train_X = [X[i] for i in train_indices]
        train_y = [y[i] for i in train_indices]
        test_X = [X[i] for i in test_indices]
        test_y = [y[i] for i in test_indices]
        classifier = CRF()
        # print_gloss=True
        index = index + 1
        evaluateSequenceClassifier(classifier, train_X, train_y, test_X,
                                   test_y, index, opts)
Example #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k',
                        '--k-folds',
                        type=int,
                        default=10,
                        help='How many folds of the data to test on')
    parser.add_argument('--max-data',
                        type=int,
                        default=10000,
                        help='Maximum data points to train and test on')
    parser.add_argument(
        '--adjacent',
        type=int,
        default=0,
        help='Set adjacent to 1 if adjacent functions want to be used')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).\
        filter(TokenizedLabel.tweet is not None).\
        filter(TokenizedLabel.tweet != '').\
        limit(opts.max_data)
    if (opts.adjacent == 0):
        X_y = ((featurize(item.tokens, crf_feature_functions), item.labels)
               for item in query)
    else:
        X_y = ((featurize_adjacent(item.tokens,
                                   crf_feature_functions), item.labels)
               for item in query)
    # unzip and flatten into static list
    X, y = zip(*X_y)
    # we need to read X multiple times, so make sure it's all static
    X = map(flatMap, X)

    N = len(y)
    for train_indices, test_indices in cross_validation.KFold(N,
                                                              opts.k_folds,
                                                              shuffle=True):
        # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
        train_X = [X[i] for i in train_indices]
        train_y = [y[i] for i in train_indices]
        test_X = [X[i] for i in test_indices]
        test_y = [y[i] for i in test_indices]
        classifier = CRF()
        # print_gloss=True
        evaluateSequenceClassifier(classifier, train_X, train_y, test_X,
                                   test_y)
Example #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k', '--k-folds',
        type=int, default=10, help='How many folds of the data to test on')
    parser.add_argument('--max-data',
        type=int, default=10000, help='Maximum data points to train and test on')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).limit(opts.max_data)

    for L in range(0, len(crf_feature_functions) + 1):
        for subset in itertools.combinations(crf_feature_functions, L):
            sub = list(subset)
            print sub
            X_y = ((featurize(item.tokens, sub), item.labels) for item in query)
            # unzip and flatten into static list
            X, y = zip(*X_y)
            # we need to read X multiple times, so make sure it's all static
            X = map(flatMap, X)
            categories = dict((label.id, label.text) for label in DBSession.query(Label))
            print 'categories', categories

            N = len(y)
            #tests on different data sets -> k folds is set to 10 right now
            for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True):
                train_X = [X[i] for i in train_indices]
                train_y = [y[i] for i in train_indices]
                test_X = [X[i] for i in test_indices]
                test_y = [y[i] for i in test_indices]
                classifier = CRF()
                evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
Example #4
0
 def __init__(self):
     self.crf = CRF.default(self.feature_functions)
     logger.info('SequenceTagger initialized')
Example #5
0
def tagger_retrain():
    GLOBALS['tagger'] = CRF.default(crf_feature_functions, retrain=True)
    return dict(success=True)
Example #6
0
from tweedr.models import DBSession, TokenizedLabel

import logging
logger = logging.getLogger(__name__)

# tell bottle where to look for templates
# We use Mako templates (*.mako) that are in the templates/ directory in the package root.
# There are also Handlebars (*.bars) templates in there, but those are rendered on the client-side.
bottle.TEMPLATE_PATH.append(os.path.join(tweedr.root, 'templates'))

# this is the primary export
app = bottle.Bottle()

# globals are messy, but we don't to retrain a tagger for every request
logger.debug('initializing %s (training or loading CRF using defaults)', __name__)
GLOBALS = dict(tagger=CRF.default(crf_feature_functions))


@app.get('/')
def root():
    redirect('/crf')


@app.get('/crf')
@view('crf.mako')
def index():
    # effectively static; all the fun stuff happens in the template
    return dict()


@app.get('/tokenized_labels/sample')
Example #7
0
def tagger_retrain():
    GLOBALS["tagger"] = CRF.default(crf_feature_functions, retrain=True)
    return dict(success=True)
Example #8
0
import logging

logger = logging.getLogger(__name__)

# tell bottle where to look for templates
# We use Mako templates (*.mako) that are in the templates/ directory in the package root.
# There are also Handlebars (*.bars) templates in there, but those are rendered on the client-side.
bottle.TEMPLATE_PATH.append(os.path.join(tweedr.root, "templates"))

# this is the primary export
app = bottle.Bottle()

# globals are messy, but we don't to retrain a tagger for every request
logger.debug("initializing %s (training or loading CRF using defaults)", __name__)
GLOBALS = dict(tagger=CRF.default(crf_feature_functions))


@app.get("/")
def root():
    redirect("/crf")


@app.get("/crf")
@view("crf.mako")
def index():
    # effectively static; all the fun stuff happens in the template
    return dict()


@app.get("/tokenized_labels/sample")