Example #1
0
    def __call__(self, tweet):
        text = tweet['text']
        tokens = token_re.findall(text)

        # tokens_features = map(list, featurize(tokens, crf_feature_functions))
        tokens_features = featurize(tokens, self.feature_functions)

        null_label = 'None'
        labels = self.crf.predict([tokens_features])[0]
        # tweet['labels'] = labels

        if 'sequences' not in tweet:
            tweet['sequences'] = []

        for sequence_label, entries in itertools.groupby(zip_boundaries(labels), lambda tup: tup[0]):
            if sequence_label != null_label:
                labels, starts, ends = zip(*entries)

                tweet['sequences'].append({
                    'text': sequence_label,
                    'start': starts[0],
                    'end': ends[-1],
                })

        return tweet
Example #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k',
                        '--k-folds',
                        type=int,
                        default=10,
                        help='How many folds of the data to test on')
    parser.add_argument('--max-data',
                        type=int,
                        default=10000,
                        help='Maximum data points to train and test on')
    parser.add_argument('--include-none',
                        type=int,
                        default=0,
                        help='Include None in Confusion Matrix.')
    parser.add_argument('-threshold',
                        type=int,
                        default=10,
                        help='Threshold for number of gold labels classified.')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).limit(opts.max_data)
    X_y = ((featurize(item.tokens, crf_feature_functions), item.labels)
           for item in query)
    # unzip and flatten into static list
    X, y = zip(*X_y)
    # we need to read X multiple times, so make sure it's all static
    X = map(flatMap, X)

    categories = dict(
        (label.id, label.text) for label in DBSession.query(Label))
    print 'categories', categories

    N = len(y)
    index = 0
    for train_indices, test_indices in cross_validation.KFold(N,
                                                              opts.k_folds,
                                                              shuffle=True):
        # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
        train_X = [X[i] for i in train_indices]
        train_y = [y[i] for i in train_indices]
        test_X = [X[i] for i in test_indices]
        test_y = [y[i] for i in test_indices]
        classifier = CRF()
        # print_gloss=True
        index = index + 1
        evaluateSequenceClassifier(classifier, train_X, train_y, test_X,
                                   test_y, index, opts)
Example #3
0
    def from_data(cls, data, feature_functions):
        '''data must be an iterable of objects with .tokens and .labels attributes.'''
        crf = cls()
        X_y = ((featurize(datum.tokens, feature_functions), datum.labels) for datum in data)
        X, y = izip(*X_y)
        # X (and y) are iterables, by the way

        logger.debug('Fitting CRF')
        crf.fit(X, y)

        return crf
Example #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k',
                        '--k-folds',
                        type=int,
                        default=10,
                        help='How many folds of the data to test on')
    parser.add_argument('--max-data',
                        type=int,
                        default=10000,
                        help='Maximum data points to train and test on')
    parser.add_argument(
        '--adjacent',
        type=int,
        default=0,
        help='Set adjacent to 1 if adjacent functions want to be used')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).\
        filter(TokenizedLabel.tweet is not None).\
        filter(TokenizedLabel.tweet != '').\
        limit(opts.max_data)
    if (opts.adjacent == 0):
        X_y = ((featurize(item.tokens, crf_feature_functions), item.labels)
               for item in query)
    else:
        X_y = ((featurize_adjacent(item.tokens,
                                   crf_feature_functions), item.labels)
               for item in query)
    # unzip and flatten into static list
    X, y = zip(*X_y)
    # we need to read X multiple times, so make sure it's all static
    X = map(flatMap, X)

    N = len(y)
    for train_indices, test_indices in cross_validation.KFold(N,
                                                              opts.k_folds,
                                                              shuffle=True):
        # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
        train_X = [X[i] for i in train_indices]
        train_y = [y[i] for i in train_indices]
        test_X = [X[i] for i in test_indices]
        test_y = [y[i] for i in test_indices]
        classifier = CRF()
        # print_gloss=True
        evaluateSequenceClassifier(classifier, train_X, train_y, test_X,
                                   test_y)
Example #5
0
    def from_data(cls, data, feature_functions):
        '''data must be an iterable of objects with .tokens and .labels attributes.'''
        crf = cls()
        X_y = ((featurize(datum.tokens, feature_functions), datum.labels)
               for datum in data)
        X, y = izip(*X_y)
        # X (and y) are iterables, by the way

        logger.debug('Fitting CRF')
        crf.fit(X, y)

        return crf
Example #6
0
def tagger_tag():
    # For bottle >= 0.10, request.forms.xyz attributes return unicode strings
    # and an empty string if decoding fails.
    text = request.forms.text
    tokens = token_re.findall(text.encode('utf8'))

    tokens_features = map(list, featurize(tokens, crf_feature_functions))
    tagger = GLOBALS['tagger']
    labels = tagger.predict([tokens_features])[0]

    sequences = [
        {'name': 'tokens', 'values': tokens},
        {'name': 'labels', 'values': labels},
    ]
    for feature_function in crf_feature_functions:
        sequences.append({
            'name': feature_function.__name__,
            'values': [', '.join(features) for features in feature_function(tokens)]})

    return {'sequences': sequences}
Example #7
0
def tagger_tag():
    # For bottle >= 0.10, request.forms.xyz attributes return unicode strings
    # and an empty string if decoding fails.
    text = request.forms.text
    tokens = token_re.findall(text.encode("utf8"))

    tokens_features = map(list, featurize(tokens, crf_feature_functions))
    tagger = GLOBALS["tagger"]
    labels = tagger.predict([tokens_features])[0]

    sequences = [{"name": "tokens", "values": tokens}, {"name": "labels", "values": labels}]
    for feature_function in crf_feature_functions:
        sequences.append(
            {
                "name": feature_function.__name__,
                "values": [", ".join(features) for features in feature_function(tokens)],
            }
        )

    return {"sequences": sequences}
Example #8
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k', '--k-folds',
        type=int, default=10, help='How many folds of the data to test on')
    parser.add_argument('--max-data',
        type=int, default=10000, help='Maximum data points to train and test on')
    parser.add_argument('--adjacent',
        type=int, default=0, help='Set adjacent to 1 if adjacent functions want to be used')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).\
        filter(TokenizedLabel.tweet is not None).\
        filter(TokenizedLabel.tweet != '').\
        limit(opts.max_data)
    if (opts.adjacent == 0):
        X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query)
    else:
        X_y = ((featurize_adjacent(item.tokens, crf_feature_functions), item.labels) for item in query)
    # unzip and flatten into static list
    X, y = zip(*X_y)
    # we need to read X multiple times, so make sure it's all static
    X = map(flatMap, X)

    N = len(y)
    for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True):
        # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
        train_X = [X[i] for i in train_indices]
        train_y = [y[i] for i in train_indices]
        test_X = [X[i] for i in test_indices]
        test_y = [y[i] for i in test_indices]
        classifier = CRF()
        # print_gloss=True
        evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
Example #9
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k', '--k-folds',
        type=int, default=10, help='How many folds of the data to test on')
    parser.add_argument('--max-data',
        type=int, default=10000, help='Maximum data points to train and test on')
    parser.add_argument('--include-none', type=int, default=0, help='Include None in Confusion Matrix.')
    parser.add_argument('-threshold', type=int, default=10, help='Threshold for number of gold labels classified.')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).limit(opts.max_data)
    X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query)
    # unzip and flatten into static list
    X, y = zip(*X_y)
    # we need to read X multiple times, so make sure it's all static
    X = map(flatMap, X)

    categories = dict((label.id, label.text) for label in DBSession.query(Label))
    print 'categories', categories

    N = len(y)
    index = 0
    for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True):
        # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
        train_X = [X[i] for i in train_indices]
        train_y = [y[i] for i in train_indices]
        test_X = [X[i] for i in test_indices]
        test_y = [y[i] for i in test_indices]
        classifier = CRF()
        # print_gloss=True
        index = index + 1
        evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts)
Example #10
0
def main():
    parser = argparse.ArgumentParser(
        description='Train CRFSuite on data from the QCRI MySQL database',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-k', '--k-folds',
        type=int, default=10, help='How many folds of the data to test on')
    parser.add_argument('--max-data',
        type=int, default=10000, help='Maximum data points to train and test on')
    opts = parser.parse_args()

    # e.g., tokenized_label =
    # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
    #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
    # Train and test must be iterables of objects that support CRF-ready
    # .tokens and .labels attributes.
    query = DBSession.query(TokenizedLabel).limit(opts.max_data)

    for L in range(0, len(crf_feature_functions) + 1):
        for subset in itertools.combinations(crf_feature_functions, L):
            sub = list(subset)
            print sub
            X_y = ((featurize(item.tokens, sub), item.labels) for item in query)
            # unzip and flatten into static list
            X, y = zip(*X_y)
            # we need to read X multiple times, so make sure it's all static
            X = map(flatMap, X)
            categories = dict((label.id, label.text) for label in DBSession.query(Label))
            print 'categories', categories

            N = len(y)
            #tests on different data sets -> k folds is set to 10 right now
            for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True):
                train_X = [X[i] for i in train_indices]
                train_y = [y[i] for i in train_indices]
                test_X = [X[i] for i in test_indices]
                test_y = [y[i] for i in test_indices]
                classifier = CRF()
                evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
Example #11
0
    def from_path_or_data(cls, data, feature_functions, model_filepath=None):
        '''If we are given a model_filepath that points to an existing file, use it.
        otherwise, create a temporary file to store the model because CRFSuite
        doesn't seem to allow us to create a tagger directly from a trained
        trainer object.'''
        if model_filepath is None or not os.path.exists(model_filepath):
            if model_filepath is None:
                model_filepath = tempfile.NamedTemporaryFile(delete=False).name

            trainer = Trainer()
            for i, datum in enumerate(data):
                tokens = datum.tokens
                labels = datum.labels

                tokens_features = featurize(tokens, feature_functions)
                trainer.append_raw(tokens_features, labels)

            trainer.save(model_filepath)
            logger.debug('Trained on %d instances and saved to %s', i, model_filepath)
        else:
            logger.debug('Loading existing model from %s', model_filepath)

        return cls(model_filepath)
Example #12
0
    def from_path_or_data(cls, data, feature_functions, model_filepath=None):
        '''If we are given a model_filepath that points to an existing file, use it.
        otherwise, create a temporary file to store the model because CRFSuite
        doesn't seem to allow us to create a tagger directly from a trained
        trainer object.'''
        if model_filepath is None or not os.path.exists(model_filepath):
            if model_filepath is None:
                model_filepath = tempfile.NamedTemporaryFile(delete=False).name

            trainer = Trainer()
            for i, datum in enumerate(data):
                tokens = datum.tokens
                labels = datum.labels

                tokens_features = featurize(tokens, feature_functions)
                trainer.append_raw(tokens_features, labels)

            trainer.save(model_filepath)
            logger.debug('Trained on %d instances and saved to %s', i,
                         model_filepath)
        else:
            logger.debug('Loading existing model from %s', model_filepath)

        return cls(model_filepath)
Example #13
0
 def tokenizer(self, text):
     tokens = token_re.findall(text)
     tokens_features = featurize(tokens, self.feature_functions)
     for token_features in tokens_features:
         for feature in token_features:
             yield feature
Example #14
0
File: ml.py Project: Priya22/tweedr
 def tokenizer(self, text):
     tokens = token_re.findall(text)
     tokens_features = featurize(tokens, self.feature_functions)
     for token_features in tokens_features:
         for feature in token_features:
             yield feature