コード例 #1
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-t', '--train', help='Additional training only data')
    parser.add_argument('-f',
                        '--folds',
                        help='Number of folds (default: 50)',
                        type=int,
                        default=50)
    parser.add_argument('-s',
                        '--seed',
                        help='Randomization seed (default: 0)',
                        type=int,
                        default=0)
    parser.add_argument('-v',
                        '--verbose',
                        help='Verbose output (default: no)',
                        action='store_true')
    parser.add_argument('-r',
                        '--reference_label',
                        help='Name of label to be quantified',
                        type=str,
                        default='positive')
    parser.add_argument('-k',
                        '--k',
                        help='Number of features to keep',
                        type=int,
                        default=1000)
    parser.add_argument('-c',
                        '--c',
                        help='C factor for svm (default: 1.0)',
                        type=float,
                        default=1.0)
    parser.add_argument(
        '-l',
        '--learner',
        help='Base learner to use (defalut:svm, options: svm, lr)',
        type=str,
        default='svm')
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    data = read_semeval_quantification_classification(args.input,
                                                      encoding='windows-1252',
                                                      verbose=args.verbose)

    if args.train:
        train = read_semeval_quantification_classification(
            args.train, encoding='windows-1252', verbose=args.verbose)
    else:
        train = None

    print(
        '"Test set","Category","True prevalence","Method","Predicted prevalence"'
    )
    for test_key in data:

        tests = list()
        X = list()
        y = list()
        for key in data:
            if key == test_key:
                tests.append([key, data[key][0], data[key][1]])
            else:
                X.extend(data[key][0])
                y.extend(data[key][1])
        if train is not None:
            for key in train:
                X.extend(train[key][0])
                y.extend(train[key][1])

        logging.debug('Training set size %i' % len(y))

        logging.debug('Number of test sets %i' % len(tests))

        for test in tests:
            logging.debug('Test %s size %i' % (test[0], len(test[1])))

        if args.learner == 'svm':
            learner = svm.SVC(kernel='linear', probability=True, C=args.c)
        else:
            learner = linear_model.LogisticRegression(C=args.c)

        quantifier = Quantifier(learner,
                                reference_label=args.reference_label,
                                n_folds=args.folds,
                                seed=args.seed)

        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi2', SelectKBest(chi2, k=args.k)),
            ('clf', quantifier),
        ])

        quantifier = pipeline.fit(X, y)

        true_prevalences = dict()
        results = dict()

        for testname, texts, labels in tests:
            true_prevalences[testname] = labels.count(
                args.reference_label) / len(labels)
            results[testname] = quantifier.predict(texts)

        for testname in results:
            for result in results[testname]:
                print('"%s","%s",%0.3f,"%s",%0.3f' %
                      (testname, result[1], true_prevalences[testname],
                       result[0], result[2]))
コード例 #2
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-r', '--train', help='Additional training only data')
    parser.add_argument('-f', '--folds', help='Number of folds (default: leave one out)', type=int)
    parser.add_argument('-s', '--seed', help='Randomization seed (default: 0)', type=int, default=0)
    parser.add_argument('-v', '--verbose', help='Verbose output (default: no)', action='store_true')
    parser.add_argument('-l', '--reference_label', help='Name of label to be quantified', type=str, default='positive')
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
    parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
    parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    data = read_semeval_quantification_classification(args.input, encoding='windows-1252', verbose=args.verbose)

    if args.train:
        train = read_semeval_quantification_classification(args.train, encoding='windows-1252', verbose=args.verbose)
    else:
        train = None

    test = read_semeval_quantification_classification(args.test, encoding='windows-1252', verbose=args.verbose,
                                                      delimiter='\t')

    X = list()
    y = list()
    for key in data:
        X.extend(data[key][0])
        y.extend(data[key][1])
    if train is not None:
        for key in train:
            X.extend(train[key][0])
            y.extend(train[key][1])

    logging.debug('Training set size %i' % len(y))

    logging.debug('Number of test sets %i' % len(test))

    learner = SVC(C=args.c, kernel='linear', probability=True)

    if args.folds:
        logging.debug('Folds %i' % args.folds)
        quantifier = Quantifier(learner, reference_label=args.reference_label,
                                n_folds=args.folds,
                                seed=args.seed)
    else:
        logging.debug('Leave one out %i' % len(y))
        quantifier = Quantifier(learner, reference_label=args.reference_label,
                                n_folds=len(y),
                                seed=args.seed)

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('chi2', SelectKBest(chi2, k=args.k)),
        ('clf', quantifier),
    ])

    quantifier = pipeline.fit(X, y)

    true_prevalences = dict()
    results = dict()

    for key in test:
        texts, labels = test[key]
        true_prevalences[key] = labels.count(args.reference_label) / len(labels)
        results[key] = quantifier.predict(texts)

    with open('%sc%f-k%i-cc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as ccfile, \
            open('%sc%f-k%i-acc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as accfile, \
            open('%sc%f-k%i-pcc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as pccfile, \
            open('%sc%f-k%i-pacc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as paccfile:
        topics = list(results)
        topics.sort()
        for topic in topics:
            for result in results[topic]:
                if result[0] == 'CC':
                    print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=ccfile)
                elif result[0] == 'ACC':
                    print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=accfile)
                elif result[0] == 'PCC':
                    print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=pccfile)
                elif result[0] == 'PACC':
                    print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=paccfile)
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-r', '--train', help='Additional training only data')
    parser.add_argument('-f',
                        '--folds',
                        help='Number of folds (default: leave one out)',
                        type=int)
    parser.add_argument('-s',
                        '--seed',
                        help='Randomization seed (default: 0)',
                        type=int,
                        default=0)
    parser.add_argument('-v',
                        '--verbose',
                        help='Verbose output (default: no)',
                        action='store_true')
    parser.add_argument('-l',
                        '--reference_label',
                        help='Name of label to be quantified',
                        type=str,
                        default='positive')
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-c',
                        '--c',
                        help='C value for SVM',
                        type=float,
                        default=1.0)
    parser.add_argument('-k',
                        '--k',
                        help='Number of features to keep',
                        type=int,
                        default=1000)
    parser.add_argument('-o',
                        '--output',
                        help='Output filename prefix',
                        required=True)
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    data = read_semeval_quantification_classification(args.input,
                                                      encoding='windows-1252',
                                                      verbose=args.verbose)

    if args.train:
        train = read_semeval_quantification_classification(
            args.train, encoding='windows-1252', verbose=args.verbose)
    else:
        train = None

    test = read_semeval_quantification_classification(args.test,
                                                      encoding='windows-1252',
                                                      verbose=args.verbose,
                                                      delimiter='\t')

    X = list()
    y = list()
    for key in data:
        X.extend(data[key][0])
        y.extend(data[key][1])
    if train is not None:
        for key in train:
            X.extend(train[key][0])
            y.extend(train[key][1])

    logging.debug('Training set size %i' % len(y))

    logging.debug('Number of test sets %i' % len(test))

    learner = SVC(C=args.c, kernel='linear', probability=True)

    if args.folds:
        logging.debug('Folds %i' % args.folds)
        quantifier = Quantifier(learner,
                                reference_label=args.reference_label,
                                n_folds=args.folds,
                                seed=args.seed)
    else:
        logging.debug('Leave one out %i' % len(y))
        quantifier = Quantifier(learner,
                                reference_label=args.reference_label,
                                n_folds=len(y),
                                seed=args.seed)

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('chi2', SelectKBest(chi2, k=args.k)),
        ('clf', quantifier),
    ])

    quantifier = pipeline.fit(X, y)

    true_prevalences = dict()
    results = dict()

    for key in test:
        texts, labels = test[key]
        true_prevalences[key] = labels.count(
            args.reference_label) / len(labels)
        results[key] = quantifier.predict(texts)

    with open('%sc%f-k%i-cc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as ccfile, \
            open('%sc%f-k%i-acc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as accfile, \
            open('%sc%f-k%i-pcc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as pccfile, \
            open('%sc%f-k%i-pacc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as paccfile:
        topics = list(results)
        topics.sort()
        for topic in topics:
            for result in results[topic]:
                if result[0] == 'CC':
                    print('%s\t%0.3f\t%0.3f' %
                          (topic, result[2], 1 - result[2]),
                          file=ccfile)
                elif result[0] == 'ACC':
                    print('%s\t%0.3f\t%0.3f' %
                          (topic, result[2], 1 - result[2]),
                          file=accfile)
                elif result[0] == 'PCC':
                    print('%s\t%0.3f\t%0.3f' %
                          (topic, result[2], 1 - result[2]),
                          file=pccfile)
                elif result[0] == 'PACC':
                    print('%s\t%0.3f\t%0.3f' %
                          (topic, result[2], 1 - result[2]),
                          file=paccfile)
コード例 #4
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-t', '--train', help='Additional training only data')
    parser.add_argument('-f', '--folds', help='Number of folds (default: 50)', type=int, default=50)
    parser.add_argument('-s', '--seed', help='Randomization seed (default: 0)', type=int, default=0)
    parser.add_argument('-v', '--verbose', help='Verbose output (default: no)', action='store_true')
    parser.add_argument('-r', '--reference_label', help='Name of label to be quantified', type=str, default='positive')
    parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
    parser.add_argument('-c', '--c', help='C factor for svm (default: 1.0)', type=float, default=1.0)
    parser.add_argument('-l', '--learner', help='Base learner to use (defalut:svm, options: svm, lr)', type=str,
                        default='svm')
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    data = read_semeval_quantification_classification(args.input, encoding='windows-1252', verbose=args.verbose)

    if args.train:
        train = read_semeval_quantification_classification(args.train, encoding='windows-1252', verbose=args.verbose)
    else:
        train = None

    print('"Test set","Category","True prevalence","Method","Predicted prevalence"')
    for test_key in data:

        tests = list()
        X = list()
        y = list()
        for key in data:
            if key == test_key:
                tests.append([key, data[key][0], data[key][1]])
            else:
                X.extend(data[key][0])
                y.extend(data[key][1])
        if train is not None:
            for key in train:
                X.extend(train[key][0])
                y.extend(train[key][1])

        logging.debug('Training set size %i' % len(y))

        logging.debug('Number of test sets %i' % len(tests))

        for test in tests:
            logging.debug('Test %s size %i' % (test[0], len(test[1])))

        if args.learner == 'svm':
            learner = svm.SVC(kernel='linear', probability=True, C=args.c)
        else:
            learner = linear_model.LogisticRegression(C=args.c)

        quantifier = Quantifier(learner, reference_label=args.reference_label,
                                n_folds=args.folds,
                                seed=args.seed)

        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('chi2', SelectKBest(chi2, k=args.k)),
            ('clf', quantifier),
        ])

        quantifier = pipeline.fit(X, y)

        true_prevalences = dict()
        results = dict()

        for testname, texts, labels in tests:
            true_prevalences[testname] = labels.count(args.reference_label) / len(labels)
            results[testname] = quantifier.predict(texts)

        for testname in results:
            for result in results[testname]:
                print('"%s","%s",%0.3f,"%s",%0.3f' % (
                    testname, result[1], true_prevalences[testname], result[0], result[2]))