Exemple #1
0
def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
    parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
    parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
    args = parser.parse_args()

    data = read_semeval_regression(args.input, encoding='windows-1252')

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('sel', SelectKBest(chi2, k=args.k)),
        ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)),
    ])

    test = read_test_data(args.test, encoding='windows-1252')

    regressor = pipeline.fit(data[0], data[1])

    y = regressor.predict(test[2])

    with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile:
        for id_, topic, rate in zip(test[0], test[1], y):
            print(id_, topic, rate, sep='\t', file=outfile)
def main():
    sys.stdout = codecs.getwriter("utf8")(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("-i", "--input", help="Input file", required=True)
    parser.add_argument("-t", "--test", help="Test file", required=True)
    parser.add_argument("-o", "--output", help="Output filename prefix", required=True)
    parser.add_argument("-c", "--c", help="C value for SVM", type=float, default=1.0)
    parser.add_argument("-k", "--k", help="Number of features to keep", type=int, default=1000)
    args = parser.parse_args()

    data = read_semeval_quantification_regression(args.input, encoding="windows-1252")

    texts = list()
    labels = list()
    topics = list()
    for topic in data:
        topic_texts, topic_labels = data[topic]
        texts.extend(topic_texts)
        labels.extend(topic_labels)
        topics.extend([topic for _ in topic_labels])

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline(
        [
            ("vect", CountVectorizer(analyzer=analyzer)),
            ("tfidf", TfidfTransformer()),
            ("sel", SelectKBest(chi2, k=args.k)),
            ("clf", BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)),
        ]
    )

    _, test_topics, test_texts = read_test_data(args.test, encoding="windows-1252")

    quantifier = RegressionQuantifier(pipeline)

    quantifier.fit(texts, labels, topics)

    quantification = quantifier.predict(test_texts, test_topics)

    sorted_topics = list(quantification)
    sorted_topics.sort()
    with open("%sc%f-k%i-plain-E.output" % (args.output, args.c, args.k), "w", encoding="utf8") as plainfile, open(
        "%sc%f-k%i-corrected_train-E.output" % (args.output, args.c, args.k), "w", encoding="utf8"
    ) as corrected_trainfile, open(
        "%sc%f-k%i-corrected_test-E.output" % (args.output, args.c, args.k), "w", encoding="utf8"
    ) as corrected_testfile:
        for topic in sorted_topics:
            plain, corrected_train, corrected_test = quantification[topic]
            print(topic, *plain, sep="\t", file=plainfile)
            print(topic, *corrected_train, sep="\t", file=corrected_trainfile)
            print(topic, *corrected_test, sep="\t", file=corrected_testfile)
def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-b', '--binary',
                        help='Polarity classification, i.e., posivitive vs negative (default: posivitive/negative/neutral classification)',
                        action='store_true')
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
    parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
    parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
    args = parser.parse_args()

    data = read_semeval_classification(args.input, encoding='windows-1252')
    if args.binary:
        data = filter_polarity_classification(data)

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('sel', SelectKBest(chi2, k=args.k)),
        ('clf', LinearSVC(C=args.c)),
    ])

    pipeline.fit(data[0], data[1])

    test = read_test_data(args.test, args.binary, encoding='windows-1252', topic=args.binary)

    classifier = pipeline.fit(data[0], data[1])

    y = classifier.predict(test[1])

    if args.binary:
        task = 'B'
    else:
        task = 'A'

    with open('%sc%f-k%i-%s.output' % (args.output, args.c, args.k, task), 'w', encoding='utf8') as outfile:
        if args.binary:
            for id_, topic, label in zip(test[0], test[2], y):
                print(id_, topic, label, sep='\t', file=outfile)
        else:
            for id_, label in zip(test[0], y):
                print(id_, label, sep='\t', file=outfile)
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-r', '--train', help='Additional training only data')
    parser.add_argument('-f',
                        '--folds',
                        help='Number of folds (default: leave one out)',
                        type=int)
    parser.add_argument('-s',
                        '--seed',
                        help='Randomization seed (default: 0)',
                        type=int,
                        default=0)
    parser.add_argument('-v',
                        '--verbose',
                        help='Verbose output (default: no)',
                        action='store_true')
    parser.add_argument('-l',
                        '--reference_label',
                        help='Name of label to be quantified',
                        type=str,
                        default='positive')
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-c',
                        '--c',
                        help='C value for SVM',
                        type=float,
                        default=1.0)
    parser.add_argument('-k',
                        '--k',
                        help='Number of features to keep',
                        type=int,
                        default=1000)
    parser.add_argument('-o',
                        '--output',
                        help='Output filename prefix',
                        required=True)
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    data = read_semeval_quantification_classification(args.input,
                                                      encoding='windows-1252',
                                                      verbose=args.verbose)

    if args.train:
        train = read_semeval_quantification_classification(
            args.train, encoding='windows-1252', verbose=args.verbose)
    else:
        train = None

    test = read_semeval_quantification_classification(args.test,
                                                      encoding='windows-1252',
                                                      verbose=args.verbose,
                                                      delimiter='\t')

    X = list()
    y = list()
    for key in data:
        X.extend(data[key][0])
        y.extend(data[key][1])
    if train is not None:
        for key in train:
            X.extend(train[key][0])
            y.extend(train[key][1])

    logging.debug('Training set size %i' % len(y))

    logging.debug('Number of test sets %i' % len(test))

    learner = SVC(C=args.c, kernel='linear', probability=True)

    if args.folds:
        logging.debug('Folds %i' % args.folds)
        quantifier = Quantifier(learner,
                                reference_label=args.reference_label,
                                n_folds=args.folds,
                                seed=args.seed)
    else:
        logging.debug('Leave one out %i' % len(y))
        quantifier = Quantifier(learner,
                                reference_label=args.reference_label,
                                n_folds=len(y),
                                seed=args.seed)

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('chi2', SelectKBest(chi2, k=args.k)),
        ('clf', quantifier),
    ])

    quantifier = pipeline.fit(X, y)

    true_prevalences = dict()
    results = dict()

    for key in test:
        texts, labels = test[key]
        true_prevalences[key] = labels.count(
            args.reference_label) / len(labels)
        results[key] = quantifier.predict(texts)

    with open('%sc%f-k%i-cc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as ccfile, \
            open('%sc%f-k%i-acc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as accfile, \
            open('%sc%f-k%i-pcc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as pccfile, \
            open('%sc%f-k%i-pacc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as paccfile:
        topics = list(results)
        topics.sort()
        for topic in topics:
            for result in results[topic]:
                if result[0] == 'CC':
                    print('%s\t%0.3f\t%0.3f' %
                          (topic, result[2], 1 - result[2]),
                          file=ccfile)
                elif result[0] == 'ACC':
                    print('%s\t%0.3f\t%0.3f' %
                          (topic, result[2], 1 - result[2]),
                          file=accfile)
                elif result[0] == 'PCC':
                    print('%s\t%0.3f\t%0.3f' %
                          (topic, result[2], 1 - result[2]),
                          file=pccfile)
                elif result[0] == 'PACC':
                    print('%s\t%0.3f\t%0.3f' %
                          (topic, result[2], 1 - result[2]),
                          file=paccfile)
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-r', '--train', help='Additional training only data')
    parser.add_argument('-f', '--folds', help='Number of folds (default: leave one out)', type=int)
    parser.add_argument('-s', '--seed', help='Randomization seed (default: 0)', type=int, default=0)
    parser.add_argument('-v', '--verbose', help='Verbose output (default: no)', action='store_true')
    parser.add_argument('-l', '--reference_label', help='Name of label to be quantified', type=str, default='positive')
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
    parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
    parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
    args = parser.parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)

    data = read_semeval_quantification_classification(args.input, encoding='windows-1252', verbose=args.verbose)

    if args.train:
        train = read_semeval_quantification_classification(args.train, encoding='windows-1252', verbose=args.verbose)
    else:
        train = None

    test = read_semeval_quantification_classification(args.test, encoding='windows-1252', verbose=args.verbose,
                                                      delimiter='\t')

    X = list()
    y = list()
    for key in data:
        X.extend(data[key][0])
        y.extend(data[key][1])
    if train is not None:
        for key in train:
            X.extend(train[key][0])
            y.extend(train[key][1])

    logging.debug('Training set size %i' % len(y))

    logging.debug('Number of test sets %i' % len(test))

    learner = SVC(C=args.c, kernel='linear', probability=True)

    if args.folds:
        logging.debug('Folds %i' % args.folds)
        quantifier = Quantifier(learner, reference_label=args.reference_label,
                                n_folds=args.folds,
                                seed=args.seed)
    else:
        logging.debug('Leave one out %i' % len(y))
        quantifier = Quantifier(learner, reference_label=args.reference_label,
                                n_folds=len(y),
                                seed=args.seed)

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('chi2', SelectKBest(chi2, k=args.k)),
        ('clf', quantifier),
    ])

    quantifier = pipeline.fit(X, y)

    true_prevalences = dict()
    results = dict()

    for key in test:
        texts, labels = test[key]
        true_prevalences[key] = labels.count(args.reference_label) / len(labels)
        results[key] = quantifier.predict(texts)

    with open('%sc%f-k%i-cc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as ccfile, \
            open('%sc%f-k%i-acc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as accfile, \
            open('%sc%f-k%i-pcc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as pccfile, \
            open('%sc%f-k%i-pacc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as paccfile:
        topics = list(results)
        topics.sort()
        for topic in topics:
            for result in results[topic]:
                if result[0] == 'CC':
                    print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=ccfile)
                elif result[0] == 'ACC':
                    print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=accfile)
                elif result[0] == 'PCC':
                    print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=pccfile)
                elif result[0] == 'PACC':
                    print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=paccfile)
def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument(
        '-b',
        '--binary',
        help=
        'Polarity classification, i.e., posivitive vs negative (default: posivitive/negative/neutral classification)',
        action='store_true')
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-o',
                        '--output',
                        help='Output filename prefix',
                        required=True)
    parser.add_argument('-c',
                        '--c',
                        help='C value for SVM',
                        type=float,
                        default=1.0)
    parser.add_argument('-k',
                        '--k',
                        help='Number of features to keep',
                        type=int,
                        default=1000)
    args = parser.parse_args()

    data = read_semeval_classification(args.input, encoding='windows-1252')
    if args.binary:
        data = filter_polarity_classification(data)

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('sel', SelectKBest(chi2, k=args.k)),
        ('clf', LinearSVC(C=args.c)),
    ])

    pipeline.fit(data[0], data[1])

    test = read_test_data(args.test,
                          args.binary,
                          encoding='windows-1252',
                          topic=args.binary)

    classifier = pipeline.fit(data[0], data[1])

    y = classifier.predict(test[1])

    if args.binary:
        task = 'B'
    else:
        task = 'A'

    with open('%sc%f-k%i-%s.output' % (args.output, args.c, args.k, task),
              'w',
              encoding='utf8') as outfile:
        if args.binary:
            for id_, topic, label in zip(test[0], test[2], y):
                print(id_, topic, label, sep='\t', file=outfile)
        else:
            for id_, label in zip(test[0], y):
                print(id_, label, sep='\t', file=outfile)
def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-o',
                        '--output',
                        help='Output filename prefix',
                        required=True)
    parser.add_argument('-c',
                        '--c',
                        help='C value for SVM',
                        type=float,
                        default=1.0)
    parser.add_argument('-k',
                        '--k',
                        help='Number of features to keep',
                        type=int,
                        default=1000)
    args = parser.parse_args()

    data = read_semeval_quantification_regression(args.input,
                                                  encoding='windows-1252')

    texts = list()
    labels = list()
    topics = list()
    for topic in data:
        topic_texts, topic_labels = data[topic]
        texts.extend(topic_texts)
        labels.extend(topic_labels)
        topics.extend([topic for _ in topic_labels])

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('sel', SelectKBest(chi2, k=args.k)),
        ('clf',
         BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c),
                             verbose=False)),
    ])

    _, test_topics, test_texts = read_test_data(args.test,
                                                encoding='windows-1252')

    quantifier = RegressionQuantifier(pipeline)

    quantifier.fit(texts, labels, topics)

    quantification = quantifier.predict(test_texts, test_topics)

    sorted_topics = list(quantification)
    sorted_topics.sort()
    with open('%sc%f-k%i-plain-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as plainfile, \
            open('%sc%f-k%i-corrected_train-E.output' % (args.output, args.c, args.k), 'w',
                 encoding='utf8') as corrected_trainfile, \
            open('%sc%f-k%i-corrected_test-E.output' % (args.output, args.c, args.k), 'w',
                 encoding='utf8') as corrected_testfile:
        for topic in sorted_topics:
            plain, corrected_train, corrected_test = quantification[topic]
            print(topic, *plain, sep='\t', file=plainfile)
            print(topic, *corrected_train, sep='\t', file=corrected_trainfile)
            print(topic, *corrected_test, sep='\t', file=corrected_testfile)