def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-t', '--train', help='Additional training only data') parser.add_argument('-f', '--folds', help='Number of folds (default: 50)', type=int, default=50) parser.add_argument('-s', '--seed', help='Randomization seed (default: 0)', type=int, default=0) parser.add_argument('-v', '--verbose', help='Verbose output (default: no)', action='store_true') parser.add_argument('-r', '--reference_label', help='Name of label to be quantified', type=str, default='positive') parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) parser.add_argument('-c', '--c', help='C factor for svm (default: 1.0)', type=float, default=1.0) parser.add_argument( '-l', '--learner', help='Base learner to use (defalut:svm, options: svm, lr)', type=str, default='svm') args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) data = read_semeval_quantification_classification(args.input, encoding='windows-1252', verbose=args.verbose) if args.train: train = read_semeval_quantification_classification( args.train, encoding='windows-1252', verbose=args.verbose) else: train = None print( '"Test set","Category","True prevalence","Method","Predicted prevalence"' ) for test_key in data: tests = list() X = list() y = list() for key in data: if key == test_key: tests.append([key, data[key][0], data[key][1]]) else: X.extend(data[key][0]) y.extend(data[key][1]) if train is not None: for key in train: X.extend(train[key][0]) y.extend(train[key][1]) logging.debug('Training set size %i' % len(y)) logging.debug('Number of test sets %i' % len(tests)) for test in tests: logging.debug('Test %s size %i' % (test[0], len(test[1]))) if args.learner == 'svm': learner = svm.SVC(kernel='linear', probability=True, C=args.c) else: learner = linear_model.LogisticRegression(C=args.c) quantifier = Quantifier(learner, reference_label=args.reference_label, n_folds=args.folds, seed=args.seed) pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=args.k)), ('clf', quantifier), ]) quantifier = pipeline.fit(X, y) true_prevalences = dict() results = dict() for testname, texts, labels in tests: true_prevalences[testname] = labels.count( args.reference_label) / len(labels) results[testname] = quantifier.predict(texts) for testname in results: for result in results[testname]: print('"%s","%s",%0.3f,"%s",%0.3f' % (testname, result[1], true_prevalences[testname], result[0], result[2]))
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-r', '--train', help='Additional training only data') parser.add_argument('-f', '--folds', help='Number of folds (default: leave one out)', type=int) parser.add_argument('-s', '--seed', help='Randomization seed (default: 0)', type=int, default=0) parser.add_argument('-v', '--verbose', help='Verbose output (default: no)', action='store_true') parser.add_argument('-l', '--reference_label', help='Name of label to be quantified', type=str, default='positive') parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) data = read_semeval_quantification_classification(args.input, encoding='windows-1252', verbose=args.verbose) if args.train: train = read_semeval_quantification_classification(args.train, encoding='windows-1252', verbose=args.verbose) else: train = None test = read_semeval_quantification_classification(args.test, encoding='windows-1252', verbose=args.verbose, delimiter='\t') X = list() y = list() for key in data: X.extend(data[key][0]) y.extend(data[key][1]) if train is not None: for key in train: X.extend(train[key][0]) y.extend(train[key][1]) logging.debug('Training set size %i' % len(y)) logging.debug('Number of test sets %i' % len(test)) learner = SVC(C=args.c, kernel='linear', probability=True) if args.folds: logging.debug('Folds %i' % args.folds) quantifier = Quantifier(learner, reference_label=args.reference_label, n_folds=args.folds, seed=args.seed) else: logging.debug('Leave one out %i' % len(y)) quantifier = Quantifier(learner, reference_label=args.reference_label, n_folds=len(y), seed=args.seed) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=args.k)), ('clf', quantifier), ]) quantifier = pipeline.fit(X, y) true_prevalences = dict() results = dict() for key in test: texts, labels = test[key] true_prevalences[key] = labels.count(args.reference_label) / len(labels) results[key] = quantifier.predict(texts) with open('%sc%f-k%i-cc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as ccfile, \ open('%sc%f-k%i-acc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as accfile, \ open('%sc%f-k%i-pcc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as pccfile, \ open('%sc%f-k%i-pacc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as paccfile: topics = list(results) topics.sort() for topic in topics: for result in results[topic]: if result[0] == 'CC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=ccfile) elif result[0] == 'ACC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=accfile) elif result[0] == 'PCC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=pccfile) elif result[0] == 'PACC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=paccfile)
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-r', '--train', help='Additional training only data') parser.add_argument('-f', '--folds', help='Number of folds (default: leave one out)', type=int) parser.add_argument('-s', '--seed', help='Randomization seed (default: 0)', type=int, default=0) parser.add_argument('-v', '--verbose', help='Verbose output (default: no)', action='store_true') parser.add_argument('-l', '--reference_label', help='Name of label to be quantified', type=str, default='positive') parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) data = read_semeval_quantification_classification(args.input, encoding='windows-1252', verbose=args.verbose) if args.train: train = read_semeval_quantification_classification( args.train, encoding='windows-1252', verbose=args.verbose) else: train = None test = read_semeval_quantification_classification(args.test, encoding='windows-1252', verbose=args.verbose, delimiter='\t') X = list() y = list() for key in data: X.extend(data[key][0]) y.extend(data[key][1]) if train is not None: for key in train: X.extend(train[key][0]) y.extend(train[key][1]) logging.debug('Training set size %i' % len(y)) logging.debug('Number of test sets %i' % len(test)) learner = SVC(C=args.c, kernel='linear', probability=True) if args.folds: logging.debug('Folds %i' % args.folds) quantifier = Quantifier(learner, reference_label=args.reference_label, n_folds=args.folds, seed=args.seed) else: logging.debug('Leave one out %i' % len(y)) quantifier = Quantifier(learner, reference_label=args.reference_label, n_folds=len(y), seed=args.seed) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=args.k)), ('clf', quantifier), ]) quantifier = pipeline.fit(X, y) true_prevalences = dict() results = dict() for key in test: texts, labels = test[key] true_prevalences[key] = labels.count( args.reference_label) / len(labels) results[key] = quantifier.predict(texts) with open('%sc%f-k%i-cc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as ccfile, \ open('%sc%f-k%i-acc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as accfile, \ open('%sc%f-k%i-pcc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as pccfile, \ open('%sc%f-k%i-pacc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as paccfile: topics = list(results) topics.sort() for topic in topics: for result in results[topic]: if result[0] == 'CC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=ccfile) elif result[0] == 'ACC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=accfile) elif result[0] == 'PCC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=pccfile) elif result[0] == 'PACC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=paccfile)
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-t', '--train', help='Additional training only data') parser.add_argument('-f', '--folds', help='Number of folds (default: 50)', type=int, default=50) parser.add_argument('-s', '--seed', help='Randomization seed (default: 0)', type=int, default=0) parser.add_argument('-v', '--verbose', help='Verbose output (default: no)', action='store_true') parser.add_argument('-r', '--reference_label', help='Name of label to be quantified', type=str, default='positive') parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) parser.add_argument('-c', '--c', help='C factor for svm (default: 1.0)', type=float, default=1.0) parser.add_argument('-l', '--learner', help='Base learner to use (defalut:svm, options: svm, lr)', type=str, default='svm') args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) data = read_semeval_quantification_classification(args.input, encoding='windows-1252', verbose=args.verbose) if args.train: train = read_semeval_quantification_classification(args.train, encoding='windows-1252', verbose=args.verbose) else: train = None print('"Test set","Category","True prevalence","Method","Predicted prevalence"') for test_key in data: tests = list() X = list() y = list() for key in data: if key == test_key: tests.append([key, data[key][0], data[key][1]]) else: X.extend(data[key][0]) y.extend(data[key][1]) if train is not None: for key in train: X.extend(train[key][0]) y.extend(train[key][1]) logging.debug('Training set size %i' % len(y)) logging.debug('Number of test sets %i' % len(tests)) for test in tests: logging.debug('Test %s size %i' % (test[0], len(test[1]))) if args.learner == 'svm': learner = svm.SVC(kernel='linear', probability=True, C=args.c) else: learner = linear_model.LogisticRegression(C=args.c) quantifier = Quantifier(learner, reference_label=args.reference_label, n_folds=args.folds, seed=args.seed) pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=args.k)), ('clf', quantifier), ]) quantifier = pipeline.fit(X, y) true_prevalences = dict() results = dict() for testname, texts, labels in tests: true_prevalences[testname] = labels.count(args.reference_label) / len(labels) results[testname] = quantifier.predict(texts) for testname in results: for result in results[testname]: print('"%s","%s",%0.3f,"%s",%0.3f' % ( testname, result[1], true_prevalences[testname], result[0], result[2]))