def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_regression(args.input, encoding='windows-1252') analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)), ]) test = read_test_data(args.test, encoding='windows-1252') regressor = pipeline.fit(data[0], data[1]) y = regressor.predict(test[2]) with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile: for id_, topic, rate in zip(test[0], test[1], y): print(id_, topic, rate, sep='\t', file=outfile)
def main(): sys.stdout = codecs.getwriter("utf8")(sys.stdout.buffer) parser = argparse.ArgumentParser(description="") parser.add_argument("-i", "--input", help="Input file", required=True) parser.add_argument("-t", "--test", help="Test file", required=True) parser.add_argument("-o", "--output", help="Output filename prefix", required=True) parser.add_argument("-c", "--c", help="C value for SVM", type=float, default=1.0) parser.add_argument("-k", "--k", help="Number of features to keep", type=int, default=1000) args = parser.parse_args() data = read_semeval_quantification_regression(args.input, encoding="windows-1252") texts = list() labels = list() topics = list() for topic in data: topic_texts, topic_labels = data[topic] texts.extend(topic_texts) labels.extend(topic_labels) topics.extend([topic for _ in topic_labels]) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline( [ ("vect", CountVectorizer(analyzer=analyzer)), ("tfidf", TfidfTransformer()), ("sel", SelectKBest(chi2, k=args.k)), ("clf", BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)), ] ) _, test_topics, test_texts = read_test_data(args.test, encoding="windows-1252") quantifier = RegressionQuantifier(pipeline) quantifier.fit(texts, labels, topics) quantification = quantifier.predict(test_texts, test_topics) sorted_topics = list(quantification) sorted_topics.sort() with open("%sc%f-k%i-plain-E.output" % (args.output, args.c, args.k), "w", encoding="utf8") as plainfile, open( "%sc%f-k%i-corrected_train-E.output" % (args.output, args.c, args.k), "w", encoding="utf8" ) as corrected_trainfile, open( "%sc%f-k%i-corrected_test-E.output" % (args.output, args.c, args.k), "w", encoding="utf8" ) as corrected_testfile: for topic in sorted_topics: plain, corrected_train, corrected_test = quantification[topic] print(topic, *plain, sep="\t", file=plainfile) print(topic, *corrected_train, sep="\t", file=corrected_trainfile) print(topic, *corrected_test, sep="\t", file=corrected_testfile)
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-b', '--binary', help='Polarity classification, i.e., posivitive vs negative (default: posivitive/negative/neutral classification)', action='store_true') parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_classification(args.input, encoding='windows-1252') if args.binary: data = filter_polarity_classification(data) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', LinearSVC(C=args.c)), ]) pipeline.fit(data[0], data[1]) test = read_test_data(args.test, args.binary, encoding='windows-1252', topic=args.binary) classifier = pipeline.fit(data[0], data[1]) y = classifier.predict(test[1]) if args.binary: task = 'B' else: task = 'A' with open('%sc%f-k%i-%s.output' % (args.output, args.c, args.k, task), 'w', encoding='utf8') as outfile: if args.binary: for id_, topic, label in zip(test[0], test[2], y): print(id_, topic, label, sep='\t', file=outfile) else: for id_, label in zip(test[0], y): print(id_, label, sep='\t', file=outfile)
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-r', '--train', help='Additional training only data') parser.add_argument('-f', '--folds', help='Number of folds (default: leave one out)', type=int) parser.add_argument('-s', '--seed', help='Randomization seed (default: 0)', type=int, default=0) parser.add_argument('-v', '--verbose', help='Verbose output (default: no)', action='store_true') parser.add_argument('-l', '--reference_label', help='Name of label to be quantified', type=str, default='positive') parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) data = read_semeval_quantification_classification(args.input, encoding='windows-1252', verbose=args.verbose) if args.train: train = read_semeval_quantification_classification( args.train, encoding='windows-1252', verbose=args.verbose) else: train = None test = read_semeval_quantification_classification(args.test, encoding='windows-1252', verbose=args.verbose, delimiter='\t') X = list() y = list() for key in data: X.extend(data[key][0]) y.extend(data[key][1]) if train is not None: for key in train: X.extend(train[key][0]) y.extend(train[key][1]) logging.debug('Training set size %i' % len(y)) logging.debug('Number of test sets %i' % len(test)) learner = SVC(C=args.c, kernel='linear', probability=True) if args.folds: logging.debug('Folds %i' % args.folds) quantifier = Quantifier(learner, reference_label=args.reference_label, n_folds=args.folds, seed=args.seed) else: logging.debug('Leave one out %i' % len(y)) quantifier = Quantifier(learner, reference_label=args.reference_label, n_folds=len(y), seed=args.seed) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=args.k)), ('clf', quantifier), ]) quantifier = pipeline.fit(X, y) true_prevalences = dict() results = dict() for key in test: texts, labels = test[key] true_prevalences[key] = labels.count( args.reference_label) / len(labels) results[key] = quantifier.predict(texts) with open('%sc%f-k%i-cc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as ccfile, \ open('%sc%f-k%i-acc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as accfile, \ open('%sc%f-k%i-pcc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as pccfile, \ open('%sc%f-k%i-pacc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as paccfile: topics = list(results) topics.sort() for topic in topics: for result in results[topic]: if result[0] == 'CC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=ccfile) elif result[0] == 'ACC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=accfile) elif result[0] == 'PCC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=pccfile) elif result[0] == 'PACC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=paccfile)
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-r', '--train', help='Additional training only data') parser.add_argument('-f', '--folds', help='Number of folds (default: leave one out)', type=int) parser.add_argument('-s', '--seed', help='Randomization seed (default: 0)', type=int, default=0) parser.add_argument('-v', '--verbose', help='Verbose output (default: no)', action='store_true') parser.add_argument('-l', '--reference_label', help='Name of label to be quantified', type=str, default='positive') parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) data = read_semeval_quantification_classification(args.input, encoding='windows-1252', verbose=args.verbose) if args.train: train = read_semeval_quantification_classification(args.train, encoding='windows-1252', verbose=args.verbose) else: train = None test = read_semeval_quantification_classification(args.test, encoding='windows-1252', verbose=args.verbose, delimiter='\t') X = list() y = list() for key in data: X.extend(data[key][0]) y.extend(data[key][1]) if train is not None: for key in train: X.extend(train[key][0]) y.extend(train[key][1]) logging.debug('Training set size %i' % len(y)) logging.debug('Number of test sets %i' % len(test)) learner = SVC(C=args.c, kernel='linear', probability=True) if args.folds: logging.debug('Folds %i' % args.folds) quantifier = Quantifier(learner, reference_label=args.reference_label, n_folds=args.folds, seed=args.seed) else: logging.debug('Leave one out %i' % len(y)) quantifier = Quantifier(learner, reference_label=args.reference_label, n_folds=len(y), seed=args.seed) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('chi2', SelectKBest(chi2, k=args.k)), ('clf', quantifier), ]) quantifier = pipeline.fit(X, y) true_prevalences = dict() results = dict() for key in test: texts, labels = test[key] true_prevalences[key] = labels.count(args.reference_label) / len(labels) results[key] = quantifier.predict(texts) with open('%sc%f-k%i-cc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as ccfile, \ open('%sc%f-k%i-acc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as accfile, \ open('%sc%f-k%i-pcc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as pccfile, \ open('%sc%f-k%i-pacc-D.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as paccfile: topics = list(results) topics.sort() for topic in topics: for result in results[topic]: if result[0] == 'CC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=ccfile) elif result[0] == 'ACC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=accfile) elif result[0] == 'PCC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=pccfile) elif result[0] == 'PACC': print('%s\t%0.3f\t%0.3f' % (topic, result[2], 1 - result[2]), file=paccfile)
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument( '-b', '--binary', help= 'Polarity classification, i.e., posivitive vs negative (default: posivitive/negative/neutral classification)', action='store_true') parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_classification(args.input, encoding='windows-1252') if args.binary: data = filter_polarity_classification(data) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', LinearSVC(C=args.c)), ]) pipeline.fit(data[0], data[1]) test = read_test_data(args.test, args.binary, encoding='windows-1252', topic=args.binary) classifier = pipeline.fit(data[0], data[1]) y = classifier.predict(test[1]) if args.binary: task = 'B' else: task = 'A' with open('%sc%f-k%i-%s.output' % (args.output, args.c, args.k, task), 'w', encoding='utf8') as outfile: if args.binary: for id_, topic, label in zip(test[0], test[2], y): print(id_, topic, label, sep='\t', file=outfile) else: for id_, label in zip(test[0], y): print(id_, label, sep='\t', file=outfile)
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_quantification_regression(args.input, encoding='windows-1252') texts = list() labels = list() topics = list() for topic in data: topic_texts, topic_labels = data[topic] texts.extend(topic_texts) labels.extend(topic_labels) topics.extend([topic for _ in topic_labels]) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)), ]) _, test_topics, test_texts = read_test_data(args.test, encoding='windows-1252') quantifier = RegressionQuantifier(pipeline) quantifier.fit(texts, labels, topics) quantification = quantifier.predict(test_texts, test_topics) sorted_topics = list(quantification) sorted_topics.sort() with open('%sc%f-k%i-plain-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as plainfile, \ open('%sc%f-k%i-corrected_train-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as corrected_trainfile, \ open('%sc%f-k%i-corrected_test-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as corrected_testfile: for topic in sorted_topics: plain, corrected_train, corrected_test = quantification[topic] print(topic, *plain, sep='\t', file=plainfile) print(topic, *corrected_train, sep='\t', file=corrected_trainfile) print(topic, *corrected_test, sep='\t', file=corrected_testfile)