def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_regression(args.input, encoding='windows-1252') analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)), ]) test = read_test_data(args.test, encoding='windows-1252') regressor = pipeline.fit(data[0], data[1]) y = regressor.predict(test[2]) with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile: for id_, topic, rate in zip(test[0], test[1], y): print(id_, topic, rate, sep='\t', file=outfile)
def __init__(self, pipeline=None): if pipeline is None: self._pipeline = BinaryTreeRegressor(base_estimator=LinearSVC(C=100.0)) else: self._pipeline = pipeline
class RegressionQuantifier(object): def __init__(self, pipeline=None): if pipeline is None: self._pipeline = BinaryTreeRegressor(base_estimator=LinearSVC(C=100.0)) else: self._pipeline = pipeline def fit(self, X, y, groups): self._true_global_prevalences = defaultdict(float) self._values = list(set(y)) self._values.sort() for rate in self._values: self._true_global_prevalences[rate] = y.count(rate) / len(y) self._estimated_global_prevalences = defaultdict(float) for group in set(groups): group_X = [atext for atext, agroup in zip(X, groups) if agroup == group] notgroup_X = [atext for atext, agroup in zip(X, groups) if agroup != group] notgroup_y = [alabel for alabel, agroup in zip(y, groups) if agroup != group] pipclone = clone(self._pipeline) model = pipclone.fit(notgroup_X, notgroup_y) predictions = model.predict(group_X) for rate in self._values: self._estimated_global_prevalences[rate] += predictions.count(rate) for rate in self._values: self._estimated_global_prevalences[rate] /= len(y) self._model = self._pipeline.fit(X, y) def predict(self, X, groups): predictions = self._model.predict(X) quantifications = dict() test_global_prevalences = defaultdict(float) for rate in self._values: test_global_prevalences[rate] = predictions.count(rate) / len(X) for group in set(groups): group_predictions = [prediction for prediction, agroup in zip(predictions, groups) if agroup == group] simple_prevanlences = list() corrected_prevalences = list() test_corrected_prevalences = list() for rate in self._values: prevalence = group_predictions.count(rate) / len(group_predictions) simple_prevanlences.append(prevalence) if self._estimated_global_prevalences[rate] != 0: corrected_prevalences.append( prevalence * self._true_global_prevalences[rate] / self._estimated_global_prevalences[rate]) else: corrected_prevalences.append(prevalence) if test_global_prevalences[rate] != 0: test_corrected_prevalences.append( prevalence * self._true_global_prevalences[rate] / test_global_prevalences[rate]) else: test_corrected_prevalences.append(prevalence) cumulative = sum(corrected_prevalences) corrected_prevalences = [corrected_prevalence / cumulative for corrected_prevalence in corrected_prevalences] cumulative = sum(test_corrected_prevalences) test_corrected_prevalences = [test_corrected_prevalence / cumulative for test_corrected_prevalence in test_corrected_prevalences] quantifications[group] = (simple_prevanlences, corrected_prevalences, test_corrected_prevalences) return quantifications
def __init__(self, pipeline=None): if pipeline is None: self._pipeline = BinaryTreeRegressor(base_estimator=LinearSVC( C=100.0)) else: self._pipeline = pipeline
class RegressionQuantifier(object): def __init__(self, pipeline=None): if pipeline is None: self._pipeline = BinaryTreeRegressor(base_estimator=LinearSVC( C=100.0)) else: self._pipeline = pipeline def fit(self, X, y, groups): self._true_global_prevalences = defaultdict(float) self._values = list(set(y)) self._values.sort() for rate in self._values: self._true_global_prevalences[rate] = y.count(rate) / len(y) self._estimated_global_prevalences = defaultdict(float) for group in set(groups): group_X = [ atext for atext, agroup in zip(X, groups) if agroup == group ] notgroup_X = [ atext for atext, agroup in zip(X, groups) if agroup != group ] notgroup_y = [ alabel for alabel, agroup in zip(y, groups) if agroup != group ] pipclone = clone(self._pipeline) model = pipclone.fit(notgroup_X, notgroup_y) predictions = model.predict(group_X) for rate in self._values: self._estimated_global_prevalences[rate] += predictions.count( rate) for rate in self._values: self._estimated_global_prevalences[rate] /= len(y) self._model = self._pipeline.fit(X, y) def predict(self, X, groups): predictions = self._model.predict(X) quantifications = dict() test_global_prevalences = defaultdict(float) for rate in self._values: test_global_prevalences[rate] = predictions.count(rate) / len(X) for group in set(groups): group_predictions = [ prediction for prediction, agroup in zip(predictions, groups) if agroup == group ] simple_prevanlences = list() corrected_prevalences = list() test_corrected_prevalences = list() for rate in self._values: prevalence = group_predictions.count(rate) / len( group_predictions) simple_prevanlences.append(prevalence) if self._estimated_global_prevalences[rate] != 0: corrected_prevalences.append( prevalence * self._true_global_prevalences[rate] / self._estimated_global_prevalences[rate]) else: corrected_prevalences.append(prevalence) if test_global_prevalences[rate] != 0: test_corrected_prevalences.append( prevalence * self._true_global_prevalences[rate] / test_global_prevalences[rate]) else: test_corrected_prevalences.append(prevalence) cumulative = sum(corrected_prevalences) corrected_prevalences = [ corrected_prevalence / cumulative for corrected_prevalence in corrected_prevalences ] cumulative = sum(test_corrected_prevalences) test_corrected_prevalences = [ test_corrected_prevalence / cumulative for test_corrected_prevalence in test_corrected_prevalences ] quantifications[group] = (simple_prevanlences, corrected_prevalences, test_corrected_prevalences) return quantifications
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_quantification_regression(args.input, encoding='windows-1252') texts = list() labels = list() topics = list() for topic in data: topic_texts, topic_labels = data[topic] texts.extend(topic_texts) labels.extend(topic_labels) topics.extend([topic for _ in topic_labels]) analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)), ]) _, test_topics, test_texts = read_test_data(args.test, encoding='windows-1252') quantifier = RegressionQuantifier(pipeline) quantifier.fit(texts, labels, topics) quantification = quantifier.predict(test_texts, test_topics) sorted_topics = list(quantification) sorted_topics.sort() with open('%sc%f-k%i-plain-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as plainfile, \ open('%sc%f-k%i-corrected_train-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as corrected_trainfile, \ open('%sc%f-k%i-corrected_test-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as corrected_testfile: for topic in sorted_topics: plain, corrected_train, corrected_test = quantification[topic] print(topic, *plain, sep='\t', file=plainfile) print(topic, *corrected_train, sep='\t', file=corrected_trainfile) print(topic, *corrected_test, sep='\t', file=corrected_testfile)