def evaluate_segmented_ml(language_model, classifier, dataset_fnames, output_fname, segment_filtering=None): """ Produces an output file that contains the ranking of document pairs and predicted relevance labels. This is done by computing similarity between segments, approximating the relevance of a document pair using a pre-learned classifier, and producing a ranking based on the classifier certainty. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. Note that the ml approach expects all training samples to have the same number of active segments. """ with open(output_fname, "wt") as output_file: for orgquestion, (thread, _) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): results = [] for orgquestion_segment in orgquestion.segments: if not orgquestion_segment.active: continue for thread_segment in thread.segments: if not thread_segment.active: continue results.append( language_model.similarity(orgquestion_segment, thread_segment)) test_score = classifier.decision_function([results])[0] test_class = classifier.predict([results])[0] output_file.write("%s\t%s\t0\t%s\t%s\n" % (orgquestion.id, thread.id, repr(test_score), "true" if test_class else "false"))
def train_segmented_ml(language_model, dataset_fnames, segment_filtering=None): """ Trains a classifier that maps document similarity to relevance labels. This is done by computing similarity between segments and then learning a to classify the segment similarities as relevant / non-relevant. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. Note that the ml approach expects all training samples to have the same number of active segments. """ training_scores = [] training_classes = [] for orgquestion, (thread, relevant) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): results = [] for orgquestion_segment in orgquestion.segments: if not orgquestion_segment.active: continue for thread_segment in thread.segments: if not thread_segment.active: continue results.append( language_model.similarity(orgquestion_segment, thread_segment)) training_scores.append(results) training_classes.append(relevant) classifier = LogisticRegression( random_state=LOGISTIC_REGRESSION_RANDOM_STATE) classifier.fit(training_scores, training_classes) return classifier
def produce_gold_results(dataset_fnames, output_fname): """ Produces gold results from an input (dev) datasets and stores the results in an output file. """ with open(output_fname, "wt") as output_file: orgquestion_ids = [] orgquestion_threads = {} for orgquestion, (thread, relevant) in zip( segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames)): if orgquestion.id not in orgquestion_threads: orgquestion_threads[orgquestion.id] = [] orgquestion_ids.append(orgquestion.id) orgquestion_threads[orgquestion.id].append((relevant, thread.id)) for orgquestion_id in orgquestion_ids: threads = orgquestion_threads[orgquestion_id] sorted_threads = sorted(enumerate(threads), key=lambda thread: thread[1][0], \ reverse=True) for rank, (_, (relevant, thread_id)) in sorted(enumerate(sorted_threads), key=lambda thread: thread[1][0]): gold_score = (len(sorted_threads) - rank) / len(sorted_threads) output_file.write( "%s\t%s\t%d\t%s\t%s\n" % (orgquestion_id, thread_id, rank + 1, gold_score, "true" if relevant else "false"))
def evaluate(language_model, dataset_fnames, output_fname): """Produces an output file that contains the ranking of document pairs.""" with open(output_fname, "wt") as output_file: for orgquestion, (thread, _) in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames)): test_score = language_model.compare(orgquestion, thread) output_file.write("%s\t%s\t0\t%s\ttrue\n" % (orgquestion.id, thread.id, repr(test_score)))
def evaluate_segmented_aggregation(language_model, classifier, dataset_fnames, output_fname, aggregate_tier1_segments, aggregate_tier2_segments, thread_first=True, segment_filtering=None): """ Produces an output file that contains the ranking of document pairs and predicted relevance labels. The segmented non-ML version computes similarity between segments and then performs a reduction step to derive document similarity. If full_threads is True, processes entire <Thread>s, otherwise processes only the <RelQuestion>s. If thread_first is True, the reduction is first performed over <Thread> segments and then over <OrgQuestion> segments rather than the other way around. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. """ with open(output_fname, "wt") as output_file: for orgquestion, (thread, _) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): results = [] tier1 = thread if thread_first else orgquestion tier2 = orgquestion if thread_first else thread for tier2_segment in tier2.segments: subresults = [] for tier1_segment in tier1.segments: orgquestion_segment = tier2_segment if thread_first else tier1_segment thread_segment = tier1_segment if thread_first else tier2_segment subresults.append([ language_model.similarity(orgquestion_segment, thread_segment), tier2_segment, tier1_segment ]) subresults_aggregate = aggregate_tier1_segments( subresults, language_model) LOGGER.debug("Aggregating subresults: %s -> %s", subresults, subresults_aggregate) results.append(subresults_aggregate) results_aggregate = aggregate_tier2_segments( results, language_model) LOGGER.debug("Aggregating results: %s -> %s", results, results_aggregate) test_score = results_aggregate[0] test_class = classifier.predict([[test_score]])[0] output_file.write("%s\t%s\t0\t%s\t%s\n" % (orgquestion.id, thread.id, repr(test_score), "true" if test_class else "false"))
def train_segmented_aggregation(language_model, dataset_fnames, aggregate_tier1_segments, aggregate_tier2_segments, thread_first=True, segment_filtering=None): """ Trains a classifier that maps document similarity to relevance labels. The segmented non-ML version computes similarity between segments and then performs a reduction step to derive document similarity. If full_threads is True, processes entire <Thread>s, otherwise processes only the <RelQuestion>s. If thread_first is True, the reduction is first performed over <Thread> segments and then over <OrgQuestion> segments rather than the other way around. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. """ training_scores = [] training_classes = [] for orgquestion, (thread, relevant) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): results = [] tier1 = thread if thread_first else orgquestion tier2 = orgquestion if thread_first else thread for tier2_segment in tier2.segments: subresults = [] for tier1_segment in tier1.segments: orgquestion_segment = tier2_segment if thread_first else tier1_segment thread_segment = tier1_segment if thread_first else tier2_segment subresults.append([ language_model.similarity(orgquestion_segment, thread_segment), tier2_segment, tier1_segment ]) subresults_aggregate = aggregate_tier1_segments( subresults, language_model) LOGGER.debug("Aggregating subresults: %s -> %s", subresults, subresults_aggregate) results.append(subresults_aggregate) results_aggregate = aggregate_tier2_segments(results, language_model) LOGGER.debug("Aggregating results: %s -> %s", results, results_aggregate) training_scores.append(results_aggregate) training_classes.append(relevant) classifier = LogisticRegression( random_state=LOGISTIC_REGRESSION_RANDOM_STATE) classifier.fit(training_scores, training_classes) return classifier
def evaluate_nonsegmented(language_model, classifier, dataset_fnames, output_fname, \ segment_filtering=None): """ Produces an output file that contains the ranking of document pairs and predicted relevance labels. The non-segmented version disregards segmentation and computes similarity directly between documents. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. """ with open(output_fname, "wt") as output_file: for orgquestion, (thread, _) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): test_score = language_model.similarity(orgquestion, thread) test_class = classifier.predict([[test_score]])[0] output_file.write("%s\t%s\t0\t%s\t%s\n" % (orgquestion.id, thread.id, repr(test_score), "true" if test_class else "false"))
def train_nonsegmented(language_model, dataset_fnames, segment_filtering=False): """ Trains a classifier that maps document similarity to relevance labels. The non-segmented version disregards segmentation and computes similarity directly between documents. If segment_filtering is not None, a text summarization technique is used for the filtering of <Thread> segments. """ training_scores = [] training_classes = [] for orgquestion, (thread, relevant) \ in zip(segment_orgquestions(dataset_fnames), segment_threads(dataset_fnames, segment_filtering=segment_filtering)): training_scores.append( [language_model.similarity(orgquestion, thread)]) training_classes.append(relevant) classifier = LogisticRegression( random_state=LOGISTIC_REGRESSION_RANDOM_STATE) classifier.fit(training_scores, training_classes) return classifier