def evaluate_segmented_ml(language_model,
                          classifier,
                          dataset_fnames,
                          output_fname,
                          segment_filtering=None):
    """
        Produces an output file that contains the ranking of document pairs and
        predicted relevance labels.  This is done by computing similarity
        between segments, approximating the relevance of a document pair using
        a pre-learned classifier, and producing a ranking based on the
        classifier certainty.

        If segment_filtering is not None, a text summarization technique is
        used for the filtering of <Thread> segments. Note that the ml approach
        expects all training samples to have the same number of active segments.
    """
    with open(output_fname, "wt") as output_file:
        for orgquestion, (thread, _) \
            in zip(segment_orgquestions(dataset_fnames),
                   segment_threads(dataset_fnames, segment_filtering=segment_filtering)):
            results = []
            for orgquestion_segment in orgquestion.segments:
                if not orgquestion_segment.active:
                    continue
                for thread_segment in thread.segments:
                    if not thread_segment.active:
                        continue
                    results.append(
                        language_model.similarity(orgquestion_segment,
                                                  thread_segment))
            test_score = classifier.decision_function([results])[0]
            test_class = classifier.predict([results])[0]
            output_file.write("%s\t%s\t0\t%s\t%s\n" %
                              (orgquestion.id, thread.id, repr(test_score),
                               "true" if test_class else "false"))
def train_segmented_ml(language_model, dataset_fnames, segment_filtering=None):
    """
        Trains a classifier that maps document similarity to relevance labels.
        This is done by computing similarity between segments and then
        learning a to classify the segment similarities as relevant / non-relevant.

        If segment_filtering is not None, a text summarization technique is
        used for the filtering of <Thread> segments. Note that the ml approach
        expects all training samples to have the same number of active segments.
    """
    training_scores = []
    training_classes = []
    for orgquestion, (thread, relevant) \
        in zip(segment_orgquestions(dataset_fnames),
               segment_threads(dataset_fnames, segment_filtering=segment_filtering)):
        results = []
        for orgquestion_segment in orgquestion.segments:
            if not orgquestion_segment.active:
                continue
            for thread_segment in thread.segments:
                if not thread_segment.active:
                    continue
                results.append(
                    language_model.similarity(orgquestion_segment,
                                              thread_segment))
        training_scores.append(results)
        training_classes.append(relevant)
    classifier = LogisticRegression(
        random_state=LOGISTIC_REGRESSION_RANDOM_STATE)
    classifier.fit(training_scores, training_classes)
    return classifier
def produce_gold_results(dataset_fnames, output_fname):
    """
        Produces gold results from an input (dev) datasets and stores the
        results in an output file.
    """
    with open(output_fname, "wt") as output_file:
        orgquestion_ids = []
        orgquestion_threads = {}
        for orgquestion, (thread, relevant) in zip(
                segment_orgquestions(dataset_fnames),
                segment_threads(dataset_fnames)):
            if orgquestion.id not in orgquestion_threads:
                orgquestion_threads[orgquestion.id] = []
                orgquestion_ids.append(orgquestion.id)
            orgquestion_threads[orgquestion.id].append((relevant, thread.id))
        for orgquestion_id in orgquestion_ids:
            threads = orgquestion_threads[orgquestion_id]
            sorted_threads = sorted(enumerate(threads), key=lambda thread: thread[1][0], \
                                    reverse=True)
            for rank, (_,
                       (relevant,
                        thread_id)) in sorted(enumerate(sorted_threads),
                                              key=lambda thread: thread[1][0]):
                gold_score = (len(sorted_threads) - rank) / len(sorted_threads)
                output_file.write(
                    "%s\t%s\t%d\t%s\t%s\n" %
                    (orgquestion_id, thread_id, rank + 1, gold_score,
                     "true" if relevant else "false"))
def evaluate(language_model, dataset_fnames, output_fname):
    """Produces an output file that contains the ranking of document pairs."""
    with open(output_fname, "wt") as output_file:
        for orgquestion, (thread,
                          _) in zip(segment_orgquestions(dataset_fnames),
                                    segment_threads(dataset_fnames)):
            test_score = language_model.compare(orgquestion, thread)
            output_file.write("%s\t%s\t0\t%s\ttrue\n" %
                              (orgquestion.id, thread.id, repr(test_score)))
def evaluate_segmented_aggregation(language_model,
                                   classifier,
                                   dataset_fnames,
                                   output_fname,
                                   aggregate_tier1_segments,
                                   aggregate_tier2_segments,
                                   thread_first=True,
                                   segment_filtering=None):
    """
        Produces an output file that contains the ranking of document pairs and
        predicted relevance labels.  The segmented non-ML version computes
        similarity between segments and then performs a reduction step to
        derive document similarity.

        If full_threads is True, processes entire <Thread>s, otherwise
        processes only the <RelQuestion>s.

        If thread_first is True, the reduction is first performed over <Thread>
        segments and then over <OrgQuestion> segments rather than the other way
        around.

        If segment_filtering is not None, a text summarization technique is
        used for the filtering of <Thread> segments.
    """
    with open(output_fname, "wt") as output_file:
        for orgquestion, (thread, _) \
            in zip(segment_orgquestions(dataset_fnames),
                   segment_threads(dataset_fnames, segment_filtering=segment_filtering)):
            results = []
            tier1 = thread if thread_first else orgquestion
            tier2 = orgquestion if thread_first else thread
            for tier2_segment in tier2.segments:
                subresults = []
                for tier1_segment in tier1.segments:
                    orgquestion_segment = tier2_segment if thread_first else tier1_segment
                    thread_segment = tier1_segment if thread_first else tier2_segment
                    subresults.append([
                        language_model.similarity(orgquestion_segment,
                                                  thread_segment),
                        tier2_segment, tier1_segment
                    ])
                subresults_aggregate = aggregate_tier1_segments(
                    subresults, language_model)
                LOGGER.debug("Aggregating subresults: %s -> %s", subresults,
                             subresults_aggregate)
                results.append(subresults_aggregate)
            results_aggregate = aggregate_tier2_segments(
                results, language_model)
            LOGGER.debug("Aggregating results: %s -> %s", results,
                         results_aggregate)
            test_score = results_aggregate[0]
            test_class = classifier.predict([[test_score]])[0]
            output_file.write("%s\t%s\t0\t%s\t%s\n" %
                              (orgquestion.id, thread.id, repr(test_score),
                               "true" if test_class else "false"))
def train_segmented_aggregation(language_model,
                                dataset_fnames,
                                aggregate_tier1_segments,
                                aggregate_tier2_segments,
                                thread_first=True,
                                segment_filtering=None):
    """
        Trains a classifier that maps document similarity to relevance labels.
        The segmented non-ML version computes similarity between segments and
        then performs a reduction step to derive document similarity.

        If full_threads is True, processes entire <Thread>s, otherwise
        processes only the <RelQuestion>s.

        If thread_first is True, the reduction is first performed over <Thread>
        segments and then over <OrgQuestion> segments rather than the other way
        around.

        If segment_filtering is not None, a text summarization technique is
        used for the filtering of <Thread> segments.
    """
    training_scores = []
    training_classes = []
    for orgquestion, (thread, relevant) \
        in zip(segment_orgquestions(dataset_fnames),
               segment_threads(dataset_fnames, segment_filtering=segment_filtering)):
        results = []
        tier1 = thread if thread_first else orgquestion
        tier2 = orgquestion if thread_first else thread
        for tier2_segment in tier2.segments:
            subresults = []
            for tier1_segment in tier1.segments:
                orgquestion_segment = tier2_segment if thread_first else tier1_segment
                thread_segment = tier1_segment if thread_first else tier2_segment
                subresults.append([
                    language_model.similarity(orgquestion_segment,
                                              thread_segment), tier2_segment,
                    tier1_segment
                ])
            subresults_aggregate = aggregate_tier1_segments(
                subresults, language_model)
            LOGGER.debug("Aggregating subresults: %s -> %s", subresults,
                         subresults_aggregate)
            results.append(subresults_aggregate)
        results_aggregate = aggregate_tier2_segments(results, language_model)
        LOGGER.debug("Aggregating results: %s -> %s", results,
                     results_aggregate)
        training_scores.append(results_aggregate)
        training_classes.append(relevant)
    classifier = LogisticRegression(
        random_state=LOGISTIC_REGRESSION_RANDOM_STATE)
    classifier.fit(training_scores, training_classes)
    return classifier
def evaluate_nonsegmented(language_model, classifier, dataset_fnames, output_fname, \
                          segment_filtering=None):
    """
        Produces an output file that contains the ranking of document pairs and
        predicted relevance labels.  The non-segmented version disregards
        segmentation and computes similarity directly between documents.

        If segment_filtering is not None, a text summarization technique is
        used for the filtering of <Thread> segments.
    """
    with open(output_fname, "wt") as output_file:
        for orgquestion, (thread, _) \
            in zip(segment_orgquestions(dataset_fnames),
                   segment_threads(dataset_fnames, segment_filtering=segment_filtering)):
            test_score = language_model.similarity(orgquestion, thread)
            test_class = classifier.predict([[test_score]])[0]
            output_file.write("%s\t%s\t0\t%s\t%s\n" %
                              (orgquestion.id, thread.id, repr(test_score),
                               "true" if test_class else "false"))
def train_nonsegmented(language_model,
                       dataset_fnames,
                       segment_filtering=False):
    """
        Trains a classifier that maps document similarity to relevance labels.
        The non-segmented version disregards segmentation and computes similarity directly between
        documents.

        If segment_filtering is not None, a text summarization technique is
        used for the filtering of <Thread> segments.
    """
    training_scores = []
    training_classes = []
    for orgquestion, (thread, relevant) \
        in zip(segment_orgquestions(dataset_fnames),
               segment_threads(dataset_fnames, segment_filtering=segment_filtering)):
        training_scores.append(
            [language_model.similarity(orgquestion, thread)])
        training_classes.append(relevant)
    classifier = LogisticRegression(
        random_state=LOGISTIC_REGRESSION_RANDOM_STATE)
    classifier.fit(training_scores, training_classes)
    return classifier
    def __init__(self,
                 base_term_weighting="tfidf_ntc_ntc",
                 extra_term_weighting=None):
        """
            Sets up a tf-idf language model using the unannotated SemEval 2016/2017 Task 3 dataset.

            base_term_weighting is either "tfidf_xxx.xxx", where "xxx.xxx" stands for the tf-idf
            SMART notation described in (Salton, Gerard. 1971a), which then determines the base term
            weights during the vectorization, or it is "bm25", which specifies that the
            probabilistic Okapi BM25 scoring will be used instead. You can toggle between the two
            via self.use_tfidf.

            extra_term_weighting specifies additional weighting factors, which stack
            multiplicatively on top of the base term weights during token list vectorization.

            If extra_term_weighting is "godwin", the base weight of a term is multiplied by a factor
            inversely proportional to the sum of the positions at which the term appears in the
            document.

            If extra_term_weighting is "murataetal00_A" or "murataetal00_B", the base weight of a
            term t is multiplied by a factor K_location(d, t) described in (Murata et al., 2000)
            with constants taken for system A or B from section 3. The title and body parameters
            then correspond to the title and body token lists.
        """
        file_handler = logging.FileHandler(LOG_FNAME, encoding='utf8')
        logging.getLogger().addHandler(file_handler)

        # Parse the configuration.
        if re.match(r"tfidf_", base_term_weighting):
            self.use_tfidf = True
        else:
            assert re.match(r"bm25", base_term_weighting)
            self.use_tfidf = False
            self.bm25_k1, self.bm25_k3, self.bm25_b = \
                re.match(r"bm25_k1=([0-9](?:\.[0-9]*)?)_k3=([0-9]*(?:\.[0-9]*)?)_b=([0-9](?:\.[0-9]*)?)",
                         base_term_weighting).groups()
            self.bm25_k1 = float(self.bm25_k1)
            self.bm25_k3 = float(self.bm25_k3)
            self.bm25_b = float(self.bm25_b)

        if self.use_tfidf:
            assert extra_term_weighting in (None, "godwin", "murataetal00_A",
                                            "murataetal00_B")
        else:
            assert extra_term_weighting is None
        self.extra_term_weighting = extra_term_weighting

        if self.use_tfidf:
            self.tfidf_result = {}
            self.tfidf_query = {}
            self.tfidf_result["tf"], self.tfidf_result["df"], self.tfidf_result["norm"], \
                self.tfidf_slope, self.tfidf_query["tf"], self.tfidf_query["df"], \
                self.tfidf_query["norm"] = re.match(r"tfidf_(.)(.)(.)(?:_s=([0-9](?:\.[0-9]*)?))?_(.)(.)(.)",
                                                    base_term_weighting).groups()
            self.tfidf_result["tf"] = TF_WEIGHTING_METHOD_MAP[
                self.tfidf_result["tf"]]
            self.tfidf_result["df"] = DF_WEIGHTING_METHOD_MAP[
                self.tfidf_result["df"]]
            self.tfidf_result["norm"] = NORMALIZATION_METHOD_MAP[
                self.tfidf_result["norm"]]
            if self.tfidf_result["norm"] in (norm_u, norm_b):
                assert self.tfidf_slope is not None
            if self.tfidf_slope is not None:
                self.tfidf_slope = float(self.tfidf_slope)
            self.tfidf_query["tf"] = TF_WEIGHTING_METHOD_MAP[
                self.tfidf_query["tf"]]
            self.tfidf_query["df"] = DF_WEIGHTING_METHOD_MAP[
                self.tfidf_query["df"]]
            self.tfidf_query["norm"] = NORMALIZATION_METHOD_MAP[
                self.tfidf_query["norm"]]
            assert self.tfidf_query["norm"] not in (norm_u, norm_b)

        # Prepare the BM25 scoring model.
        try:
            with open(BM25_STATS_FNAME, "br") as file:
                self.bm25_avdl = load(file)
        except IOError:
            self.bm25_avdl = {}
            LOGGER.info("preparing the bm25 scoring function statistics")

            self.bm25_avdl["documents"] = mean([sum((len(token) for token in document.tokens)) \
                for document, _ in segment_threads([UNANNOTATED_DATASET_FNAME])])
            LOGGER.info("average document length: %f",
                        self.bm25_avdl["documents"])

            self.bm25_avdl["qsubjects"] = mean([sum((len(token) for token in segment.tokens)) \
                for segment in chain.from_iterable(document.segments for document, _ \
                                       in segment_threads([UNANNOTATED_DATASET_FNAME])) \
                if segment == segment.document.qsubject])
            LOGGER.info("average qsubject segment length: %f",
                        self.bm25_avdl["qsubjects"])

            self.bm25_avdl["qbodies"] = mean([sum((len(token) for token in segment.tokens)) \
                for segment in chain.from_iterable(document.segments for document, _ \
                                       in segment_threads([UNANNOTATED_DATASET_FNAME])) \
                if segment == segment.document.qbody])
            LOGGER.info("average qbody segment length: %f",
                        self.bm25_avdl["qbodies"])

            self.bm25_avdl["comments"] = mean([sum((len(token) for token in segment.tokens)) \
                for segment in chain.from_iterable(document.segments for document, _ \
                                       in segment_threads([UNANNOTATED_DATASET_FNAME])) \
                if segment != segment.document.qsubject and segment != segment.document.qbody])
            LOGGER.info("average comment segment length: %f",
                        self.bm25_avdl["comments"])

            with open(BM25_STATS_FNAME, "bw") as file:
                dump(self.bm25_avdl, file)
            LOGGER.info("done preparing the bm25 scoring function statistics")

        # Prepare the pivoted document normalization tf-idf statistics.
        try:
            with open(PIVOT_STATS_FNAME, "rb") as file:
                self.pivot_stats = load(file)
        except IOError:
            self.pivot_stats = {}
            LOGGER.info(
                "preparing the pivoted document normalization tf-idf statistics"
            )

            self.pivot_stats["documents"] = {}
            self.pivot_stats["documents"]["avgb"] = self.bm25_avdl["documents"]
            self.pivot_stats["documents"]["avgu"] = mean([len(document.terms) \
                for document, _ in segment_threads([UNANNOTATED_DATASET_FNAME])])
            LOGGER.info("average document length: %f",
                        self.pivot_stats["documents"]["avgb"])
            LOGGER.info("average document unique terms: %f",
                        self.pivot_stats["documents"]["avgu"])

            self.pivot_stats["qsubjects"] = {}
            self.pivot_stats["qsubjects"]["avgb"] = self.bm25_avdl["qsubjects"]
            self.pivot_stats["qsubjects"]["avgu"] = mean([len(segment.terms) \
                for segment in chain.from_iterable(document.segments for document, _ \
                                       in segment_threads([UNANNOTATED_DATASET_FNAME])) \
                if segment == segment.document.qsubject])
            LOGGER.info("average qsubject segment length: %f",
                        self.pivot_stats["qsubjects"]["avgb"])
            LOGGER.info("average qsubject segment unique terms: %f",
                        self.pivot_stats["qsubjects"]["avgu"])

            self.pivot_stats["qbodies"] = {}
            self.pivot_stats["qbodies"]["avgb"] = self.bm25_avdl["qbodies"]
            self.pivot_stats["qbodies"]["avgu"] = mean([len(segment.terms) \
                for segment in chain.from_iterable(document.segments for document, _ \
                                       in segment_threads([UNANNOTATED_DATASET_FNAME])) \
                if segment == segment.document.qbody])
            LOGGER.info("average qbody segment length: %f",
                        self.pivot_stats["qbodies"]["avgb"])
            LOGGER.info("average qbody segment unique terms: %f",
                        self.pivot_stats["qbodies"]["avgu"])

            self.pivot_stats["comments"] = {}
            self.pivot_stats["comments"]["avgb"] = self.bm25_avdl["comments"]
            self.pivot_stats["comments"]["avgu"] = mean([len(segment.terms) \
                for segment in chain.from_iterable(document.segments for document, _ \
                                       in segment_threads([UNANNOTATED_DATASET_FNAME])) \
                if segment != segment.document.qsubject and segment != segment.document.qbody])
            LOGGER.info("average comment segment length: %f",
                        self.pivot_stats["comments"]["avgb"])
            LOGGER.info("average comment segment unique terms: %f",
                        self.pivot_stats["comments"]["avgu"])

            with open(PIVOT_STATS_FNAME, "wb") as file:
                dump(self.pivot_stats, file)
            LOGGER.info(
                "done preparing the pivoted document normalization tf-idf statistics"
            )

        # Prepare the dictionary.
        try:
            self.dictionary = corpora.Dictionary.load(DICTIONARY_FNAME,
                                                      mmap='r')
        except IOError:
            self.dictionary = \
                corpora.Dictionary(segment for segment in chain.from_iterable( \
                    document.segments for document, _ \
                                      in segment_threads([UNANNOTATED_DATASET_FNAME])))
            self.dictionary.save(DICTIONARY_FNAME)

        logging.getLogger().removeHandler(file_handler)