def interpret_annotation_assist(annotation_assist, judgment_threshold): """ Convert the file produced by the Annotation Assist tool into a set of judgments that can be used by Themis. Convert the in purview column from an integer value to a boolean. Convert the annotation score column to a boolean correct column by applying a threshold. An answer can only be correct if the question is in purview. Drop any Q&A pairs that have multiple annotations. :param annotation_assist: Annotation Assist judgments :type annotation_assist: pandas.DataFrame :param judgment_threshold: threshold above which an answer is deemed correct :type judgment_threshold: pandas.DataFrame :return: Annotation Assist judgments with a boolean Correct column :rtype: pandas.DataFrame """ qa_duplicates = annotation_assist[[QUESTION, ANSWER]].duplicated() if any(qa_duplicates): n = sum(qa_duplicates) logger.warning( "Dropping %d Q&A pairs with multiple annotations (%0.3f%%)" % (n, 100.0 * n / len(annotation_assist))) annotation_assist.drop_duplicates((QUESTION, ANSWER), keep=False, inplace=True) annotation_assist[IN_PURVIEW] = annotation_assist[IN_PURVIEW].astype( "bool") annotation_assist[CORRECT] = \ annotation_assist[IN_PURVIEW] & (annotation_assist[ANNOTATION_SCORE] >= judgment_threshold) logger.info("Processed %d judgments" % len(annotation_assist)) return annotation_assist.drop(ANNOTATION_SCORE, axis="columns")
def interpret_annotation_assist(annotation_assist, judgment_threshold): """ Convert the file produced by the Annotation Assist tool into a set of judgments that can be used by Themis. Convert the in purview column from an integer value to a boolean. Convert the annotation score column to a boolean correct column by applying a threshold. An answer can only be correct if the question is in purview. Drop any Q&A pairs that have multiple annotations. :param annotation_assist: Annotation Assist judgments :type annotation_assist: pandas.DataFrame :param judgment_threshold: threshold above which an answer is deemed correct :type judgment_threshold: pandas.DataFrame :return: Annotation Assist judgments with a boolean Correct column :rtype: pandas.DataFrame """ qa_duplicates = annotation_assist[[QUESTION, ANSWER]].duplicated() if any(qa_duplicates): n = sum(qa_duplicates) logger.warning( "Dropping %d Q&A pairs with multiple annotations (%0.3f%%)" % (n, 100.0 * n / len(annotation_assist))) annotation_assist.drop_duplicates((QUESTION, ANSWER), keep=False, inplace=True) annotation_assist[IN_PURVIEW] = annotation_assist[IN_PURVIEW].astype("bool") annotation_assist[CORRECT] = \ annotation_assist[IN_PURVIEW] & (annotation_assist[ANNOTATION_SCORE] >= judgment_threshold) logger.info("Processed %d judgments" % len(annotation_assist)) return annotation_assist.drop(ANNOTATION_SCORE, axis="columns")
def drop_missing(systems_data): if any(systems_data.isnull()): n = len(systems_data) systems_data = systems_data.dropna() m = n - len(systems_data) logger.warning("Dropping %d of %d question/answer pairs missing information (%0.3f%%)" % (m, n, 100.0 * m / n)) return systems_data
def judge_sample_handler(args): questions = pandas.concat(args.judgments)[[QUESTION]].drop_duplicates() sample = pandas.merge(questions, args.frequency, on=QUESTION, how="left") n = len(sample) logger.info("%d judged questions" % n) m = sum(sample[FREQUENCY].isnull()) if m: logger.warning("Missing frequencies for %d questions (%0.3f%%)" % (m, 100.0 * m / n)) print_csv(QuestionFrequencyFileType.output_format(sample))
def __call__(self, filename): collated = super(self.__class__, self).__call__(filename) m = sum(collated[collated[IN_PURVIEW] == False][CORRECT]) if m: n = len(collated) logger.warning( "%d out of %d question/answer pairs in %s are marked as out of purview but correct (%0.3f%%)" % (m, n, filename, 100.0 * m / n)) return collated
def precision(judgments, t): s = judgments[judgments[CONFIDENCE] >= t] correct = sum(s[s[CORRECT]][FREQUENCY]) in_purview = sum(s[s[IN_PURVIEW]][FREQUENCY]) try: return correct / float(in_purview) except ZeroDivisionError: logger.warning("No in-purview questions at threshold level %0.3f" % t) return None
def drop_missing(systems_data): if any(systems_data.isnull()): n = len(systems_data) systems_data = systems_data.dropna() m = n - len(systems_data) if m: logger.warning("Dropping %d of %d question/answer pairs missing information (%0.3f%%)" % (m, n, 100.0 * m / n)) return systems_data
def questions_attempted(judgments, t): s = judgments[judgments[CONFIDENCE] >= t] in_purview_attempted = sum(s[s[IN_PURVIEW]][FREQUENCY]) total_in_purview = sum(judgments[judgments[IN_PURVIEW]][FREQUENCY]) try: return in_purview_attempted / float(total_in_purview) except ZeroDivisionError: logger.warning("No in-purview questions attempted at threshold level %0.3f" % t) return None
def questions_attempted(judgments, t): s = judgments[judgments[CONFIDENCE] >= t] in_purview_attempted = sum(s[s[IN_PURVIEW]][FREQUENCY]) total_in_purview = sum(judgments[judgments[IN_PURVIEW]][FREQUENCY]) try: return in_purview_attempted / float(total_in_purview) except ZeroDivisionError: logger.warning( "No in-purview questions attempted at threshold level %0.3f" % t) return None
def corpus_from_trec(checkpoint_filename, trec_directory, checkpoint_frequency, max_docs): trec_filenames = sorted(glob.glob(os.path.join(trec_directory, "*.xml")))[:max_docs] checkpoint = get_items("TREC files", trec_filenames, TrecFileCheckpoint(checkpoint_filename, checkpoint_frequency), parse_trec_file, checkpoint_frequency) if checkpoint.invalid: n = len(trec_filenames) logger.warning("%d of %d TREC files are invalid (%0.3f%%)" % (checkpoint.invalid, n, 100 * checkpoint.invalid / n)) # I'm not sure why I'm getting duplicates after a restart. return from_csv(checkpoint_filename).drop_duplicates().drop(TrecFileCheckpoint.TREC_FILENAME, axis="columns")
def get_pau_mapping(question): if "predefinedAnswerUnit" in question: return question["predefinedAnswerUnit"] elif "mappedQuestion" in question: question_id = question["mappedQuestion"]["id"] try: mapped_question = questions[question_id] except KeyError: logger.warning("Question %s mapped to non-existent question %s" % (question["id"], question_id)) return None return get_pau_mapping(mapped_question) else: return None
def __call__(self, filename): if os.path.isfile(filename): collated = super(self.__class__, self).__call__(filename) m = sum(collated[collated[IN_PURVIEW] == False][CORRECT]) if m: n = len(collated) logger.warning( "%d out of %d question/answer pairs in %s are marked as out of purview but correct (%0.3f%%)" % (m, n, filename, 100.0 * m / n)) return collated else: logger.info("{0} does not exist".format(filename)) return None
def get_pau_mapping(question): if "predefinedAnswerUnit" in question: return question["predefinedAnswerUnit"] elif "mappedQuestion" in question: question_id = question["mappedQuestion"]["id"] try: mapped_question = questions[question_id] except KeyError: logger.warning( "Question %s mapped to non-existent question %s" % (question["id"], question_id)) return None return get_pau_mapping(mapped_question) else: return None
def corpus_statistics(corpus): """ Generate statistics for the corpus. :param corpus: corpus generated by 'xmgr corpus' command :type corpus: pandas.DataFrame :return: answers in corpus, tokens in the corpus, histogram of answer length in tokens :rtype: (int, int, dict(int, int)) """ answers = len(corpus) token_frequency = FreqDist([len(word_tokenize(BeautifulSoup(answer, "lxml").text)) for answer in corpus[ANSWER]]) histogram = {} for frequency, count in token_frequency.items(): histogram[frequency] = histogram.get(frequency, 0) + count tokens = sum(token_frequency.keys()) n = sum(corpus.duplicated(ANSWER_ID)) if n: logger.warning("%d duplicated answer IDs (%0.3f%%)" % (n, 100.0 * n / answers)) return answers, tokens, histogram
def in_purview_disagreement(systems_data): """ Return collated data where in-purview judgments are not unanimous for a question. These questions' purview should be rejudged to make them consistent. :param systems_data: collated results for all systems :type systems_data: pandas.DataFrame :return: subset of collated data where the purview judgments are not unanimous for a question :rtype: pandas.DataFrame """ question_groups = systems_data[[QUESTION, IN_PURVIEW]].groupby(QUESTION) index = question_groups.filter(lambda qg: len(qg[IN_PURVIEW].unique()) == 2).index purview_disagreement = systems_data.loc[index] m = len(purview_disagreement[QUESTION].drop_duplicates()) if m: n = len(systems_data[QUESTION].drop_duplicates()) logger.warning("%d out of %d questions have non-unanimous in-purview judgments (%0.3f%%)" % (m, n, 100.0 * m / n)) return purview_disagreement
def in_purview_disagreement(systems_data): """ Return collated data where in-purview judgments are not unanimous for a question. These questions' purview should be rejudged to make them consistent. :param systems_data: collated results for all systems :type systems_data: pandas.DataFrame :return: subset of collated data where the purview judgments are not unanimous for a question :rtype: pandas.DataFrame """ question_groups = systems_data[[QUESTION, IN_PURVIEW]].groupby(QUESTION) index = question_groups.filter(lambda qg: len(qg[IN_PURVIEW].unique()) > 1).index purview_disagreement = systems_data.loc[index] m = len(purview_disagreement[QUESTION].drop_duplicates()) if m: n = len(systems_data[QUESTION].drop_duplicates()) logger.warning("%d out of %d questions have non-unanimous in-purview judgments (%0.3f%%)" % (m, n, 100.0 * m / n)) return purview_disagreement
def collate_handler(parser, args): labeled_qa_pairs = answer_labels(parser, args) judgments = pandas.concat(args.judgments) all_systems = [] for label, qa_pairs in labeled_qa_pairs: # Only consider the questions listed in the frequency file. qa_pairs = qa_pairs[qa_pairs[QUESTION].isin(args.frequency[QUESTION])] collated = add_judgments_and_frequencies_to_qa_pairs(qa_pairs, judgments, args.frequency, args.remove_newlines) collated[SYSTEM] = label all_systems.append(collated) collated = pandas.concat(all_systems) logger.info("%d question/answer pairs" % len(collated)) n = len(collated) for column, s in [(ANSWER, "answers"), (IN_PURVIEW, "in purview judgments"), (CORRECT, "correctness judgments")]: m = sum(collated[column].isnull()) if m: logger.warning("%d question/answer pairs out of %d missing %s (%0.3f%%)" % (m, n, s, 100.0 * m / n)) # This will print a warning if any in-purview judgments are not unanimous for a given question. in_purview_disagreement(collated) print_csv(CollatedFileType.output_format(collated))
def get_answers_from_usage_log(questions, qa_pairs_from_logs): """ Get answers returned by WEA to questions by looking them up in the usage log. Each question in the Q&A pairs must have a unique answer. :param questions: questions to look up in the usage logs :type questions: pandas.DataFrame :param qa_pairs_from_logs: question/answer pairs extracted from user logs :type qa_pairs_from_logs: pandas.DataFrame :return: Question, Answer, and Confidence :rtype: pandas.DataFrame """ answers = pandas.merge(questions, qa_pairs_from_logs, on=QUESTION, how="left") missing_answers = answers[answers[ANSWER].isnull()] if len(missing_answers): logger.warning("%d questions without answers" % len(missing_answers)) logger.info("Answered %d questions" % len(answers)) answers = answers[[QUESTION, ANSWER, CONFIDENCE]].sort_values([QUESTION, CONFIDENCE], ascending=[True, False]) return answers.set_index(QUESTION)
def extract_question_answer_pairs_from_usage_logs(usage_log): """ Extract questions and answers from usage logs, adding question frequency information. We are assuming here that a given question always elicits the same answer. Print a warning if this is not the case and drop answers to make the answers unique. It is arbitrary which answer is dropped. :param usage_log: QuestionsData.csv usage log :type usage_log: pandas.DatFrame :return: Q&A pairs with question frequency information :rtype: pandas.DatFrame """ frequency = question_frequency(usage_log) qa_pairs = usage_log.drop_duplicates(subset=(QUESTION, ANSWER)) m = sum(qa_pairs.duplicated(QUESTION)) if m: n = len(frequency) logger.warning("%d questions of %d have multiple answers (%0.3f%%), only keeping one answer per question" % (m, n, 100.0 * m / n)) qa_pairs = qa_pairs.drop_duplicates(QUESTION) qa_pairs = pandas.merge(qa_pairs, frequency, on=QUESTION) logger.info("%d question/answer pairs" % len(qa_pairs)) return qa_pairs
def extract_question_answer_pairs_from_usage_logs(usage_log): """ Extract questions and answers from usage logs, adding question frequency information. We are assuming here that a given question always elicits the same answer. Print a warning if this is not the case and drop answers to make the answers unique. It is arbitrary which answer is dropped. :param usage_log: QuestionsData.csv usage log :type usage_log: pandas.DatFrame :return: Q&A pairs with question frequency information :rtype: pandas.DatFrame """ frequency = question_frequency(usage_log) qa_pairs = usage_log.drop_duplicates(subset=(QUESTION, ANSWER)) m = sum(qa_pairs.duplicated(QUESTION)) if m: n = len(frequency) logger.warning( "%d questions of %d have multiple answers (%0.3f%%), only keeping one answer per question" % (m, n, 100.0 * m / n)) qa_pairs = qa_pairs.drop_duplicates(QUESTION) qa_pairs = pandas.merge(qa_pairs, frequency, on=QUESTION) logger.info("%d question/answer pairs" % len(qa_pairs)) return qa_pairs