コード例 #1
0
    def filter_messages(cls,
                        data,
                        project_start_date,
                        project_end_date,
                        filter_test_messages=True):
        # Filter out test messages sent by AVF.
        if filter_test_messages:
            data = MessageFilters.filter_test_messages(data)
        else:
            log.debug(
                "Not filtering out test messages (because the pipeline configuration json key "
                "'FilterTestMessages' was set to false)")

        # Filter for runs which don't contain a response to any week's question
        data = MessageFilters.filter_empty_messages(data, [
            plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS
        ])

        # Filter out runs sent outwith the project start and end dates
        time_keys = {
            plan.time_field
            for plan in PipelineConfiguration.RQA_CODING_PLANS
        }
        data = MessageFilters.filter_time_range(data, time_keys,
                                                project_start_date,
                                                project_end_date)

        return data
コード例 #2
0
    def auto_code_show_messages(cls, user, data, icr_output_dir,
                                coda_output_dir):
        # Filter out test messages sent by AVF
        if not PipelineConfiguration.DEV_MODE:
            data = MessageFilters.filter_test_messages(data)

        # Filter for runs which don't contain a response to any week's question
        data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS)

        # Filter out runs sent outwith the project start and end dates
        data = MessageFilters.filter_time_range(
            data, cls.SENT_ON_KEY, PipelineConfiguration.PROJECT_START_DATE,
            PipelineConfiguration.PROJECT_END_DATE)

        # Label each message with channel keys
        Channels.set_channel_keys(user, data, cls.SENT_ON_KEY)

        # Output RQA and follow up surveys messages to Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {},
                    f)

        # Output RQA and follow up messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            rqa_and_follow_up_messages = []
            # This test works because the only codes which have been applied at this point are TRUE_MISSING.
            # If any other coding is done above, this test will need to change
            for td in data:
                if plan.raw_field in td:
                    rqa_and_follow_up_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_and_follow_up_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data
コード例 #3
0
    def filter_rqa_noise_other_project(messages):
        """
        Filters out RQA messages which have been labelled as Noise_Other_Project(NOP).

        :param messages: List of message objects to filter.
        :type messages: list of TracedData
        :return: Filtered list.
        :rtype: list of TracedData
        """

        # Filter radio question answers labelled as Noise_Other_Project
        data = MessageFilters.filter_rqa_noise_other_project(messages)

        return data
コード例 #4
0
    def generate(data, production_csv_output_path):
        production_keys = ["uid"]
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            if plan.raw_field not in production_keys:
                production_keys.append(plan.raw_field)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.raw_field not in production_keys:
                production_keys.append(plan.raw_field)

        not_noise = MessageFilters.filter_noise(data, "noise", lambda x: x)
        with open(production_csv_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(not_noise, f, headers=production_keys)

        return data
コード例 #5
0
    log.info("Loading the raw data...")
    data = LoadData.load_raw_data(user, raw_data_dir, pipeline_configuration)

    log.info("Translating source Keys...")
    data = TranslateSourceKeys.translate_source_keys(user, data,
                                                     pipeline_configuration)

    if pipeline_configuration.move_ws_messages:
        log.info("Pre-filtering empty message objects...")
        # This is a performance optimisation to save execution time + memory when moving WS messages, by removing
        # the need to mark and process a high volume of empty message objects as 'NR' in WS correction.
        # Empty message objects represent flow runs where the participants never sent a message e.g. from an advert
        # flow run where we asked someone a question but didn't receive a response.
        data = MessageFilters.filter_empty_messages(data, [
            plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS
        ])

        log.info("Moving WS messages...")
        data = WSCorrection.move_wrong_scheme_messages(user, data,
                                                       prev_coded_dir_path)
    else:
        log.info(
            "Not moving WS messages (because the 'MoveWSMessages' key in the pipeline configuration "
            "json was set to 'false')")

    log.info("Auto Coding...")
    data = AutoCode.auto_code(user, data, pipeline_configuration,
                              icr_output_dir, coded_dir_path)

    log.info("Exporting production CSV...")
コード例 #6
0
        data = WSCorrection.move_wrong_scheme_messages(user, data,
                                                       prev_coded_dir_path)
    else:
        log.info(
            "Not moving WS messages (because the 'MoveWSMessages' key in the pipeline configuration "
            "json was set to 'false')")

    log.info("Auto Coding...")
    data = AutoCode.auto_code(user, data, pipeline_configuration,
                              icr_output_dir, coded_dir_path)

    log.info("Applying Manual Codes from Coda...")
    data = ApplyManualCodes.apply_manual_codes(user, data, prev_coded_dir_path)

    log.info("Filtering out Messages labelled as Noise_Other_Channel...")
    data = MessageFilters.filter_noise_other_channel(data)

    log.info("Exporting production CSV...")
    data = ProductionFile.generate(data, production_csv_output_path)

    log.info(
        "Tagging listening group participants & Generating Analysis CSVs...")
    messages_data, individuals_data = AnalysisFile.generate(
        user, data, pipeline_configuration, raw_data_dir,
        csv_by_message_output_path, csv_by_individual_output_path)

    log.info("Writing messages TracedData to file...")
    IOUtils.ensure_dirs_exist_for_file(messages_json_output_path)
    with open(messages_json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_jsonl(messages_data, f)
コード例 #7
0
    data = CombineRawDatasets.combine_raw_datasets(user, messages_datasets,
                                                   coalesced_surveys_datasets)

    # Infer which RQA coding plans to use from the operator.
    # This 'hack' is necessary because the rqa coding plans are still not being set in the configuration json.
    if pipeline_configuration.filter_operator == "golis":
        log.info("Running in Bossaso mode")
        PipelineConfiguration.RQA_CODING_PLANS = PipelineConfiguration.BOSSASO_RQA_CODING_PLANS
    else:
        assert pipeline_configuration.filter_operator == "hormud", "FilterOperator must be either 'golis' or 'hormud'"
        log.info("Running in Baidoa mode")
        PipelineConfiguration.RQA_CODING_PLANS = PipelineConfiguration.BAIDOA_RQA_CODING_PLANS

    if pipeline_configuration.filter_operator is not None:
        data = MessageFilters.filter_operator(
            data, "operator_coded",
            CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value(
                pipeline_configuration.filter_operator))

    log.info("Translating Rapid Pro Keys...")
    data = TranslateRapidProKeys.translate_rapid_pro_keys(
        user, data, pipeline_configuration, prev_coded_dir_path)

    log.info("Redirecting WS messages...")
    data = WSCorrection.move_wrong_scheme_messages(user, data,
                                                   prev_coded_dir_path)

    log.info("Auto Coding Messages...")
    data = AutoCodeShowMessages.auto_code_show_messages(
        user, data, pipeline_configuration, icr_output_dir, coded_dir_path)

    log.info("Exporting production CSV...")
コード例 #8
0
    def auto_code_show_messages(cls, user, data, pipeline_configuration,
                                icr_output_dir, coda_output_dir):
        # Filter out test messages sent by AVF.
        if pipeline_configuration.filter_test_messages:
            data = MessageFilters.filter_test_messages(data)
        else:
            log.debug(
                "Not filtering out test messages (because the pipeline configuration json key "
                "'FilterTestMessages' was set to false)")

        # Filter for runs which don't contain a response to any week's question
        data = MessageFilters.filter_empty_messages(data, [
            plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS
        ])

        # Filter out runs sent outwith the project start and end dates
        data = MessageFilters.filter_time_range(
            data, cls.SENT_ON_KEY, pipeline_configuration.project_start_date,
            pipeline_configuration.project_end_date)

        # Skipping auto-assigning noise, as an experiment on this project.
        # If it turns out we need this, uncomment this block.
        # for td in data:
        #     is_noise = True
        #     for rqa_key in cls.RQA_KEYS:
        #         if rqa_key in td and not somali.DemographicCleaner.is_noise(td[rqa_key], min_length=10):
        #             is_noise = False
        #     td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time()))

        # TODO: Label each message with channel keys
        # Channels.set_channel_keys(user, data, cls.SENT_ON_KEY,
        #                           pipeline_configuration.project_start_date, pipeline_configuration.project_end_date)

        # Filter for messages which aren't noise (in order to export to Coda and export for ICR)
        not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY,
                                                lambda x: x)

        # Compute the number of RQA messages that were the empty string
        log.debug(
            "Counting the number of empty string messages for each raw radio show field..."
        )
        raw_rqa_fields = []
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            if plan.raw_field not in raw_rqa_fields:
                raw_rqa_fields.append(plan.raw_field)
        cls.log_empty_string_stats(data, raw_rqa_fields)

        # Compute the number of survey messages that were the empty string
        log.debug(
            "Counting the number of empty string messages for each survey field..."
        )
        raw_survey_fields = []
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.raw_field not in raw_survey_fields:
                raw_survey_fields.append(plan.raw_field)
        survey_data = dict()
        for td in data:
            survey_data[td["uid"]] = td
        cls.log_empty_string_stats(survey_data.values(), raw_survey_fields)

        # Output messages which aren't noise to Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, not_noise,
                                                   plan.raw_field,
                                                   plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field,
                    {}, f)

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = []
            for td in not_noise:
                if plan.raw_field in td:
                    rqa_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data
コード例 #9
0
    log.info("Loading the raw data...")
    data = LoadData.load_raw_data(user, raw_data_dir, pipeline_configuration)

    log.info("Translating Rapid Pro Keys...")
    data = TranslateRapidProKeys.translate_rapid_pro_keys(
        user, data, pipeline_configuration)

    if pipeline_configuration.move_ws_messages:
        log.info("Pre-filtering empty message objects...")
        # This is a performance optimisation to save execution time + memory when moving WS messages, by removing
        # the need to mark and process a high volume of empty message objects as 'NR' in WS correction.
        # Empty message objects represent flow runs where the participants never sent a message e.g. from an advert
        # flow run where we asked someone a question but didn't receive a response.
        data = MessageFilters.filter_empty_messages(data, [
            plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS
        ])

        log.info("Moving WS messages...")
        data = WSCorrection.move_wrong_scheme_messages(user, data,
                                                       prev_coded_dir_path)
    else:
        log.info(
            "Not moving WS messages (because the 'MoveWSMessages' key in the pipeline configuration "
            "json was set to 'false')")

    log.info("Auto Coding...")
    data = AutoCode.auto_code(user, data, pipeline_configuration,
                              icr_output_dir, coded_dir_path)

    log.info("Exporting production CSV...")