def test_read_split_file_invalid_input(self):
        """Test parsing invalid input raises an error."""
        with self.assertRaises(FeatureMapFileFormatError):
            _ = read_split_file(self.split_dup_groups_lines)

        with self.assertRaises(FeatureMapFileFormatError):
            _ = read_split_file(self.split_dup_objs_lines)
Example #2
0
def build_problem_data(group_map_files, mapping_file, prediction_field,
                       start_level, include_only, negate, n_processes):
    #For each scope, build a map from group to object and vice versa
    group_to_object = []
    object_to_group = []
    for map_file in group_map_files:
        g_to_o, o_to_g = read_split_file(map_file)
        group_to_object.append(g_to_o)
        object_to_group.append(o_to_g)

    #Find a list of sample names from our group names
    #An alternative is 'samplenames = samplemap.keys()', but that may have records without features
    samplenames = set()
    for grp in group_to_object[start_level]:
        l = group_to_object[start_level][grp]
        for obj in l:
            samplenames.add(parse_object_string_sample(obj))
    samplenames = list(samplenames)

    #get a map of sample name to it's properties
    samplemap = read_mapping_file(mapping_file)

    sample_to_response = {}
    for samplename in samplenames:
        if (include_only is None or
            ((samplemap[samplename][include_only[0]] in include_only[1]) ^ negate)):
            sample_to_response[samplename] = samplemap[samplename][prediction_field]

    problem_data = ProblemData(group_to_object, object_to_group, sample_to_response, n_processes)

    feature_vector = FeatureVector([FeatureRecord(group, start_level,
                                                  len(group_to_object[start_level][group]))
                                    for group in group_to_object[start_level].keys()])

    return problem_data, feature_vector
 def test_read_split_file(self):
     """Test parsing a feature-to-object map."""
     exp = ({'F1': ['A', 'B'], 'F2': ['C'], 'F3': ['D', 'E', 'F']},
            {'A': 'F1', 'C': 'F2', 'B': 'F1', 'E': 'F3', 'D': 'F3',
             'F': 'F3'})
     obs = read_split_file(self.split1_lines)
     self.assertEqual(obs, exp)
Example #4
0
def build_problem_data(
    group_map_files,
    mapping_file,
    prediction_field,
    start_level,
    include_only,
    negate,
    n_processes,
    parse_object_string=parse_object_string_sample,
):
    simple_var_types = [
        ("n_processes", types.IntType),
        ("start_level", types.IntType),
        ("negate", types.BooleanType),
        ("prediction_field", types.StringType),
        ("include_only", (types.NoneType, types.ListType, types.TupleType)),
        ("group_map_files", types.ListType),
    ]
    for var_name, var_type in simple_var_types:
        check_input_type(var_name, locals()[var_name], var_type)
    if include_only != None:
        if not isinstance(include_only[0], types.StringType):
            raise InputTypeError("include_only[0] should be of type string")
        if not isinstance(include_only[1], types.ListType) or not all(
            [isinstance(value, types.StringType) for value in include_only[1]]
        ):
            raise InputTypeError("include_only[1] should be a list of strings")
    if start_level >= len(group_map_files) or start_level < 0:
        raise InputTypeError(
            "start_level (%s) is not a valid scope index; group_map_files is of length %s"
            % (start_level, len(group_map_files))
        )

    # For each scope, build a map from group to object and vice versa
    group_to_object = []
    object_to_group = []
    for map_file in group_map_files:
        g_to_o, o_to_g = read_split_file(map_file)
        group_to_object.append(g_to_o)
        object_to_group.append(o_to_g)

        assert isinstance(g_to_o, types.DictType), "read_split_file did not return a dict type"
        assert isinstance(o_to_g, types.DictType), "read_split_file did not return a dict type"

    # Find a list of sample names from our group names
    # An alternative is 'samplenames = samplemap.keys()', but that may have records without features
    samplename_set = set()
    for grp in group_to_object[start_level]:
        objs = group_to_object[start_level][grp]
        for obj in objs:
            samplename = parse_object_string(obj)
            samplename_set.add(samplename)
    samplenames = list(samplename_set)

    # get a map of sample name to it's properties
    samplemap = read_mapping_file(mapping_file)

    def include_samplename(samplename):
        if include_only == None:
            return True
        sample_dict = samplemap[samplename]

        try:
            if (sample_dict[include_only[0]] in include_only[1]) ^ negate:
                return True
            return False
        except KeyError:
            raise KeyError("include_only[0] is not a field in mapping_file")

    sample_to_response = {}
    for samplename in samplenames:
        if include_samplename(samplename):
            sample_fields = None
            try:
                sample_fields = samplemap[samplename]
            except KeyError:
                raise KeyError(
                    "A sample name (%s) found in the group files is not a sample in mapping_file." % samplename
                )
            try:
                sample_to_response[samplename] = sample_fields[prediction_field]
            except KeyError:
                raise KeyError("prediction_field is not a field in mapping_file.")

    problem_data = ProblemData(group_to_object, object_to_group, sample_to_response, n_processes, parse_object_string)

    return problem_data