Example #1
0
 def test_feature_set_csv_converter(self):
     f1 = Feature('foo')
     f2 = Feature('bar')
     f_set = FeatureSet([f1, f2])
     d = f_set.to_dict()
     c = FeatureSetCsvConverter()
     # order doesn't matter, so need to check both orders:
     converted_input = c.convert('xyz', d, '', '')
     self.assertTrue(({
         'xyz': 'foo,bar'
     } == converted_input)
                     | ({
                         'xyz': 'bar,foo'
                     } == converted_input))
    def test_metadata_correct(self):
        resource_path = os.path.join(TESTDIR, 'gene_annotations.tsv')
        t = FeatureTable()
        column_dict = {}
        feature_list = []
        for i, line in enumerate(open(resource_path)):
            if i == 0:
                contents = line.strip().split('\t')
                for j,c in enumerate(contents[1:]):
                    column_dict[j] = c
            else:
                contents = line.strip().split('\t')
                gene_name = contents[0]
                attr_dict = {}
                for j,v in enumerate(contents[1:]):
                    try:
                        v = int(v)
                        attr = IntegerAttribute(v)
                    except ValueError:
                        attr = StringAttribute(v)

                    attr_dict[column_dict[j]] = attr
                f = Feature(gene_name, attr_dict)
                feature_list.append(f)
        expected_feature_set = FeatureSetSerializer(FeatureSet(feature_list)).data
        metadata = t.extract_metadata(resource_path)
        self.assertEqual(metadata[FEATURE_SET_KEY], expected_feature_set)
        self.assertIsNone(metadata[OBSERVATION_SET_KEY])
        self.assertIsNone(metadata[PARENT_OP_KEY])
    def test_metadata_correct_case2(self):
        '''
        Typically, the metadata is collected following a successful
        validation.  Do that here
        '''
        m = IntegerMatrix()
        resource_path = os.path.join(TESTDIR, 'test_integer_matrix.tsv')
        metadata = m.extract_metadata(resource_path)

        # Parse the test file to ensure we extracted the right content.
        line = open(resource_path).readline()
        contents = line.strip().split('\t')
        samplenames = contents[1:]
        obs_list = [Observation(x) for x in samplenames]

        gene_list = []
        for i, line in enumerate(open(resource_path)):
            if i > 0:
                g = line.split('\t')[0]
                gene_list.append(g)
        feature_list = [Feature(x) for x in gene_list]

        obs_set = ObservationSetSerializer(ObservationSet(obs_list)).data
        feature_set = FeatureSetSerializer(FeatureSet(feature_list)).data

        self.assertEqual(obs_set, metadata[OBSERVATION_SET_KEY])
        self.assertEqual(feature_set, metadata[FEATURE_SET_KEY])
        self.assertIsNone(metadata[PARENT_OP_KEY])
Example #4
0
    def test_metadata_correct(self):
        resource_path = os.path.join(TESTDIR, 'gene_annotations.tsv')
        t = FeatureTable()
        column_dict = {}
        feature_list = []
        for i, line in enumerate(open(resource_path)):
            if i == 0:
                contents = line.strip().split('\t')
                for j, c in enumerate(contents[1:]):
                    column_dict[j] = c
            else:
                contents = line.strip().split('\t')
                gene_name = contents[0]
                attr_dict = {}
                for j, v in enumerate(contents[1:]):
                    try:
                        v = int(v)
                        attr = IntegerAttribute(v)
                    except ValueError:
                        attr = StringAttribute(v)

                    attr_dict[column_dict[j]] = attr
                f = Feature(gene_name, attr_dict)
                feature_list.append(f)
        expected_feature_set = FeatureSetSerializer(
            FeatureSet(feature_list)).data
        metadata = t.extract_metadata(resource_path, 'tsv')
        # Commented out when we removed the automatic creation of Feature metadata
        # for FeatureTable resource types. For large files, it was causing issues
        # with exceptionally large JSON failing to store in db table.
        #self.assertEqual(metadata[FEATURE_SET_KEY], expected_feature_set)
        self.assertIsNone(metadata[FEATURE_SET_KEY])
        self.assertIsNone(metadata[OBSERVATION_SET_KEY])
        self.assertIsNone(metadata[PARENT_OP_KEY])
Example #5
0
    def test_metadata_correct_case2(self):
        '''
        Typically, the metadata is collected following a successful
        validation.  However, here we don't validate.  Check that 
        it goes and collects the table in the process
        '''
        m = Matrix()
        resource_path = os.path.join(TESTDIR, 'test_matrix.tsv')
        metadata = m.extract_metadata(resource_path, 'tsv')

        # Parse the test file to ensure we extracted the right content.
        line = open(resource_path).readline()
        contents = line.strip().split('\t')
        samplenames = contents[1:]
        obs_list = [Observation(x) for x in samplenames]

        gene_list = []
        for i, line in enumerate(open(resource_path)):
            if i > 0:
                g = line.split('\t')[0]
                gene_list.append(g)
        feature_list = [Feature(x) for x in gene_list]

        obs_set = ObservationSetSerializer(ObservationSet(obs_list)).data
        feature_set = FeatureSetSerializer(FeatureSet(feature_list)).data

        self.assertEqual(obs_set, metadata[OBSERVATION_SET_KEY])
        # Commented out when removed the feature metadata, as it was causing database
        # issues due to the size of the json object.
        #self.assertEqual(feature_set, metadata[FEATURE_SET_KEY])
        self.assertIsNone(metadata[FEATURE_SET_KEY])
        self.assertIsNone(metadata[PARENT_OP_KEY])
Example #6
0
 def test_feature_set_list_converter(self):
     '''
     Tests that we get properly formatted JSON-compatible
     arrays (of strings in this case). Used when we need to
     supply a WDL job with a list of relevant samples as an
     array of strings, for instance.
     '''
     obs1 = Feature('foo')
     obs2 = Feature('bar')
     obs_set = FeatureSet([obs1, obs2])
     d = obs_set.to_dict()
     c = FeatureSetListConverter()
     # order doesn't matter, so need to check both orders:
     converted_input = c.convert('xyz', d, '', '')
     self.assertTrue(({
         'xyz': ['foo', 'bar']
     } == converted_input)
                     | ({
                         'xyz': ['bar', 'foo']
                     } == converted_input))
Example #7
0
    def extract_metadata(self, resource_path, parent_op_pk=None):

        super().extract_metadata(resource_path, parent_op_pk)

        # the FeatureSet comes from the rows:
        f_set = FeatureSet([Feature(x) for x in self.table.index])
        self.metadata[DataResource.FEATURE_SET] = FeatureSetSerializer(
            f_set).data

        # the ObservationSet comes from the cols:
        o_set = ObservationSet([Observation(x) for x in self.table.columns])
        self.metadata[DataResource.OBSERVATION_SET] = ObservationSetSerializer(
            o_set).data
        return self.metadata
 def test_merge_of_different_types_fails(self):
     '''
     We cannot merge two different types (e.g. and Obs Set and Feat. Set)
     Test that it raises an exception.
     '''
     element_list1 = [self.el1, self.el2]
     some_feature = Feature('geneA', {'oncogene': StringAttribute('Y')})
     element_list2 = [
         some_feature,
     ]
     obs_set = ObservationSet(element_list1)
     feature_set = FeatureSet(element_list2)
     with self.assertRaises(Exception):
         new_set = merge_element_set([obs_set, feature_set])
Example #9
0
    def extract_metadata(self, resource_path, parent_op_pk=None):
        '''
        When we extract the metadata from a FeatureTable, we 
        expect the Feature instances to be the rows.  

        Additional columns specify attributes of each Feature,
        which we incorporate
        '''
        super().extract_metadata(resource_path, parent_op_pk)

        feature_list = super().prep_metadata(Feature)
        f_set = FeatureSet(feature_list)
        self.metadata[DataResource.FEATURE_SET] = FeatureSetSerializer(
            f_set).data
        return self.metadata
Example #10
0
 def create(self, validated_data):
     '''
     Returns an FeatureSet instance from the validated
     data.
     '''
     obs_list = []
     for obs_dict in validated_data['elements']:
         # the validated data has the Feature info as an OrderedDict
         # below, we use the FeatureSerializer to turn that into
         # proper Feature instance.
         obs_serializer = FeatureSerializer(data=obs_dict)
         obs = obs_serializer.get_instance()
         obs_list.append(obs)
     return FeatureSet(
         obs_list, 
         validated_data['multiple']
     )
Example #11
0
 def _build_set(self, data):
     '''
     A helper method which attempts to build a FeatureSet
     given the `data` arg. Assumes the `data` does have the 
     proper keys
     '''
     feature_list = []
     for feature_dict in data['elements']:
         # the validated data has the Feature info as an OrderedDict
         # below, we use the FeatureSerializer to turn that into
         # proper Feature instance.
         feature_serializer = FeatureSerializer(data=feature_dict)
         feat = feature_serializer.get_instance()
         feature_list.append(feat)
     fl = FeatureSet(
         feature_list, 
         data['multiple']
     )
     return fl
    def setUp(self):

        # create a couple Features to use and a corresponding serializer
        self.el1 = Feature('geneA', {'oncogene': StringAttribute('WT')})
        self.el1_serializer = FeatureSerializer(self.el1)

        self.el2 = Feature('geneB', {'oncogene': StringAttribute('KO')})
        self.el2_serializer = FeatureSerializer(self.el2)

        # a duplicate of el1 above, for testing addition of duplicate elements:
        self.duplicate_element = Feature('geneA', {})
        self.dup_element_serializer = FeatureSerializer(self.duplicate_element)

        # the correct serialized representation of an ElementSet instance
        self.expected_element_set_data = {
            'multiple': True,
            'elements': [self.el1_serializer.data, self.el2_serializer.data]
        }
        # a correctly formed instance of an FeatureSet
        self.element_set = FeatureSet([self.el1, self.el2])

        # the class that will execute the tests
        self.tester_class = ElementSetSerializerTester(FeatureSetSerializer)