def test_feature_set_csv_converter(self): f1 = Feature('foo') f2 = Feature('bar') f_set = FeatureSet([f1, f2]) d = f_set.to_dict() c = FeatureSetCsvConverter() # order doesn't matter, so need to check both orders: converted_input = c.convert('xyz', d, '', '') self.assertTrue(({ 'xyz': 'foo,bar' } == converted_input) | ({ 'xyz': 'bar,foo' } == converted_input))
def test_metadata_correct(self): resource_path = os.path.join(TESTDIR, 'gene_annotations.tsv') t = FeatureTable() column_dict = {} feature_list = [] for i, line in enumerate(open(resource_path)): if i == 0: contents = line.strip().split('\t') for j,c in enumerate(contents[1:]): column_dict[j] = c else: contents = line.strip().split('\t') gene_name = contents[0] attr_dict = {} for j,v in enumerate(contents[1:]): try: v = int(v) attr = IntegerAttribute(v) except ValueError: attr = StringAttribute(v) attr_dict[column_dict[j]] = attr f = Feature(gene_name, attr_dict) feature_list.append(f) expected_feature_set = FeatureSetSerializer(FeatureSet(feature_list)).data metadata = t.extract_metadata(resource_path) self.assertEqual(metadata[FEATURE_SET_KEY], expected_feature_set) self.assertIsNone(metadata[OBSERVATION_SET_KEY]) self.assertIsNone(metadata[PARENT_OP_KEY])
def test_metadata_correct_case2(self): ''' Typically, the metadata is collected following a successful validation. Do that here ''' m = IntegerMatrix() resource_path = os.path.join(TESTDIR, 'test_integer_matrix.tsv') metadata = m.extract_metadata(resource_path) # Parse the test file to ensure we extracted the right content. line = open(resource_path).readline() contents = line.strip().split('\t') samplenames = contents[1:] obs_list = [Observation(x) for x in samplenames] gene_list = [] for i, line in enumerate(open(resource_path)): if i > 0: g = line.split('\t')[0] gene_list.append(g) feature_list = [Feature(x) for x in gene_list] obs_set = ObservationSetSerializer(ObservationSet(obs_list)).data feature_set = FeatureSetSerializer(FeatureSet(feature_list)).data self.assertEqual(obs_set, metadata[OBSERVATION_SET_KEY]) self.assertEqual(feature_set, metadata[FEATURE_SET_KEY]) self.assertIsNone(metadata[PARENT_OP_KEY])
def test_metadata_correct(self): resource_path = os.path.join(TESTDIR, 'gene_annotations.tsv') t = FeatureTable() column_dict = {} feature_list = [] for i, line in enumerate(open(resource_path)): if i == 0: contents = line.strip().split('\t') for j, c in enumerate(contents[1:]): column_dict[j] = c else: contents = line.strip().split('\t') gene_name = contents[0] attr_dict = {} for j, v in enumerate(contents[1:]): try: v = int(v) attr = IntegerAttribute(v) except ValueError: attr = StringAttribute(v) attr_dict[column_dict[j]] = attr f = Feature(gene_name, attr_dict) feature_list.append(f) expected_feature_set = FeatureSetSerializer( FeatureSet(feature_list)).data metadata = t.extract_metadata(resource_path, 'tsv') # Commented out when we removed the automatic creation of Feature metadata # for FeatureTable resource types. For large files, it was causing issues # with exceptionally large JSON failing to store in db table. #self.assertEqual(metadata[FEATURE_SET_KEY], expected_feature_set) self.assertIsNone(metadata[FEATURE_SET_KEY]) self.assertIsNone(metadata[OBSERVATION_SET_KEY]) self.assertIsNone(metadata[PARENT_OP_KEY])
def test_metadata_correct_case2(self): ''' Typically, the metadata is collected following a successful validation. However, here we don't validate. Check that it goes and collects the table in the process ''' m = Matrix() resource_path = os.path.join(TESTDIR, 'test_matrix.tsv') metadata = m.extract_metadata(resource_path, 'tsv') # Parse the test file to ensure we extracted the right content. line = open(resource_path).readline() contents = line.strip().split('\t') samplenames = contents[1:] obs_list = [Observation(x) for x in samplenames] gene_list = [] for i, line in enumerate(open(resource_path)): if i > 0: g = line.split('\t')[0] gene_list.append(g) feature_list = [Feature(x) for x in gene_list] obs_set = ObservationSetSerializer(ObservationSet(obs_list)).data feature_set = FeatureSetSerializer(FeatureSet(feature_list)).data self.assertEqual(obs_set, metadata[OBSERVATION_SET_KEY]) # Commented out when removed the feature metadata, as it was causing database # issues due to the size of the json object. #self.assertEqual(feature_set, metadata[FEATURE_SET_KEY]) self.assertIsNone(metadata[FEATURE_SET_KEY]) self.assertIsNone(metadata[PARENT_OP_KEY])
def test_feature_set_list_converter(self): ''' Tests that we get properly formatted JSON-compatible arrays (of strings in this case). Used when we need to supply a WDL job with a list of relevant samples as an array of strings, for instance. ''' obs1 = Feature('foo') obs2 = Feature('bar') obs_set = FeatureSet([obs1, obs2]) d = obs_set.to_dict() c = FeatureSetListConverter() # order doesn't matter, so need to check both orders: converted_input = c.convert('xyz', d, '', '') self.assertTrue(({ 'xyz': ['foo', 'bar'] } == converted_input) | ({ 'xyz': ['bar', 'foo'] } == converted_input))
def extract_metadata(self, resource_path, parent_op_pk=None): super().extract_metadata(resource_path, parent_op_pk) # the FeatureSet comes from the rows: f_set = FeatureSet([Feature(x) for x in self.table.index]) self.metadata[DataResource.FEATURE_SET] = FeatureSetSerializer( f_set).data # the ObservationSet comes from the cols: o_set = ObservationSet([Observation(x) for x in self.table.columns]) self.metadata[DataResource.OBSERVATION_SET] = ObservationSetSerializer( o_set).data return self.metadata
def test_merge_of_different_types_fails(self): ''' We cannot merge two different types (e.g. and Obs Set and Feat. Set) Test that it raises an exception. ''' element_list1 = [self.el1, self.el2] some_feature = Feature('geneA', {'oncogene': StringAttribute('Y')}) element_list2 = [ some_feature, ] obs_set = ObservationSet(element_list1) feature_set = FeatureSet(element_list2) with self.assertRaises(Exception): new_set = merge_element_set([obs_set, feature_set])
def extract_metadata(self, resource_path, parent_op_pk=None): ''' When we extract the metadata from a FeatureTable, we expect the Feature instances to be the rows. Additional columns specify attributes of each Feature, which we incorporate ''' super().extract_metadata(resource_path, parent_op_pk) feature_list = super().prep_metadata(Feature) f_set = FeatureSet(feature_list) self.metadata[DataResource.FEATURE_SET] = FeatureSetSerializer( f_set).data return self.metadata
def create(self, validated_data): ''' Returns an FeatureSet instance from the validated data. ''' obs_list = [] for obs_dict in validated_data['elements']: # the validated data has the Feature info as an OrderedDict # below, we use the FeatureSerializer to turn that into # proper Feature instance. obs_serializer = FeatureSerializer(data=obs_dict) obs = obs_serializer.get_instance() obs_list.append(obs) return FeatureSet( obs_list, validated_data['multiple'] )
def _build_set(self, data): ''' A helper method which attempts to build a FeatureSet given the `data` arg. Assumes the `data` does have the proper keys ''' feature_list = [] for feature_dict in data['elements']: # the validated data has the Feature info as an OrderedDict # below, we use the FeatureSerializer to turn that into # proper Feature instance. feature_serializer = FeatureSerializer(data=feature_dict) feat = feature_serializer.get_instance() feature_list.append(feat) fl = FeatureSet( feature_list, data['multiple'] ) return fl
def setUp(self): # create a couple Features to use and a corresponding serializer self.el1 = Feature('geneA', {'oncogene': StringAttribute('WT')}) self.el1_serializer = FeatureSerializer(self.el1) self.el2 = Feature('geneB', {'oncogene': StringAttribute('KO')}) self.el2_serializer = FeatureSerializer(self.el2) # a duplicate of el1 above, for testing addition of duplicate elements: self.duplicate_element = Feature('geneA', {}) self.dup_element_serializer = FeatureSerializer(self.duplicate_element) # the correct serialized representation of an ElementSet instance self.expected_element_set_data = { 'multiple': True, 'elements': [self.el1_serializer.data, self.el2_serializer.data] } # a correctly formed instance of an FeatureSet self.element_set = FeatureSet([self.el1, self.el2]) # the class that will execute the tests self.tester_class = ElementSetSerializerTester(FeatureSetSerializer)