def test_observation_set_csv_converter(self): obs1 = Observation('foo') obs2 = Observation('bar') obs_set = ObservationSet([obs1, obs2]) d = obs_set.to_dict() c = ObservationSetCsvConverter() # order doesn't matter, so need to check both orders: converted_input = c.convert('xyz', d, '', '') self.assertTrue(({ 'xyz': 'foo,bar' } == converted_input) | ({ 'xyz': 'bar,foo' } == converted_input))
def test_metadata_correct_case2(self): ''' Typically, the metadata is collected following a successful validation. Do that here ''' m = IntegerMatrix() resource_path = os.path.join(TESTDIR, 'test_integer_matrix.tsv') metadata = m.extract_metadata(resource_path) # Parse the test file to ensure we extracted the right content. line = open(resource_path).readline() contents = line.strip().split('\t') samplenames = contents[1:] obs_list = [Observation(x) for x in samplenames] gene_list = [] for i, line in enumerate(open(resource_path)): if i > 0: g = line.split('\t')[0] gene_list.append(g) feature_list = [Feature(x) for x in gene_list] obs_set = ObservationSetSerializer(ObservationSet(obs_list)).data feature_set = FeatureSetSerializer(FeatureSet(feature_list)).data self.assertEqual(obs_set, metadata[OBSERVATION_SET_KEY]) self.assertEqual(feature_set, metadata[FEATURE_SET_KEY]) self.assertIsNone(metadata[PARENT_OP_KEY])
def setUp(self): # create a couple Observations to use and a corresponding serializer self.el1 = Observation('sampleA', {'phenotype': StringAttribute('WT')}) self.el1_serializer = ObservationSerializer(self.el1) self.el2 = Observation('sampleB', {'phenotype': StringAttribute('KO')}) self.el2_serializer = ObservationSerializer(self.el2) # a duplicate of el1 above, for testing addition of duplicate elements: self.duplicate_element = Observation('sampleA', {}) self.dup_element_serializer = ObservationSerializer( self.duplicate_element) # the correct serialized representation of an ElementSet instance self.expected_element_set_data = { 'multiple': True, 'elements': [self.el1_serializer.data, self.el2_serializer.data] } # a correctly formed instance of an ObservationSet self.element_set = ObservationSet([self.el1, self.el2]) # the class that will execute the tests self.tester_class = ElementSetSerializerTester( ObservationSetSerializer)
def test_metadata_correct(self): resource_path = os.path.join(TESTDIR, 'three_column_annotation.tsv') t = AnnotationTable() column_dict = {} obs_list = [] for i, line in enumerate(open(resource_path)): if i == 0: contents = line.strip().split('\t') for j, c in enumerate(contents[1:]): column_dict[j] = c else: contents = line.strip().split('\t') samplename = contents[0] attr_dict = {} for j, v in enumerate(contents[1:]): attr = UnrestrictedStringAttribute(v) attr_dict[column_dict[j]] = attr obs = Observation(samplename, attr_dict) obs_list.append(obs) expected_obs_set = ObservationSetSerializer( ObservationSet(obs_list)).data metadata = t.extract_metadata(resource_path, 'tsv') self.assertEqual(metadata[OBSERVATION_SET_KEY], expected_obs_set) self.assertIsNone(metadata[FEATURE_SET_KEY]) self.assertIsNone(metadata[PARENT_OP_KEY])
def test_metadata_correct_case2(self): ''' Typically, the metadata is collected following a successful validation. However, here we don't validate. Check that it goes and collects the table in the process ''' m = Matrix() resource_path = os.path.join(TESTDIR, 'test_matrix.tsv') metadata = m.extract_metadata(resource_path, 'tsv') # Parse the test file to ensure we extracted the right content. line = open(resource_path).readline() contents = line.strip().split('\t') samplenames = contents[1:] obs_list = [Observation(x) for x in samplenames] gene_list = [] for i, line in enumerate(open(resource_path)): if i > 0: g = line.split('\t')[0] gene_list.append(g) feature_list = [Feature(x) for x in gene_list] obs_set = ObservationSetSerializer(ObservationSet(obs_list)).data feature_set = FeatureSetSerializer(FeatureSet(feature_list)).data self.assertEqual(obs_set, metadata[OBSERVATION_SET_KEY]) # Commented out when removed the feature metadata, as it was causing database # issues due to the size of the json object. #self.assertEqual(feature_set, metadata[FEATURE_SET_KEY]) self.assertIsNone(metadata[FEATURE_SET_KEY]) self.assertIsNone(metadata[PARENT_OP_KEY])
def test_observation_set_list_converter(self): ''' Tests that we get properly formatted JSON-compatible arrays (of strings in this case). Used when we need to supply a WDL job with a list of relevant samples as an array of strings, for instance. ''' obs1 = Observation('foo') obs2 = Observation('bar') obs_set = ObservationSet([obs1, obs2]) d = obs_set.to_dict() c = ObservationSetListConverter() # order doesn't matter, so need to check both orders: converted_input = c.convert('xyz', d, '', '') self.assertTrue(({ 'xyz': ['foo', 'bar'] } == converted_input) | ({ 'xyz': ['bar', 'foo'] } == converted_input))
def _build_set(self, data): ''' A helper method which attempts to build an ObservationSet given the `data` arg. Assumes the `data` does have the proper keys ''' obs_list = [] for obs_dict in data['elements']: obs_serializer = ObservationSerializer(data=obs_dict) obs = obs_serializer.get_instance() obs_list.append(obs) return ObservationSet(obs_list, data['multiple'])
def extract_metadata(self, resource_path, parent_op_pk=None): super().extract_metadata(resource_path, parent_op_pk) # the FeatureSet comes from the rows: f_set = FeatureSet([Feature(x) for x in self.table.index]) self.metadata[DataResource.FEATURE_SET] = FeatureSetSerializer( f_set).data # the ObservationSet comes from the cols: o_set = ObservationSet([Observation(x) for x in self.table.columns]) self.metadata[DataResource.OBSERVATION_SET] = ObservationSetSerializer( o_set).data return self.metadata
def test_merge_of_different_types_fails(self): ''' We cannot merge two different types (e.g. and Obs Set and Feat. Set) Test that it raises an exception. ''' element_list1 = [self.el1, self.el2] some_feature = Feature('geneA', {'oncogene': StringAttribute('Y')}) element_list2 = [ some_feature, ] obs_set = ObservationSet(element_list1) feature_set = FeatureSet(element_list2) with self.assertRaises(Exception): new_set = merge_element_set([obs_set, feature_set])
def extract_metadata(self, resource_path, parent_op_pk=None): ''' When we extract the metadata from an AnnotationTable, we expect the Observation instances to be the rows. Additional columns specify attributes of each Observation, which we incorporate ''' super().extract_metadata(resource_path, parent_op_pk) observation_list = super().prep_metadata(Observation) o_set = ObservationSet(observation_list) self.metadata[DataResource.OBSERVATION_SET] = ObservationSetSerializer( o_set).data return self.metadata
def create(self, validated_data): ''' Returns an ObservationSet instance from the validated data. ''' obs_list = [] for obs_dict in validated_data['elements']: # the validated data has the Observation info as an OrderedDict # below, we use the ObservationSerializer to turn that into # proper Observation instance. obs_serializer = ObservationSerializer(data=obs_dict) obs = obs_serializer.get_instance() obs_list.append(obs) return ObservationSet( obs_list, validated_data['multiple'] )
def extract_metadata(self, resource_path, file_extension, parent_op_pk=None): super().extract_metadata(resource_path, file_extension, parent_op_pk) # Note: removed the addition of FeatureSets to the metadata as it was causing # issues with large json objects being inserted into the database. # the FeatureSet comes from the rows: # f_set = FeatureSet([Feature(x) for x in self.table.index]) # self.metadata[DataResource.FEATURE_SET] = FeatureSetSerializer(f_set).data # the ObservationSet comes from the cols: o_set = ObservationSet([Observation(x) for x in self.table.columns]) self.metadata[DataResource.OBSERVATION_SET] = ObservationSetSerializer( o_set).data return self.metadata