def test_cell_values_with_long_name(self): cell_str = 'B_subtilis_WT_JH642_Colony_1, B_subtilis_WT_JH642_Colony_2, B_subtilis_WT_JH642_Colony_3' expected_values = [ 'B_subtilis_WT_JH642_Colony_1', 'B_subtilis_WT_JH642_Colony_2', 'B_subtilis_WT_JH642_Colony_3' ] self.assertListEqual(expected_values, tu.extract_name_value(cell_str))
def _process_control_with_captions(self, cell, control_tables): controls = [] for table_caption in table_utils.extract_name_value(cell.get_text()): canonicalize_caption = ''.join(table_caption.lower().split()) if canonicalize_caption in control_tables: for control in control_tables[canonicalize_caption]: controls.append(control) return controls
def _process_reagent_media(self, cell, header_cell): reagents_media = [] text = cell.get_text() name_dict, timepoint_dict = cell_parser.PARSER.process_reagent_header( header_cell, self._timepoint_units, unit_type='timepoints') # Determine if cells is numerical or name value if table_utils.is_valued_cells(text): try: for value, unit in table_utils.transform_cell( text, self._fluid_units, cell_type='fluid'): if timepoint_dict: numerical_dict = { 'name': name_dict, 'value': value, 'unit': unit, 'timepoint': timepoint_dict } else: numerical_dict = { 'name': name_dict, 'value': value, 'unit': unit } reagents_media.append(numerical_dict) except TableException as err: message = err.get_message() self._validation_errors.append(message) elif table_utils.is_number(text): err = '%s is missing a unit' % text message = 'Measurement table has invalid reagent/media value: %s' % err self._validation_errors.append(message) return [] else: for name in table_utils.extract_name_value(text): if timepoint_dict: named_dict = { 'name': name_dict, 'value': name, 'timepoint': timepoint_dict } else: named_dict = {'name': name_dict, 'value': name} reagents_media.append(named_dict) return reagents_media
def parse_content_item(self, cell, fluid_units={}, timepoint_units={}): list_of_contents = [] tokens = self._cell_tokenizer.tokenize(cell.get_text(), keep_skip=False) if len(tokens) < 1: raise TableException('Invalid value: %s does not contain a name' % cell.get_text()) cell_type = self._get_token_type(self._cell_parser.parse(tokens)) label, value, unit, timepoint_value, timepoint_unit = (None, None, None, None, None) if cell_type == 'NAME_VALUE_UNIT_TIMEPOINT': label, value, unit, timepoint_value, timepoint_unit = self._get_name_values_unit_timepoint( tokens) content = {} content['name'] = self.process_name_with_uri( label, cell.get_text_with_url()) content['value'] = value content['unit'] = self.process_content_item_unit( unit, fluid_units, timepoint_units) content['timepoints'] = self.process_timepoint( timepoint_value, timepoint_unit, timepoint_units) list_of_contents.append(content) elif cell_type == 'NAME_VALUE_UNIT': label, value, unit = self._get_name_values_unit(tokens) content = {} content['name'] = self.process_name_with_uri( label, cell.get_text_with_url()) content['value'] = value content['unit'] = self.process_content_item_unit( unit, fluid_units, timepoint_units) list_of_contents.append(content) elif cell_type == 'NAME': labels = table_utils.extract_name_value(cell.get_text()) for label in labels: content = {} content['name'] = self.process_name_with_uri( label, cell.get_text_with_url()) list_of_contents.append(content) else: raise TableException('Unable to parse %s' % cell.get_text()) return list_of_contents
def _process_file_type(self, cell): file_type = cell.get_text() return [value for value in table_utils.extract_name_value(file_type)]
def test_cell_with_unicode_characters(self): cell_str = '\x0bApp' self.assertTrue('App', tu.extract_name_value(cell_str))
def test_cell_with_trailing_whitespace(self): cell_str = 'Yeast1_, Yeast2_, Yeast3_ ' exp_res = ['Yeast1_', 'Yeast2_', 'Yeast3_'] for name in tu.extract_name_value(cell_str): self.assertTrue(name in exp_res)
def test_cell_values_with_named_and_numerical_spacing(self): cell_str = 'B. subtilis 168 PmtlA-comKS' for name in tu.extract_name_value(cell_str): self.assertEquals(cell_str, name)
def test_cell_values_with_named_spacing(self): cell_str = 'Yeast_Extract_Peptone_Adenine_Dextrose (a.k.a. YPAD Media)' for name in tu.extract_name_value(cell_str): self.assertEquals(cell_str, name)
def test_cell_values_with_one_name(self): cell_str = 'CSV' expected_values = ['CSV'] self.assertListEqual(expected_values, tu.extract_name_value(cell_str))
def test_cell_values_without_underscore(self): cell_str = 'CSV, FCS' expected_values = ['CSV', 'FCS'] self.assertListEqual(expected_values, tu.extract_name_value(cell_str))
def test_cell_values_with_name_containing_underscore_numbers(self): cell_str = 'AND_00, NAND_00' expected_values = ['AND_00', 'NAND_00'] self.assertListEqual(expected_values, tu.extract_name_value(cell_str))