def test_header_exception_labeled_header_not_allowed( assay_file_exception_labeled_header_not_allowed): with pytest.raises(ParseIsatabException) as excinfo: AssayReader.from_stream( "S1", "A1", assay_file_exception_labeled_header_not_allowed) msg = 'Header "Factor Value" not allowed in assay.' assert msg == str(excinfo.value)
def test_header_exception_term_source_ref_stop_iteration( assay_file_exception_term_source_ref_stop_iteration): with pytest.raises(ParseIsatabException) as excinfo: AssayReader.from_stream( "S1", "A1", assay_file_exception_term_source_ref_stop_iteration) msg = 'Expected one more column on seeing "Term Source REF"' assert msg == str(excinfo.value)
def test_header_exception_term_source_ref_next_column( assay_file_exception_term_source_ref_next_column): with pytest.raises(ParseIsatabException) as excinfo: AssayReader.from_stream( "S1", "A1", assay_file_exception_term_source_ref_next_column) msg = 'Expected column "Term Accession Number" after seeing "Term Source REF"' assert msg == str(excinfo.value)
def test_header_exception_labeled_header_format( assay_file_exception_labeled_header_format): with pytest.raises(ParseIsatabException) as excinfo: AssayReader.from_stream("S1", "A1", assay_file_exception_labeled_header_format) msg = "Problem parsing labeled header CharacteristicsWithoutBrackets" assert msg == str(excinfo.value)
def test_header_exception_duplicated_header( assay_file_exception_duplicated_header): with pytest.raises(ParseIsatabException) as excinfo: AssayReader.from_stream("S1", "A1", assay_file_exception_duplicated_header).read() msg = "Found duplicated column types in header of study S1 assay A1: Characteristics[Organism]" assert msg == str(excinfo.value)
def test_parsing_exception_invalid_column_type(assay_file_exception_invalid_column_type): with pytest.raises(ParseIsatabException) as excinfo: AssayReader.from_stream("S1", "A1", assay_file_exception_invalid_column_type).read() msg = ( "Invalid column type occured \"Parameter Value\" not in ('Material Type', " "'Characteristics', 'Comment', 'Factor Value', 'Label', 'Term Source REF', 'Unit')" ) assert msg == str(excinfo.value)
def get(self, project_uuid=None) -> IsaData: raw_data = self.get_raw(project_uuid) investigation = InvestigationReader.from_stream( input_file=io.StringIO(raw_data["investigation"]["tsv"]), filename=raw_data["investigation"]["path"], ).read() studies = { path: StudyReader.from_stream(study_id=path, input_file=io.StringIO(details["tsv"]), filename=path).read() for path, details in raw_data["studies"].items() } if len(studies) > 1: # pragma: nocover raise UnsupportedIsaTabFeatureException( "More than one study found!") study = list(studies.values())[0] assays = { path: AssayReader.from_stream( study_id=study.file, assay_id=path, input_file=io.StringIO(details["tsv"]), filename=path, ).read() for path, details in raw_data["assays"].items() } return IsaData(investigation, raw_data["investigation"]["path"], studies, assays)
def run_warnings_caught(args): # Read investigation investigation = InvestigationReader.from_stream(args.input_investigation_file).read() args.input_investigation_file.close() # Validate investigation InvestigationValidator(investigation).validate() # Read studies and assays path_in = os.path.normpath(os.path.dirname(args.input_investigation_file.name)) studies = {} assays = {} for s, study_info in enumerate(investigation.studies): if study_info.info.path: with open(os.path.join(path_in, study_info.info.path), "rt") as inputf: studies[s] = StudyReader.from_stream("S{}".format(s + 1), inputf).read() if study_info.assays: assays[s] = {} for a, assay_info in enumerate(study_info.assays): if assay_info.path: with open(os.path.join(path_in, assay_info.path), "rt") as inputf: assays[s][a] = AssayReader.from_stream( "S{}".format(s + 1), "A{}".format(a + 1), inputf ).read() # Validate studies and assays for s, study_info in enumerate(investigation.studies): if study_info.info.path: StudyValidator(investigation, study_info, studies[s]).validate() for a, assay_info in enumerate(study_info.assays): if assay_info.path: AssayValidator(investigation, study_info, assay_info, assays[s][a]).validate()
def load_investigation(i_path: typing.Union[str, Path]) -> IsaData: """Load investigation information from investigation files. Study and assay files are expected to be next to the investigation file. """ i_path = Path(i_path) with i_path.open("rt") as i_file: investigation = InvestigationReader.from_stream( input_file=i_file, filename=i_path.name).read() studies = {} assays = {} for study in investigation.studies: with (i_path.parent / study.info.path).open() as s_file: studies[study.info.path.name] = StudyReader.from_stream( study_id=study.info.path.name, input_file=s_file).read() for assay in study.assays: with (i_path.parent / assay.path).open() as a_file: assays[assay.path.name] = AssayReader.from_stream( study_id=studies[study.info.path.name].file.name, assay_id=assay.path.name, input_file=a_file, ).read() return IsaData(investigation, str(i_path), studies, assays)
def test_assay_reader_minimal_assay_iostring2(minimal_investigation_file, minimal_assay_file): # Load investigation (tested elsewhere) stringio = io.StringIO(minimal_investigation_file.read()) investigation = InvestigationReader.from_stream(stringio).read() with pytest.warns(IsaWarning) as record: InvestigationValidator(investigation).validate() # Check warnings assert 2 == len(record) # Create new assay reader and read from StringIO with no filename indicated stringio = io.StringIO(minimal_assay_file.read()) reader = AssayReader.from_stream("S1", "A1", stringio) assert 5 == len(reader.header) # Read and validate assay assay = reader.read() AssayValidator(investigation, investigation.studies[0], investigation.studies[0].assays[0], assay).validate() # Check results assert str(assay.file) == os.path.normpath("<no file>") assert 5 == len(assay.header) assert 3 == len(assay.materials) assert 1 == len(assay.processes) assert 3 == len(assay.arcs)
def run(args): with open(args.investigation_file, "rt") as inputf: investigation = InvestigationReader.from_stream(inputf).read() path = os.path.dirname(args.investigation_file) print("digraph investigation {", file=args.output_file) print(' rankdir = "LR";', file=args.output_file) for s, study_info in enumerate(investigation.studies): with open(os.path.join(path, study_info.info.path), "rt") as inputf: study = StudyReader.from_stream("S{}".format(s + 1), inputf).read() print(" /* study {} */".format(study_info.info.path), file=args.output_file) print(" subgraph clusterStudy{} {{".format(s), file=args.output_file) print(' label = "Study: {}"'.format(study_info.info.path), file=args.output_file) print_dot(study, args.output_file) print(" }", file=args.output_file) for a, assay_info in enumerate(study_info.assays): with open(os.path.join(path, assay_info.path), "rt") as inputf: assay = AssayReader.from_stream("S{}".format(s + 1), "A{}".format(a + 1), inputf).read() print(" /* assay {} */".format(assay_info.path), file=args.output_file) print(" subgraph clusterAssayS{}A{} {{".format(s, a), file=args.output_file) print(' label = "Assay: {}"'.format(assay_info.path), file=args.output_file) print_dot(assay, args.output_file) print(" }", file=args.output_file) print("}", file=args.output_file)
def _parse_write_assert_assay(investigation_file, tmp_path, quote=None, normalize=False, skip=None): # Load investigation investigation = InvestigationReader.from_stream(investigation_file).read() InvestigationValidator(investigation).validate() directory = os.path.normpath(os.path.dirname(investigation_file.name)) # Iterate assays for s, study_info in enumerate(investigation.studies): for a, assay_info in enumerate(study_info.assays): if skip and str(assay_info.path) in skip: continue # Load assay path_in = os.path.join(directory, assay_info.path) with open(path_in, "rt") as inputf: assay = AssayReader.from_stream("S{}".format(s + 1), "A{}".format(a + 1), inputf).read() AssayValidator(investigation, study_info, assay_info, assay).validate() # Write assay to temporary file path_out = tmp_path / assay_info.path with open(path_out, "wt", newline="") as file: AssayWriter.from_stream(assay, file, quote=quote).write() if normalize: # Read and write assay again path_in = path_out with open(path_out, "rt") as inputf: assay = AssayReader.from_stream("S{}".format(s + 1), "A{}".format(a + 1), inputf).read() AssayValidator(investigation, study_info, assay_info, assay).validate() path_out = tmp_path / (assay_info.path.name + "_b") with open(path_out, "wt", newline="") as file: AssayWriter.from_stream(assay, file, quote=quote).write() # Sort and compare input and output path_in_s = tmp_path / (assay_info.path.name + ".in.sorted") path_out_s = tmp_path / (assay_info.path.name + ".out.sorted") assert filecmp.cmp(sort_file(path_in, path_in_s), sort_file(path_out, path_out_s), shallow=False)
def parse_isatab(self): """ parse sample info from ISA-tab table """ logger.info("Parsing ISA-tab...") logger.info("Read assay file: %s", self.args.isa_assay.name) # read assay assay = AssayReader.from_stream("S1", "A1", self.args.isa_assay).read() # extract relevant fields dummy = Bunch(type="", protocol_ref="") sample_info = {} arc_map = {a.tail: a.head for a in assay.arcs} for m in assay.materials.values(): if m.type == "Sample Name": sample_name = m.name if sample_name not in sample_info: sample_info[sample_name] = {} key = m.unique_name # breakpoint() while key in arc_map: key = arc_map[key] if re.match( "Library construction [a-z]*RNA[-_][Ss]eq", assay.processes.get(key, dummy).protocol_ref, ): for p in assay.processes[key].parameter_values: if p.name == "Library layout": sample_info[sample_name]["paired"] = ( True if p.value == "PAIRED" else False) elif p.name == "Library strand-specificity": sample_info[sample_name][ "stranded"] = p.value.lower() elif re.match( "Nucleic acid sequencing [a-z]*RNA[-_][Ss]eq", assay.processes.get(key, dummy).protocol_ref, ): for p in assay.processes[key].parameter_values: if p.name == "Instrument model": sample_info[sample_name][ "instrument"] = ",".join(p.value) elif (p.name == "Platform" and "instrument" not in sample_info[sample_name]): sample_info[sample_name][ "instrument"] = ",".join(p.value) elif p.name == "Target read length": sample_info[sample_name][ "read_length"] = p.value logger.info("Samples in ISA assay:\n%s", ", ".join(sample_info)) logger.debug(sample_info) self.sample_info = sample_info
def parse_isatab(self): """ parse sample info from ISA-tab table """ logger.info("Parsing ISA-tab...") logger.info("Read assay file: %s", self.args.isa_assay.name) # read assay assay = AssayReader.from_stream("S1", "A1", self.args.isa_assay).read() # extract relevant fields dummy = Bunch(type="", protocol_ref="") sample_info = {} arc_map = {a.tail: a.head for a in assay.arcs} for m in assay.materials.values(): if m.type == "Sample Name": sample_name = m.name if sample_name not in sample_info: sample_info[sample_name] = {} key = m.unique_name # breakpoint() while key in arc_map: key = arc_map[key] if re.match( "Library construction [a-z]*RNA[-_]?[Ss]eq", assay.processes.get(key, dummy).protocol_ref, ): self._parse_isatab_library_construction( assay, key, sample_info, sample_name) elif re.match( "Nucleic acid sequencing [a-z]*RNA[-_]?[Ss]eq", assay.processes.get(key, dummy).protocol_ref, ): self._parse_isatab_sequencing(assay, key, sample_info, sample_name) logger.info("Samples in ISA assay:\n%s", ", ".join(sample_info)) logger.debug(sample_info) self.sample_info = sample_info
def test_assay_reader_minimal_assay(minimal_investigation_file, minimal_assay_file): """Use ``AssayReader`` to read in minimal assay file. Using the ``AssayReader`` instead of the ``AssayRowReader`` gives us ``Assay`` objects instead of just the row-wise nodes. """ # Load investigation (tested elsewhere) investigation = InvestigationReader.from_stream( minimal_investigation_file).read() with pytest.warns(IsaWarning) as record: InvestigationValidator(investigation).validate() # Check warnings assert 1 == len(record) # Create new row reader and check read headers reader = AssayReader.from_stream("S1", "A1", minimal_assay_file) assert 5 == len(reader.header) # Read and validate assay assay = reader.read() AssayValidator(investigation, investigation.studies[0], investigation.studies[0].assays[0], assay).validate() # Check results assert os.path.normpath(str(assay.file)).endswith( os.path.normpath("data/i_minimal/a_minimal.txt")) assert 5 == len(assay.header) assert 3 == len(assay.materials) assert 1 == len(assay.processes) assert 3 == len(assay.arcs) expected = models.Material( "Sample Name", "S1-sample-0815-N1", "0815-N1", None, (), (), (), None, [table_headers.SAMPLE_NAME], ) assert expected == assay.materials["S1-sample-0815-N1"] expected = models.Material( "Raw Data File", "S1-A1-0815-N1-DNA1-WES1_L???_???_R1.fastq.gz-COL4", "0815-N1-DNA1-WES1_L???_???_R1.fastq.gz", None, (), (), (), None, [table_headers.RAW_DATA_FILE], ) assert expected == assay.materials[ "S1-A1-0815-N1-DNA1-WES1_L???_???_R1.fastq.gz-COL4"] expected = models.Material( "Raw Data File", "S1-A1-0815-N1-DNA1-WES1_L???_???_R2.fastq.gz-COL5", "0815-N1-DNA1-WES1_L???_???_R2.fastq.gz", None, (), (), (), None, [table_headers.RAW_DATA_FILE], ) assert expected == assay.materials[ "S1-A1-0815-N1-DNA1-WES1_L???_???_R2.fastq.gz-COL5"] expected = models.Process( "nucleic acid sequencing", "S1-A1-0815-N1-DNA1-WES1-3", "0815-N1-DNA1-WES1", "Assay Name", None, None, (), (), None, None, None, [table_headers.PROTOCOL_REF, table_headers.ASSAY_NAME], ) assert expected == assay.processes["S1-A1-0815-N1-DNA1-WES1-3"] expected = ( models.Arc("S1-sample-0815-N1", "S1-A1-0815-N1-DNA1-WES1-3"), models.Arc("S1-A1-0815-N1-DNA1-WES1-3", "S1-A1-0815-N1-DNA1-WES1_L???_???_R1.fastq.gz-COL4"), models.Arc( "S1-A1-0815-N1-DNA1-WES1_L???_???_R1.fastq.gz-COL4", "S1-A1-0815-N1-DNA1-WES1_L???_???_R2.fastq.gz-COL5", ), ) assert expected == assay.arcs
def test_assay_reader_gelelect(gelelect_investigation_file, gelelect_assay_file): """Use ``AssayReader`` to read in small assay file.""" with pytest.warns(IsaWarning) as record: # Load investigation investigation = InvestigationReader.from_stream( gelelect_investigation_file).read() InvestigationValidator(investigation).validate() # Create new row reader and check read headers reader = AssayReader.from_stream("S1", "A1", gelelect_assay_file) assert 22 == len(reader.header) # Read assay assay = reader.read() AssayValidator(investigation, investigation.studies[0], investigation.studies[0].assays[0], assay).validate() # Check warnings assert 4 == len(record) # Check results assert os.path.normpath(str(assay.file)).endswith( os.path.normpath( "data/test_gelelect/a_study01_protein_expression_profiling_gel_electrophoresis.txt" )) assert 22 == len(assay.header) assert 9 == len(assay.materials) assert 10 == len(assay.processes) assert 18 == len(assay.arcs) expected = models.Material( "Image File", "S1-A1-Image01.jpeg-COL19", "Image01.jpeg", None, (), (), (), None, [table_headers.IMAGE_FILE], ) assert expected == assay.materials["S1-A1-Image01.jpeg-COL19"] expected = models.Process( "data collection", "S1-A1-Scan02-18", "Scan02", "Scan Name", None, None, (), (), None, None, None, [table_headers.PROTOCOL_REF, table_headers.SCAN_NAME], ) assert expected == assay.processes["S1-A1-Scan02-18"] header_electrophoresis = [ table_headers.PROTOCOL_REF, table_headers.GEL_ELECTROPHORESIS_ASSAY_NAME, table_headers.FIRST_DIMENSION, table_headers.TERM_SOURCE_REF, table_headers.TERM_ACCESSION_NUMBER, table_headers.SECOND_DIMENSION, table_headers.TERM_SOURCE_REF, table_headers.TERM_ACCESSION_NUMBER, ] expected = models.Process( "electrophoresis", "S1-A1-Assay01-10", "Assay01", "Gel Electrophoresis Assay Name", None, None, (), (), None, models.OntologyTermRef("", "", ""), models.OntologyTermRef("", "", ""), header_electrophoresis, ) assert expected == assay.processes["S1-A1-Assay01-10"] expected = models.Process( "electrophoresis", "S1-A1-electrophoresis-9-2", "", "Gel Electrophoresis Assay Name", None, None, (), (), None, models.OntologyTermRef("AssayX", None, None), models.OntologyTermRef("AssayY", None, None), header_electrophoresis, ) assert expected == assay.processes["S1-A1-electrophoresis-9-2"]
def test_assay_reader_small2_assay(small2_investigation_file, small2_assay_file): """Use ``AssayReader`` to read in small assay file.""" # Load investigation (tested elsewhere) investigation = InvestigationReader.from_stream( small2_investigation_file).read() InvestigationValidator(investigation).validate() # Create new row reader and check read headers reader = AssayReader.from_stream("S1", "A1", small2_assay_file) assert 14 == len(reader.header) # Read assay assay = reader.read() AssayValidator(investigation, investigation.studies[0], investigation.studies[0].assays[0], assay).validate() # Check results assert os.path.normpath(str(assay.file)).endswith( os.path.normpath("data/i_small2/a_small2.txt")) assert 14 == len(assay.header) assert 25 == len(assay.materials) assert 41 == len(assay.processes) assert 74 == len(assay.arcs) # Comments expected = models.Comment(name="Replicate", value="B") assert assay.materials["S1-A1-0815-T1-Pro1-B-115-COL5"].comments[ 0] == expected # Expected arcs expected = ( models.Arc("S1-sample-0815-N1", "S1-A1-extraction-2-1"), models.Arc("S1-sample-0815-T1", "S1-A1-extraction-2-2"), models.Arc("S1-A1-extraction-2-1", "S1-A1-0815-N1-Pro1-COL3"), models.Arc("S1-A1-extraction-2-2", "S1-A1-0815-T1-Pro1-COL3"), models.Arc("S1-A1-0815-N1-Pro1-COL3", "S1-A1-labeling-4-1"), models.Arc("S1-A1-0815-T1-Pro1-COL3", "S1-A1-labeling-4-2"), models.Arc("S1-A1-0815-N1-Pro1-COL3", "S1-A1-labeling-4-3"), models.Arc("S1-A1-0815-T1-Pro1-COL3", "S1-A1-labeling-4-4"), models.Arc("S1-A1-0815-N1-Pro1-COL3", "S1-A1-labeling-4-5"), models.Arc("S1-A1-0815-T1-Pro1-COL3", "S1-A1-labeling-4-6"), models.Arc("S1-A1-0815-N1-Pro1-COL3", "S1-A1-labeling-4-7"), models.Arc("S1-A1-0815-T1-Pro1-COL3", "S1-A1-labeling-4-8"), models.Arc("S1-A1-0815-N1-Pro1-COL3", "S1-A1-labeling-4-9"), models.Arc("S1-A1-0815-T1-Pro1-COL3", "S1-A1-labeling-4-10"), models.Arc("S1-A1-0815-N1-Pro1-COL3", "S1-A1-labeling-4-11"), models.Arc("S1-A1-0815-T1-Pro1-COL3", "S1-A1-labeling-4-12"), models.Arc("S1-A1-labeling-4-1", "S1-A1-0815-N1-Pro1-A-114-COL5"), models.Arc("S1-A1-labeling-4-2", "S1-A1-0815-T1-Pro1-A-115-COL5"), models.Arc("S1-A1-labeling-4-3", "S1-A1-0815-N1-Pro1-B-114-COL5"), models.Arc("S1-A1-labeling-4-4", "S1-A1-0815-T1-Pro1-B-115-COL5"), models.Arc("S1-A1-labeling-4-5", "S1-A1-0815-N1-Pro1-C-114-COL5"), models.Arc("S1-A1-labeling-4-6", "S1-A1-0815-T1-Pro1-C-115-COL5"), models.Arc("S1-A1-labeling-4-7", "S1-A1-0815-N1-Pro1-D-114-COL5"), models.Arc("S1-A1-labeling-4-8", "S1-A1-0815-T1-Pro1-D-115-COL5"), models.Arc("S1-A1-labeling-4-9", "S1-A1-0815-N1-Pro1-E-114-COL5"), models.Arc("S1-A1-labeling-4-10", "S1-A1-0815-T1-Pro1-E-115-COL5"), models.Arc("S1-A1-labeling-4-11", "S1-A1-0815-N1-Pro1-F-114-COL5"), models.Arc("S1-A1-labeling-4-12", "S1-A1-0815-T1-Pro1-F-115-COL5"), models.Arc("S1-A1-0815-N1-Pro1-A-114-COL5", "S1-A1-chromatography-8-1"), models.Arc("S1-A1-0815-T1-Pro1-A-115-COL5", "S1-A1-chromatography-8-2"), models.Arc("S1-A1-0815-N1-Pro1-B-114-COL5", "S1-A1-chromatography-8-3"), models.Arc("S1-A1-0815-T1-Pro1-B-115-COL5", "S1-A1-chromatography-8-4"), models.Arc("S1-A1-0815-N1-Pro1-C-114-COL5", "S1-A1-chromatography-8-5"), models.Arc("S1-A1-0815-T1-Pro1-C-115-COL5", "S1-A1-chromatography-8-6"), models.Arc("S1-A1-0815-N1-Pro1-D-114-COL5", "S1-A1-chromatography-8-7"), models.Arc("S1-A1-0815-T1-Pro1-D-115-COL5", "S1-A1-chromatography-8-8"), models.Arc("S1-A1-0815-N1-Pro1-E-114-COL5", "S1-A1-chromatography-8-9"), models.Arc("S1-A1-0815-T1-Pro1-E-115-COL5", "S1-A1-chromatography-8-10"), models.Arc("S1-A1-0815-N1-Pro1-F-114-COL5", "S1-A1-chromatography-8-11"), models.Arc("S1-A1-0815-T1-Pro1-F-115-COL5", "S1-A1-chromatography-8-12"), models.Arc("S1-A1-chromatography-8-1", "S1-A1-poolA-10"), models.Arc("S1-A1-chromatography-8-2", "S1-A1-poolA-10"), models.Arc("S1-A1-chromatography-8-3", "S1-A1-mass spectrometry-9-3"), models.Arc("S1-A1-chromatography-8-4", "S1-A1-mass spectrometry-9-4"), models.Arc("S1-A1-chromatography-8-5", "S1-A1-poolC-10"), models.Arc("S1-A1-chromatography-8-6", "S1-A1-poolC-10"), models.Arc("S1-A1-chromatography-8-7", "S1-A1-mass spectrometry-9-7"), models.Arc("S1-A1-chromatography-8-8", "S1-A1-mass spectrometry-9-8"), models.Arc("S1-A1-chromatography-8-9", "S1-A1-poolE-10"), models.Arc("S1-A1-chromatography-8-10", "S1-A1-poolE-10"), models.Arc("S1-A1-chromatography-8-11", "S1-A1-poolF-10"), models.Arc("S1-A1-chromatography-8-12", "S1-A1-poolF-10"), models.Arc("S1-A1-poolA-10", "S1-A1-poolA.raw-COL11"), models.Arc("S1-A1-mass spectrometry-9-3", "S1-A1-poolB.raw-COL11"), models.Arc("S1-A1-mass spectrometry-9-4", "S1-A1-poolB.raw-COL11"), models.Arc("S1-A1-poolC-10", "S1-A1-Empty Raw Spectral Data File-11-5"), models.Arc("S1-A1-mass spectrometry-9-7", "S1-A1-Empty Raw Spectral Data File-11-7"), models.Arc("S1-A1-mass spectrometry-9-8", "S1-A1-Empty Raw Spectral Data File-11-8"), models.Arc("S1-A1-poolE-10", "S1-A1-poolE.raw-COL11"), models.Arc("S1-A1-poolF-10", "S1-A1-Empty Raw Spectral Data File-11-11"), models.Arc("S1-A1-poolA.raw-COL11", "S1-A1-data transformation-12-1"), models.Arc("S1-A1-poolB.raw-COL11", "S1-A1-data transformation-12-3"), models.Arc("S1-A1-Empty Raw Spectral Data File-11-5", "S1-A1-data transformation-12-5"), models.Arc("S1-A1-Empty Raw Spectral Data File-11-7", "S1-A1-data transformation-12-7"), models.Arc("S1-A1-Empty Raw Spectral Data File-11-8", "S1-A1-data transformation-12-8"), models.Arc("S1-A1-poolE.raw-COL11", "S1-A1-data transformation-12-9"), models.Arc("S1-A1-Empty Raw Spectral Data File-11-11", "S1-A1-data analysis-13"), models.Arc("S1-A1-data transformation-12-1", "S1-A1-results.csv-COL14"), models.Arc("S1-A1-data transformation-12-3", "S1-A1-results.csv-COL14"), models.Arc("S1-A1-data transformation-12-5", "S1-A1-results.csv-COL14"), models.Arc("S1-A1-data transformation-12-7", "S1-A1-results.csv-COL14"), models.Arc("S1-A1-data transformation-12-8", "S1-A1-results.csv-COL14"), models.Arc("S1-A1-data transformation-12-9", "S1-A1-Empty Derived Data File-14-9"), models.Arc("S1-A1-data analysis-13", "S1-A1-results.csv-COL14"), ) assert sorted(expected) == sorted(assay.arcs)
def test_assay_reader_small_assay(small_investigation_file, small_assay_file): """Use ``AssayReader`` to read in small assay file.""" # Load investigation (tested elsewhere) investigation = InvestigationReader.from_stream( small_investigation_file).read() with pytest.warns(IsaWarning) as record: InvestigationValidator(investigation).validate() # Check warnings assert 1 == len(record) # Create new row reader and check read headers reader = AssayReader.from_stream("S1", "A1", small_assay_file) assert 9 == len(reader.header) # Read assay with pytest.warns(IsaWarning) as record: assay = reader.read() AssayValidator(investigation, investigation.studies[0], investigation.studies[0].assays[0], assay).validate() # Check warnings assert 1 == len(record) # Check results assert os.path.normpath(str(assay.file)).endswith( os.path.normpath("data/i_small/a_small.txt")) assert 9 == len(assay.header) assert 9 == len(assay.materials) assert 5 == len(assay.processes) assert 13 == len(assay.arcs) expected = models.Material( "Sample Name", "S1-sample-0815-N1", "0815-N1", None, (), (), (), None, [table_headers.SAMPLE_NAME], ) assert expected == assay.materials["S1-sample-0815-N1"] expected = models.Material( "Sample Name", "S1-sample-0815-T1", "0815-T1", None, (), (), (), None, [table_headers.SAMPLE_NAME], ) assert expected == assay.materials["S1-sample-0815-T1"] expected = models.Material( "Raw Data File", "S1-A1-0815-N1-DNA1-WES1_L???_???_R1.fastq.gz-COL6", "0815-N1-DNA1-WES1_L???_???_R1.fastq.gz", None, (), (), (), None, [table_headers.RAW_DATA_FILE], ) assert expected == assay.materials[ "S1-A1-0815-N1-DNA1-WES1_L???_???_R1.fastq.gz-COL6"] expected = models.Material( "Raw Data File", "S1-A1-0815-N1-DNA1-WES1_L???_???_R2.fastq.gz-COL7", "0815-N1-DNA1-WES1_L???_???_R2.fastq.gz", None, (), (), (), None, [table_headers.RAW_DATA_FILE], ) assert expected == assay.materials[ "S1-A1-0815-N1-DNA1-WES1_L???_???_R2.fastq.gz-COL7"] expected = models.Material( "Raw Data File", "S1-A1-0815-T1-DNA1-WES1_L???_???_R1.fastq.gz-COL6", "0815-T1-DNA1-WES1_L???_???_R1.fastq.gz", None, (), (), (), None, [table_headers.RAW_DATA_FILE], ) assert expected == assay.materials[ "S1-A1-0815-T1-DNA1-WES1_L???_???_R1.fastq.gz-COL6"] expected = models.Material( "Raw Data File", "S1-A1-0815-T1-DNA1-WES1_L???_???_R2.fastq.gz-COL7", "0815-T1-DNA1-WES1_L???_???_R2.fastq.gz", None, (), (), (), None, [table_headers.RAW_DATA_FILE], ) assert expected == assay.materials[ "S1-A1-0815-T1-DNA1-WES1_L???_???_R2.fastq.gz-COL7"] expected = models.Material( "Derived Data File", "S1-A1-0815-somatic.vcf.gz-COL9", "0815-somatic.vcf.gz", None, (), (), (), None, [table_headers.DERIVED_DATA_FILE], ) assert expected == assay.materials["S1-A1-0815-somatic.vcf.gz-COL9"] expected = models.Process( "library preparation", "S1-A1-library preparation-2-1", None, None, None, None, (), (), None, None, None, [table_headers.PROTOCOL_REF], ) assert expected == assay.processes["S1-A1-library preparation-2-1"] expected = models.Process( "library preparation", "S1-A1-library preparation-2-2", None, None, None, None, (), (), None, None, None, [table_headers.PROTOCOL_REF], ) assert expected == assay.processes["S1-A1-library preparation-2-2"] expected = models.Process( "nucleic acid sequencing", "S1-A1-0815-N1-DNA1-WES1-5", "0815-N1-DNA1-WES1", "Assay Name", None, None, (), (), None, None, None, [table_headers.PROTOCOL_REF, table_headers.ASSAY_NAME], ) assert expected == assay.processes["S1-A1-0815-N1-DNA1-WES1-5"] expected = models.Process( "nucleic acid sequencing", "S1-A1-0815-T1-DNA1-WES1-5", "0815-T1-DNA1-WES1", "Assay Name", None, None, (), (), None, None, None, [table_headers.PROTOCOL_REF, table_headers.ASSAY_NAME], ) assert expected == assay.processes["S1-A1-0815-T1-DNA1-WES1-5"] expected = ( models.Arc("S1-sample-0815-N1", "S1-A1-library preparation-2-1"), models.Arc("S1-A1-library preparation-2-1", "S1-A1-0815-N1-DNA1-COL3"), models.Arc("S1-A1-0815-N1-DNA1-COL3", "S1-A1-0815-N1-DNA1-WES1-5"), models.Arc("S1-A1-0815-N1-DNA1-WES1-5", "S1-A1-0815-N1-DNA1-WES1_L???_???_R1.fastq.gz-COL6"), models.Arc( "S1-A1-0815-N1-DNA1-WES1_L???_???_R1.fastq.gz-COL6", "S1-A1-0815-N1-DNA1-WES1_L???_???_R2.fastq.gz-COL7", ), models.Arc("S1-A1-0815-N1-DNA1-WES1_L???_???_R2.fastq.gz-COL7", "S1-A1-somatic variant calling-1-8"), models.Arc("S1-A1-somatic variant calling-1-8", "S1-A1-0815-somatic.vcf.gz-COL9"), models.Arc("S1-sample-0815-T1", "S1-A1-library preparation-2-2"), models.Arc("S1-A1-library preparation-2-2", "S1-A1-0815-T1-DNA1-COL3"), models.Arc("S1-A1-0815-T1-DNA1-COL3", "S1-A1-0815-T1-DNA1-WES1-5"), models.Arc("S1-A1-0815-T1-DNA1-WES1-5", "S1-A1-0815-T1-DNA1-WES1_L???_???_R1.fastq.gz-COL6"), models.Arc( "S1-A1-0815-T1-DNA1-WES1_L???_???_R1.fastq.gz-COL6", "S1-A1-0815-T1-DNA1-WES1_L???_???_R2.fastq.gz-COL7", ), models.Arc("S1-A1-0815-T1-DNA1-WES1_L???_???_R2.fastq.gz-COL7", "S1-A1-somatic variant calling-1-8"), ) assert expected == assay.arcs
def test_header_exception_unknown_header(assay_file_exception_unknown_header): with pytest.raises(ParseIsatabException) as excinfo: AssayReader.from_stream("S1", "A1", assay_file_exception_unknown_header) msg = 'Header "Test Name" unknown, processing unclear' assert msg == str(excinfo.value)