def get(self, project_uuid=None) -> IsaData: raw_data = self.get_raw(project_uuid) investigation = InvestigationReader.from_stream( input_file=io.StringIO(raw_data["investigation"]["tsv"]), filename=raw_data["investigation"]["path"], ).read() studies = { path: StudyReader.from_stream(study_id=path, input_file=io.StringIO(details["tsv"]), filename=path).read() for path, details in raw_data["studies"].items() } if len(studies) > 1: # pragma: nocover raise UnsupportedIsaTabFeatureException( "More than one study found!") study = list(studies.values())[0] assays = { path: AssayReader.from_stream( study_id=study.file, assay_id=path, input_file=io.StringIO(details["tsv"]), filename=path, ).read() for path, details in raw_data["assays"].items() } return IsaData(investigation, raw_data["investigation"]["path"], studies, assays)
def run_warnings_caught(args): # Read investigation investigation = InvestigationReader.from_stream(args.input_investigation_file).read() args.input_investigation_file.close() # Validate investigation InvestigationValidator(investigation).validate() # Read studies and assays path_in = os.path.normpath(os.path.dirname(args.input_investigation_file.name)) studies = {} assays = {} for s, study_info in enumerate(investigation.studies): if study_info.info.path: with open(os.path.join(path_in, study_info.info.path), "rt") as inputf: studies[s] = StudyReader.from_stream("S{}".format(s + 1), inputf).read() if study_info.assays: assays[s] = {} for a, assay_info in enumerate(study_info.assays): if assay_info.path: with open(os.path.join(path_in, assay_info.path), "rt") as inputf: assays[s][a] = AssayReader.from_stream( "S{}".format(s + 1), "A{}".format(a + 1), inputf ).read() # Validate studies and assays for s, study_info in enumerate(investigation.studies): if study_info.info.path: StudyValidator(investigation, study_info, studies[s]).validate() for a, assay_info in enumerate(study_info.assays): if assay_info.path: AssayValidator(investigation, study_info, assay_info, assays[s][a]).validate()
def load_investigation(i_path: typing.Union[str, Path]) -> IsaData: """Load investigation information from investigation files. Study and assay files are expected to be next to the investigation file. """ i_path = Path(i_path) with i_path.open("rt") as i_file: investigation = InvestigationReader.from_stream( input_file=i_file, filename=i_path.name).read() studies = {} assays = {} for study in investigation.studies: with (i_path.parent / study.info.path).open() as s_file: studies[study.info.path.name] = StudyReader.from_stream( study_id=study.info.path.name, input_file=s_file).read() for assay in study.assays: with (i_path.parent / assay.path).open() as a_file: assays[assay.path.name] = AssayReader.from_stream( study_id=studies[study.info.path.name].file.name, assay_id=assay.path.name, input_file=a_file, ).read() return IsaData(investigation, str(i_path), studies, assays)
def test_study_reader_minimal_study_iostring2(minimal_investigation_file, minimal_study_file): # Load investigation (tested elsewhere) stringio = io.StringIO(minimal_investigation_file.read()) investigation = InvestigationReader.from_stream(stringio).read() with pytest.warns(IsaWarning) as record: InvestigationValidator(investigation).validate() # Check warnings assert 2 == len(record) # Create new study reader and read from StringIO with no filename indicated stringio = io.StringIO(minimal_study_file.read()) reader = StudyReader.from_stream("S1", stringio) assert 3 == len(reader.header) # Read study study = reader.read() StudyValidator(investigation, investigation.studies[0], study).validate() # Check results assert str(study.file) == "<no file>" assert 3 == len(study.header) assert 2 == len(study.materials) assert 1 == len(study.processes) assert 2 == len(study.arcs)
def run(args): with open(args.investigation_file, "rt") as inputf: investigation = InvestigationReader.from_stream(inputf).read() path = os.path.dirname(args.investigation_file) print("digraph investigation {", file=args.output_file) print(' rankdir = "LR";', file=args.output_file) for s, study_info in enumerate(investigation.studies): with open(os.path.join(path, study_info.info.path), "rt") as inputf: study = StudyReader.from_stream("S{}".format(s + 1), inputf).read() print(" /* study {} */".format(study_info.info.path), file=args.output_file) print(" subgraph clusterStudy{} {{".format(s), file=args.output_file) print(' label = "Study: {}"'.format(study_info.info.path), file=args.output_file) print_dot(study, args.output_file) print(" }", file=args.output_file) for a, assay_info in enumerate(study_info.assays): with open(os.path.join(path, assay_info.path), "rt") as inputf: assay = AssayReader.from_stream("S{}".format(s + 1), "A{}".format(a + 1), inputf).read() print(" /* assay {} */".format(assay_info.path), file=args.output_file) print(" subgraph clusterAssayS{}A{} {{".format(s, a), file=args.output_file) print(' label = "Assay: {}"'.format(assay_info.path), file=args.output_file) print_dot(assay, args.output_file) print(" }", file=args.output_file) print("}", file=args.output_file)
def _parse_write_assert(investigation_file, tmp_path, quote=None): # Load investigation investigation = InvestigationReader.from_stream(investigation_file).read() InvestigationValidator(investigation).validate() directory = os.path.normpath(os.path.dirname(investigation_file.name)) # Iterate studies for s, study_info in enumerate(investigation.studies): # Load study path_in = os.path.join(directory, study_info.info.path) with open(path_in, "rt") as inputf: study = StudyReader.from_stream("S{}".format(s + 1), inputf).read() StudyValidator(investigation, study_info, study).validate() # Write study to temporary file path_out = tmp_path / study_info.info.path with open(path_out, "wt", newline="") as file: StudyWriter.from_stream(study, file, quote=quote).write() assert filecmp.cmp(path_in, path_out, shallow=False)
def execute_sync_case_list_job(job): """Synchronise cases within a project with the upstream SODAR site.""" job.mark_start() timeline = get_backend_api("timeline_backend") if timeline: tl_event = timeline.add_event( project=job.project, app_name="variants", user=job.bg_job.user, event_name="project_sync_upstream", description="sychronising with upstream SODAR", status_type="INIT", ) try: sources = [s for s in RemoteSite.objects.all() if s.mode == "SOURCE"] if len(sources) != 1: raise RuntimeError( "Expected exactly one remote source site but there were %d" % len(sources) ) else: source = sources[0] project = CaseAwareProject.objects.get(pk=job.project.pk) r = requests.get( URL_TEMPLATE % {"url": source.url, "project_uuid": project.sodar_uuid, "secret": source.secret} ) def get_field(fields, key): """Helper for easily obtaining value from an ISA-tab field.""" return ";".join(fields.get(key, ())) # Mapping from sex in ISA-tab to sex in PLINK PED. map_sex = {"male": 1, "female": 2} # Mapping from disease state in ISA-tab to sex in PLINK PED. map_affected = {"affected": 2, "unaffected": 1} # Parse investigation and all studies from ISA-tab (wrapped in JSON). upstream_pedigree = {} isa_json = r.json() # investigation = InvestigationReader.from_stream( # io.StringIO(isa_json["investigation"]["tsv"]), # filename=isa_json["investigation"]["path"], # ).read() for s_path, s_data in isa_json["studies"].items(): study = StudyReader.from_stream( s_path, io.StringIO(s_data["tsv"]), filename=s_path ).read() job.add_log_entry(study) # Compress study arcs (map source to sample), easy because only one depth of one in study. arc_map = {arc.tail: arc for arc in study.arcs} for arc in list(arc_map.values()): # NB: copy intentionally if arc.head in arc_map: arc_map[arc.tail] = arc_map[arc.head] job.add_log_entry(arc_map) # Actually parse out individuals. source_samples = {} for arc in study.arcs: if arc.tail in study.materials and study.materials[arc.tail].type == "Source Name": source_samples.setdefault(arc.tail, []).append( study.materials[arc_map[arc.tail].head] ) for material in study.materials.values(): if material.type == "Source Name": if len(source_samples[material.unique_name]) > 1: job.add_log_entry( "WARNING: more than one sample for source %s" % material.name, log_level=LOG_LEVEL_WARNING, ) fields = {c.name: c.value for c in material.characteristics} job.add_log_entry("fields = %s" % fields) member = PedigreeMember( family=get_field(fields, "Family"), name=material.name, father=get_field(fields, "Father"), mother=get_field(fields, "Mother"), sex=map_sex.get(get_field(fields, "Sex"), 0), affected=map_affected.get(get_field(fields, "Disease status"), 0), sample_name=source_samples[material.unique_name][0].name, ) job.add_log_entry("new member: %s" % member) upstream_pedigree[material.name] = member compare_to_upstream(project, upstream_pedigree, job) except Exception as e: job.mark_error("%s: %s" % (type(e).__name__, e)) if timeline: tl_event.set_status("FAILED", "syncing with upstream SODAR failed") raise else: job.mark_success() if timeline: tl_event.set_status("OK", "syncing with upstream SODAR successful")
def test_study_reader_minimal_study(minimal_investigation_file, minimal_study_file): """Use ``StudyReader`` to read in minimal study file. Using the ``StudyReader`` instead of the ``StudyRowReader`` gives us ``Study`` objects instead of just the row-wise nodes. """ # Load investigation (tested elsewhere) investigation = InvestigationReader.from_stream( minimal_investigation_file).read() with pytest.warns(IsaWarning) as record: InvestigationValidator(investigation).validate() # Check warnings assert 2 == len(record) # Create new row reader and check read headers reader = StudyReader.from_stream("S1", minimal_study_file) assert 3 == len(reader.header) # Read study study = reader.read() StudyValidator(investigation, investigation.studies[0], study).validate() # Check results assert os.path.normpath(str(study.file)).endswith( os.path.normpath("data/i_minimal/s_minimal.txt")) assert 3 == len(study.header) assert 2 == len(study.materials) assert 1 == len(study.processes) assert 2 == len(study.arcs) expected = models.Material("Source Name", "S1-source-0815", "0815", None, (), (), (), None, [table_headers.SOURCE_NAME]) assert expected == study.materials["S1-source-0815"] expected = models.Material( "Sample Name", "S1-sample-0815-N1", "0815-N1", None, (), (), (), None, [table_headers.SAMPLE_NAME], ) assert expected == study.materials["S1-sample-0815-N1"] expected = models.Process( "sample collection", "S1-sample collection-2-1", None, None, None, None, (), (), None, None, None, [table_headers.PROTOCOL_REF], ) assert expected == study.processes["S1-sample collection-2-1"] expected = ( models.Arc("S1-source-0815", "S1-sample collection-2-1"), models.Arc("S1-sample collection-2-1", "S1-sample-0815-N1"), ) assert expected == study.arcs
def test_study_reader_small_study(small_investigation_file, small_study_file): """Use ``StudyReader`` to read in small study file.""" # Load investigation (tested elsewhere) with pytest.warns(IsaWarning) as record: investigation = InvestigationReader.from_stream( small_investigation_file).read() InvestigationValidator(investigation).validate() # Check warnings assert 2 == len(record) # Create new row reader and check read headers reader = StudyReader.from_stream("S1", small_study_file) assert 13 == len(reader.header) # Read study study = reader.read() StudyValidator(investigation, investigation.studies[0], study).validate() # Check results assert os.path.normpath(str(study.file)).endswith( os.path.normpath("data/i_small/s_small.txt")) assert 13 == len(study.header) assert 9 == len(study.materials) assert 5 == len(study.processes) assert 10 == len(study.arcs) headers_source = [ table_headers.SOURCE_NAME, table_headers.CHARACTERISTICS + "[organism]", table_headers.TERM_SOURCE_REF, table_headers.TERM_ACCESSION_NUMBER, table_headers.CHARACTERISTICS + "[age]", table_headers.UNIT, table_headers.TERM_SOURCE_REF, table_headers.TERM_ACCESSION_NUMBER, ] headers_collection = [ table_headers.PROTOCOL_REF, table_headers.PARAMETER_VALUE + "[instrument]", table_headers.PERFORMER, table_headers.DATE, ] headers_sample = [ table_headers.SAMPLE_NAME, table_headers.CHARACTERISTICS + "[status]", table_headers.FACTOR_VALUE + "[treatment]", ] unit = models.OntologyTermRef( name="day", accession="http://purl.obolibrary.org/obo/UO_0000033", ontology_name="UO") characteristics1 = ( models.Characteristics( name="organism", value=[ models.OntologyTermRef( name="Mus musculus", accession="http://purl.bioontology.org/ontology/" "NCBITAXON/10090", ontology_name="NCBITAXON", ) ], unit=None, ), models.Characteristics(name="age", value=["90"], unit=unit), ) characteristics2 = ( models.Characteristics( name="organism", value=[models.OntologyTermRef("Mus musculus", "", "")], unit=None), models.Characteristics(name="age", value=[""], unit=unit), ) characteristics3 = ( models.Characteristics( name="organism", value=[models.OntologyTermRef(None, None, None)], unit=None), models.Characteristics(name="age", value=["150"], unit=unit), ) expected = models.Material( "Source Name", "S1-source-0815", "0815", None, characteristics1, (), (), None, headers_source, ) assert expected == study.materials["S1-source-0815"] expected = models.Material( "Source Name", "S1-source-0816", "0816", None, characteristics2, (), (), None, headers_source, ) assert expected == study.materials["S1-source-0816"] expected = models.Material( "Source Name", "S1-source-0817", "0817", None, characteristics3, (), (), None, headers_source, ) assert expected == study.materials["S1-source-0817"] expected = models.Material( "Sample Name", "S1-sample-0815-N1", "0815-N1", None, (models.Characteristics("status", ["0"], None), ), (), (models.FactorValue("treatment", "yes", None), ), None, headers_sample, ) assert expected == study.materials["S1-sample-0815-N1"] expected = models.Material( "Sample Name", "S1-sample-0815-T1", "0815-T1", None, (models.Characteristics("status", ["2"], None), ), (), (models.FactorValue("treatment", "", None), ), None, headers_sample, ) assert expected == study.materials["S1-sample-0815-T1"] expected = models.Material( "Sample Name", "S1-sample-0816-T1", "0816-T1", None, (models.Characteristics("status", ["1"], None), ), (), (models.FactorValue("treatment", "yes", None), ), None, headers_sample, ) assert expected == study.materials["S1-sample-0816-T1"] expected = models.Material( "Sample Name", "S1-Empty Sample Name-13-5", "", None, (models.Characteristics("status", [""], None), ), (), (models.FactorValue("treatment", "", None), ), None, headers_sample, ) assert expected == study.materials["S1-Empty Sample Name-13-5"] expected = models.Process( "sample collection", "S1-sample collection-9-2", None, None, date(2018, 2, 2), "John Doe", (models.ParameterValue("instrument", ["scalpel"], None), ), (), None, None, None, headers_collection, ) assert expected == study.processes["S1-sample collection-9-2"] expected = models.Process( "sample collection", "S1-sample collection-9-3", None, None, date(2018, 2, 2), "John Doe", (models.ParameterValue("instrument", ["scalpel type A", "scalpel type B"], None), ), (), None, None, None, headers_collection, ) assert expected == study.processes["S1-sample collection-9-3"] expected = models.Process( "sample collection", "S1-sample collection-9-4", None, None, date(2018, 2, 2), "John Doe", (models.ParameterValue("instrument", ["scalpel"], None), ), (), None, None, None, headers_collection, ) assert expected == study.processes["S1-sample collection-9-4"] expected = ( models.Arc("S1-source-0814", "S1-sample collection-9-1"), models.Arc("S1-sample collection-9-1", "S1-sample-0814-N1"), models.Arc("S1-source-0815", "S1-sample collection-9-2"), models.Arc("S1-sample collection-9-2", "S1-sample-0815-N1"), models.Arc("S1-source-0815", "S1-sample collection-9-3"), models.Arc("S1-sample collection-9-3", "S1-sample-0815-T1"), models.Arc("S1-source-0816", "S1-sample collection-9-4"), models.Arc("S1-sample collection-9-4", "S1-sample-0816-T1"), models.Arc("S1-source-0817", "S1-sample collection-9-5"), models.Arc("S1-sample collection-9-5", "S1-Empty Sample Name-13-5"), ) assert expected == study.arcs
def fetch_remote_pedigree(source, project, add_log_entry=_nolog): """Fetch pedigree (dict of ``PedigreeMember``) from remote site ``source``.""" r = requests.get( URL_TEMPLATE % { "url": source.url, "project_uuid": project.sodar_uuid, "secret": source.secret }) # Mapping from sex in ISA-tab to sex in PLINK PED. map_sex = {"male": 1, "female": 2} # Mapping from disease state in ISA-tab to sex in PLINK PED. map_affected = {"affected": 2, "unaffected": 1} # Parse investigation and all studies from ISA-tab (wrapped in JSON). remote_pedigree = {} isa_json = r.json() for s_path, s_data in isa_json["studies"].items(): study = StudyReader.from_stream(s_path, io.StringIO(s_data["tsv"]), filename=s_path).read() add_log_entry(study) # Compress study arcs (map source to sample), easy because only one depth of one in study. arc_map = {arc.tail: arc for arc in study.arcs} for arc in list(arc_map.values()): # NB: copy intentionally if arc.head in arc_map: arc_map[arc.tail] = arc_map[arc.head] add_log_entry(arc_map) # Actually parse out individuals. source_samples = {} for arc in study.arcs: if arc.tail in study.materials and study.materials[ arc.tail].type == "Source Name": source_samples.setdefault(arc.tail, []).append( study.materials[arc_map[arc.tail].head]) for material in study.materials.values(): if material.type == "Source Name": if len(source_samples[material.unique_name]) > 1: add_log_entry( "WARNING: more than one sample for source %s" % material.name, log_level=LOG_LEVEL_WARNING, ) fields = {c.name: c.value for c in material.characteristics} add_log_entry("fields = %s" % fields) member = PedigreeMember( family=_isa_helper_get_field(fields, "Family"), name=material.name, father=_isa_helper_get_field(fields, "Father"), mother=_isa_helper_get_field(fields, "Mother"), sex=map_sex.get(_isa_helper_get_field(fields, "Sex"), 0), affected=map_affected.get( _isa_helper_get_field(fields, "Disease status"), 0), sample_name=source_samples[material.unique_name][0].name, hpo_terms=_isa_helper_get_term_field(fields, "HPO terms"), orphanet_diseases=_isa_helper_get_term_field( fields, "Orphanet disease"), omim_diseases=_isa_helper_get_term_field( fields, "OMIM disease"), ) add_log_entry("new member: %s" % member) remote_pedigree[material.name] = member return remote_pedigree