Beispiel #1
0
 def get(self, project_uuid=None) -> IsaData:
     raw_data = self.get_raw(project_uuid)
     investigation = InvestigationReader.from_stream(
         input_file=io.StringIO(raw_data["investigation"]["tsv"]),
         filename=raw_data["investigation"]["path"],
     ).read()
     studies = {
         path:
         StudyReader.from_stream(study_id=path,
                                 input_file=io.StringIO(details["tsv"]),
                                 filename=path).read()
         for path, details in raw_data["studies"].items()
     }
     if len(studies) > 1:  # pragma: nocover
         raise UnsupportedIsaTabFeatureException(
             "More than one study found!")
     study = list(studies.values())[0]
     assays = {
         path: AssayReader.from_stream(
             study_id=study.file,
             assay_id=path,
             input_file=io.StringIO(details["tsv"]),
             filename=path,
         ).read()
         for path, details in raw_data["assays"].items()
     }
     return IsaData(investigation, raw_data["investigation"]["path"],
                    studies, assays)
Beispiel #2
0
def run_warnings_caught(args):
    # Read investigation
    investigation = InvestigationReader.from_stream(args.input_investigation_file).read()
    args.input_investigation_file.close()

    # Validate investigation
    InvestigationValidator(investigation).validate()

    # Read studies and assays
    path_in = os.path.normpath(os.path.dirname(args.input_investigation_file.name))
    studies = {}
    assays = {}
    for s, study_info in enumerate(investigation.studies):
        if study_info.info.path:
            with open(os.path.join(path_in, study_info.info.path), "rt") as inputf:
                studies[s] = StudyReader.from_stream("S{}".format(s + 1), inputf).read()
        if study_info.assays:
            assays[s] = {}
        for a, assay_info in enumerate(study_info.assays):
            if assay_info.path:
                with open(os.path.join(path_in, assay_info.path), "rt") as inputf:
                    assays[s][a] = AssayReader.from_stream(
                        "S{}".format(s + 1), "A{}".format(a + 1), inputf
                    ).read()

    # Validate studies and assays
    for s, study_info in enumerate(investigation.studies):
        if study_info.info.path:
            StudyValidator(investigation, study_info, studies[s]).validate()
        for a, assay_info in enumerate(study_info.assays):
            if assay_info.path:
                AssayValidator(investigation, study_info, assay_info, assays[s][a]).validate()
Beispiel #3
0
def load_investigation(i_path: typing.Union[str, Path]) -> IsaData:
    """Load investigation information from investigation files.

    Study and assay files are expected to be next to the investigation file.
    """
    i_path = Path(i_path)
    with i_path.open("rt") as i_file:
        investigation = InvestigationReader.from_stream(
            input_file=i_file, filename=i_path.name).read()

    studies = {}
    assays = {}
    for study in investigation.studies:
        with (i_path.parent / study.info.path).open() as s_file:
            studies[study.info.path.name] = StudyReader.from_stream(
                study_id=study.info.path.name, input_file=s_file).read()
            for assay in study.assays:
                with (i_path.parent / assay.path).open() as a_file:
                    assays[assay.path.name] = AssayReader.from_stream(
                        study_id=studies[study.info.path.name].file.name,
                        assay_id=assay.path.name,
                        input_file=a_file,
                    ).read()

    return IsaData(investigation, str(i_path), studies, assays)
Beispiel #4
0
def test_study_reader_minimal_study_iostring2(minimal_investigation_file,
                                              minimal_study_file):
    # Load investigation (tested elsewhere)
    stringio = io.StringIO(minimal_investigation_file.read())
    investigation = InvestigationReader.from_stream(stringio).read()
    with pytest.warns(IsaWarning) as record:
        InvestigationValidator(investigation).validate()

    # Check warnings
    assert 2 == len(record)

    # Create new study reader and read from StringIO with no filename indicated
    stringio = io.StringIO(minimal_study_file.read())
    reader = StudyReader.from_stream("S1", stringio)
    assert 3 == len(reader.header)

    # Read study
    study = reader.read()
    StudyValidator(investigation, investigation.studies[0], study).validate()

    # Check results
    assert str(study.file) == "<no file>"
    assert 3 == len(study.header)
    assert 2 == len(study.materials)
    assert 1 == len(study.processes)
    assert 2 == len(study.arcs)
Beispiel #5
0
def run(args):
    with open(args.investigation_file, "rt") as inputf:
        investigation = InvestigationReader.from_stream(inputf).read()

    path = os.path.dirname(args.investigation_file)

    print("digraph investigation {", file=args.output_file)
    print('  rankdir = "LR";', file=args.output_file)

    for s, study_info in enumerate(investigation.studies):
        with open(os.path.join(path, study_info.info.path), "rt") as inputf:
            study = StudyReader.from_stream("S{}".format(s + 1), inputf).read()
        print("  /* study {} */".format(study_info.info.path),
              file=args.output_file)
        print("  subgraph clusterStudy{} {{".format(s), file=args.output_file)
        print('    label = "Study: {}"'.format(study_info.info.path),
              file=args.output_file)
        print_dot(study, args.output_file)
        print("  }", file=args.output_file)

        for a, assay_info in enumerate(study_info.assays):
            with open(os.path.join(path, assay_info.path), "rt") as inputf:
                assay = AssayReader.from_stream("S{}".format(s + 1),
                                                "A{}".format(a + 1),
                                                inputf).read()
            print("  /* assay {} */".format(assay_info.path),
                  file=args.output_file)
            print("  subgraph clusterAssayS{}A{} {{".format(s, a),
                  file=args.output_file)
            print('    label = "Assay: {}"'.format(assay_info.path),
                  file=args.output_file)
            print_dot(assay, args.output_file)
            print("  }", file=args.output_file)

    print("}", file=args.output_file)
Beispiel #6
0
def _parse_write_assert(investigation_file, tmp_path, quote=None):
    # Load investigation
    investigation = InvestigationReader.from_stream(investigation_file).read()
    InvestigationValidator(investigation).validate()
    directory = os.path.normpath(os.path.dirname(investigation_file.name))
    # Iterate studies
    for s, study_info in enumerate(investigation.studies):
        # Load study
        path_in = os.path.join(directory, study_info.info.path)
        with open(path_in, "rt") as inputf:
            study = StudyReader.from_stream("S{}".format(s + 1), inputf).read()
        StudyValidator(investigation, study_info, study).validate()
        # Write study to temporary file
        path_out = tmp_path / study_info.info.path
        with open(path_out, "wt", newline="") as file:
            StudyWriter.from_stream(study, file, quote=quote).write()
        assert filecmp.cmp(path_in, path_out, shallow=False)
def execute_sync_case_list_job(job):
    """Synchronise cases within a project with the upstream SODAR site."""
    job.mark_start()
    timeline = get_backend_api("timeline_backend")
    if timeline:
        tl_event = timeline.add_event(
            project=job.project,
            app_name="variants",
            user=job.bg_job.user,
            event_name="project_sync_upstream",
            description="sychronising with upstream SODAR",
            status_type="INIT",
        )
    try:
        sources = [s for s in RemoteSite.objects.all() if s.mode == "SOURCE"]
        if len(sources) != 1:
            raise RuntimeError(
                "Expected exactly one remote source site but there were %d" % len(sources)
            )
        else:
            source = sources[0]
        project = CaseAwareProject.objects.get(pk=job.project.pk)
        r = requests.get(
            URL_TEMPLATE
            % {"url": source.url, "project_uuid": project.sodar_uuid, "secret": source.secret}
        )

        def get_field(fields, key):
            """Helper for easily obtaining value from an ISA-tab field."""
            return ";".join(fields.get(key, ()))

        # Mapping from sex in ISA-tab to sex in PLINK PED.
        map_sex = {"male": 1, "female": 2}
        # Mapping from disease state in ISA-tab to sex in PLINK PED.
        map_affected = {"affected": 2, "unaffected": 1}

        # Parse investigation and all studies from ISA-tab (wrapped in JSON).
        upstream_pedigree = {}
        isa_json = r.json()
        # investigation = InvestigationReader.from_stream(
        #     io.StringIO(isa_json["investigation"]["tsv"]),
        #     filename=isa_json["investigation"]["path"],
        # ).read()
        for s_path, s_data in isa_json["studies"].items():
            study = StudyReader.from_stream(
                s_path, io.StringIO(s_data["tsv"]), filename=s_path
            ).read()
            job.add_log_entry(study)
            # Compress study arcs (map source to sample), easy because only one depth of one in study.
            arc_map = {arc.tail: arc for arc in study.arcs}
            for arc in list(arc_map.values()):  # NB: copy intentionally
                if arc.head in arc_map:
                    arc_map[arc.tail] = arc_map[arc.head]
            job.add_log_entry(arc_map)
            # Actually parse out individuals.
            source_samples = {}
            for arc in study.arcs:
                if arc.tail in study.materials and study.materials[arc.tail].type == "Source Name":
                    source_samples.setdefault(arc.tail, []).append(
                        study.materials[arc_map[arc.tail].head]
                    )
            for material in study.materials.values():
                if material.type == "Source Name":
                    if len(source_samples[material.unique_name]) > 1:
                        job.add_log_entry(
                            "WARNING: more than one sample for source %s" % material.name,
                            log_level=LOG_LEVEL_WARNING,
                        )
                    fields = {c.name: c.value for c in material.characteristics}
                    job.add_log_entry("fields = %s" % fields)
                    member = PedigreeMember(
                        family=get_field(fields, "Family"),
                        name=material.name,
                        father=get_field(fields, "Father"),
                        mother=get_field(fields, "Mother"),
                        sex=map_sex.get(get_field(fields, "Sex"), 0),
                        affected=map_affected.get(get_field(fields, "Disease status"), 0),
                        sample_name=source_samples[material.unique_name][0].name,
                    )
                    job.add_log_entry("new member: %s" % member)
                    upstream_pedigree[material.name] = member

        compare_to_upstream(project, upstream_pedigree, job)
    except Exception as e:
        job.mark_error("%s: %s" % (type(e).__name__, e))
        if timeline:
            tl_event.set_status("FAILED", "syncing with upstream SODAR failed")
        raise
    else:
        job.mark_success()
        if timeline:
            tl_event.set_status("OK", "syncing with upstream SODAR successful")
Beispiel #8
0
def test_study_reader_minimal_study(minimal_investigation_file,
                                    minimal_study_file):
    """Use ``StudyReader`` to read in minimal study file.

    Using the ``StudyReader`` instead of the ``StudyRowReader`` gives us
    ``Study`` objects instead of just the row-wise nodes.
    """
    # Load investigation (tested elsewhere)
    investigation = InvestigationReader.from_stream(
        minimal_investigation_file).read()
    with pytest.warns(IsaWarning) as record:
        InvestigationValidator(investigation).validate()

    # Check warnings
    assert 2 == len(record)

    # Create new row reader and check read headers
    reader = StudyReader.from_stream("S1", minimal_study_file)
    assert 3 == len(reader.header)

    # Read study
    study = reader.read()
    StudyValidator(investigation, investigation.studies[0], study).validate()

    # Check results
    assert os.path.normpath(str(study.file)).endswith(
        os.path.normpath("data/i_minimal/s_minimal.txt"))
    assert 3 == len(study.header)
    assert 2 == len(study.materials)
    assert 1 == len(study.processes)
    assert 2 == len(study.arcs)

    expected = models.Material("Source Name", "S1-source-0815", "0815", None,
                               (), (), (), None, [table_headers.SOURCE_NAME])
    assert expected == study.materials["S1-source-0815"]
    expected = models.Material(
        "Sample Name",
        "S1-sample-0815-N1",
        "0815-N1",
        None,
        (),
        (),
        (),
        None,
        [table_headers.SAMPLE_NAME],
    )
    assert expected == study.materials["S1-sample-0815-N1"]

    expected = models.Process(
        "sample collection",
        "S1-sample collection-2-1",
        None,
        None,
        None,
        None,
        (),
        (),
        None,
        None,
        None,
        [table_headers.PROTOCOL_REF],
    )
    assert expected == study.processes["S1-sample collection-2-1"]

    expected = (
        models.Arc("S1-source-0815", "S1-sample collection-2-1"),
        models.Arc("S1-sample collection-2-1", "S1-sample-0815-N1"),
    )
    assert expected == study.arcs
Beispiel #9
0
def test_study_reader_small_study(small_investigation_file, small_study_file):
    """Use ``StudyReader`` to read in small study file."""
    # Load investigation (tested elsewhere)
    with pytest.warns(IsaWarning) as record:
        investigation = InvestigationReader.from_stream(
            small_investigation_file).read()
        InvestigationValidator(investigation).validate()

    # Check warnings
    assert 2 == len(record)

    # Create new row reader and check read headers
    reader = StudyReader.from_stream("S1", small_study_file)
    assert 13 == len(reader.header)

    # Read study
    study = reader.read()
    StudyValidator(investigation, investigation.studies[0], study).validate()

    # Check results
    assert os.path.normpath(str(study.file)).endswith(
        os.path.normpath("data/i_small/s_small.txt"))
    assert 13 == len(study.header)
    assert 9 == len(study.materials)
    assert 5 == len(study.processes)
    assert 10 == len(study.arcs)

    headers_source = [
        table_headers.SOURCE_NAME,
        table_headers.CHARACTERISTICS + "[organism]",
        table_headers.TERM_SOURCE_REF,
        table_headers.TERM_ACCESSION_NUMBER,
        table_headers.CHARACTERISTICS + "[age]",
        table_headers.UNIT,
        table_headers.TERM_SOURCE_REF,
        table_headers.TERM_ACCESSION_NUMBER,
    ]
    headers_collection = [
        table_headers.PROTOCOL_REF,
        table_headers.PARAMETER_VALUE + "[instrument]",
        table_headers.PERFORMER,
        table_headers.DATE,
    ]
    headers_sample = [
        table_headers.SAMPLE_NAME,
        table_headers.CHARACTERISTICS + "[status]",
        table_headers.FACTOR_VALUE + "[treatment]",
    ]

    unit = models.OntologyTermRef(
        name="day",
        accession="http://purl.obolibrary.org/obo/UO_0000033",
        ontology_name="UO")

    characteristics1 = (
        models.Characteristics(
            name="organism",
            value=[
                models.OntologyTermRef(
                    name="Mus musculus",
                    accession="http://purl.bioontology.org/ontology/"
                    "NCBITAXON/10090",
                    ontology_name="NCBITAXON",
                )
            ],
            unit=None,
        ),
        models.Characteristics(name="age", value=["90"], unit=unit),
    )
    characteristics2 = (
        models.Characteristics(
            name="organism",
            value=[models.OntologyTermRef("Mus musculus", "", "")],
            unit=None),
        models.Characteristics(name="age", value=[""], unit=unit),
    )
    characteristics3 = (
        models.Characteristics(
            name="organism",
            value=[models.OntologyTermRef(None, None, None)],
            unit=None),
        models.Characteristics(name="age", value=["150"], unit=unit),
    )

    expected = models.Material(
        "Source Name",
        "S1-source-0815",
        "0815",
        None,
        characteristics1,
        (),
        (),
        None,
        headers_source,
    )
    assert expected == study.materials["S1-source-0815"]
    expected = models.Material(
        "Source Name",
        "S1-source-0816",
        "0816",
        None,
        characteristics2,
        (),
        (),
        None,
        headers_source,
    )
    assert expected == study.materials["S1-source-0816"]
    expected = models.Material(
        "Source Name",
        "S1-source-0817",
        "0817",
        None,
        characteristics3,
        (),
        (),
        None,
        headers_source,
    )
    assert expected == study.materials["S1-source-0817"]
    expected = models.Material(
        "Sample Name",
        "S1-sample-0815-N1",
        "0815-N1",
        None,
        (models.Characteristics("status", ["0"], None), ),
        (),
        (models.FactorValue("treatment", "yes", None), ),
        None,
        headers_sample,
    )
    assert expected == study.materials["S1-sample-0815-N1"]
    expected = models.Material(
        "Sample Name",
        "S1-sample-0815-T1",
        "0815-T1",
        None,
        (models.Characteristics("status", ["2"], None), ),
        (),
        (models.FactorValue("treatment", "", None), ),
        None,
        headers_sample,
    )
    assert expected == study.materials["S1-sample-0815-T1"]
    expected = models.Material(
        "Sample Name",
        "S1-sample-0816-T1",
        "0816-T1",
        None,
        (models.Characteristics("status", ["1"], None), ),
        (),
        (models.FactorValue("treatment", "yes", None), ),
        None,
        headers_sample,
    )
    assert expected == study.materials["S1-sample-0816-T1"]
    expected = models.Material(
        "Sample Name",
        "S1-Empty Sample Name-13-5",
        "",
        None,
        (models.Characteristics("status", [""], None), ),
        (),
        (models.FactorValue("treatment", "", None), ),
        None,
        headers_sample,
    )
    assert expected == study.materials["S1-Empty Sample Name-13-5"]

    expected = models.Process(
        "sample collection",
        "S1-sample collection-9-2",
        None,
        None,
        date(2018, 2, 2),
        "John Doe",
        (models.ParameterValue("instrument", ["scalpel"], None), ),
        (),
        None,
        None,
        None,
        headers_collection,
    )
    assert expected == study.processes["S1-sample collection-9-2"]
    expected = models.Process(
        "sample collection",
        "S1-sample collection-9-3",
        None,
        None,
        date(2018, 2, 2),
        "John Doe",
        (models.ParameterValue("instrument",
                               ["scalpel type A", "scalpel type B"], None), ),
        (),
        None,
        None,
        None,
        headers_collection,
    )
    assert expected == study.processes["S1-sample collection-9-3"]
    expected = models.Process(
        "sample collection",
        "S1-sample collection-9-4",
        None,
        None,
        date(2018, 2, 2),
        "John Doe",
        (models.ParameterValue("instrument", ["scalpel"], None), ),
        (),
        None,
        None,
        None,
        headers_collection,
    )
    assert expected == study.processes["S1-sample collection-9-4"]

    expected = (
        models.Arc("S1-source-0814", "S1-sample collection-9-1"),
        models.Arc("S1-sample collection-9-1", "S1-sample-0814-N1"),
        models.Arc("S1-source-0815", "S1-sample collection-9-2"),
        models.Arc("S1-sample collection-9-2", "S1-sample-0815-N1"),
        models.Arc("S1-source-0815", "S1-sample collection-9-3"),
        models.Arc("S1-sample collection-9-3", "S1-sample-0815-T1"),
        models.Arc("S1-source-0816", "S1-sample collection-9-4"),
        models.Arc("S1-sample collection-9-4", "S1-sample-0816-T1"),
        models.Arc("S1-source-0817", "S1-sample collection-9-5"),
        models.Arc("S1-sample collection-9-5", "S1-Empty Sample Name-13-5"),
    )
    assert expected == study.arcs
def fetch_remote_pedigree(source, project, add_log_entry=_nolog):
    """Fetch pedigree (dict of ``PedigreeMember``) from remote site ``source``."""
    r = requests.get(
        URL_TEMPLATE % {
            "url": source.url,
            "project_uuid": project.sodar_uuid,
            "secret": source.secret
        })

    # Mapping from sex in ISA-tab to sex in PLINK PED.
    map_sex = {"male": 1, "female": 2}
    # Mapping from disease state in ISA-tab to sex in PLINK PED.
    map_affected = {"affected": 2, "unaffected": 1}

    # Parse investigation and all studies from ISA-tab (wrapped in JSON).
    remote_pedigree = {}
    isa_json = r.json()

    for s_path, s_data in isa_json["studies"].items():
        study = StudyReader.from_stream(s_path,
                                        io.StringIO(s_data["tsv"]),
                                        filename=s_path).read()
        add_log_entry(study)
        # Compress study arcs (map source to sample), easy because only one depth of one in study.
        arc_map = {arc.tail: arc for arc in study.arcs}
        for arc in list(arc_map.values()):  # NB: copy intentionally
            if arc.head in arc_map:
                arc_map[arc.tail] = arc_map[arc.head]
        add_log_entry(arc_map)
        # Actually parse out individuals.
        source_samples = {}
        for arc in study.arcs:
            if arc.tail in study.materials and study.materials[
                    arc.tail].type == "Source Name":
                source_samples.setdefault(arc.tail, []).append(
                    study.materials[arc_map[arc.tail].head])
        for material in study.materials.values():
            if material.type == "Source Name":
                if len(source_samples[material.unique_name]) > 1:
                    add_log_entry(
                        "WARNING: more than one sample for source %s" %
                        material.name,
                        log_level=LOG_LEVEL_WARNING,
                    )
                fields = {c.name: c.value for c in material.characteristics}
                add_log_entry("fields = %s" % fields)
                member = PedigreeMember(
                    family=_isa_helper_get_field(fields, "Family"),
                    name=material.name,
                    father=_isa_helper_get_field(fields, "Father"),
                    mother=_isa_helper_get_field(fields, "Mother"),
                    sex=map_sex.get(_isa_helper_get_field(fields, "Sex"), 0),
                    affected=map_affected.get(
                        _isa_helper_get_field(fields, "Disease status"), 0),
                    sample_name=source_samples[material.unique_name][0].name,
                    hpo_terms=_isa_helper_get_term_field(fields, "HPO terms"),
                    orphanet_diseases=_isa_helper_get_term_field(
                        fields, "Orphanet disease"),
                    omim_diseases=_isa_helper_get_term_field(
                        fields, "OMIM disease"),
                )
                add_log_entry("new member: %s" % member)
                remote_pedigree[material.name] = member
    return remote_pedigree