Beispiel #1
0
def to_downloadable(queries):
    out_queries = []
    for q in queries:
        if q.startswith('GSE'):
            out_queries += list(pysradb.SRAweb().gse_to_srp(q).study_accession)
        else:
            out_queries.append(q)
    return out_queries
Beispiel #2
0
def samples2metadata_sra(samples: List[str], logger) -> dict:
    """
    Get the required info to continue a seq2science run from a list of samples.

    - If a sample already exists locally, we only want to know if it is paired-end or single-end.
    - If a sample does not exist locally
      - find its corresponding SRX number and all runs that belong to it,
      - check if they all have the same layout, if not, crash
      - see if we can download the runs from ena

    output:
        dict(
            "GSM1234": {"layout": "PAIRED",
                         "runs": ["SRR1234", "SRR4321"],
                         "ena_fastq_ftp": {...},

            "SRR5678": {"layout": "SINGLE",
                        "runs": ["SRR5678"],
                        ena_fastq_ftp: None,
            ...
        )
    """
    # start with empty dictionary which we fill out later
    SAMPLEDICT = {sample: dict() for sample in samples}

    # only continue with public samples
    db_sra = pysradb.SRAweb()

    # all samples that are on GEO (GSM numbers), must first be converted to SRA ids (SRX numbers)
    geo_samples = [sample for sample in samples if sample.startswith("GSM")]

    # in sample2clean we store the (potential GEO) sample name in a SRA compliant name
    if len(geo_samples):
        try:
            df_geo = db_sra.gsm_to_srx(geo_samples)
        except:
            logger.error(
                "We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers "
                "are overloaded or slow. Please try again in a bit...\n"
                "Another possible option is that you try to access samples that do not exist or are protected, and "
                "seq2science does not support downloading those..\n\n")
            os._exit(1)  # noqa

        sample2clean = dict(
            zip(df_geo.experiment_alias, df_geo.experiment_accession))
    else:
        sample2clean = dict()

    # now add the already SRA compliant names with a reference to itself
    sample2clean.update(
        {sample: sample
         for sample in samples if sample not in geo_samples})

    # check our samples on sra
    try:
        df_sra = db_sra.sra_metadata(list(sample2clean.values()),
                                     detailed=True)
    except:
        logger.error(
            "We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers "
            "are overloaded or slow. Please try again in a bit...\n"
            "Another possible option is that you try to access samples that do not exist or are protected, and "
            "seq2science does not support downloading those..\n\n")
        os._exit(1)  # noqa

    # keep track of not-supported samples
    not_supported_formats = ["ABI_SOLID"]
    not_supported_samples = []

    for sample, clean in sample2clean.items():
        # table indices
        idxs = _sample_to_idxs(df_sra, clean)

        # get all runs that belong to the sample
        runs = df_sra.loc[idxs].run_accession.tolist()
        assert len(runs) >= 1
        SAMPLEDICT[sample]["runs"] = runs

        # check if sample is from a supported format
        for bad_format in not_supported_formats:
            for real_format in df_sra.loc[idxs].instrument_model_desc.tolist():
                if real_format == bad_format:
                    not_supported_samples.append(sample)

        # get the layout
        layout = df_sra.loc[idxs].library_layout.tolist()
        assert len(set(
            layout)) == 1, f"sample {sample} consists of mixed layouts, bad!"
        assert layout[0] in ["PAIRED", "SINGLE"
                             ], f"sample {sample} is an unclear layout, bad!"
        SAMPLEDICT[sample]["layout"] = layout[0]

        # get the ena url
        SAMPLEDICT[sample]["ena_fastq_ftp"] = dict()
        for run in runs:
            if layout[0] == "SINGLE":
                SAMPLEDICT[sample]["ena_fastq_ftp"][run] = df_sra[
                    df_sra.run_accession == run].ena_fastq_ftp.tolist()
            elif layout[0] == "PAIRED":
                SAMPLEDICT[sample]["ena_fastq_ftp"][run] = (
                    df_sra[df_sra.run_accession ==
                           run].ena_fastq_ftp_1.tolist() +
                    df_sra[df_sra.run_accession ==
                           run].ena_fastq_ftp_2.tolist())

        # if any run from a sample is not found on ENA, better be safe, and assume that sample as a whole is not on ENA
        if any([
                any(pd.isna(urls))
                for urls in SAMPLEDICT[sample]["ena_fastq_ftp"].values()
        ]):
            SAMPLEDICT[sample]["ena_fastq_ftp"] = None

    # now report single message for all sample(s) that are from a sequencing platform that is not supported
    assert len(not_supported_samples) == 0, (
        f'Sample(s) {", ".join(not_supported_samples)} are not supported by seq2science. Samples that are one of '
        f'these formats; [{", ".join(not_supported_formats)}] are not supported.'
    )

    return SAMPLEDICT
Beispiel #3
0
def samples2metadata(samples: List[str], config: dict) -> dict:
    """
    Get the required info to continue a seq2science run from a list of samples.

    - If a sample already exists locally, we only want to know if it is paired-end or single-end.
    - If a sample does not exist locally
      - find its corresponding SRX number and all runs that belong to it,
      - check if they all have the same layout, if not, crash
      - see if we can download the runs from ena

    output:
        dict(
            "GSM1234": {"layout": "PAIRED",
                         "runs": ["SRR1234", "SRR4321"],
                         "ena_fastq_http": {
                            "SRR1234": [...],
                            "SRR4321": None
                            },
                         "ena_fastq_ftp": ["..."],

            "SRR5678": {"Layout": "SINGLE",
                        "runs": ["SRR5678"],
                        ena_fastq_http: None,
                        ena_fastq_ftp: [...],
            ...
        )
    """
    # start with empty dictionary which we fill out later
    sampledict = {sample: dict() for sample in samples}

    # fill out the sampledict for the local samples, and store the public samples for later
    public_samples = []
    for sample in samples:
        if os.path.exists(
                expand(f'{{fastq_dir}}/{sample}.{{fqsuffix}}.gz',
                       **config)[0]):
            sampledict[sample]["layout"] = "SINGLE"
        elif all(
                os.path.exists(path) for path in expand(
                    f'{{fastq_dir}}/{sample}_{{fqext}}.{{fqsuffix}}.gz', **
                    config)):
            sampledict[sample]["layout"] = "PAIRED"
        elif sample.startswith(('GSM', 'SRX', 'SRR', 'ERR', 'DRR')):
            public_samples.append(sample)
        else:
            raise ValueError(
                f"\nsample {sample} was not found..\n"
                f"We checked for SE file:\n"
                f"\t{config['fastq_dir']}/{sample}.{config['fqsuffix']}.gz \n"
                f"and for PE files:\n"
                f"\t{config['fastq_dir']}/{sample}_{config['fqext1']}.{config['fqsuffix']}.gz \n"
                f"\t{config['fastq_dir']}/{sample}_{config['fqext2']}.{config['fqsuffix']}.gz \n"
                f"and since the sample did not start with either GSM, SRX, SRR, ERR, and DRR we "
                f"couldn't find it online..\n")

    if len(public_samples) == 0:
        return sampledict

    # only continue with public samples
    db = pysradb.SRAweb()

    # all samples that are on GEO (GSM numbers), must first be converted to SRA ids (SRX numbers)
    geo_samples = [
        sample for sample in public_samples if sample.startswith("GSM")
    ]

    # in sample2clean we store the (potential GEO) sample name in a SRA compliant name
    df = db.gsm_to_srx(geo_samples)
    sample2clean = dict(zip(df.experiment_alias, df.experiment_accession))

    # now add the already SRA compliant names with a reference to itself
    sample2clean.update({
        sample: sample
        for sample in public_samples if sample not in geo_samples
    })

    # check our samples on sra
    df = db.sra_metadata(list(sample2clean.values()), detailed=True)

    for sample, clean in sample2clean.items():
        # table indices
        idxs = _sample_to_idxs(df, clean)

        # get all runs that belong to the sample
        runs = df.loc[idxs].run_accession.tolist()
        assert len(runs) >= 1
        sampledict[sample]["runs"] = runs

        # get the layout
        layout = df.loc[idxs].library_layout.tolist()
        assert len(set(
            layout)) == 1, f"sample {sample} consists of mixed layouts, bad!"
        assert layout[0] in ["PAIRED", "SINGLE"
                             ], f"sample {sample} is an unclear layout, bad!"
        sampledict[sample]["layout"] = layout[0]

        # get the ena url
        sampledict[sample]["ena_fastq_http"] = dict()
        sampledict[sample]["ena_fastq_ftp"] = dict()
        for run in runs:
            if layout[0] == "SINGLE":
                sampledict[sample]["ena_fastq_http"][run] = df[
                    df.run_accession == run].ena_fastq_http.tolist()
                sampledict[sample]["ena_fastq_ftp"][run] = df[
                    df.run_accession == run].ena_fastq_ftp.tolist()
            elif layout[0] == "PAIRED":
                sampledict[sample]["ena_fastq_http"][run] = df[
                    df.run_accession == run].ena_fastq_http_1.tolist() + df[
                        df.run_accession == run].ena_fastq_http_2.tolist()
                sampledict[sample]["ena_fastq_ftp"][run] = df[
                    df.run_accession == run].ena_fastq_ftp_1.tolist() + df[
                        df.run_accession == run].ena_fastq_ftp_2.tolist()

        # if any run from a sample is not found on ENA, better be safe, and assume that sample as a whole is not on ENA
        if any([
                "N/A" in urls
                for run, urls in sampledict[sample]["ena_fastq_http"].items()
        ]):
            sampledict[sample]["ena_fastq_http"] = None
        if any([
                "N/A" in urls
                for run, urls in sampledict[sample]["ena_fastq_ftp"].items()
        ]):
            sampledict[sample]["ena_fastq_ftp"] = None

    return sampledict
Beispiel #4
0
]

DEFAULT_GROUP_EVAL = [
    'srr["d_lib"].str.replace("[_-](R|rep)_?[\d+]$", "", regex=True)',
]

DEFAULT_SRR_FILTER = []

TAB_CHAR = '    '

args = parse_args(sys.argv[1:])

if pathlib.Path(args.accessions[0]).is_file():
    srr = pd.read_csv(args.accessions[0], sep='\t').convert_dtypes()
else:
    db = pysradb.SRAweb()
    queries = to_downloadable(args.accessions)

    srr = pd.concat([db.sra_metadata(q, detailed=True) for q in queries])

for library_eval in (args.library_eval
                     if args.library_eval else DEFAULT_LIBRARY_EVAL):
    srr['d_lib'] = eval(library_eval, {'srr': srr})

for group_eval in (args.group_eval if args.group_eval else DEFAULT_GROUP_EVAL):
    srr['d_group'] = eval(group_eval, {'srr': srr})

for srr_filter in (args.filter_srr if args.filter_srr else DEFAULT_SRR_FILTER):
    srr = srr[eval(srr_filter, {'srr': srr})]

srr = srr.sort_values(['d_lib', 'run_accession'])
Beispiel #5
0
def samples2metadata_sra(samples: List[str]) -> dict:
    """
    Get the required info to continue a seq2science run from a list of samples.

    - If a sample already exists locally, we only want to know if it is paired-end or single-end.
    - If a sample does not exist locally
      - find its corresponding SRX number and all runs that belong to it,
      - check if they all have the same layout, if not, crash
      - see if we can download the runs from ena

    output:
        dict(
            "GSM1234": {"layout": "PAIRED",
                         "runs": ["SRR1234", "SRR4321"],
                         "ena_fastq_ftp": {...},

            "SRR5678": {"layout": "SINGLE",
                        "runs": ["SRR5678"],
                        ena_fastq_ftp: None,
            ...
        )
    """
    # start with empty dictionary which we fill out later
    sampledict = {sample: dict() for sample in samples}

    # only continue with public samples
    db_sra = pysradb.SRAweb()

    # all samples that are on GEO (GSM numbers), must first be converted to SRA ids (SRX numbers)
    geo_samples = [sample for sample in samples if sample.startswith("GSM")]

    # in sample2clean we store the (potential GEO) sample name in a SRA compliant name
    try:
        df_geo = db_sra.gsm_to_srx(geo_samples)
    except:
        print("We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers "
              "are overloaded or slow. Please try again in a bit...")
        sys.exit(1)

    sample2clean = dict(zip(df_geo.experiment_alias, df_geo.experiment_accession))

    # now add the already SRA compliant names with a reference to itself
    sample2clean.update({sample: sample for sample in samples if sample not in geo_samples})

    # check our samples on sra
    try:
        df_sra = db_sra.sra_metadata(list(sample2clean.values()), detailed=True)
    except:
        print("We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers "
              "are overloaded or slow. Please try again in a bit...")
        sys.exit(1)

    for sample, clean in sample2clean.items():
        # table indices
        idxs = _sample_to_idxs(df_sra, clean)

        # get all runs that belong to the sample
        runs = df_sra.loc[idxs].run_accession.tolist()
        assert len(runs) >= 1
        sampledict[sample]["runs"] = runs

        # get the layout
        layout = df_sra.loc[idxs].library_layout.tolist()
        assert len(set(layout)) == 1, f"sample {sample} consists of mixed layouts, bad!"
        assert layout[0] in ["PAIRED", "SINGLE"], f"sample {sample} is an unclear layout, bad!"
        sampledict[sample]["layout"] = layout[0]

        # get the ena url
        sampledict[sample]["ena_fastq_ftp"] = dict()
        for run in runs:
            if layout[0] == "SINGLE":
                sampledict[sample]["ena_fastq_ftp"][run] = df_sra[df_sra.run_accession == run].ena_fastq_ftp.tolist()
            elif layout[0] == "PAIRED":
                sampledict[sample]["ena_fastq_ftp"][run] = df_sra[df_sra.run_accession == run].ena_fastq_ftp_1.tolist() + df_sra[
                    df_sra.run_accession == run].ena_fastq_ftp_2.tolist()

        # if any run from a sample is not found on ENA, better be safe, and assume that sample as a whole is not on ENA
        if any(["N/A" in urls for run, urls in sampledict[sample]["ena_fastq_ftp"].items()]):
            sampledict[sample]["ena_fastq_ftp"] = None

    return sampledict