def to_downloadable(queries): out_queries = [] for q in queries: if q.startswith('GSE'): out_queries += list(pysradb.SRAweb().gse_to_srp(q).study_accession) else: out_queries.append(q) return out_queries
def samples2metadata_sra(samples: List[str], logger) -> dict: """ Get the required info to continue a seq2science run from a list of samples. - If a sample already exists locally, we only want to know if it is paired-end or single-end. - If a sample does not exist locally - find its corresponding SRX number and all runs that belong to it, - check if they all have the same layout, if not, crash - see if we can download the runs from ena output: dict( "GSM1234": {"layout": "PAIRED", "runs": ["SRR1234", "SRR4321"], "ena_fastq_ftp": {...}, "SRR5678": {"layout": "SINGLE", "runs": ["SRR5678"], ena_fastq_ftp: None, ... ) """ # start with empty dictionary which we fill out later SAMPLEDICT = {sample: dict() for sample in samples} # only continue with public samples db_sra = pysradb.SRAweb() # all samples that are on GEO (GSM numbers), must first be converted to SRA ids (SRX numbers) geo_samples = [sample for sample in samples if sample.startswith("GSM")] # in sample2clean we store the (potential GEO) sample name in a SRA compliant name if len(geo_samples): try: df_geo = db_sra.gsm_to_srx(geo_samples) except: logger.error( "We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers " "are overloaded or slow. Please try again in a bit...\n" "Another possible option is that you try to access samples that do not exist or are protected, and " "seq2science does not support downloading those..\n\n") os._exit(1) # noqa sample2clean = dict( zip(df_geo.experiment_alias, df_geo.experiment_accession)) else: sample2clean = dict() # now add the already SRA compliant names with a reference to itself sample2clean.update( {sample: sample for sample in samples if sample not in geo_samples}) # check our samples on sra try: df_sra = db_sra.sra_metadata(list(sample2clean.values()), detailed=True) except: logger.error( "We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers " "are overloaded or slow. Please try again in a bit...\n" "Another possible option is that you try to access samples that do not exist or are protected, and " "seq2science does not support downloading those..\n\n") os._exit(1) # noqa # keep track of not-supported samples not_supported_formats = ["ABI_SOLID"] not_supported_samples = [] for sample, clean in sample2clean.items(): # table indices idxs = _sample_to_idxs(df_sra, clean) # get all runs that belong to the sample runs = df_sra.loc[idxs].run_accession.tolist() assert len(runs) >= 1 SAMPLEDICT[sample]["runs"] = runs # check if sample is from a supported format for bad_format in not_supported_formats: for real_format in df_sra.loc[idxs].instrument_model_desc.tolist(): if real_format == bad_format: not_supported_samples.append(sample) # get the layout layout = df_sra.loc[idxs].library_layout.tolist() assert len(set( layout)) == 1, f"sample {sample} consists of mixed layouts, bad!" assert layout[0] in ["PAIRED", "SINGLE" ], f"sample {sample} is an unclear layout, bad!" SAMPLEDICT[sample]["layout"] = layout[0] # get the ena url SAMPLEDICT[sample]["ena_fastq_ftp"] = dict() for run in runs: if layout[0] == "SINGLE": SAMPLEDICT[sample]["ena_fastq_ftp"][run] = df_sra[ df_sra.run_accession == run].ena_fastq_ftp.tolist() elif layout[0] == "PAIRED": SAMPLEDICT[sample]["ena_fastq_ftp"][run] = ( df_sra[df_sra.run_accession == run].ena_fastq_ftp_1.tolist() + df_sra[df_sra.run_accession == run].ena_fastq_ftp_2.tolist()) # if any run from a sample is not found on ENA, better be safe, and assume that sample as a whole is not on ENA if any([ any(pd.isna(urls)) for urls in SAMPLEDICT[sample]["ena_fastq_ftp"].values() ]): SAMPLEDICT[sample]["ena_fastq_ftp"] = None # now report single message for all sample(s) that are from a sequencing platform that is not supported assert len(not_supported_samples) == 0, ( f'Sample(s) {", ".join(not_supported_samples)} are not supported by seq2science. Samples that are one of ' f'these formats; [{", ".join(not_supported_formats)}] are not supported.' ) return SAMPLEDICT
def samples2metadata(samples: List[str], config: dict) -> dict: """ Get the required info to continue a seq2science run from a list of samples. - If a sample already exists locally, we only want to know if it is paired-end or single-end. - If a sample does not exist locally - find its corresponding SRX number and all runs that belong to it, - check if they all have the same layout, if not, crash - see if we can download the runs from ena output: dict( "GSM1234": {"layout": "PAIRED", "runs": ["SRR1234", "SRR4321"], "ena_fastq_http": { "SRR1234": [...], "SRR4321": None }, "ena_fastq_ftp": ["..."], "SRR5678": {"Layout": "SINGLE", "runs": ["SRR5678"], ena_fastq_http: None, ena_fastq_ftp: [...], ... ) """ # start with empty dictionary which we fill out later sampledict = {sample: dict() for sample in samples} # fill out the sampledict for the local samples, and store the public samples for later public_samples = [] for sample in samples: if os.path.exists( expand(f'{{fastq_dir}}/{sample}.{{fqsuffix}}.gz', **config)[0]): sampledict[sample]["layout"] = "SINGLE" elif all( os.path.exists(path) for path in expand( f'{{fastq_dir}}/{sample}_{{fqext}}.{{fqsuffix}}.gz', ** config)): sampledict[sample]["layout"] = "PAIRED" elif sample.startswith(('GSM', 'SRX', 'SRR', 'ERR', 'DRR')): public_samples.append(sample) else: raise ValueError( f"\nsample {sample} was not found..\n" f"We checked for SE file:\n" f"\t{config['fastq_dir']}/{sample}.{config['fqsuffix']}.gz \n" f"and for PE files:\n" f"\t{config['fastq_dir']}/{sample}_{config['fqext1']}.{config['fqsuffix']}.gz \n" f"\t{config['fastq_dir']}/{sample}_{config['fqext2']}.{config['fqsuffix']}.gz \n" f"and since the sample did not start with either GSM, SRX, SRR, ERR, and DRR we " f"couldn't find it online..\n") if len(public_samples) == 0: return sampledict # only continue with public samples db = pysradb.SRAweb() # all samples that are on GEO (GSM numbers), must first be converted to SRA ids (SRX numbers) geo_samples = [ sample for sample in public_samples if sample.startswith("GSM") ] # in sample2clean we store the (potential GEO) sample name in a SRA compliant name df = db.gsm_to_srx(geo_samples) sample2clean = dict(zip(df.experiment_alias, df.experiment_accession)) # now add the already SRA compliant names with a reference to itself sample2clean.update({ sample: sample for sample in public_samples if sample not in geo_samples }) # check our samples on sra df = db.sra_metadata(list(sample2clean.values()), detailed=True) for sample, clean in sample2clean.items(): # table indices idxs = _sample_to_idxs(df, clean) # get all runs that belong to the sample runs = df.loc[idxs].run_accession.tolist() assert len(runs) >= 1 sampledict[sample]["runs"] = runs # get the layout layout = df.loc[idxs].library_layout.tolist() assert len(set( layout)) == 1, f"sample {sample} consists of mixed layouts, bad!" assert layout[0] in ["PAIRED", "SINGLE" ], f"sample {sample} is an unclear layout, bad!" sampledict[sample]["layout"] = layout[0] # get the ena url sampledict[sample]["ena_fastq_http"] = dict() sampledict[sample]["ena_fastq_ftp"] = dict() for run in runs: if layout[0] == "SINGLE": sampledict[sample]["ena_fastq_http"][run] = df[ df.run_accession == run].ena_fastq_http.tolist() sampledict[sample]["ena_fastq_ftp"][run] = df[ df.run_accession == run].ena_fastq_ftp.tolist() elif layout[0] == "PAIRED": sampledict[sample]["ena_fastq_http"][run] = df[ df.run_accession == run].ena_fastq_http_1.tolist() + df[ df.run_accession == run].ena_fastq_http_2.tolist() sampledict[sample]["ena_fastq_ftp"][run] = df[ df.run_accession == run].ena_fastq_ftp_1.tolist() + df[ df.run_accession == run].ena_fastq_ftp_2.tolist() # if any run from a sample is not found on ENA, better be safe, and assume that sample as a whole is not on ENA if any([ "N/A" in urls for run, urls in sampledict[sample]["ena_fastq_http"].items() ]): sampledict[sample]["ena_fastq_http"] = None if any([ "N/A" in urls for run, urls in sampledict[sample]["ena_fastq_ftp"].items() ]): sampledict[sample]["ena_fastq_ftp"] = None return sampledict
] DEFAULT_GROUP_EVAL = [ 'srr["d_lib"].str.replace("[_-](R|rep)_?[\d+]$", "", regex=True)', ] DEFAULT_SRR_FILTER = [] TAB_CHAR = ' ' args = parse_args(sys.argv[1:]) if pathlib.Path(args.accessions[0]).is_file(): srr = pd.read_csv(args.accessions[0], sep='\t').convert_dtypes() else: db = pysradb.SRAweb() queries = to_downloadable(args.accessions) srr = pd.concat([db.sra_metadata(q, detailed=True) for q in queries]) for library_eval in (args.library_eval if args.library_eval else DEFAULT_LIBRARY_EVAL): srr['d_lib'] = eval(library_eval, {'srr': srr}) for group_eval in (args.group_eval if args.group_eval else DEFAULT_GROUP_EVAL): srr['d_group'] = eval(group_eval, {'srr': srr}) for srr_filter in (args.filter_srr if args.filter_srr else DEFAULT_SRR_FILTER): srr = srr[eval(srr_filter, {'srr': srr})] srr = srr.sort_values(['d_lib', 'run_accession'])
def samples2metadata_sra(samples: List[str]) -> dict: """ Get the required info to continue a seq2science run from a list of samples. - If a sample already exists locally, we only want to know if it is paired-end or single-end. - If a sample does not exist locally - find its corresponding SRX number and all runs that belong to it, - check if they all have the same layout, if not, crash - see if we can download the runs from ena output: dict( "GSM1234": {"layout": "PAIRED", "runs": ["SRR1234", "SRR4321"], "ena_fastq_ftp": {...}, "SRR5678": {"layout": "SINGLE", "runs": ["SRR5678"], ena_fastq_ftp: None, ... ) """ # start with empty dictionary which we fill out later sampledict = {sample: dict() for sample in samples} # only continue with public samples db_sra = pysradb.SRAweb() # all samples that are on GEO (GSM numbers), must first be converted to SRA ids (SRX numbers) geo_samples = [sample for sample in samples if sample.startswith("GSM")] # in sample2clean we store the (potential GEO) sample name in a SRA compliant name try: df_geo = db_sra.gsm_to_srx(geo_samples) except: print("We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers " "are overloaded or slow. Please try again in a bit...") sys.exit(1) sample2clean = dict(zip(df_geo.experiment_alias, df_geo.experiment_accession)) # now add the already SRA compliant names with a reference to itself sample2clean.update({sample: sample for sample in samples if sample not in geo_samples}) # check our samples on sra try: df_sra = db_sra.sra_metadata(list(sample2clean.values()), detailed=True) except: print("We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers " "are overloaded or slow. Please try again in a bit...") sys.exit(1) for sample, clean in sample2clean.items(): # table indices idxs = _sample_to_idxs(df_sra, clean) # get all runs that belong to the sample runs = df_sra.loc[idxs].run_accession.tolist() assert len(runs) >= 1 sampledict[sample]["runs"] = runs # get the layout layout = df_sra.loc[idxs].library_layout.tolist() assert len(set(layout)) == 1, f"sample {sample} consists of mixed layouts, bad!" assert layout[0] in ["PAIRED", "SINGLE"], f"sample {sample} is an unclear layout, bad!" sampledict[sample]["layout"] = layout[0] # get the ena url sampledict[sample]["ena_fastq_ftp"] = dict() for run in runs: if layout[0] == "SINGLE": sampledict[sample]["ena_fastq_ftp"][run] = df_sra[df_sra.run_accession == run].ena_fastq_ftp.tolist() elif layout[0] == "PAIRED": sampledict[sample]["ena_fastq_ftp"][run] = df_sra[df_sra.run_accession == run].ena_fastq_ftp_1.tolist() + df_sra[ df_sra.run_accession == run].ena_fastq_ftp_2.tolist() # if any run from a sample is not found on ENA, better be safe, and assume that sample as a whole is not on ENA if any(["N/A" in urls for run, urls in sampledict[sample]["ena_fastq_ftp"].items()]): sampledict[sample]["ena_fastq_ftp"] = None return sampledict