def generate_s3paths_using_api(phsId, ngc_token, aws_region): db = SRAweb() print(f'Getting Metadata for phs {phsId}.Please be patient....') df = db.sra_metadata(phsId, detailed=True) print(f'Data Set for phs {phsId} received...') print(f'Retrieving files for each SRR...') generate_s3paths_common_function(df, 'api')
class Download_fq_from_sra: def __init__(self, sra_id): self.db = SRAweb() self.sra_id = sra_id def get_metadata(self): df = self.db.sra_metadata(self.sra_id, detailed=True) return df def download_fq_file(self): print(os.getcwd()) os.system('mkdir {}'.format(self.sra_id)) metadata = self.get_metadata() print(metadata) os.chdir(self.sra_id) for run_acc in metadata.loc[:,"run_accession"]: print(run_acc) return_value = os.system("fasterq-dump {} -p -t $HOME/temp_files".format(str(run_acc))) print(return_value) #self.build_collections(metadata) def build_collections(self, df): all_types = set(list(df.loc[:,"source_name"])) for collec in all_types: os.system("mkdir {}_{}".format(collec, self.sra_id)) for i, curr_collec in enumerate(df.loc[:,"source_name"]): if collec == curr_collec: curr_run = str(df.loc[i,"run_accession"]) os.system('mv {}* {}_{}/'.format(curr_run, collec, self.sra_id))
def sraweb_connection(): db = SRAweb() time.sleep(2) return db
help= 'The .tsv file path that will stored the metadata for the given SRA Project ID.' ) args = parser.parse_args() # # Get the metadata # if args.sra_db is not None: db = SRAdb(args.sra_db.name) print(f"Using local SRA SQLite database to query...") else: print(f"Using NCBi's esearch and esummary interface to query...") db = SRAweb() metadata = db.sra_metadata(args.sra_project_id, detailed=True, expand_sample_attributes=True, sample_attribute=True) # Drop any None columns # pysradb does not lock the versions # pandas 0.25.3 generates an additional None column compared to pandas 0.25.0 # Bug in 0.25.3 ? metadata = metadata[metadata.columns.dropna()] metadata = pd.concat([ metadata, metadata["experiment_title"].str.extract( r'^(.*): (.*); (.*); (.*)$', expand=True).rename( columns={
def sraweb_connection(): db = SRAweb() return db
from pysradb.sraweb import SRAweb db = SRAweb() df = (db.sra_metadata(snakemake.config["srp_id"], detailed=True).filter(["run_accession", "sra_url_alt3" ]).rename(columns={ "run_accession": "srr_id", "sra_url_alt3": "url" })) df.to_csv(snakemake.output[0], index=False)
def __init__(self, sra_id): self.db = SRAweb() self.sra_id = sra_id
def get_srp_table(srp, assembly, re_ribo_analysis_dir): #sradb = SRAdb("/data2/SRAmetadb.sqlite") sradb = SRAweb() column_order = [ "study_accession", "experiment_title", "experiment_accession", "run_accession", #"taxon_id", "library_selection", "library_layout", "library_strategy", "library_source", #"library_name", #"adapter_spec", #"bases", #"spots", #"avg_read_length", "pass1_adapter", "pass1_total_reads_processed", "pass1_reads_with_adapters", "pass2_adapter", "pass2_total_reads_processed", "pass2_reads_with_adapters", "mapping_total_reads_input", "uniquely_mapped", "uniquely_mapped_percent", "ribotricer_orfs", ] filepath = os.path.join(re_ribo_analysis_dir, assembly, srp) if os.path.exists(filepath): try: srp_df = sradb.sra_metadata( srp.split("_")[0], detailed=True) # , expand_sample_attributes=True) except: if "Kadosh" in filepath and "Kadosh_30C_37C" not in filepath: srp_df = pd.read_csv( "/data2/Kadosh_design_files/{}.tsv".format(srp), sep="\t") else: srp_df = create_df_from_dir(filepath) # return pd.DataFrame() if "library_layout" in srp_df.columns: srp_df.library_layout = srp_df.library_layout.fillna("SINGLE") else: srp_df["library_layout"] = "SINGLE" srp_df = srp_df[srp_df.library_layout.str.contains("SINGLE")] srp_df["pass1_reads_with_adapters"] = None srp_df["pass1_total_reads_processed"] = None srp_df["pass1_adapter"] = None srp_df["pass2_adapter"] = None srp_df["pass2_total_reads_processed"] = None srp_df["pass2_reads_with_adapters"] = None srp_df["mapping_total_reads_input"] = None srp_df["uniquely_mapped"] = None srp_df["uniquely_mapped_percent"] = None srp_df["ribotricer_orfs"] = None srp_df["ribotricer_metagene_5p"] = None srp_df["ribotricer_metagene_3p"] = None srp_df["ribotricer_metagene_plot"] = None srp_df["ribotricer_protocol"] = None srp_df["ribotricer_bam_summary"] = None # srp_df["summarized_orfs"] = None # srp_df["summarized_phase_scores"] = None srpdir = os.path.join(re_ribo_analysis_dir, assembly, srp) starlogsdir = os.path.join(srpdir, "starlogs") srp_srx_grouped = srp_df.groupby("experiment_accession") preprocess_step1_dir = os.path.join(srpdir, "preprocessed_step1") preprocess_step2_dir = os.path.join(srpdir, "preprocessed") for srx, srx_group in srp_srx_grouped: ribotricer_output = check_ribotricer_output_exists( srp, srx, assembly) ( ribotricer_metagene_5p, ribotricer_metagene_3p, ) = check_ribotricer_metagene_exists(srp, srx, assembly) ribotricer_bam_summary = check_ribotricer_bam_summary_exists( srp, srx, assembly) ribotricer_protocol = check_ribotricer_protocol_exists( srp, srx, assembly) ribotricer_metagene_plot = check_ribotricer_metagene_plot_exists( srp, srx, assembly) # summarized_orfs = check_summarized_orfs_exists(srp, assembly) # summarized_phase_score = check_summarized_orfs_exists(srp, assembly) srrs = srx_group["run_accession"].tolist() if ribotricer_output: srp_df.loc[srp_df.experiment_accession == srx, "ribotricer_orfs"] = ribotricer_output srp_df.loc[srp_df.experiment_accession == srx, "ribotricer_metagene_5p"] = ribotricer_metagene_5p srp_df.loc[srp_df.experiment_accession == srx, "ribotricer_metagene_3p"] = ribotricer_metagene_3p srp_df.loc[srp_df.experiment_accession == srx, "ribotricer_bam_summary"] = ribotricer_bam_summary srp_df.loc[srp_df.experiment_accession == srx, "ribotricer_protocol"] = ribotricer_protocol srp_df.loc[srp_df.experiment_accession == srx, "ribotricer_metagene_plot"] = ribotricer_metagene_plot # srp_df.loc[srp_df.experiment_accession == srx, "summarized_orfs"] = summarized_orfs # srp_df.loc[srp_df.experiment_accession == srx, "summarized_phase_scores"] = summarized_phase_score # starlogs_df = summary_starlogs_over_runs(starlogsdir, srrs) for srr in srrs: starlogs_df = None if os.path.isfile( os.path.join(starlogsdir, srr + "Log.final.out")): starlogs_df = parse_star_logs( os.path.join(starlogsdir, srr + "Log.final.out")) # Preprocessed_step1 adapter info step1_txt = os.path.join(preprocess_step1_dir, srr + ".fastq.gz_trimming_report.txt") step2_txt = os.path.join( preprocess_step2_dir, srr + "_trimmed.fq.gz_trimming_report.txt") step1_cutadapt_json = None step2_cutadapt_json = None if os.path.isfile(step1_txt): step1_cutadapt_json = cutadapt_to_json(step1_txt) if os.path.isfile(step2_txt): step2_cutadapt_json = cutadapt_to_json(step2_txt) if step1_cutadapt_json: adapters = step1_cutadapt_json["adapters"] if len(step1_cutadapt_json["adapters"]) == 0: srp_df.loc[srp_df.run_accession == srr, "pass1_adapter"] = "Empty?" elif isinstance(adapters, str): srp_df.loc[ srp_df.run_accession == srr, "pass1_adapter"] = step1_cutadapt_json["adapters"] else: srp_df.loc[ srp_df.run_accession == srr, "pass1_adapter"] = step1_cutadapt_json["adapters"][ "{} - {}".format(srr, "Adapter 1")] trim_info1 = step1_cutadapt_json["trim_info"][srr] srp_df.loc[srp_df.run_accession == srr, "pass1_total_reads_processed"] = trim_info1[ "r_processed"] srp_df.loc[srp_df.run_accession == srr, "pass1_reads_with_adapters"] = trim_info1[ "r_with_adapters"] if step2_cutadapt_json: adapters = step2_cutadapt_json["adapters"] if len(step2_cutadapt_json["adapters"]) == 0: srp_df.loc[srp_df.run_accession == srr, "pass2_adapter"] = "Empty?" elif isinstance(adapters, str): srp_df.loc[ srp_df.run_accession == srr, "pass2_adapter"] = step2_cutadapt_json["adapters"] else: srp_df.loc[ srp_df.run_accession == srr, "pass2_adapter"] = step2_cutadapt_json["adapters"][ "{} - {}".format(srr + "_trimmed", "Adapter 1")] trim_info2 = step2_cutadapt_json["trim_info"][ srr + "_trimmed"] srp_df.loc[srp_df.run_accession == srr, "pass2_reads_with_adapters"] = trim_info2[ "r_with_adapters"] srp_df.loc[srp_df.run_accession == srr, "pass2_total_reads_processed"] = trim_info2[ "r_processed"] if starlogs_df: srp_df.loc[srp_df.run_accession == srr, "mapping_total_reads_input"] = starlogs_df[ "total_reads"] srp_df.loc[ srp_df.run_accession == srr, "uniquely_mapped"] = starlogs_df["uniquely_mapped"] srp_df.loc[srp_df.run_accession == srr, "uniquely_mapped_percent"] = starlogs_df[ "uniquely_mapped_percent"] cols = [ "bases", "spots", "pass1_reads_with_adapters", "pass2_reads_with_adapters", "pass2_total_reads_processed", "pass1_total_reads_processed", "uniquely_mapped", "mapping_total_reads_input", ] for col in cols: try: srp_df[col] = srp_df[col].apply(lambda z: millify(z)) except: pass sradb.close() return order_dataframe(srp_df, column_order)
root_dir, assembly_build, srp) def get_fragment_lengths(file_path): try: return pd.read_csv(file_path, sep="\t").fragment_length.tolist() except: # Handle 3 headed files df = pd.read_csv(file_path, header=None, sep="\t") df.columns = ["fragment_length", "offset_5p", "profile"] return df.fragment_length.tolist() # In[21]: db = SRAweb() #SRAdb("/data2/SRAmetadb.sqlite") all_projects = [] for species, sample_list in __ASSEMBLY_WISE_SRP__.items(): mkdir_p("/data2/re-ribo-analysis-metadata/{}".format(species)) for srp in sample_list: basedir = os.path.dirname( os.path.dirname(__SRP_TO_ROOT_DIR_MAP__[srp][species])) if not os.listdir(__SRP_TO_ROOT_DIR_MAP__[srp][species]): continue print(srp, basedir) df = get_srp_table(srp, species, basedir) project_filepath = "{}/{}/{}".format(basedir, species, srp) metadata_filepath = "/data2/re-ribo-analysis-metadata/{}/{}.tsv".format( species, srp) df_subset = df[df.ribotricer_metagene_5p == df.