Ejemplo n.º 1
0
def generate_s3paths_using_api(phsId, ngc_token, aws_region):
    db = SRAweb()
    print(f'Getting Metadata for phs {phsId}.Please be patient....')
    df = db.sra_metadata(phsId, detailed=True)
    print(f'Data Set for phs {phsId} received...')
    print(f'Retrieving files for each SRR...')
    generate_s3paths_common_function(df, 'api')
Ejemplo n.º 2
0
class Download_fq_from_sra:
    
    def __init__(self, sra_id):
        self.db = SRAweb()
        self.sra_id = sra_id
    
    def get_metadata(self):
        df = self.db.sra_metadata(self.sra_id, detailed=True)
        return df
    
    def download_fq_file(self):
        print(os.getcwd())
        os.system('mkdir {}'.format(self.sra_id))
        metadata = self.get_metadata()
        print(metadata)
        os.chdir(self.sra_id)
        for run_acc in metadata.loc[:,"run_accession"]:
            print(run_acc)
            return_value = os.system("fasterq-dump {} -p -t $HOME/temp_files".format(str(run_acc)))
            print(return_value)
        #self.build_collections(metadata)

    def build_collections(self, df):
        all_types = set(list(df.loc[:,"source_name"]))
        for collec in all_types:
            os.system("mkdir {}_{}".format(collec, self.sra_id))
            for i, curr_collec in enumerate(df.loc[:,"source_name"]):
                if collec == curr_collec:
                    curr_run = str(df.loc[i,"run_accession"])
                    os.system('mv {}* {}_{}/'.format(curr_run, collec, self.sra_id))
Ejemplo n.º 3
0
def sraweb_connection():
    db = SRAweb()
    time.sleep(2)
    return db
Ejemplo n.º 4
0
    help=
    'The .tsv file path that will stored the metadata for the given SRA Project ID.'
)

args = parser.parse_args()

#
# Get the metadata
#

if args.sra_db is not None:
    db = SRAdb(args.sra_db.name)
    print(f"Using local SRA SQLite database to query...")
else:
    print(f"Using NCBi's esearch and esummary interface to query...")
    db = SRAweb()

metadata = db.sra_metadata(args.sra_project_id,
                           detailed=True,
                           expand_sample_attributes=True,
                           sample_attribute=True)
# Drop any None columns
# pysradb does not lock the versions
# pandas 0.25.3 generates an additional None column compared to pandas 0.25.0
# Bug in 0.25.3 ?
metadata = metadata[metadata.columns.dropna()]

metadata = pd.concat([
    metadata, metadata["experiment_title"].str.extract(
        r'^(.*): (.*); (.*); (.*)$', expand=True).rename(
            columns={
Ejemplo n.º 5
0
def sraweb_connection():
    db = SRAweb()
    return db
Ejemplo n.º 6
0
from pysradb.sraweb import SRAweb

db = SRAweb()
df = (db.sra_metadata(snakemake.config["srp_id"],
                      detailed=True).filter(["run_accession", "sra_url_alt3"
                                             ]).rename(columns={
                                                 "run_accession": "srr_id",
                                                 "sra_url_alt3": "url"
                                             }))

df.to_csv(snakemake.output[0], index=False)
Ejemplo n.º 7
0
 def __init__(self, sra_id):
     self.db = SRAweb()
     self.sra_id = sra_id
Ejemplo n.º 8
0
def get_srp_table(srp, assembly, re_ribo_analysis_dir):
    #sradb = SRAdb("/data2/SRAmetadb.sqlite")
    sradb = SRAweb()
    column_order = [
        "study_accession",
        "experiment_title",
        "experiment_accession",
        "run_accession",
        #"taxon_id",
        "library_selection",
        "library_layout",
        "library_strategy",
        "library_source",
        #"library_name",
        #"adapter_spec",
        #"bases",
        #"spots",
        #"avg_read_length",
        "pass1_adapter",
        "pass1_total_reads_processed",
        "pass1_reads_with_adapters",
        "pass2_adapter",
        "pass2_total_reads_processed",
        "pass2_reads_with_adapters",
        "mapping_total_reads_input",
        "uniquely_mapped",
        "uniquely_mapped_percent",
        "ribotricer_orfs",
    ]
    filepath = os.path.join(re_ribo_analysis_dir, assembly, srp)
    if os.path.exists(filepath):

        try:
            srp_df = sradb.sra_metadata(
                srp.split("_")[0],
                detailed=True)  # , expand_sample_attributes=True)
        except:
            if "Kadosh" in filepath and "Kadosh_30C_37C" not in filepath:
                srp_df = pd.read_csv(
                    "/data2/Kadosh_design_files/{}.tsv".format(srp), sep="\t")
            else:
                srp_df = create_df_from_dir(filepath)

            # return pd.DataFrame()
        if "library_layout" in srp_df.columns:
            srp_df.library_layout = srp_df.library_layout.fillna("SINGLE")
        else:
            srp_df["library_layout"] = "SINGLE"
        srp_df = srp_df[srp_df.library_layout.str.contains("SINGLE")]

        srp_df["pass1_reads_with_adapters"] = None
        srp_df["pass1_total_reads_processed"] = None
        srp_df["pass1_adapter"] = None
        srp_df["pass2_adapter"] = None
        srp_df["pass2_total_reads_processed"] = None
        srp_df["pass2_reads_with_adapters"] = None
        srp_df["mapping_total_reads_input"] = None
        srp_df["uniquely_mapped"] = None
        srp_df["uniquely_mapped_percent"] = None
        srp_df["ribotricer_orfs"] = None
        srp_df["ribotricer_metagene_5p"] = None
        srp_df["ribotricer_metagene_3p"] = None

        srp_df["ribotricer_metagene_plot"] = None
        srp_df["ribotricer_protocol"] = None
        srp_df["ribotricer_bam_summary"] = None
        # srp_df["summarized_orfs"] = None
        # srp_df["summarized_phase_scores"] = None

        srpdir = os.path.join(re_ribo_analysis_dir, assembly, srp)
        starlogsdir = os.path.join(srpdir, "starlogs")
        srp_srx_grouped = srp_df.groupby("experiment_accession")
        preprocess_step1_dir = os.path.join(srpdir, "preprocessed_step1")
        preprocess_step2_dir = os.path.join(srpdir, "preprocessed")
        for srx, srx_group in srp_srx_grouped:
            ribotricer_output = check_ribotricer_output_exists(
                srp, srx, assembly)
            (
                ribotricer_metagene_5p,
                ribotricer_metagene_3p,
            ) = check_ribotricer_metagene_exists(srp, srx, assembly)

            ribotricer_bam_summary = check_ribotricer_bam_summary_exists(
                srp, srx, assembly)
            ribotricer_protocol = check_ribotricer_protocol_exists(
                srp, srx, assembly)
            ribotricer_metagene_plot = check_ribotricer_metagene_plot_exists(
                srp, srx, assembly)

            # summarized_orfs = check_summarized_orfs_exists(srp, assembly)
            # summarized_phase_score = check_summarized_orfs_exists(srp, assembly)

            srrs = srx_group["run_accession"].tolist()
            if ribotricer_output:
                srp_df.loc[srp_df.experiment_accession == srx,
                           "ribotricer_orfs"] = ribotricer_output

            srp_df.loc[srp_df.experiment_accession == srx,
                       "ribotricer_metagene_5p"] = ribotricer_metagene_5p
            srp_df.loc[srp_df.experiment_accession == srx,
                       "ribotricer_metagene_3p"] = ribotricer_metagene_3p

            srp_df.loc[srp_df.experiment_accession == srx,
                       "ribotricer_bam_summary"] = ribotricer_bam_summary
            srp_df.loc[srp_df.experiment_accession == srx,
                       "ribotricer_protocol"] = ribotricer_protocol
            srp_df.loc[srp_df.experiment_accession == srx,
                       "ribotricer_metagene_plot"] = ribotricer_metagene_plot
            # srp_df.loc[srp_df.experiment_accession == srx, "summarized_orfs"] = summarized_orfs
            # srp_df.loc[srp_df.experiment_accession == srx, "summarized_phase_scores"] = summarized_phase_score

            # starlogs_df = summary_starlogs_over_runs(starlogsdir, srrs)

            for srr in srrs:
                starlogs_df = None
                if os.path.isfile(
                        os.path.join(starlogsdir, srr + "Log.final.out")):
                    starlogs_df = parse_star_logs(
                        os.path.join(starlogsdir, srr + "Log.final.out"))
                # Preprocessed_step1 adapter info
                step1_txt = os.path.join(preprocess_step1_dir,
                                         srr + ".fastq.gz_trimming_report.txt")
                step2_txt = os.path.join(
                    preprocess_step2_dir,
                    srr + "_trimmed.fq.gz_trimming_report.txt")
                step1_cutadapt_json = None
                step2_cutadapt_json = None

                if os.path.isfile(step1_txt):
                    step1_cutadapt_json = cutadapt_to_json(step1_txt)

                if os.path.isfile(step2_txt):
                    step2_cutadapt_json = cutadapt_to_json(step2_txt)

                if step1_cutadapt_json:
                    adapters = step1_cutadapt_json["adapters"]
                    if len(step1_cutadapt_json["adapters"]) == 0:
                        srp_df.loc[srp_df.run_accession == srr,
                                   "pass1_adapter"] = "Empty?"
                    elif isinstance(adapters, str):
                        srp_df.loc[
                            srp_df.run_accession == srr,
                            "pass1_adapter"] = step1_cutadapt_json["adapters"]
                    else:
                        srp_df.loc[
                            srp_df.run_accession == srr,
                            "pass1_adapter"] = step1_cutadapt_json["adapters"][
                                "{} - {}".format(srr, "Adapter 1")]
                        trim_info1 = step1_cutadapt_json["trim_info"][srr]
                        srp_df.loc[srp_df.run_accession == srr,
                                   "pass1_total_reads_processed"] = trim_info1[
                                       "r_processed"]
                        srp_df.loc[srp_df.run_accession == srr,
                                   "pass1_reads_with_adapters"] = trim_info1[
                                       "r_with_adapters"]
                if step2_cutadapt_json:
                    adapters = step2_cutadapt_json["adapters"]
                    if len(step2_cutadapt_json["adapters"]) == 0:
                        srp_df.loc[srp_df.run_accession == srr,
                                   "pass2_adapter"] = "Empty?"
                    elif isinstance(adapters, str):
                        srp_df.loc[
                            srp_df.run_accession == srr,
                            "pass2_adapter"] = step2_cutadapt_json["adapters"]
                    else:
                        srp_df.loc[
                            srp_df.run_accession == srr,
                            "pass2_adapter"] = step2_cutadapt_json["adapters"][
                                "{} - {}".format(srr + "_trimmed",
                                                 "Adapter 1")]
                        trim_info2 = step2_cutadapt_json["trim_info"][
                            srr + "_trimmed"]
                        srp_df.loc[srp_df.run_accession == srr,
                                   "pass2_reads_with_adapters"] = trim_info2[
                                       "r_with_adapters"]
                        srp_df.loc[srp_df.run_accession == srr,
                                   "pass2_total_reads_processed"] = trim_info2[
                                       "r_processed"]

                if starlogs_df:
                    srp_df.loc[srp_df.run_accession == srr,
                               "mapping_total_reads_input"] = starlogs_df[
                                   "total_reads"]
                    srp_df.loc[
                        srp_df.run_accession == srr,
                        "uniquely_mapped"] = starlogs_df["uniquely_mapped"]
                    srp_df.loc[srp_df.run_accession == srr,
                               "uniquely_mapped_percent"] = starlogs_df[
                                   "uniquely_mapped_percent"]

        cols = [
            "bases",
            "spots",
            "pass1_reads_with_adapters",
            "pass2_reads_with_adapters",
            "pass2_total_reads_processed",
            "pass1_total_reads_processed",
            "uniquely_mapped",
            "mapping_total_reads_input",
        ]
        for col in cols:
            try:
                srp_df[col] = srp_df[col].apply(lambda z: millify(z))
            except:
                pass
        sradb.close()
        return order_dataframe(srp_df, column_order)
Ejemplo n.º 9
0
                root_dir, assembly_build, srp)


def get_fragment_lengths(file_path):
    try:
        return pd.read_csv(file_path, sep="\t").fragment_length.tolist()
    except:
        # Handle 3 headed files
        df = pd.read_csv(file_path, header=None, sep="\t")
        df.columns = ["fragment_length", "offset_5p", "profile"]
        return df.fragment_length.tolist()


# In[21]:

db = SRAweb()  #SRAdb("/data2/SRAmetadb.sqlite")
all_projects = []

for species, sample_list in __ASSEMBLY_WISE_SRP__.items():
    mkdir_p("/data2/re-ribo-analysis-metadata/{}".format(species))
    for srp in sample_list:
        basedir = os.path.dirname(
            os.path.dirname(__SRP_TO_ROOT_DIR_MAP__[srp][species]))
        if not os.listdir(__SRP_TO_ROOT_DIR_MAP__[srp][species]):
            continue
        print(srp, basedir)
        df = get_srp_table(srp, species, basedir)
        project_filepath = "{}/{}/{}".format(basedir, species, srp)
        metadata_filepath = "/data2/re-ribo-analysis-metadata/{}/{}.tsv".format(
            species, srp)
        df_subset = df[df.ribotricer_metagene_5p == df.