Ejemplo n.º 1
0
def make_org_db(org_name_file, data_path, exp_path):
    """
    function to collect details of each organism

    @args org_name_file: text file containing organisms name
    @type org_name_file: str 
        
        ovis_aries      SRR645881       ensembl 78
        capra_hircus      SRR645885       ensembl 78

    @args data_path: data file storage path 
    @type data_path: str 
    @args exp_path: experiment related file path  
    @type exp_path: str 
    """

    org_fasta_file = dict( A_carolinensis = '%s/A_carolinensis/STARgenome/Anolis_carolinensis.AnoCar2.0.69.stable_genome.fa' % data_path,
    M_mulatta = "%s/M_mulatta/STARgenome/ensembl_release-69.fa" % data_path,
    O_cuniculus = "%s/O_cuniculus/STARgenome/ensembl_release-69.fa" % data_path,
    M_gallopavo = "%s/M_gallopavo/STARgenome/ensembl_release-69.fa" % data_path, 
    B_anthracis = '%s/B_anthracis/ensembl_release-21/Bacillus_anthracis_str_a0193.GCA_000181915.1.21.dna.toplevel.fa' % data_path,
    C_familiaris = '%s/C_familiaris/ensembl_release-69/Canis_familiaris.CanFam3.1.69.dna.toplevel.fa' % data_path,
    D_melanogaster = '%s/D_melanogaster/ensembl_release-69/Drosophila_melanogaster.BDGP5.69.dna.toplevel.fa' % data_path,
    E_caballus = '%s/E_caballus/ensembl_release-69/Equus_caballus.EquCab2.69_stable.fa.bz2' % data_path,
    M_domestica = '%s/M_domestica/ensembl_release-69/Monodelphis_domestica.BROADO5.69.dna.toplevel.fa' % data_path,
    O_sativa = '%s/O_sativa/phytozome_v9.0/Osativa_204.fa' % data_path,
    A_gambiae = '%s/A_gambiae/ensembl_release-20/Anopheles_gambiae.AgamP3.20.dna.fa' % data_path,
    B_rapa = '%s/B_rapa/phytozome_v9.0/Brapa_197_stable.fa' % data_path,
    C_japonica = '%s/C_japonica/STARgenome/Caenorhabditis_japonica.C_japonica-7.0.1.22.dna_sm.stable.fa' % data_path,
    G_gallus = '%s/G_gallus/ensembl_release-69/Gallus_gallus.WASHUC2.69_stable.fa.bz2' % data_path,
    M_musculus = '%s/M_musculus/ensembl_release-69/Mus_musculus.GRCm38.69_stable.fa.bz2' % data_path,  
    V_vinifera = '%s/V_vinifera/STARgenome/Vvinifera_145.fa' % data_path,
    A_mellifera = '%s/A_mellifera/ensembl_release-20/apiMel3_ucsc_chrom.fasta' % data_path,
    B_taurus = '%s/B_taurus/STARgenome/ensembl_release-69.fa' % data_path,
    C_rubella = '%s/C_rubella/phytozome_v9.0/Crubella_183.fa.gz' % data_path,
    D_rerio = '%s/D_rerio/ensembl_release-69/Danio_rerio.Zv9.69.dna.toplevel.fa.bz2' % data_path,
    G_max = '%s/G_max/phytozome_v9.0/Gmax_189_filter.fa' % data_path,
    M_truncatula = '%s/M_truncatula/STARgenome/Mtruncatula_198.fa' % data_path,
    P_pacificus = '%s/P_pacificus/STARgenome/Pristionchus_pacificus.P_pacificus-5.0.22.dna_sm.stable.fa' % data_path,
    S_scrofa = '%s/S_scrofa/ensembl_release-69/Sus_scrofa.Sscrofa10.2.69.dna_filter.fasta' % data_path,
    X_tropicalis = '%s/X_tropicalis/ensembl_release-69/Xenopus_tropicalis.JGI_4.2.69.dna_stable.fasta.bz2' % data_path,
    C_sativus = '%s/C_sativus/phytozome_v9.0/Csativus_122_filtered.fa' % data_path,
    D_simulans = '%s/D_simulans/ensembl_release-22/Drosophila_simulans.WUGSC1.22.dna_sm.dna.fa' % data_path,
    H_sapiens = '%s/H_sapiens/hg19_bowtie/hg19.fa' % data_path,
    O_anatinus = '%s/O_anatinus/ensembl_release-69/Ornithorhynchus_anatinus.OANA5.69-filtered_dna.fa' % data_path,
    P_troglodytes = '%s/P_troglodytes/STARgenome/Pan_troglodytes.CHIMP2.1.4.69_stable.fa' % data_path,
    S_tuberosum = '%s/S_tuberosum/phytozome_v9.0/Stuberosum_206.fa' % data_path,
    Z_mays = '%s/Z_mays/phytozome_v9.0/Zmays_181.fa' % data_path,
    A_thaliana = '%s/A_thaliana/arabidopsis_tair10/sequences/TAIR9_chr_all.fas' % data_path,
    C_elegans = '%s/C_elegans/ensembl_release-69/Caenorhabditis_elegans.WBcel215.69.dna.toplevel.fa' % data_path,
    D_discoideum = '%s/D_discoideum/STARgenome/Dictyostelium_discoideum.dictybase.01.21.dna.toplevel.fa' % data_path,
    D_yakuba = '%s/D_yakuba/STARgenome/Drosophila_yakuba.dyak_r1.3_FB2008_07.22.dna_sm_stable.fasta' % data_path,
    O_latipes = '%s/O_latipes/STARgenome/ensembl_release-74.fa' % data_path,
    R_norvegicus = '%s/R_norvegicus/ensembl_release-69/Rattus_norvegicus.RGSC3.4.69.dna.toplevel.fa.bz2' % data_path,
    C_briggsae = '%s/C_briggsae/STARgenome/Caenorhabditis_briggsae.CB4.22.dna_sm_stable.fasta' % data_path,
    C_brenneri = '%s/C_brenneri/STARgenome/Caenorhabditis_brenneri.C_brenneri-6.0.1b.22.dna_sm_stable.fa' % data_path,
    C_remanei = '%s/C_remanei/STARgenome/Caenorhabditis_remanei.C_remanei-15.0.1.22.dna_sm_stable.fa' % data_path,
    D_pseudoobscura = '%s/D_pseudoobscura/STARgenome/Drosophila_pseudoobscura.HGSC2.22.dna_sm_stable.fasta' % data_path,
    T_pseudonana = '%s/T_pseudonana/Thaps3/Thaps3_chromosomes_assembly_chromosomes_repeatmasked.fasta' % data_path,
    T_nigroviridis = '%s/T_nigroviridis/ensembl_release-69/Tetraodon_nigroviridis.TETRAODON8.69.dna.toplevel.fa' % data_path
    )

    org_gtf_file = dict( A_carolinensis = '%s/A_carolinensis/STARgenome/Anolis_carolinensis.AnoCar2.0.69.stable.gtf' % data_path,
    M_mulatta = "%s/M_mulatta/STARgenome/ensembl_release-69.gtf" % data_path,
    O_cuniculus = "%s/O_cuniculus/STARgenome/ensembl_release-69.gtf" % data_path,
    M_gallopavo = "%s/M_gallopavo/STARgenome/ensembl_release-69.gtf" % data_path, 
    B_anthracis = '%s/B_anthracis/ensembl_release-21/Bacillus_anthracis' % data_path,
    C_familiaris = '%s/C_familiaris/ensembl_release-69/Canis_familiaris.CanFam3.1.69.gtf' % data_path,
    D_melanogaster = '%s/D_melanogaster/ensembl_release-69/Drosophila_melanogaster.BDGP5.69.gtf' % data_path,
    E_caballus = '%s/E_caballus/ensembl_release-69/Equus_caballus.EquCab2.69.gtf' % data_path,
    M_domestica = '%s/M_domestica/ensembl_release-69/Monodelphis_domestica.BROADO5.69.gtf' % data_path,
    O_sativa = '%s/O_sativa/phytozome_v9.0/Osativa_204_gene.gff3' % data_path,
    A_gambiae = '%s/A_gambiae/ensembl_release-20/Anopheles_gambiae.AgamP3.20.gtf' % data_path,
    B_rapa = '%s/B_rapa/phytozome_v9.0/Brapa_197_gene.gff3' % data_path,
    C_japonica = '%s/C_japonica/ensembl_release-22/Caenorhabditis_japonica.C_japonica-7.0.1.22.gff3' % data_path,
    G_gallus = '%s/G_gallus/ensembl_release-69/Gallus_gallus.WASHUC2.69.gtf' % data_path,
    M_musculus = '%s/M_musculus/ensembl_release-69/Mus_musculus.GRCm38.69.gtf' % data_path,  
    V_vinifera = '%s/V_vinifera/STARgenome/Vvinifera_145_gene.gff3' % data_path,
    A_mellifera = '%s/A_mellifera/ensembl_release-20/apiMel3_ucsc_ensembl_genes.gtf' % data_path,
    B_taurus = '%s/ensembl_release-69/Bos_taurus.UMD3.1.69.gtf' % data_path,
    C_rubella = '%s/C_rubella/phytozome_v9.0/Crubella_183.gff3' % data_path,
    D_rerio = '%s/D_rerio/ensembl_release-69/Danio_rerio.Zv9.69.gtf' % data_path,
    G_max = '%s/G_max/phytozome_v9.0/Gmax_189_gene.gff3' % data_path,
    M_truncatula = '%s/M_truncatula/' % data_path,
    P_pacificus = '%s/P_pacificus/ensembl_release-22/Pristionchus_pacificus.P_pacificus-5.0.22.gtf' % data_path,
    S_scrofa = '%s/S_scrofa/ensembl_release-69/Sus_scrofa.Sscrofa10.2.69.gtf' % data_path,
    X_tropicalis = '%s/X_tropicalis/STARgenome/JGIv4-1.gtf' % data_path,
    C_sativus = '%s/C_sativus/phytozome_v9.0/Csativus_122_gene.gff3' % data_path,
    D_simulans = '%s/D_simulans/ensembl_release-22/Drosophila_simulans.WUGSC1.22.gff3' % data_path,
    H_sapiens = '%s/H_sapiens/ensembl_release-69/Homo_sapiens.GRCh37.69.gtf' % data_path,
    O_anatinus = '%s/O_anatinus/ensembl_release-69/' % data_path,
    P_troglodytes = '%s/P_troglodytes/ensembl_release-69/Pan_troglodytes.CHIMP2.1.4.69.gtf' % data_path,
    S_tuberosum = '%s/S_tuberosum/phytozome_v9.0/Stuberosum_206_gene.gff3' % data_path,
    Z_mays = '%s/Z_mays/phytozome_v9.0/Zmays_181_gene.gff3' % data_path,
    A_thaliana = '%s/A_thaliana/arabidopsis_tair10/annotations/TAIR10_GFF3_genes.gff' % data_path,
    C_elegans = '%s/C_elegans/ensembl_release-69/Caenorhabditis_elegans.WBcel215.69.gtf' % data_path,
    D_discoideum = '%s/D_discoideum/' % data_path,
    D_yakuba = '%s/D_yakuba/ensembl_release-22/Drosophila_yakuba.dyak_r1.3_FB2008_07.22.gff3' % data_path,
    O_latipes = '%s/O_latipes/STARgenome/Oryzias_latipes.MEDAKA1.74.gtf' % data_path,
    R_norvegicus = '%s/R_norvegicus/ensembl_release-69/Rattus_norvegicus.RGSC3.4.69.gtf' % data_path,
    C_briggsae = '%s/C_briggsae/ensembl_release-22/Caenorhabditis_briggsae.CB4.22.gff3' % data_path,
    C_brenneri = '%s/C_brenneri/ensembl_release-22/Caenorhabditis_brenneri.C_brenneri-6.0.1b.22.gff3' % data_path,
    C_remanei = '%s/C_remanei/ensembl_release-22/Caenorhabditis_remanei.C_remanei-15.0.1.22.gff3' % data_path,
    D_pseudoobscura = '%s/D_pseudoobscura/ensembl_release-22/Drosophila_pseudoobscura.HGSC2.22.gff3' % data_path,
    T_pseudonana = '%s/T_pseudonana/Thaps3/Thaps3_chromosomes_geneModels_FilteredModels2.gff' % data_path,
    T_nigroviridis = '%s/T_nigroviridis/ensembl_release-69/Tetraodon_nigroviridis.TETRAODON8.69.gtf' % data_path
    )

    org_db = defaultdict()

    ## get the organisms name and details on the experiment
    with open(org_name_file, "rU") as fh:
        for name in fh:
            name = name.strip('\n\r').split('\t')
            #print name 
            
            ## name shortening 
            token = name[0].split("_") 
            genus, species = token[0], token[-1]
            short_name = "%s_%s" % (genus[0].upper(), species.lower())  
             
            ## adding details 
            org_db[short_name] = dict(name = name[0])  
            org_db[short_name]['short_name'] = short_name

            ## sequencing reads files 
            sra_files = [] 
            if os.path.isdir("%s/%s/source_data" % (exp_path, short_name)):
                for sra_file in os.listdir("%s/%s/source_data" % (exp_path, short_name)):
                    file_prefix, ext = os.path.splitext(sra_file)
                    if ext == ".sra":
                        continue 
                    sra_files.append(sra_file) 
            else:
                # new organism, creating sub directories  
                for sub_dir in ['source_data', 'read_mapping', 'signal_labels', 'trans_pred']:
                    try:
                        os.makedirs("%s/%s/%s" % (exp_path, short_name, sub_dir))
                    except OSError:
                        print "Skipping creation of %s/%s/%s because it exists already." % (exp_path, short_name, sub_dir) 
                
            org_db[short_name]['fastq_path'] = "%s/%s/source_data" % (exp_path, short_name)
            org_db[short_name]['fastq'] = sra_files

            org_db[short_name]['star_wd'] = "%s/%s/read_mapping" % (exp_path, short_name)
            org_db[short_name]['trsk_wd'] = "%s/%s/trans_pred" % (exp_path, short_name)
            org_db[short_name]['labels_wd'] = "%s/%s/signal_labels" % (exp_path, short_name)

            org_db[short_name]['bam'] = "%s/%s/read_mapping/unique_map.bam" % (exp_path, short_name)
            org_db[short_name]['pred_gff'] = "%s/%s/trans_pred/ss_filter_predgenes.gff" % (exp_path, short_name)

            ## check for the genome sequence file 
            if short_name in org_fasta_file:
                org_db[short_name]['fasta'] = org_fasta_file[short_name]
            else:
                if not os.path.isdir("%s" % data_path):
                    os.makedirs("%s" % data_path) 
                else:
                    print "Skipping creation of %s because it exists already." % data_path

                org_db[short_name]['fasta'] = "%s" % data_path

            ## check for the genome index file 
            if short_name in star_index_file:
                org_db[short_name]['index'] = star_index_file[short_name]
            else:
                if not os.path.isdir("%s" % data_path):
                    os.makedirs("%s" % data_path)
                else:    
                    print "Skipping creation of %s because it exists already." % data_path

                org_db[short_name]['index'] = "%s/%s/STARgenome/" % (data_path, short_name) 

            ## TODO remove the dependency of gio from TSKM 
            if short_name in org_gio_file:
                org_db[short_name]['gio'] = org_gio_file[short_name]
            else:
                org_db[short_name]['gio'] = "%s" % data_path

            ## check the genome annotation 
            if short_name in org_gtf_file:
                org_db[short_name]['gtf'] = org_gtf_file[short_name]
                ## get the gtf feature lengths 
                if os.path.isfile(org_gtf_file[short_name]):
                    from fetch_remote_data import prepare_data as pd

                    feat_len_db = pd.make_anno_db(org_gtf_file[short_name]) 
                    org_db[short_name]['max_intron'] = feat_len_db['max_intron']
                    org_db[short_name]['max_exon'] = feat_len_db['max_exon']
            else:
                org_db[short_name]['gtf'] = data_path
                org_db[short_name]['max_intron'] = None
                org_db[short_name]['max_exon'] = None
            
            ## SRA/ENA run id 
            try:
                org_db[short_name]['sra_run_id'] = name[1]
            except:
                org_db[short_name]['sra_run_id'] = None 
                print "SRA run_id missing"

            ## genome annotation release number
            try:
                version = name[2].split(' ')
                org_db[short_name]['release_db'] = version[0]
                org_db[short_name]['release_num'] = version[-1]
                
                #sub_genome_folder = '%s/%s/%s_release_%s' % (data_path, short_name, version[0], version[-1]) 
                ## updating the fasta/gtf/index file location 
                #try:
                #    os.makedirs(sub_genome_folder)
                #    org_db[short_name]['fasta'] = sub_genome_folder
                #    org_db[short_name]['gtf'] = sub_genome_folder
                #    star_index_folder = '%s/STARgenome' % sub_genome_folder
                #    try:
                #        os.makedirs(star_index_folder)
                #        org_db[short_name]['index'] = star_index_folder
                #    except OSError:
                #        print "skipping creation of %s star index dir" % star_index_folder

                #except OSError:
                #    print "skipping creation of %s genome version dir" % sub_genome_folder

            except:
                org_db[short_name]['release_db'] = None
                org_db[short_name]['release_num'] = None 
                print "Genome annotation release database and number are missing"
                
    fh.close() 
    
    return org_db
Ejemplo n.º 2
0
def experiment_db(config_file, opt_action):
    """
    function to collect details of each organism

    FIXME descriptions 
    @args config_file: yaml file contain the information for the experiment
    @type config_file: str 
    """

    ## parsing the config file 
    config_map = yaml.safe_load(open(config_file, "rU"))

    data_path = config_map['genome_data_path']['dir']
    exp_path = config_map['experiment_data_path']['dir']

    org_fasta_file = dict( 
    A_carolinensis  = '%s/A_carolinensis/ensembl_release_79/ensembl_release_79.fas' % data_path,
    M_mulatta       = "%s/M_mulatta/ensembl_release_79/ensembl_release_79.fas" % data_path,
    O_cuniculus     = "%s/O_cuniculus/ensembl_release_79/ensembl_release_79.fas" % data_path,
    M_gallopavo     = "%s/M_gallopavo/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, 
    B_anthracis     = '%s/B_anthracis/' % data_path,
    C_familiaris    = '%s/C_familiaris/ensembl_release_79/ensembl_release_79.fas' % data_path,
    D_melanogaster  = '%s/D_melanogaster/ensembl_release_79/ensembl_release_79.fas' % data_path,
    E_caballus      = '%s/E_caballus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    M_domestica     = '%s/M_domestica/ensembl_release_79/ensembl_release_79.fas' % data_path,
    O_sativa        = '%s/O_sativa/phytozome_v9.0/Osativa_204.fa' % data_path,
    A_gambiae       = '%s/A_gambiae/ensembl_release_28/ensembl_release_28.fas' % data_path,
    B_rapa          = '%s/B_rapa/phytozome_v9.0/Brapa_197_stable.fa' % data_path,
    G_gallus        = '%s/G_gallus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    M_musculus      = '%s/M_musculus/ensembl_release_79/ensembl_release_79.fas' % data_path,  
    V_vinifera      = '%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.fas.bz2' % data_path,
    A_mellifera     = '%s/A_mellifera/ensembl_release_28/ensembl_release_28.fas' % data_path,
    B_taurus        = '%s/B_taurus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    C_rubella       = '%s/C_rubella/phytozome_v9.0/Crubella_183.fa.gz' % data_path,
    D_rerio         = '%s/D_rerio/ensembl_release_79/ensembl_release_79.fas' % data_path,
    G_max           = '%s/G_max/phytozome_v9.0/Gmax_189_filter.fa' % data_path,
    M_truncatula    = '%s/M_truncatula/STARgenome/Mtruncatula_198.fa' % data_path,
    P_pacificus     = '%s/P_pacificus/' % data_path,
    S_scrofa        = '%s/S_scrofa/ensembl_release_79/ensembl_release_79.fas' % data_path,
    X_tropicalis    = '%s/X_tropicalis/JGIv4-1/JGIv4-1.fa' % data_path,
    C_sativus       = '%s/C_sativus/phytozome_v9.0/Csativus_122_filtered.fa' % data_path,
    D_simulans      = '%s/D_simulans/ensembl_release_28/ensembl_release_28.fas' % data_path,
    H_sapiens       = '%s/H_sapiens/ensembl_release_79/STARgenome/hg19_chrOnly.fa' % data_path,
    N_vitripennis   = '%s/N_vitripennis/ensembl_release_22/N_vitripennis_dna_sm.fa' % data_path,
    P_troglodytes   = '%s/P_troglodytes/ensembl_release_79/ensembl_release_79.fas' % data_path,
    S_tuberosum     = '%s/S_tuberosum/phytozome_v9.0/Stuberosum_206.fa' % data_path,
    Z_mays          = '%s/Z_mays/phytozome_v9.0/Zmays_181.fa' % data_path,
    A_thaliana      = '%s/A_thaliana/arabidopsis_tair10/sequences/TAIR9_chr_all.fas' % data_path,
    O_aries         = '%s/O_aries/ensembl_release_79/ensembl_release_79.fas' % data_path,
    C_jacchus       = '%s/C_jacchus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    C_elegans       = '%s/C_elegans/ensembl_release-69/Caenorhabditis_elegans.WBcel215.69.dna.toplevel.fa' % data_path,
    O_latipes       = '%s/O_latipes/ensembl_release_79/ensembl_release_79.fas' % data_path,
    R_norvegicus    = '%s/R_norvegicus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    G_gorilla       = '%s/G_gorilla/ensembl_release_79/ensembl_release_79.fas' % data_path,
    P_paniscus      = '%s/P_paniscus/eva_mpg_de/eva_mpg_de.fas' % data_path,
    C_porcellus     = '%s/C_porcellus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    O_anatinus      = '%s/O_anatinus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    A_platyrhynchos = '%s/A_platyrhynchos/ensembl_release_79/ensembl_release_79.fas' % data_path,
    O_niloticus     = '%s/O_niloticus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    L_chalumnae     = '%s/L_chalumnae/ensembl_release_79/ensembl_release_79.fas' % data_path,
    H_glaber        = '%s/H_glaber/naked_mole_rat_db/naked_mole_rat_db.fas' % data_path,
    M_eugenii       = '%s/M_eugenii/ensembl_release_79/ensembl_release_79.fas' % data_path,
    C_briggsae      = '%s/C_briggsae/ensembl_release_28/ensembl_release_28.fas' % data_path,
    C_japonica      = '%s/C_japonica/ensembl_release_28/ensembl_release_28.fas' % data_path,
    C_remanei       = '%s/C_remanei/ensembl_release_28/ensembl_release_28.fas' % data_path,
    P_marinus       = '%s/P_marinus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    C_brenneri      = '%s/C_brenneri/ensembl_release_28/ensembl_release_28.fas' % data_path,
    C_intestinalis  = '%s/C_intestinalis/ensembl_release_79/ensembl_release_79.fas' % data_path,
    S_cerevisiae    = '%s/S_cerevisiae/ensembl_release_79/ensembl_release_79.fas' % data_path,
    S_pombe         = '%s/S_pombe/ensembl_release_28/ensembl_release_28.fas' % data_path,
    A_aegypti       = '%s/A_aegypti/ensembl_release_28/ensembl_release_28.fas' % data_path,
    C_hircus        = '%s/C_hircus/ncbi_genome/ncbi_genome.fas' % data_path,
    F_catus         = '%s/F_catus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    T_nigroviridis  = '%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.fas' % data_path
    )
    #C_elegans       = '%s/C_elegans/ensembl_release_79/ensembl_release_79.fas' % data_path,

    org_gtf_file = dict( 
    A_carolinensis  = '%s/A_carolinensis/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    M_mulatta       = "%s/M_mulatta/ensembl_release_79/ensembl_release_79.gtf" % data_path,
    O_cuniculus     = "%s/O_cuniculus/ensembl_release_79/ensembl_release_79.gtf" % data_path,
    M_gallopavo     = "%s/M_gallopavo/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, 
    B_anthracis     = '%s/B_anthracis/ensembl_release-21/Bacillus_anthracis' % data_path,
    C_familiaris    = '%s/C_familiaris/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    D_melanogaster  = '%s/D_melanogaster/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    E_caballus      = '%s/E_caballus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    M_domestica     = '%s/M_domestica/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    O_sativa        = '%s/O_sativa/phytozome_v9.0/Osativa_204_gene.gff3' % data_path,
    A_gambiae       = '%s/A_gambiae/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    B_rapa          = '%s/B_rapa/phytozome_v9.0/Brapa_197_gene.gff3' % data_path,
    G_gallus        = '%s/G_gallus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    M_musculus      = '%s/M_musculus/ensembl_release_79/ensembl_release_79.gtf' % data_path,  
    V_vinifera      = '%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.gff.bz2' % data_path,
    A_mellifera     = '%s/A_mellifera/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    B_taurus        = '%s/B_taurus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_jacchus       = '%s/C_jacchus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_rubella       = '%s/C_rubella/phytozome_v9.0/Crubella_183.gff3' % data_path,
    D_rerio         = '%s/D_rerio/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    G_max           = '%s/G_max/phytozome_v9.0/Gmax_189_gene.gff3' % data_path,
    N_vitripennis   = '%s/N_vitripennis/ensembl_release_22/N_vitripennis.gtf' % data_path,
    O_aries         = '%s/O_aries/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    M_truncatula    = '%s/M_truncatula/' % data_path,
    P_pacificus     = '%s/P_pacificus/ensembl_release-22/.gtf' % data_path,
    S_scrofa        = '%s/S_scrofa/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    X_tropicalis    = '%s/X_tropicalis/JGIv4-1/JGIv4-1.gff' % data_path,
    C_sativus       = '%s/C_sativus/phytozome_v9.0/Csativus_122_gene.gff3' % data_path,
    D_simulans      = '%s/D_simulans/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    H_sapiens       = '%s/H_sapiens/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    P_troglodytes   = '%s/P_troglodytes/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    S_tuberosum     = '%s/S_tuberosum/phytozome_v9.0/Stuberosum_206_gene.gff3' % data_path,
    Z_mays          = '%s/Z_mays/phytozome_v9.0/Zmays_181_gene.gff3' % data_path,
    A_thaliana      = '%s/A_thaliana/arabidopsis_tair10/annotations/TAIR10_GFF3_genes.gff' % data_path,
    C_elegans       = '%s/C_elegans/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    D_discoideum    = '%s/D_discoideum/' % data_path,
    D_yakuba        = '%s/D_yakuba/ensembl_release-22/.gff3' % data_path,
    O_latipes       = '%s/O_latipes/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    R_norvegicus    = '%s/R_norvegicus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    D_pseudoobscura = '%s/D_pseudoobscura/ensembl_release-22/.gff3' % data_path,
    T_pseudonana    = '%s/T_pseudonana/Thaps3/.gff' % data_path,
    G_gorilla       = '%s/G_gorilla/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_porcellus     = '%s/C_porcellus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    O_anatinus      = '%s/O_anatinus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    A_platyrhynchos = '%s/A_platyrhynchos/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    O_niloticus     = '%s/O_niloticus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    L_chalumnae     = '%s/L_chalumnae/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_briggsae      = '%s/C_briggsae/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    M_eugenii       = '%s/M_eugenii/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_remanei       = '%s/C_remanei/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    C_brenneri      = '%s/C_brenneri/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    C_intestinalis  = '%s/C_intestinalis/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    S_pombe         = '%s/S_pombe/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    P_marinus       = '%s/P_marinus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    S_cerevisiae    = '%s/S_cerevisiae/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_japonica      = '%s/C_japonica/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    F_catus         = '%s/F_catus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    A_aegypti       = '%s/A_aegypti/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    C_hircus        = '%s/C_hircus/ncbi_genome/ncbi_genome.gff' % data_path,
    T_nigroviridis  = '%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.gtf' % data_path
    )

    ## TODO algorithms details 
    #A_aegypti = '%s/A_aegypti/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    #F_catus = '%s/F_catus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    #C_hircus = '%s/C_hircus/ncbi_genome/ncbi_genome.gff' % data_path,
    #H_glaber = '%s/H_glaber/naked_mole_rat_db/naked_mole_rat_db.gtf' % data_path,

    ## experiment details  
    org_db = defaultdict()
    
    for ent in config_map['experiment']:
        species_name = ent['organism_name'] 
        sra_run_id = ent['sra_run_id']
        genome_build_version = ent['genome_build_version']
        db_server = ent['release_db']

        ## mapping to short names       arabidopsis_thaliana --> A_thaliana
        genus, species = species_name.strip().split("_")
        short_name = "%s_%s" % (genus[0].upper(), species)

        org_db[short_name] = dict(short_name = short_name)  
        org_db[short_name]['long_name'] = species_name
        org_db[short_name]['sra_run_id'] = sra_run_id
        org_db[short_name]['genome_release_db'] = genome_build_version
        ## the broad path to the experiment 
        org_db[short_name]['genome_dir'] = data_path
        org_db[short_name]['experiment_dir'] = exp_path

        build_release = genome_build_version.split("_")
        org_db[short_name]['release_db'] = db_server ## ensembl_metazoa, phytozome
        org_db[short_name]['release_nb'] = build_release[-1] ## build number 

        sra_files = [] ## sequencing reads files 
        if os.path.isdir("%s/%s/source_data" % (exp_path, short_name)):
            for sra_file in os.listdir("%s/%s/source_data" % (exp_path, short_name)):
                file_prefx, ext = os.path.splitext(sra_file)
                if ext == ".sra": ## skipping the original .sra binary file 
                    continue
                if re.search(sra_run_id, sra_file):
                    sra_files.append(sra_file) 
        else:
            print "warning: missing sequencing read trace file %s/%s/source_data" % (exp_path, short_name) 
                
        org_db[short_name]['fastq_path'] = "%s/%s/source_data" % (exp_path, short_name)
        org_db[short_name]['fastq'] = sra_files

        ## read mapping, read assembly and label generation working folders 
        for sub_dir in ['read_mapping', 'signal_labels', 'trans_pred', 'source_data']:
            work_path = "%s/%s/%s" % (exp_path, short_name, sub_dir)

            if not os.path.isdir(work_path):
                try:
                    os.makedirs(work_path)
                except OSError:
                    exit("error: cannot create the directory %s." % work_path)

        org_db[short_name]['read_map_dir'] = "%s/%s/read_mapping" % (exp_path, short_name)
        org_db[short_name]['read_assembly_dir'] = "%s/%s/trans_pred" % (exp_path, short_name)
        org_db[short_name]['labels_dir'] = "%s/%s/signal_labels" % (exp_path, short_name)

        ## calculate the sequence read length
        readlength = 0 
        if opt_action in ["2", "3"]: ## perform this action only for selected options 
            if sra_files:
                fqfile = os.path.join(org_db[short_name]['fastq_path'], sra_files[0])
                print 'using sequencing read file %s to determine readLength' % fqfile
                fh = helper.open_file(fqfile)
                for rec in SeqIO.parse(fh, "fastq"):
                    readlength = len(rec.seq)
                    break
                fh.close() 
        org_db[short_name]['read_length'] = readlength

        ## check for the genome sequence file 
        if short_name in org_fasta_file:
            if os.path.isfile(org_fasta_file[short_name]):
                org_db[short_name]['fasta'] = org_fasta_file[short_name]
            else:
                org_db[short_name]['fasta'] = None
        else:
            print "warning: missing genome sequence file for %s under %s/%s/%s" % (short_name, data_path, short_name, genome_build_version)
            org_db[short_name]['fasta'] = None

        if not os.path.isdir("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version)):
            try:
                os.makedirs("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version))
            except OSError:
                exit("error: cannot create the directory %s/%s/%s" % (data_path, short_name, genome_build_version))

        org_db[short_name]['genome_index_dir'] = "%s/%s/%s/STARgenome/" % (data_path, short_name, genome_build_version) 

        ## check the genome annotation 
        if short_name in org_gtf_file:
            if os.path.isfile(org_gtf_file[short_name]):
                org_db[short_name]['gtf'] = org_gtf_file[short_name]

                if opt_action in ["2", "3", "4", "c"]: ## perform this action only for selected options 
                    ## get the gtf feature lengths 
                    from fetch_remote_data import prepare_data as pd
                    feat_len_db = pd.make_anno_db(org_gtf_file[short_name]) 
                    org_db[short_name]['max_intron_len'] = feat_len_db['max_intron']
                    org_db[short_name]['max_exon_len'] = feat_len_db['max_exon']
            else:
                exit("error: the provided gtf file %s is not available to read. Please check!" % org_gtf_file[short_name])
        else:
            print("warning: missing annotation file for %s under %s/%s/%s" % (short_name, data_path, short_name, genome_build_version))
            org_db[short_name]['gtf'] = None
            org_db[short_name]['max_intron_len'] = None
            org_db[short_name]['max_exon_len'] = None
        
        print("fetched details for %s" % short_name) 
            
    return org_db
def experiment_db(config_file, opt_action):
    """
    function to collect details of each organism

    FIXME descriptions 
    @args config_file: yaml file contain the information for the experiment
    @type config_file: str 
    """

    ## parsing the config file
    config_map = yaml.safe_load(open(config_file, "rU"))

    data_path = config_map["genome_data_path"]["dir"]
    exp_path = config_map["experiment_data_path"]["dir"]

    org_fasta_file = dict(
        A_carolinensis="%s/A_carolinensis/ensembl_release_79/ensembl_release_79.fas" % data_path,
        M_mulatta="%s/M_mulatta/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        O_cuniculus="%s/O_cuniculus/ensembl_release_79/ensembl_release_79.fas" % data_path,
        M_gallopavo="%s/M_gallopavo/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        B_anthracis="%s/B_anthracis/ensembl_release-21/Bacillus_anthracis_str_a0193.GCA_000181915.1.21.dna.toplevel.fa"
        % data_path,
        C_familiaris="%s/C_familiaris/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        D_melanogaster="%s/D_melanogaster/ensembl_release_79/ensembl_release_79.fas" % data_path,
        E_caballus="%s/E_caballus/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        M_domestica="%s/M_domestica/ensembl_release-69/Monodelphis_domestica.BROADO5.69.dna.toplevel.fa" % data_path,
        O_sativa="%s/O_sativa/phytozome_v9.0/Osativa_204.fa" % data_path,
        A_gambiae="%s/A_gambiae/anoGam1_ucsc/anoGam1_rm.fasta" % data_path,
        B_rapa="%s/B_rapa/phytozome_v9.0/Brapa_197_stable.fa" % data_path,
        C_japonica="%s/C_japonica/STARgenome/Caenorhabditis_japonica.C_japonica-7.0.1.22.dna_sm.stable.fa" % data_path,
        G_gallus="%s/G_gallus/ensembl_release_79/ensembl_release_79.fas" % data_path,
        M_musculus="%s/M_musculus/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        V_vinifera="%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.fas.bz2" % data_path,
        A_mellifera="%s/A_mellifera/apiMel3_ucsc/apiMel3_sm.fasta" % data_path,
        B_taurus="%s/B_taurus/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        C_rubella="%s/C_rubella/phytozome_v9.0/Crubella_183.fa.gz" % data_path,
        D_rerio="%s/D_rerio/ensembl_release_79/ensembl_release_79.fas" % data_path,
        G_max="%s/G_max/phytozome_v9.0/Gmax_189_filter.fa" % data_path,
        M_truncatula="%s/M_truncatula/STARgenome/Mtruncatula_198.fa" % data_path,
        P_pacificus="%s/P_pacificus/STARgenome/Pristionchus_pacificus.P_pacificus-5.0.22.dna_sm.stable.fa" % data_path,
        S_scrofa="%s/S_scrofa/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        X_tropicalis="%s/X_tropicalis/JGIv4-1/JGIv4-1.fa" % data_path,
        C_sativus="%s/C_sativus/phytozome_v9.0/Csativus_122_filtered.fa" % data_path,
        D_simulans="%s/D_simulans/ensembl_release_22/ensembl_release_22.fas.bz2" % data_path,
        H_sapiens="%s/H_sapiens/hg19_bowtie2/hg19.fa" % data_path,
        O_anatinus="%s/O_anatinus/ensembl_release-69/Ornithorhynchus_anatinus.OANA5.69-filtered_dna.fa" % data_path,
        N_vitripennis="%s/N_vitripennis/ensembl_release_22/N_vitripennis_dna_sm.fa" % data_path,
        P_troglodytes="%s/P_troglodytes/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        S_tuberosum="%s/S_tuberosum/phytozome_v9.0/Stuberosum_206.fa" % data_path,
        Z_mays="%s/Z_mays/phytozome_v9.0/Zmays_181.fa" % data_path,
        A_thaliana="%s/A_thaliana/arabidopsis_tair10/sequences/TAIR9_chr_all.fas" % data_path,
        O_aries="%s/O_aries/ensembl_release_79/ensembl_release_79.fas" % data_path,
        C_jacchus="%s/C_jacchus/ensembl_release_79/ensembl_release_79.fas" % data_path,
        C_elegans="%s/C_elegans/ensembl_release_79/ensembl_release_79.fas" % data_path,
        O_latipes="%s/O_latipes/ensembl_release_79/ensembl_release_79.fas" % data_path,
        R_norvegicus="%s/R_norvegicus/ensembl_release_79/ensembl_release_79.fas" % data_path,
        C_briggsae="%s/C_briggsae/ensembl_release_22/ensembl_release_22.fas.bz2" % data_path,
        T_nigroviridis="%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.fas" % data_path,
    )

    org_gtf_file = dict(
        A_carolinensis="%s/A_carolinensis/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        M_mulatta="%s/M_mulatta/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        O_cuniculus="%s/O_cuniculus/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        M_gallopavo="%s/M_gallopavo/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        B_anthracis="%s/B_anthracis/ensembl_release-21/Bacillus_anthracis" % data_path,
        C_familiaris="%s/C_familiaris/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        D_melanogaster="%s/D_melanogaster/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        E_caballus="%s/E_caballus/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        M_domestica="%s/M_domestica/ensembl_release-69/Monodelphis_domestica.BROADO5.69.gtf" % data_path,
        O_sativa="%s/O_sativa/phytozome_v9.0/Osativa_204_gene.gff3" % data_path,
        A_gambiae="%s/A_gambiae/anoGam1_ucsc/anoGam1_ucsc.gtf" % data_path,
        B_rapa="%s/B_rapa/phytozome_v9.0/Brapa_197_gene.gff3" % data_path,
        C_japonica="%s/C_japonica/ensembl_release-22/Caenorhabditis_japonica.C_japonica-7.0.1.22.gff3" % data_path,
        G_gallus="%s/G_gallus/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        M_musculus="%s/M_musculus/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        V_vinifera="%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.gff.bz2" % data_path,
        A_mellifera="%s/A_mellifera/apiMel3_ucsc/apiMel2_ucsc.gtf" % data_path,
        B_taurus="%s/B_taurus/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        C_jacchus="%s/C_jacchus/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        C_rubella="%s/C_rubella/phytozome_v9.0/Crubella_183.gff3" % data_path,
        D_rerio="%s/D_rerio/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        G_max="%s/G_max/phytozome_v9.0/Gmax_189_gene.gff3" % data_path,
        N_vitripennis="%s/N_vitripennis/ensembl_release_22/N_vitripennis.gtf" % data_path,
        O_aries="%s/O_aries/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        M_truncatula="%s/M_truncatula/" % data_path,
        P_pacificus="%s/P_pacificus/ensembl_release-22/Pristionchus_pacificus.P_pacificus-5.0.22.gtf" % data_path,
        S_scrofa="%s/S_scrofa/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        X_tropicalis="%s/X_tropicalis/JGIv4-1/JGIv4-1.gff" % data_path,
        C_sativus="%s/C_sativus/phytozome_v9.0/Csativus_122_gene.gff3" % data_path,
        D_simulans="%s/D_simulans/ensembl_release_22/ensembl_release_22.gff.bz2" % data_path,
        H_sapiens="%s/H_sapiens/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        O_anatinus="%s/O_anatinus/ensembl_release-69/" % data_path,
        P_troglodytes="%s/P_troglodytes/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        S_tuberosum="%s/S_tuberosum/phytozome_v9.0/Stuberosum_206_gene.gff3" % data_path,
        Z_mays="%s/Z_mays/phytozome_v9.0/Zmays_181_gene.gff3" % data_path,
        A_thaliana="%s/A_thaliana/arabidopsis_tair10/annotations/TAIR10_GFF3_genes.gff" % data_path,
        C_elegans="%s/C_elegans/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        D_discoideum="%s/D_discoideum/" % data_path,
        D_yakuba="%s/D_yakuba/ensembl_release-22/Drosophila_yakuba.dyak_r1.3_FB2008_07.22.gff3" % data_path,
        O_latipes="%s/O_latipes/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        R_norvegicus="%s/R_norvegicus/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        C_briggsae="%s/C_briggsae/ensembl_release_22/ensembl_release_22.gff.bz2" % data_path,
        C_brenneri="%s/C_brenneri/ensembl_release-22/Caenorhabditis_brenneri.C_brenneri-6.0.1b.22.gff3" % data_path,
        C_remanei="%s/C_remanei/ensembl_release-22/Caenorhabditis_remanei.C_remanei-15.0.1.22.gff3" % data_path,
        D_pseudoobscura="%s/D_pseudoobscura/ensembl_release-22/Drosophila_pseudoobscura.HGSC2.22.gff3" % data_path,
        T_pseudonana="%s/T_pseudonana/Thaps3/Thaps3_chromosomes_geneModels_FilteredModels2.gff" % data_path,
        T_nigroviridis="%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.gtf" % data_path,
    )

    ## TODO algorithms details

    ## experiment details
    org_db = defaultdict()

    for ent in config_map["experiment"]:
        species_name = ent["organism_name"]
        sra_run_id = ent["sra_run_id"]
        genome_build_version = ent["genome_build_version"]
        db_server = ent["release_db"]

        ## mapping to short names       arabidopsis_thaliana --> A_thaliana
        genus, species = species_name.strip().split("_")
        short_name = "%s_%s" % (genus[0].upper(), species)

        org_db[short_name] = dict(short_name=short_name)
        org_db[short_name]["long_name"] = species_name
        org_db[short_name]["sra_run_id"] = sra_run_id
        org_db[short_name]["genome_release_db"] = genome_build_version
        ## the broad path to the experiment
        org_db[short_name]["genome_dir"] = data_path
        org_db[short_name]["experiment_dir"] = exp_path

        build_release = genome_build_version.split("_")
        org_db[short_name]["release_db"] = db_server  ## ensembl_metazoa, phytozome
        org_db[short_name]["release_nb"] = build_release[-1]  ## build number

        sra_files = []  ## sequencing reads files
        if os.path.isdir("%s/%s/source_data" % (exp_path, short_name)):
            for sra_file in os.listdir("%s/%s/source_data" % (exp_path, short_name)):
                file_prefx, ext = os.path.splitext(sra_file)
                if ext == ".sra":  ## skipping the original .sra binary file
                    continue
                if re.search(sra_run_id, sra_file):
                    sra_files.append(sra_file)
        else:
            print "warning: missing sequencing read trace file %s/%s/source_data" % (exp_path, short_name)

        org_db[short_name]["fastq_path"] = "%s/%s/source_data" % (exp_path, short_name)
        org_db[short_name]["fastq"] = sra_files

        ## read mapping, read assembly and label generation working folders
        for sub_dir in ["read_mapping", "signal_labels", "trans_pred"]:
            work_path = "%s/%s/%s" % (exp_path, short_name, sub_dir)

            if not os.path.isdir(work_path):
                try:
                    os.makedirs(work_path)
                except OSError:
                    print "error: cannot create the directory %s." % work_path
                    sys.exit(0)

        org_db[short_name]["read_map_dir"] = "%s/%s/read_mapping" % (exp_path, short_name)
        org_db[short_name]["read_assembly_dir"] = "%s/%s/trans_pred" % (exp_path, short_name)
        org_db[short_name]["labels_dir"] = "%s/%s/signal_labels" % (exp_path, short_name)

        ## calculate the sequence read length
        readlength = 0
        if opt_action in ["2", "3"]:  ## perform this action only for selected options
            if sra_files:
                fqfile = os.path.join(org_db[short_name]["fastq_path"], sra_files[0])
                print "using sequencing read file %s to determine readLength" % fqfile
                fh = helper.open_file(fqfile)
                for rec in SeqIO.parse(fh, "fastq"):
                    readlength = len(rec.seq)
                    break
                fh.close()
        org_db[short_name]["read_length"] = readlength

        ## check for the genome sequence file
        if short_name in org_fasta_file:
            if os.path.isfile(org_fasta_file[short_name]):
                org_db[short_name]["fasta"] = org_fasta_file[short_name]
            else:
                org_db[short_name]["fasta"] = None
        else:
            print "warning: missing genome sequence file for %s under %s/%s/%s" % (
                short_name,
                data_path,
                short_name,
                genome_build_version,
            )
            org_db[short_name]["fasta"] = None

        if not os.path.isdir("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version)):
            try:
                os.makedirs("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version))
            except OSError:
                print "error: cannot create the directory %s/%s/%s" % (data_path, short_name, genome_build_version)
                sys.exit(0)

        org_db[short_name]["genome_index_dir"] = "%s/%s/%s/STARgenome/" % (data_path, short_name, genome_build_version)

        ## check the genome annotation
        if short_name in org_gtf_file:

            if os.path.isfile(org_gtf_file[short_name]):
                org_db[short_name]["gtf"] = org_gtf_file[short_name]

                if opt_action in ["2", "3", "4", "c"]:  ## perform this action only for selected options
                    ## get the gtf feature lengths
                    from fetch_remote_data import prepare_data as pd

                    feat_len_db = pd.make_anno_db(org_gtf_file[short_name])
                    org_db[short_name]["max_intron_len"] = feat_len_db["max_intron"]
                    org_db[short_name]["max_exon_len"] = feat_len_db["max_exon"]
            else:
                print "error: the provided gtf file %s is not available to read. Please check!" % org_gtf_file[
                    short_name
                ]
                sys.exit(-1)
        else:
            print "warning: missing annotation file for %s under %s/%s/%s" % (
                short_name,
                data_path,
                short_name,
                genome_build_version,
            )
            org_db[short_name]["gtf"] = None
            org_db[short_name]["max_intron_len"] = None
            org_db[short_name]["max_exon_len"] = None

        print "fetched details for %s" % short_name

    return org_db