Esempio n. 1
0
def fasta_seq_length(fa_name):
    """
    general information about contigs lengths in a FASTA file
    """
    from operator import itemgetter
    from gfftools import helper

    seq_info = dict()
    fah = helper.open_file(fa_name)

    for rec in SeqIO.parse(fah, "fasta"):
        seq_info[rec.id] = len(rec.seq)
        print rec.id, len(rec.seq)
    fah.close()

    print
    print 'Number of FASTA entries: ', len(seq_info)
    for long_one in sorted(seq_info.items(), key=itemgetter(1), reverse=True):
        print 'Long contig length (bp): ', long_one[0], long_one[1]
        break
    for short_one in sorted(seq_info.items(), key=itemgetter(1)):
        print 'Short contig length (bp): ', short_one[0], short_one[1]
        break
    flength = 0
    for ele in sorted(seq_info.items(), key=itemgetter(1)):
        flength += ele[1]
    print 'Average length of FASTA contig (bp): ', (flength / len(seq_info))
    print
Esempio n. 2
0
def translate_trsk_genes(gtf_file, fas_file, out_seq_fname):
    """
    translate the trsk genes to protein sequence 

    @args gtf_file: genome annotation file 
    @type gtf_file: str 
    @args fas_file: genome sequence file
    @type fas_file: str
    @args out_seq_fname: output file in fasta format 
    @type out_seq_fname: str
    """
    
    if filecmp.cmp(gtf_file, fas_file):
        exit("Do the two files are exactly same? Please check that!")

    ## reading the TSkim file to get the features 
    sys.stdout.write('reading genome features from %s\n' % gtf_file)
    anno_db = GFFParser.Parse(gtf_file) 
    total_genes = len(anno_db) 

    ## genome sequence file reading 
    sys.stdout.write('reading genome sequence from %s\n' % fas_file)
    seqlab.chrom_name_consistency(fas_file, anno_db) 

    cds_idx = [] # deleting the empty cds lines  
    for idp, feat in enumerate(anno_db):
        if not feat['cds_exons'][0].any(): # TSkim annotation expects only single transcript from a region
            cds_idx.append(idp) 
    anno_db = np.delete(anno_db, cds_idx) 
    genes_with_cds = len(anno_db) 

    fasFH = helper.open_file(fas_file) 
    out_seq_fh = open(out_seq_fname, "w")
    for rec in SeqIO.parse(fasFH, "fasta"):
        for idx, feature in enumerate(anno_db):
            if rec.id == feature['chr']:
                ## iterate over cds_exons
                cds_seq = ''
                for ex in feature['cds_exons'][0]:## single transcript by TSkim 
                    cds_seq += rec.seq[ex[0]-1:ex[1]]
                
                if feature['strand'] == '-':
                    cds_seq = cds_seq.reverse_complement()
                ## 
                #sys.stdout.write(str(cds_seq.translate()) + "\n")

                ## fasta output 
                if cds_seq:
                    prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence') 
                    out_seq_fh.write(prt_seq.format("fasta"))

        # FIXME need an efficient way to translate multiple gene 
        # iterate over chromosome

    fasFH.close()
    out_seq_fh.close()

    sys.stdout.write('total genes fetched: %d\n' % total_genes)
    sys.stdout.write('total genes translated: %d\n' % genes_with_cds)
    sys.stdout.write('protein sequence stored at %s\n' % out_seq_fname)
Esempio n. 3
0
def fasta_seq_length(fa_name):
    """
    general information about contigs lengths in a FASTA file
    """
    from operator import itemgetter
    from gfftools import helper 

    seq_info = dict()
    fah = helper.open_file(fa_name)

    for rec in SeqIO.parse(fah, "fasta"):
        seq_info[rec.id] = len(rec.seq)
        print rec.id, len(rec.seq)
    fah.close()
    
    print 
    print 'Number of FASTA entries: ', len(seq_info)
    for long_one in sorted(seq_info.items(), key=itemgetter(1), reverse=True):
        print 'Long contig length (bp): ', long_one[0], long_one[1]
        break
    for short_one in sorted(seq_info.items(), key=itemgetter(1)):
        print 'Short contig length (bp): ', short_one[0], short_one[1]
        break
    flength = 0 
    for ele in sorted(seq_info.items(), key=itemgetter(1)):
        flength += ele[1]
    print 'Average length of FASTA contig (bp): ', (flength/len(seq_info))
    print 
def check_splice_site_consensus(fas_file, splice_region):
    """
    splice site consensus check
    """

    sys.stdout.write( "splice site sequence consensus check started...\n")
    get_gene_models = defaultdict()
    splice_site_con = 0 
    fas_fh = helper.open_file(fas_file)
    for fas_rec in SeqIO.parse(fas_fh, "fasta"):
        if fas_rec.id in splice_region:
            for details in splice_region[fas_rec.id]:
                for genes, regions in details.items():
                    acc_cons_cnt = 0 
                    don_cons_cnt = 0 

                    if len(regions) == 1:## single exon long transcripts no checks 
                        get_gene_models[(fas_rec.id, genes[0], genes[1], genes[2])] = 1   
                        continue

                    for region in regions:
                        if genes[-1] == '+':
                            #if not numpy.isnan(region[0]):## acceptor splice site 
                            if region[0]:## acceptor splice site 
                                acc_seq = fas_rec.seq[int(region[0])-3:int(region[0])-1]
                                if str(acc_seq).upper() == "AG":
                                    acc_cons_cnt += 1 

                            if region[1]:
                                don_seq = fas_rec.seq[int(region[1]):int(region[1])+2]
                                if str(don_seq).upper() == "GT":
                                    don_cons_cnt +=1 

                        elif genes[-1] == '-':
                            if region[0]: ## donor splice site 
                                don_seq = fas_rec.seq[int(region[0])-3:int(region[0])-1]
                                don_seq = don_seq.reverse_complement()
                                if str(don_seq).upper() == "GT":
                                    don_cons_cnt +=1 
                            
                            if region[1]:
                                acc_seq = fas_rec.seq[int(region[1]):int(region[1])+2]
                                acc_seq = acc_seq.reverse_complement()
                                if str(acc_seq).upper() == "AG":
                                    acc_cons_cnt += 1 
                    ## check for half of the consensus sites 
                    if acc_cons_cnt > (len(regions)/2) and don_cons_cnt > (len(regions)/2):
                        get_gene_models[(fas_rec.id, genes[0], genes[1], genes[2])] = 1   
                    else:
                        splice_site_con +=1 
    fas_fh.close()
    sys.stdout.write( "...considering %d best transcripts\n" % len(get_gene_models))
    sys.stdout.write( "discarding transcripts...\n")
    sys.stdout.write( "\t%d splice-site consensus sequence missing\n" % splice_site_con)

    return get_gene_models
Esempio n. 5
0
def read_genome_file(fas_file):
    """
    read genome file in fasta and return the list of chromosomes/contigs 

    @args fas_file: genome sequence in fasta file  
    @type fas_file: str 
    """
    
    # get the filehandler from input file
    try:
        fh = helper.open_file(fas_file)
    except Exception, errmsg:
        stop_err('error in reading file '+ errmsg) 
Esempio n. 6
0
def read_genome_file(fas_file):
    """
    read genome file in fasta and return the list of chromosomes/contigs 

    @args fas_file: genome sequence in fasta file  
    @type fas_file: str 

    returns a list with contig_names and length
    """

    # get the filehandler from input file
    try:
        fh = helper.open_file(fas_file)
    except Exception, errmsg:
        stop_err('error in reading file ' + errmsg)
Esempio n. 7
0
def clean_genome_file(chr_names, fas_file, fas_out):
    """
    make a stable genome file with valid contigs 

    @args chr_names: different contig names with a valid genome sequence 
    @type chr_names: dict
    @args fas_file: genome sequence in fasta file  
    @type fas_file: str 
    @args fas_out: new genome sequence file in fasta format 
    @type fas_out: str 
    """

    # get the filehandler from input file
    try:
        fh = helper.open_file(fas_file)
    except Exception, errmsg:
        stop_err('error in reading file ' + errmsg)
Esempio n. 8
0
def clean_anno_file(chr_names, gtf_file, gtf_out):
    """
    make stable annotation file with valid contig name 

    @args chr_names: different contig names with a valid genome sequence 
    @type chr_names: dict
    @args gtf_file: genome sequence in fasta file  
    @type gtf_file: str 
    @args gtf_out: new genome sequence file in fasta format 
    @type gtf_out: str 
    """

    # get the filehandler from input file
    try:
        fh = helper.open_file(gtf_file)
    except Exception, errmsg:
        stop_err('error %s in reading file %s' % (errmsg, gtf_file)) 
Esempio n. 9
0
def genome_file_rec_extract(chr_pattn, fas_file, fas_out):
    """
    get all contings based on a matiching string in the record identifier

    @args chr_pattn: pattern to be searched in contig names 
    @type chr_pattn: str
    @args fas_file: genome sequence in fasta file  
    @type fas_file: str 
    @args fas_out: new genome sequence file in fasta format 
    @type fas_out: str 
    """

    # get the filehandler from input file
    try:
        fh = helper.open_file(fas_file)
    except Exception, errmsg:
        stop_err('error in reading file '+ errmsg) 
Esempio n. 10
0
def clean_anno_file(chr_names, gtf_file, gtf_out):
    """
    make stable annotation file with valid contig name 

    @args chr_names: different contig names with a valid genome sequence 
    @type chr_names: dict
    @args gtf_file: genome annotation in gtf/gff form 
    @type gtf_file: str 
    @args gtf_out: new genome annotation in gtf/gff form 
    @type gtf_out: str 
    """

    # get the filehandler from input file
    try:
        fh = helper.open_file(gtf_file)
    except Exception, errmsg:
        stop_err('error %s in reading file %s' % (errmsg, gtf_file))
Esempio n. 11
0
def clean_genome_file(chr_names, fas_file, fas_out):
    """
    make a stable genome file with valid contigs 

    @args chr_names: different contig names with a valid genome sequence 
    @type chr_names: dict
    @args fas_file: genome sequence in fasta file  
    @type fas_file: str 
    @args fas_out: new genome sequence file in fasta format 
    @type fas_out: str 
    """

    # get the filehandler from input file
    try:
        fh = helper.open_file(fas_file)
    except Exception, errmsg:
        stop_err('error in reading file '+ errmsg) 
Esempio n. 12
0
def genome_file_rec_extract(chr_pattn, fas_file, fas_out):
    """
    get all contings based on a matiching string in the record identifier

    @args chr_pattn: pattern to be searched in contig names 
    @type chr_pattn: str
    @args fas_file: genome sequence in fasta file  
    @type fas_file: str 
    @args fas_out: new genome sequence file in fasta format 
    @type fas_out: str 
    """

    # get the filehandler from input file
    try:
        fh = helper.open_file(fas_file)
    except Exception, errmsg:
        stop_err('error in reading file ' + errmsg)
Esempio n. 13
0
def __main__():
    
    try:
        fname = sys.argv[1]
        fa_out = sys.argv[2]
    except:
        print __doc__
        sys.exit(-1)

    # get the valid chromosome identifier from user as STDIN
    chrom = dict()
    for chr in sys.stdin:
        chr = chr.strip()
        chrom[chr] = 1
    
    # get the filehandler from input file
    try:
        fh = helper.open_file(fname)
    except Exception, errmsg:
        stop_err('error in reading file '+ errmsg) 
Esempio n. 14
0
def true_ss_seq_fetch(fnam, Label, boundary):
    """
    true splice signals 
    """

    foh = helper.open_file(fnam)

    don_cnt_pl = don_cnt_mi = acc_cnt_pl = acc_cnt_mi = 0 
    don_in_pl = don_in_mi = acc_in_pl = acc_in_mi = 0 

    for rec in SeqIO.parse(foh, "fasta"):
        if rec.id in Label:
            for Lfeat in Label[rec.id]:
                for fid, loc in Lfeat.items():
                
                    acc_ind = don_ind = 0 
                    if loc[-1] == '+': 
                        acc_mot_seq = rec.seq[(int(loc[0])-boundary)-2:(int(loc[0])+boundary)-2]
                        if not acc_mot_seq:
                            acc_ind = 1
                        if str(acc_mot_seq[boundary-1:boundary+1]).upper() != 'AG':
                            acc_ind = 1
                   
                        if acc_ind:
                            acc_in_pl += 1
                        else:
                            acc_cnt_pl += 1 

                        don_mot_seq = rec.seq[(int(loc[1])-boundary)+1:(int(loc[1])+boundary)+1]
                        if not don_mot_seq:
                            don_ind = 1 
                        if str(don_mot_seq[boundary-1:boundary+1]).upper() != 'GT':
                            don_ind = 1 

                        if don_ind:
                            don_in_pl += 1 
                        else:
                            don_cnt_pl += 1 

                    elif loc[-1] == '-':
                        don_mot_seq = rec.seq[(int(loc[0])-boundary)-2:(int(loc[0])+boundary)-2]

                        if not don_mot_seq:
                            don_ind = 1 
                        if str(don_mot_seq[boundary-1:boundary+1]).upper() != 'AC':
                            don_ind = 1 

                        if don_ind:
                            don_in_mi += 1 
                        else:
                            don_cnt_mi += 1 

                        acc_mot_seq = rec.seq[(int(loc[1])-boundary)+1:(int(loc[1])+boundary)+1]
                        if not acc_mot_seq:
                            acc_ind = 1
                        if str(acc_mot_seq[boundary-1:boundary+1]).upper() != 'CT':
                            acc_ind = 1

                        if acc_ind:
                            acc_in_mi += 1 
                        else:
                            acc_cnt_mi += 1 

    print 
    print '%d and %d Consensus DONOR sites on positive and negative strand' % (don_cnt_pl, don_cnt_mi) 
    print '%d and %d Non-Consensus DONOR sites on  positive and negative strand' % (don_in_pl, don_in_mi)
    print 
    print '%d and %d Consensus ACCEPTOR sites on positive and negative strand' % (acc_cnt_pl, acc_cnt_mi) 
    print '%d and %d Non-Consensus ACCEPTOR sites on positive and negative strand' % (acc_in_pl, acc_in_mi)
    print 
    foh.close()
    return don_cnt_pl+don_cnt_mi, acc_cnt_pl+acc_cnt_mi, don_in_pl+don_in_mi, acc_in_pl+acc_in_mi
def experiment_db(config_file, opt_action):
    """
    function to collect details of each organism

    FIXME descriptions 
    @args config_file: yaml file contain the information for the experiment
    @type config_file: str 
    """

    ## parsing the config file
    config_map = yaml.safe_load(open(config_file, "rU"))

    data_path = config_map["genome_data_path"]["dir"]
    exp_path = config_map["experiment_data_path"]["dir"]

    org_fasta_file = dict(
        A_carolinensis="%s/A_carolinensis/ensembl_release_79/ensembl_release_79.fas" % data_path,
        M_mulatta="%s/M_mulatta/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        O_cuniculus="%s/O_cuniculus/ensembl_release_79/ensembl_release_79.fas" % data_path,
        M_gallopavo="%s/M_gallopavo/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        B_anthracis="%s/B_anthracis/ensembl_release-21/Bacillus_anthracis_str_a0193.GCA_000181915.1.21.dna.toplevel.fa"
        % data_path,
        C_familiaris="%s/C_familiaris/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        D_melanogaster="%s/D_melanogaster/ensembl_release_79/ensembl_release_79.fas" % data_path,
        E_caballus="%s/E_caballus/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        M_domestica="%s/M_domestica/ensembl_release-69/Monodelphis_domestica.BROADO5.69.dna.toplevel.fa" % data_path,
        O_sativa="%s/O_sativa/phytozome_v9.0/Osativa_204.fa" % data_path,
        A_gambiae="%s/A_gambiae/anoGam1_ucsc/anoGam1_rm.fasta" % data_path,
        B_rapa="%s/B_rapa/phytozome_v9.0/Brapa_197_stable.fa" % data_path,
        C_japonica="%s/C_japonica/STARgenome/Caenorhabditis_japonica.C_japonica-7.0.1.22.dna_sm.stable.fa" % data_path,
        G_gallus="%s/G_gallus/ensembl_release_79/ensembl_release_79.fas" % data_path,
        M_musculus="%s/M_musculus/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        V_vinifera="%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.fas.bz2" % data_path,
        A_mellifera="%s/A_mellifera/apiMel3_ucsc/apiMel3_sm.fasta" % data_path,
        B_taurus="%s/B_taurus/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        C_rubella="%s/C_rubella/phytozome_v9.0/Crubella_183.fa.gz" % data_path,
        D_rerio="%s/D_rerio/ensembl_release_79/ensembl_release_79.fas" % data_path,
        G_max="%s/G_max/phytozome_v9.0/Gmax_189_filter.fa" % data_path,
        M_truncatula="%s/M_truncatula/STARgenome/Mtruncatula_198.fa" % data_path,
        P_pacificus="%s/P_pacificus/STARgenome/Pristionchus_pacificus.P_pacificus-5.0.22.dna_sm.stable.fa" % data_path,
        S_scrofa="%s/S_scrofa/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        X_tropicalis="%s/X_tropicalis/JGIv4-1/JGIv4-1.fa" % data_path,
        C_sativus="%s/C_sativus/phytozome_v9.0/Csativus_122_filtered.fa" % data_path,
        D_simulans="%s/D_simulans/ensembl_release_22/ensembl_release_22.fas.bz2" % data_path,
        H_sapiens="%s/H_sapiens/hg19_bowtie2/hg19.fa" % data_path,
        O_anatinus="%s/O_anatinus/ensembl_release-69/Ornithorhynchus_anatinus.OANA5.69-filtered_dna.fa" % data_path,
        N_vitripennis="%s/N_vitripennis/ensembl_release_22/N_vitripennis_dna_sm.fa" % data_path,
        P_troglodytes="%s/P_troglodytes/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path,
        S_tuberosum="%s/S_tuberosum/phytozome_v9.0/Stuberosum_206.fa" % data_path,
        Z_mays="%s/Z_mays/phytozome_v9.0/Zmays_181.fa" % data_path,
        A_thaliana="%s/A_thaliana/arabidopsis_tair10/sequences/TAIR9_chr_all.fas" % data_path,
        O_aries="%s/O_aries/ensembl_release_79/ensembl_release_79.fas" % data_path,
        C_jacchus="%s/C_jacchus/ensembl_release_79/ensembl_release_79.fas" % data_path,
        C_elegans="%s/C_elegans/ensembl_release_79/ensembl_release_79.fas" % data_path,
        O_latipes="%s/O_latipes/ensembl_release_79/ensembl_release_79.fas" % data_path,
        R_norvegicus="%s/R_norvegicus/ensembl_release_79/ensembl_release_79.fas" % data_path,
        C_briggsae="%s/C_briggsae/ensembl_release_22/ensembl_release_22.fas.bz2" % data_path,
        T_nigroviridis="%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.fas" % data_path,
    )

    org_gtf_file = dict(
        A_carolinensis="%s/A_carolinensis/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        M_mulatta="%s/M_mulatta/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        O_cuniculus="%s/O_cuniculus/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        M_gallopavo="%s/M_gallopavo/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        B_anthracis="%s/B_anthracis/ensembl_release-21/Bacillus_anthracis" % data_path,
        C_familiaris="%s/C_familiaris/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        D_melanogaster="%s/D_melanogaster/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        E_caballus="%s/E_caballus/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        M_domestica="%s/M_domestica/ensembl_release-69/Monodelphis_domestica.BROADO5.69.gtf" % data_path,
        O_sativa="%s/O_sativa/phytozome_v9.0/Osativa_204_gene.gff3" % data_path,
        A_gambiae="%s/A_gambiae/anoGam1_ucsc/anoGam1_ucsc.gtf" % data_path,
        B_rapa="%s/B_rapa/phytozome_v9.0/Brapa_197_gene.gff3" % data_path,
        C_japonica="%s/C_japonica/ensembl_release-22/Caenorhabditis_japonica.C_japonica-7.0.1.22.gff3" % data_path,
        G_gallus="%s/G_gallus/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        M_musculus="%s/M_musculus/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        V_vinifera="%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.gff.bz2" % data_path,
        A_mellifera="%s/A_mellifera/apiMel3_ucsc/apiMel2_ucsc.gtf" % data_path,
        B_taurus="%s/B_taurus/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        C_jacchus="%s/C_jacchus/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        C_rubella="%s/C_rubella/phytozome_v9.0/Crubella_183.gff3" % data_path,
        D_rerio="%s/D_rerio/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        G_max="%s/G_max/phytozome_v9.0/Gmax_189_gene.gff3" % data_path,
        N_vitripennis="%s/N_vitripennis/ensembl_release_22/N_vitripennis.gtf" % data_path,
        O_aries="%s/O_aries/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        M_truncatula="%s/M_truncatula/" % data_path,
        P_pacificus="%s/P_pacificus/ensembl_release-22/Pristionchus_pacificus.P_pacificus-5.0.22.gtf" % data_path,
        S_scrofa="%s/S_scrofa/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        X_tropicalis="%s/X_tropicalis/JGIv4-1/JGIv4-1.gff" % data_path,
        C_sativus="%s/C_sativus/phytozome_v9.0/Csativus_122_gene.gff3" % data_path,
        D_simulans="%s/D_simulans/ensembl_release_22/ensembl_release_22.gff.bz2" % data_path,
        H_sapiens="%s/H_sapiens/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        O_anatinus="%s/O_anatinus/ensembl_release-69/" % data_path,
        P_troglodytes="%s/P_troglodytes/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path,
        S_tuberosum="%s/S_tuberosum/phytozome_v9.0/Stuberosum_206_gene.gff3" % data_path,
        Z_mays="%s/Z_mays/phytozome_v9.0/Zmays_181_gene.gff3" % data_path,
        A_thaliana="%s/A_thaliana/arabidopsis_tair10/annotations/TAIR10_GFF3_genes.gff" % data_path,
        C_elegans="%s/C_elegans/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        D_discoideum="%s/D_discoideum/" % data_path,
        D_yakuba="%s/D_yakuba/ensembl_release-22/Drosophila_yakuba.dyak_r1.3_FB2008_07.22.gff3" % data_path,
        O_latipes="%s/O_latipes/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        R_norvegicus="%s/R_norvegicus/ensembl_release_79/ensembl_release_79.gtf" % data_path,
        C_briggsae="%s/C_briggsae/ensembl_release_22/ensembl_release_22.gff.bz2" % data_path,
        C_brenneri="%s/C_brenneri/ensembl_release-22/Caenorhabditis_brenneri.C_brenneri-6.0.1b.22.gff3" % data_path,
        C_remanei="%s/C_remanei/ensembl_release-22/Caenorhabditis_remanei.C_remanei-15.0.1.22.gff3" % data_path,
        D_pseudoobscura="%s/D_pseudoobscura/ensembl_release-22/Drosophila_pseudoobscura.HGSC2.22.gff3" % data_path,
        T_pseudonana="%s/T_pseudonana/Thaps3/Thaps3_chromosomes_geneModels_FilteredModels2.gff" % data_path,
        T_nigroviridis="%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.gtf" % data_path,
    )

    ## TODO algorithms details

    ## experiment details
    org_db = defaultdict()

    for ent in config_map["experiment"]:
        species_name = ent["organism_name"]
        sra_run_id = ent["sra_run_id"]
        genome_build_version = ent["genome_build_version"]
        db_server = ent["release_db"]

        ## mapping to short names       arabidopsis_thaliana --> A_thaliana
        genus, species = species_name.strip().split("_")
        short_name = "%s_%s" % (genus[0].upper(), species)

        org_db[short_name] = dict(short_name=short_name)
        org_db[short_name]["long_name"] = species_name
        org_db[short_name]["sra_run_id"] = sra_run_id
        org_db[short_name]["genome_release_db"] = genome_build_version
        ## the broad path to the experiment
        org_db[short_name]["genome_dir"] = data_path
        org_db[short_name]["experiment_dir"] = exp_path

        build_release = genome_build_version.split("_")
        org_db[short_name]["release_db"] = db_server  ## ensembl_metazoa, phytozome
        org_db[short_name]["release_nb"] = build_release[-1]  ## build number

        sra_files = []  ## sequencing reads files
        if os.path.isdir("%s/%s/source_data" % (exp_path, short_name)):
            for sra_file in os.listdir("%s/%s/source_data" % (exp_path, short_name)):
                file_prefx, ext = os.path.splitext(sra_file)
                if ext == ".sra":  ## skipping the original .sra binary file
                    continue
                if re.search(sra_run_id, sra_file):
                    sra_files.append(sra_file)
        else:
            print "warning: missing sequencing read trace file %s/%s/source_data" % (exp_path, short_name)

        org_db[short_name]["fastq_path"] = "%s/%s/source_data" % (exp_path, short_name)
        org_db[short_name]["fastq"] = sra_files

        ## read mapping, read assembly and label generation working folders
        for sub_dir in ["read_mapping", "signal_labels", "trans_pred"]:
            work_path = "%s/%s/%s" % (exp_path, short_name, sub_dir)

            if not os.path.isdir(work_path):
                try:
                    os.makedirs(work_path)
                except OSError:
                    print "error: cannot create the directory %s." % work_path
                    sys.exit(0)

        org_db[short_name]["read_map_dir"] = "%s/%s/read_mapping" % (exp_path, short_name)
        org_db[short_name]["read_assembly_dir"] = "%s/%s/trans_pred" % (exp_path, short_name)
        org_db[short_name]["labels_dir"] = "%s/%s/signal_labels" % (exp_path, short_name)

        ## calculate the sequence read length
        readlength = 0
        if opt_action in ["2", "3"]:  ## perform this action only for selected options
            if sra_files:
                fqfile = os.path.join(org_db[short_name]["fastq_path"], sra_files[0])
                print "using sequencing read file %s to determine readLength" % fqfile
                fh = helper.open_file(fqfile)
                for rec in SeqIO.parse(fh, "fastq"):
                    readlength = len(rec.seq)
                    break
                fh.close()
        org_db[short_name]["read_length"] = readlength

        ## check for the genome sequence file
        if short_name in org_fasta_file:
            if os.path.isfile(org_fasta_file[short_name]):
                org_db[short_name]["fasta"] = org_fasta_file[short_name]
            else:
                org_db[short_name]["fasta"] = None
        else:
            print "warning: missing genome sequence file for %s under %s/%s/%s" % (
                short_name,
                data_path,
                short_name,
                genome_build_version,
            )
            org_db[short_name]["fasta"] = None

        if not os.path.isdir("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version)):
            try:
                os.makedirs("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version))
            except OSError:
                print "error: cannot create the directory %s/%s/%s" % (data_path, short_name, genome_build_version)
                sys.exit(0)

        org_db[short_name]["genome_index_dir"] = "%s/%s/%s/STARgenome/" % (data_path, short_name, genome_build_version)

        ## check the genome annotation
        if short_name in org_gtf_file:

            if os.path.isfile(org_gtf_file[short_name]):
                org_db[short_name]["gtf"] = org_gtf_file[short_name]

                if opt_action in ["2", "3", "4", "c"]:  ## perform this action only for selected options
                    ## get the gtf feature lengths
                    from fetch_remote_data import prepare_data as pd

                    feat_len_db = pd.make_anno_db(org_gtf_file[short_name])
                    org_db[short_name]["max_intron_len"] = feat_len_db["max_intron"]
                    org_db[short_name]["max_exon_len"] = feat_len_db["max_exon"]
            else:
                print "error: the provided gtf file %s is not available to read. Please check!" % org_gtf_file[
                    short_name
                ]
                sys.exit(-1)
        else:
            print "warning: missing annotation file for %s under %s/%s/%s" % (
                short_name,
                data_path,
                short_name,
                genome_build_version,
            )
            org_db[short_name]["gtf"] = None
            org_db[short_name]["max_intron_len"] = None
            org_db[short_name]["max_exon_len"] = None

        print "fetched details for %s" % short_name

    return org_db
Esempio n. 16
0
        print '\t'.join(rec)


def fasta_reader(fname):
    """
    reading a FASTA file 
    """

    regions_removed = collections.defaultdict(list)
    for rec in SeqIO.parse(fname, "fasta"):
        #rec.id = 'chr'+rec.id
        Nindex = [item for item in range(len(rec.seq)) if rec.seq[item]=="N"] ##index of the desired nucleotide 
        for xn, xp in itertools.groupby(enumerate(Nindex), lambda (i,x):i-x): ## 
            cod_range = map(itemgetter(1), xp)
            regions_removed[rec.id].append((cod_range[0], cod_range[-1]))
    return regions_removed


try:
    ffa=sys.argv[1]
    fbz=sys.argv[2]
except:
    print __doc__
    sys.exit(-1)

dis_cod = fasta_reader(ffa)

bzh = helper.open_file(fbz)

pred_score(bzh, dis_cod)
Esempio n. 17
0
def __main__():

    try:
        fastq_path = sys.argv[1]
    except:
        print __doc__
        sys.exit(-1)

    fastq_1_file = None
    fastq_2_file = None

    # TODO expecting the left and right reads in the base folder with .fastq ending. Make it is common general form

    ## get the files from base path
    for filename in os.listdir(fastq_path):
        if re.search(r'_1.fastq', filename):
            fastq_1_file = filename
        if re.search(r'_2.fastq', filename):
            fastq_2_file = filename

    print fastq_1_file, fastq_2_file
    print

    ## count the number of reads and calculate the sub sample
    fqh = helper.open_file('%s/%s' % (fastq_path, fastq_1_file))
    read_cnt = 0
    for rec in SeqIO.parse(fqh, 'fastq'):
        read_cnt += 1
    fqh.close()

    print '%d Number of reads in FASTQ' % read_cnt
    print

    ## what percentage sub-sample
    percentage = 1
    sub_count = int(round((percentage * read_cnt) / 100.0))
    assert sub_count <= read_cnt, ' %d (sub-sample count) should be less than total read count %d' % (
        sub_count, read_cnt)

    print "%d Sub sample count" % sub_count
    print

    try:
        accept_prob = (1.0 * sub_count) / read_cnt
    except:
        accept_prob = 1

    print accept_prob

    sub_fastq_1_file = "%d_percentage_%s.bz2" % (percentage, fastq_1_file)
    sub_fastq_2_file = "%d_percentage_%s.bz2" % (percentage, fastq_2_file)

    ## writing out sub sample files
    try:
        subFile_1 = bz2.BZ2File("%s/%s" % (fastq_path, sub_fastq_1_file), 'wb')
        subFile_2 = bz2.BZ2File("%s/%s" % (fastq_path, sub_fastq_2_file), 'wb')
    except Exception as error:
        sys.exit(error)

    total_cnt = 0
    sample_cnt = 0
    left_reads = dict()

    fqh = helper.open_file('%s/%s' % (fastq_path, fastq_1_file))
    for rec in SeqIO.parse(fqh, 'fastq'):
        rnb = random.random()
        total_cnt += 1

        if rnb <= accept_prob:
            ## @UNC15-SN850_63:4:1101:1103:2151/1 @UNC15-SN850_63:4:1101:1103:2151/2
            read_id = rec.id.split('/')
            if len(read_id) > 1:
                left_reads[read_id[0]] = 0
            else:
                ## @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 1:N:0:GGNTAC  @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 2:N:0:GGNTAC
                left_reads[rec.id] = 0

            sample_cnt += 1
            subFile_1.write(rec.format("fastq"))

        if sub_count == sample_cnt:
            break

    fqh.close()
    subFile_1.close()

    fqh = helper.open_file('%s/%s' % (fastq_path, fastq_2_file))
    for rec in SeqIO.parse(fqh, 'fastq'):
        read_id = rec.id.split('/')

        if len(read_id) > 1:
            ## @UNC15-SN850_63:4:1101:1103:2151/1
            if read_id[0] in left_reads:
                subFile_2.write(rec.format("fastq"))
        else:
            ## @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 1:N:0:GGNTAC
            if rec.id in left_reads:
                subFile_2.write(rec.format("fastq"))
    fqh.close()
    subFile_2.close()

    print "%s/%s" % (fastq_path, sub_fastq_1_file)
    print "%s/%s" % (fastq_path, sub_fastq_2_file)
    print

    print '%d Number of reads scanned' % total_cnt
    print '%d Number of reads in' % sample_cnt
    print
Esempio n. 18
0
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1):
    """
    wrapper for running STAR program 

    @args org_db: a python dictionary with all details about a single organism 
    @type org_db: defaultdict
    @args read_type: library type - paired-end or single-end (default: PE)
    @type read_type: str 
    @args max_mates_gap_length: maximum insert size from the sample (default: 10000)
    @type max_mates_gap_length: int 
    @args num_cpus: number of threads to use for the run (default: 1)
    @type num_cpus: int 
    """
    try:
        subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")

    from gfftools import helper, GFFParser

    genome_dir = org_db['genome_index_dir']## genome indices and annotation file
    gtf_db = org_db['gtf']

    if gtf_db != None: 
        ## check for the annotation file type gff or gtf 
        gff_hand = helper.open_file(gtf_db)
        for rec in gff_hand:
            rec = rec.strip('\n\r')
            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in  ['#', '>']:
                continue
            # skip the genome sequence 
            if not re.search('\t', rec):
                continue
            parts = rec.split('\t')
            assert len(parts) >= 8, rec
            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break 
        gff_hand.close() 

    ## library type 
    if read_type == 'PE':
        read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1])
    else:
        read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0])
    
    ## getting the command to uncompress the read file
    zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} 
    file_prefx, ext = os.path.splitext(org_db['fastq'][0])

    out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name'])

    ## genomic feature information 
    max_lenth_intron = org_db['max_intron_len']

    ## according to the file type 
    if gtf_db == None:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 4 \
        --sjdbScore 1 \
        --sjdbOverhang 5 \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus)
    elif ftype:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 4 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFtagExonParentTranscript Parent \
        --sjdbScore 1 \
        --sjdbOverhang 5 \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db)
    else:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 4 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFfeatureExon exon \
        --sjdbScore 1 \
        --sjdbOverhang 5 \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db)

    sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run)
    try:
        process = subprocess.Popen(make_star_run, shell=True) 
        returncode = process.wait()

        if returncode !=0:
            raise Exception, "Exit status return code = %i" % returncode

        sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix)
    except Exception, e:
        sys.exit("Error running STAR.\n%s" %  str( e ))
Esempio n. 19
0
def translate_trsk_genes(gtf_file, fas_file, out_seq_fname):
    """
    translate the trsk genes to protein sequence 

    @args gtf_file: genome annotation file 
    @type gtf_file: str 
    @args fas_file: genome sequence file
    @type fas_file: str
    @args out_seq_fname: output file in fasta format 
    @type out_seq_fname: str
    """

    if filecmp.cmp(gtf_file, fas_file):
        exit("Do the two files are exactly same? Please check that!")

    ## reading the TSkim file to get the features
    sys.stdout.write('reading genome features from %s\n' % gtf_file)
    anno_db = GFFParser.Parse(gtf_file)
    total_genes = len(anno_db)

    ## genome sequence file reading
    sys.stdout.write('reading genome sequence from %s\n' % fas_file)
    seqlab.chrom_name_consistency(fas_file, anno_db)

    cds_idx = []  # deleting the empty cds lines
    for idp, feat in enumerate(anno_db):
        if not feat['cds_exons'][0].any(
        ):  # TSkim annotation expects only single transcript from a region
            cds_idx.append(idp)
    anno_db = np.delete(anno_db, cds_idx)
    genes_with_cds = len(anno_db)

    fasFH = helper.open_file(fas_file)
    out_seq_fh = open(out_seq_fname, "w")
    for rec in SeqIO.parse(fasFH, "fasta"):
        for idx, feature in enumerate(anno_db):
            if rec.id == feature['chr']:
                ## iterate over cds_exons
                cds_seq = ''
                for ex in feature['cds_exons'][
                        0]:  ## single transcript by TSkim
                    cds_seq += rec.seq[ex[0] - 1:ex[1]]

                if feature['strand'] == '-':
                    cds_seq = cds_seq.reverse_complement()
                ##
                #sys.stdout.write(str(cds_seq.translate()) + "\n")

                ## fasta output
                if cds_seq:
                    prt_seq = SeqRecord(cds_seq.translate(),
                                        id=feature['name'],
                                        description='protein sequence')
                    out_seq_fh.write(prt_seq.format("fasta"))

        # FIXME need an efficient way to translate multiple gene
        # iterate over chromosome

    fasFH.close()
    out_seq_fh.close()

    sys.stdout.write('total genes fetched: %d\n' % total_genes)
    sys.stdout.write('total genes translated: %d\n' % genes_with_cds)
    sys.stdout.write('protein sequence stored at %s\n' % out_seq_fname)
Esempio n. 20
0
def create_star_genome_index(fasta_file,
                             out_dir,
                             genome_anno=None,
                             num_workers=1,
                             onematelength=100):
    """
    Creating STAR genome index with or without using genome annotation

    @args fasta_file: reference genome sequence file .fasta format 
    @type fasta_file: str 
    @args out_dir: genome index binary file storage place  
    @type out_dir: str 
    @args genome_anno: genome annotation file (optional) 
    @type genome_anno: str 
    @args num_workers: number of threads to run (default value = 1)
    @type num_workers: int 
    @args onematelength: One Mate Length (default value=100) 
    @type onematelength: int 
    """

    try:
        subprocess.call(["STAR"],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")

    file_prefx, ext = os.path.splitext(fasta_file)
    if ext in [".bz2", ".gz", ".lzma"
               ]:  ## checking for the compressed form of the file extension
        exit(
            "error: STAR - Generating genome indexes - recommended to use the uncompressed FASTA file %s."
            % fasta_file)

    if not genome_anno:
        cli_cmd = 'STAR \
        --runMode genomeGenerate \
        --genomeDir %s \
        --genomeFastaFiles %s \
        --runThreadN %d' % (out_dir, fasta_file, num_workers)
    else:
        file_prefx, ext = os.path.splitext(genome_anno)
        if ext in [".bz2", ".gz", ".lzma"]:
            exit(
                "error: STAR - Generating genome indexes - recommended to use the uncompressed GTF/GFF file %s."
                % genome_anno)

        ## check for the file type
        gff_hand = helper.open_file(genome_anno)
        for rec in gff_hand:
            rec = rec.strip('\n\r')

            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in ['#', '>']:
                continue
            # skip the genome sequence
            if not re.search('\t', rec):
                continue
            parts = rec.split('\t')
            assert len(parts) >= 8, rec

            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break
        gff_hand.close()

        ## according to the file type
        if ftype:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFtagExonParentTranscript Parent \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers,
                                  genome_anno, onematelength)
        else:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFfeatureExon exon \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers,
                                  genome_anno, onematelength)

    ## create downloadpath if doesnot exists
    if not os.path.exists(out_dir):
        try:
            os.makedirs(out_dir)
        except OSError:
            exit("error: cannot create the directory %s." % out_dir)
    else:  ## if present any other old index files clean up the folder
        for the_file in os.listdir(out_dir):
            file_path = os.path.join(out_dir, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception, e:
                print(e)
Esempio n. 21
0
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1):
    """
    wrapper for running STAR program 

    @args org_db: a python dictionary with all details about a single organism 
    @type org_db: defaultdict
    @args read_type: library type - paired-end or single-end (default: PE)
    @type read_type: str 
    @args max_mates_gap_length: maximum insert size from the sample (default: 10000)
    @type max_mates_gap_length: int 
    @args num_cpus: number of threads to use for the run (default: 1)
    @type num_cpus: int 
    """
    try:
        subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")

    from gfftools import helper, GFFParser

    genome_dir = org_db['genome_index_dir']## genome indices and annotation file
    gtf_db = org_db['gtf']

    if gtf_db != None: 
        ## check for the annotation file type gff or gtf 
        gff_hand = helper.open_file(gtf_db)
        for rec in gff_hand:
            rec = rec.strip('\n\r')
            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in  ['#', '>']:
                continue
            # skip the genome sequence 
            if not re.search('\t', rec):
                continue
            parts = rec.split('\t')
            assert len(parts) >= 8, rec
            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break 
        gff_hand.close() 

    ## library type 
    if read_type == 'PE':
        read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1])
    else:
        read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0])
    
    ## getting the command to uncompress the read file
    zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} 
    file_prefx, ext = os.path.splitext(org_db['fastq'][0])

    out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name'])

    ## genomic feature information 
    max_lenth_intron = org_db['max_intron_len']
    
    ##sjdbOverhang 
    mate_len = org_db['mate_length']

    ## according to the file type 
    if gtf_db == None:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 3 \
        --sjdbScore 1 \
        --sjdbOverhang %d \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, mate_len)
    elif ftype:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 3 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFtagExonParentTranscript Parent \
        --sjdbScore 1 \
        --sjdbOverhang %d \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len)
    else:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 3 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFfeatureExon exon \
        --sjdbScore 1 \
        --sjdbOverhang %d \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len)

    sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run)
    try:
        process = subprocess.Popen(make_star_run, shell=True) 
        returncode = process.wait()

        if returncode !=0:
            raise Exception, "Exit status return code = %i" % returncode

        sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix)
    except Exception, e:
        sys.exit("Error running STAR.\n%s" %  str( e ))
Esempio n. 22
0
def create_star_genome_index(fasta_file, out_dir, genome_anno=None, num_workers=1, onematelength=100):
    """
    Creating STAR genome index with or without using genome annotation

    TODO check whether the fasta and gtf files are uncompressed star works with uncompressed files in this step. 

    @args fasta_file: reference genome sequence file .fasta format 
    @type fasta_file: str 
    @args out_dir: genome index binary file storage place  
    @type out_dir: str 
    @args genome_anno: genome annotation file (optional) 
    @type genome_anno: str 
    @args num_workers: number of threads to run (default value = 1)
    @type num_workers: int 
    @args onematelength: One Mate Length (default value=100) 
    @type num_workers: int 
    """

    try:
        subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")
    
    if not genome_anno:
        cli_cmd = 'STAR \
        --runMode genomeGenerate \
        --genomeDir %s \
        --genomeFastaFiles %s \
        --runThreadN %d' % (out_dir, fasta_file, num_workers) 
    else:
        ## check for the file type  
        gff_hand = helper.open_file(genome_anno)
    
        for rec in gff_hand:
            rec = rec.strip('\n\r')

            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in  ['#', '>']:
                continue
            # skip the genome sequence 
            if not re.search('\t', rec):
                continue

            parts = rec.split('\t')
            assert len(parts) >= 8, rec

            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break 

        gff_hand.close() 

        ## according to the file type 
        if ftype:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFtagExonParentTranscript Parent \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) 
        else:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFfeatureExon exon \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) 

    ## create downloadpath if doesnot exists 
    if not os.path.exists(out_dir):
        try:
            os.makedirs(out_dir)
        except OSError:
            print "error: cannot create the directory %s." % out_dir
            sys.exit(0)
    else:## if present any other old index files clean up the folder 
        for the_file in os.listdir(out_dir):
            file_path = os.path.join(out_dir, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception, e:
                print e 
Esempio n. 23
0
def experiment_db(config_file, opt_action):
    """
    function to collect details of each organism

    FIXME descriptions 
    @args config_file: yaml file contain the information for the experiment
    @type config_file: str 
    """

    ## parsing the config file 
    config_map = yaml.safe_load(open(config_file, "rU"))

    data_path = config_map['genome_data_path']['dir']
    exp_path = config_map['experiment_data_path']['dir']

    org_fasta_file = dict( 
    A_carolinensis  = '%s/A_carolinensis/ensembl_release_79/ensembl_release_79.fas' % data_path,
    M_mulatta       = "%s/M_mulatta/ensembl_release_79/ensembl_release_79.fas" % data_path,
    O_cuniculus     = "%s/O_cuniculus/ensembl_release_79/ensembl_release_79.fas" % data_path,
    M_gallopavo     = "%s/M_gallopavo/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, 
    B_anthracis     = '%s/B_anthracis/' % data_path,
    C_familiaris    = '%s/C_familiaris/ensembl_release_79/ensembl_release_79.fas' % data_path,
    D_melanogaster  = '%s/D_melanogaster/ensembl_release_79/ensembl_release_79.fas' % data_path,
    E_caballus      = '%s/E_caballus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    M_domestica     = '%s/M_domestica/ensembl_release_79/ensembl_release_79.fas' % data_path,
    O_sativa        = '%s/O_sativa/phytozome_v9.0/Osativa_204.fa' % data_path,
    A_gambiae       = '%s/A_gambiae/ensembl_release_28/ensembl_release_28.fas' % data_path,
    B_rapa          = '%s/B_rapa/phytozome_v9.0/Brapa_197_stable.fa' % data_path,
    G_gallus        = '%s/G_gallus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    M_musculus      = '%s/M_musculus/ensembl_release_79/ensembl_release_79.fas' % data_path,  
    V_vinifera      = '%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.fas.bz2' % data_path,
    A_mellifera     = '%s/A_mellifera/ensembl_release_28/ensembl_release_28.fas' % data_path,
    B_taurus        = '%s/B_taurus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    C_rubella       = '%s/C_rubella/phytozome_v9.0/Crubella_183.fa.gz' % data_path,
    D_rerio         = '%s/D_rerio/ensembl_release_79/ensembl_release_79.fas' % data_path,
    G_max           = '%s/G_max/phytozome_v9.0/Gmax_189_filter.fa' % data_path,
    M_truncatula    = '%s/M_truncatula/STARgenome/Mtruncatula_198.fa' % data_path,
    P_pacificus     = '%s/P_pacificus/' % data_path,
    S_scrofa        = '%s/S_scrofa/ensembl_release_79/ensembl_release_79.fas' % data_path,
    X_tropicalis    = '%s/X_tropicalis/JGIv4-1/JGIv4-1.fa' % data_path,
    C_sativus       = '%s/C_sativus/phytozome_v9.0/Csativus_122_filtered.fa' % data_path,
    D_simulans      = '%s/D_simulans/ensembl_release_28/ensembl_release_28.fas' % data_path,
    H_sapiens       = '%s/H_sapiens/ensembl_release_79/STARgenome/hg19_chrOnly.fa' % data_path,
    N_vitripennis   = '%s/N_vitripennis/ensembl_release_22/N_vitripennis_dna_sm.fa' % data_path,
    P_troglodytes   = '%s/P_troglodytes/ensembl_release_79/ensembl_release_79.fas' % data_path,
    S_tuberosum     = '%s/S_tuberosum/phytozome_v9.0/Stuberosum_206.fa' % data_path,
    Z_mays          = '%s/Z_mays/phytozome_v9.0/Zmays_181.fa' % data_path,
    A_thaliana      = '%s/A_thaliana/arabidopsis_tair10/sequences/TAIR9_chr_all.fas' % data_path,
    O_aries         = '%s/O_aries/ensembl_release_79/ensembl_release_79.fas' % data_path,
    C_jacchus       = '%s/C_jacchus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    C_elegans       = '%s/C_elegans/ensembl_release-69/Caenorhabditis_elegans.WBcel215.69.dna.toplevel.fa' % data_path,
    O_latipes       = '%s/O_latipes/ensembl_release_79/ensembl_release_79.fas' % data_path,
    R_norvegicus    = '%s/R_norvegicus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    G_gorilla       = '%s/G_gorilla/ensembl_release_79/ensembl_release_79.fas' % data_path,
    P_paniscus      = '%s/P_paniscus/eva_mpg_de/eva_mpg_de.fas' % data_path,
    C_porcellus     = '%s/C_porcellus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    O_anatinus      = '%s/O_anatinus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    A_platyrhynchos = '%s/A_platyrhynchos/ensembl_release_79/ensembl_release_79.fas' % data_path,
    O_niloticus     = '%s/O_niloticus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    L_chalumnae     = '%s/L_chalumnae/ensembl_release_79/ensembl_release_79.fas' % data_path,
    H_glaber        = '%s/H_glaber/naked_mole_rat_db/naked_mole_rat_db.fas' % data_path,
    M_eugenii       = '%s/M_eugenii/ensembl_release_79/ensembl_release_79.fas' % data_path,
    C_briggsae      = '%s/C_briggsae/ensembl_release_28/ensembl_release_28.fas' % data_path,
    C_japonica      = '%s/C_japonica/ensembl_release_28/ensembl_release_28.fas' % data_path,
    C_remanei       = '%s/C_remanei/ensembl_release_28/ensembl_release_28.fas' % data_path,
    P_marinus       = '%s/P_marinus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    C_brenneri      = '%s/C_brenneri/ensembl_release_28/ensembl_release_28.fas' % data_path,
    C_intestinalis  = '%s/C_intestinalis/ensembl_release_79/ensembl_release_79.fas' % data_path,
    S_cerevisiae    = '%s/S_cerevisiae/ensembl_release_79/ensembl_release_79.fas' % data_path,
    S_pombe         = '%s/S_pombe/ensembl_release_28/ensembl_release_28.fas' % data_path,
    A_aegypti       = '%s/A_aegypti/ensembl_release_28/ensembl_release_28.fas' % data_path,
    C_hircus        = '%s/C_hircus/ncbi_genome/ncbi_genome.fas' % data_path,
    F_catus         = '%s/F_catus/ensembl_release_79/ensembl_release_79.fas' % data_path,
    T_nigroviridis  = '%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.fas' % data_path
    )
    #C_elegans       = '%s/C_elegans/ensembl_release_79/ensembl_release_79.fas' % data_path,

    org_gtf_file = dict( 
    A_carolinensis  = '%s/A_carolinensis/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    M_mulatta       = "%s/M_mulatta/ensembl_release_79/ensembl_release_79.gtf" % data_path,
    O_cuniculus     = "%s/O_cuniculus/ensembl_release_79/ensembl_release_79.gtf" % data_path,
    M_gallopavo     = "%s/M_gallopavo/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, 
    B_anthracis     = '%s/B_anthracis/ensembl_release-21/Bacillus_anthracis' % data_path,
    C_familiaris    = '%s/C_familiaris/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    D_melanogaster  = '%s/D_melanogaster/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    E_caballus      = '%s/E_caballus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    M_domestica     = '%s/M_domestica/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    O_sativa        = '%s/O_sativa/phytozome_v9.0/Osativa_204_gene.gff3' % data_path,
    A_gambiae       = '%s/A_gambiae/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    B_rapa          = '%s/B_rapa/phytozome_v9.0/Brapa_197_gene.gff3' % data_path,
    G_gallus        = '%s/G_gallus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    M_musculus      = '%s/M_musculus/ensembl_release_79/ensembl_release_79.gtf' % data_path,  
    V_vinifera      = '%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.gff.bz2' % data_path,
    A_mellifera     = '%s/A_mellifera/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    B_taurus        = '%s/B_taurus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_jacchus       = '%s/C_jacchus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_rubella       = '%s/C_rubella/phytozome_v9.0/Crubella_183.gff3' % data_path,
    D_rerio         = '%s/D_rerio/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    G_max           = '%s/G_max/phytozome_v9.0/Gmax_189_gene.gff3' % data_path,
    N_vitripennis   = '%s/N_vitripennis/ensembl_release_22/N_vitripennis.gtf' % data_path,
    O_aries         = '%s/O_aries/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    M_truncatula    = '%s/M_truncatula/' % data_path,
    P_pacificus     = '%s/P_pacificus/ensembl_release-22/.gtf' % data_path,
    S_scrofa        = '%s/S_scrofa/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    X_tropicalis    = '%s/X_tropicalis/JGIv4-1/JGIv4-1.gff' % data_path,
    C_sativus       = '%s/C_sativus/phytozome_v9.0/Csativus_122_gene.gff3' % data_path,
    D_simulans      = '%s/D_simulans/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    H_sapiens       = '%s/H_sapiens/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    P_troglodytes   = '%s/P_troglodytes/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    S_tuberosum     = '%s/S_tuberosum/phytozome_v9.0/Stuberosum_206_gene.gff3' % data_path,
    Z_mays          = '%s/Z_mays/phytozome_v9.0/Zmays_181_gene.gff3' % data_path,
    A_thaliana      = '%s/A_thaliana/arabidopsis_tair10/annotations/TAIR10_GFF3_genes.gff' % data_path,
    C_elegans       = '%s/C_elegans/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    D_discoideum    = '%s/D_discoideum/' % data_path,
    D_yakuba        = '%s/D_yakuba/ensembl_release-22/.gff3' % data_path,
    O_latipes       = '%s/O_latipes/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    R_norvegicus    = '%s/R_norvegicus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    D_pseudoobscura = '%s/D_pseudoobscura/ensembl_release-22/.gff3' % data_path,
    T_pseudonana    = '%s/T_pseudonana/Thaps3/.gff' % data_path,
    G_gorilla       = '%s/G_gorilla/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_porcellus     = '%s/C_porcellus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    O_anatinus      = '%s/O_anatinus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    A_platyrhynchos = '%s/A_platyrhynchos/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    O_niloticus     = '%s/O_niloticus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    L_chalumnae     = '%s/L_chalumnae/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_briggsae      = '%s/C_briggsae/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    M_eugenii       = '%s/M_eugenii/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_remanei       = '%s/C_remanei/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    C_brenneri      = '%s/C_brenneri/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    C_intestinalis  = '%s/C_intestinalis/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    S_pombe         = '%s/S_pombe/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    P_marinus       = '%s/P_marinus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    S_cerevisiae    = '%s/S_cerevisiae/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    C_japonica      = '%s/C_japonica/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    F_catus         = '%s/F_catus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    A_aegypti       = '%s/A_aegypti/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    C_hircus        = '%s/C_hircus/ncbi_genome/ncbi_genome.gff' % data_path,
    T_nigroviridis  = '%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.gtf' % data_path
    )

    ## TODO algorithms details 
    #A_aegypti = '%s/A_aegypti/ensembl_release_28/ensembl_release_28.gtf' % data_path,
    #F_catus = '%s/F_catus/ensembl_release_79/ensembl_release_79.gtf' % data_path,
    #C_hircus = '%s/C_hircus/ncbi_genome/ncbi_genome.gff' % data_path,
    #H_glaber = '%s/H_glaber/naked_mole_rat_db/naked_mole_rat_db.gtf' % data_path,

    ## experiment details  
    org_db = defaultdict()
    
    for ent in config_map['experiment']:
        species_name = ent['organism_name'] 
        sra_run_id = ent['sra_run_id']
        genome_build_version = ent['genome_build_version']
        db_server = ent['release_db']

        ## mapping to short names       arabidopsis_thaliana --> A_thaliana
        genus, species = species_name.strip().split("_")
        short_name = "%s_%s" % (genus[0].upper(), species)

        org_db[short_name] = dict(short_name = short_name)  
        org_db[short_name]['long_name'] = species_name
        org_db[short_name]['sra_run_id'] = sra_run_id
        org_db[short_name]['genome_release_db'] = genome_build_version
        ## the broad path to the experiment 
        org_db[short_name]['genome_dir'] = data_path
        org_db[short_name]['experiment_dir'] = exp_path

        build_release = genome_build_version.split("_")
        org_db[short_name]['release_db'] = db_server ## ensembl_metazoa, phytozome
        org_db[short_name]['release_nb'] = build_release[-1] ## build number 

        sra_files = [] ## sequencing reads files 
        if os.path.isdir("%s/%s/source_data" % (exp_path, short_name)):
            for sra_file in os.listdir("%s/%s/source_data" % (exp_path, short_name)):
                file_prefx, ext = os.path.splitext(sra_file)
                if ext == ".sra": ## skipping the original .sra binary file 
                    continue
                if re.search(sra_run_id, sra_file):
                    sra_files.append(sra_file) 
        else:
            print "warning: missing sequencing read trace file %s/%s/source_data" % (exp_path, short_name) 
                
        org_db[short_name]['fastq_path'] = "%s/%s/source_data" % (exp_path, short_name)
        org_db[short_name]['fastq'] = sra_files

        ## read mapping, read assembly and label generation working folders 
        for sub_dir in ['read_mapping', 'signal_labels', 'trans_pred', 'source_data']:
            work_path = "%s/%s/%s" % (exp_path, short_name, sub_dir)

            if not os.path.isdir(work_path):
                try:
                    os.makedirs(work_path)
                except OSError:
                    exit("error: cannot create the directory %s." % work_path)

        org_db[short_name]['read_map_dir'] = "%s/%s/read_mapping" % (exp_path, short_name)
        org_db[short_name]['read_assembly_dir'] = "%s/%s/trans_pred" % (exp_path, short_name)
        org_db[short_name]['labels_dir'] = "%s/%s/signal_labels" % (exp_path, short_name)

        ## calculate the sequence read length
        readlength = 0 
        if opt_action in ["2", "3"]: ## perform this action only for selected options 
            if sra_files:
                fqfile = os.path.join(org_db[short_name]['fastq_path'], sra_files[0])
                print 'using sequencing read file %s to determine readLength' % fqfile
                fh = helper.open_file(fqfile)
                for rec in SeqIO.parse(fh, "fastq"):
                    readlength = len(rec.seq)
                    break
                fh.close() 
        org_db[short_name]['read_length'] = readlength

        ## check for the genome sequence file 
        if short_name in org_fasta_file:
            if os.path.isfile(org_fasta_file[short_name]):
                org_db[short_name]['fasta'] = org_fasta_file[short_name]
            else:
                org_db[short_name]['fasta'] = None
        else:
            print "warning: missing genome sequence file for %s under %s/%s/%s" % (short_name, data_path, short_name, genome_build_version)
            org_db[short_name]['fasta'] = None

        if not os.path.isdir("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version)):
            try:
                os.makedirs("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version))
            except OSError:
                exit("error: cannot create the directory %s/%s/%s" % (data_path, short_name, genome_build_version))

        org_db[short_name]['genome_index_dir'] = "%s/%s/%s/STARgenome/" % (data_path, short_name, genome_build_version) 

        ## check the genome annotation 
        if short_name in org_gtf_file:
            if os.path.isfile(org_gtf_file[short_name]):
                org_db[short_name]['gtf'] = org_gtf_file[short_name]

                if opt_action in ["2", "3", "4", "c"]: ## perform this action only for selected options 
                    ## get the gtf feature lengths 
                    from fetch_remote_data import prepare_data as pd
                    feat_len_db = pd.make_anno_db(org_gtf_file[short_name]) 
                    org_db[short_name]['max_intron_len'] = feat_len_db['max_intron']
                    org_db[short_name]['max_exon_len'] = feat_len_db['max_exon']
            else:
                exit("error: the provided gtf file %s is not available to read. Please check!" % org_gtf_file[short_name])
        else:
            print("warning: missing annotation file for %s under %s/%s/%s" % (short_name, data_path, short_name, genome_build_version))
            org_db[short_name]['gtf'] = None
            org_db[short_name]['max_intron_len'] = None
            org_db[short_name]['max_exon_len'] = None
        
        print("fetched details for %s" % short_name) 
            
    return org_db
Esempio n. 24
0
def __main__():

    try:
        fastq_path = sys.argv[1]
    except:
        print __doc__
        sys.exit(-1)
    
    fastq_1_file = None 
    fastq_2_file = None

    # TODO expecting the left and right reads in the base folder with .fastq ending. Make it is common general form

    ## get the files from base path 
    for filename in os.listdir(fastq_path):
        if re.search(r'_1.fastq', filename):
            fastq_1_file = filename 
        if re.search(r'_2.fastq', filename):
            fastq_2_file = filename 
        
    print fastq_1_file, fastq_2_file
    print 
    
    ## count the number of reads and calculate the sub sample 
    fqh = helper.open_file('%s/%s' % (fastq_path, fastq_1_file)) 
    read_cnt = 0
    for rec in SeqIO.parse(fqh, 'fastq'):
        read_cnt +=1 
    fqh.close() 

    print '%d Number of reads in FASTQ' % read_cnt
    print 

    ## what percentage sub-sample 
    percentage = 1
    sub_count = int(round((percentage*read_cnt)/100.0))
    assert sub_count <= read_cnt, ' %d (sub-sample count) should be less than total read count %d'  % (sub_count, read_cnt)
    
    print "%d Sub sample count" % sub_count  
    print 

    try:
        accept_prob = (1.0*sub_count)/read_cnt
    except:
        accept_prob = 1

    print accept_prob

    sub_fastq_1_file = "%d_percentage_%s.bz2" % (percentage, fastq_1_file)
    sub_fastq_2_file = "%d_percentage_%s.bz2" % (percentage, fastq_2_file)

    ## writing out sub sample files
    try:
        subFile_1 = bz2.BZ2File("%s/%s" % (fastq_path, sub_fastq_1_file), 'wb')
        subFile_2 = bz2.BZ2File("%s/%s" % (fastq_path, sub_fastq_2_file), 'wb')
    except Exception as error:
        sys.exit(error)

    total_cnt = 0 
    sample_cnt = 0 
    left_reads = dict() 

    fqh = helper.open_file('%s/%s' % (fastq_path, fastq_1_file)) 
    for rec in SeqIO.parse(fqh, 'fastq'):
        rnb = random.random()
        total_cnt += 1

        if rnb <= accept_prob:
            ## @UNC15-SN850_63:4:1101:1103:2151/1 @UNC15-SN850_63:4:1101:1103:2151/2
            read_id = rec.id.split('/')
            if len(read_id) > 1:
                left_reads[read_id[0]] = 0
            else:    
                ## @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 1:N:0:GGNTAC  @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 2:N:0:GGNTAC 
                left_reads[rec.id] = 0

            sample_cnt += 1 
            subFile_1.write(rec.format("fastq"))

        if sub_count == sample_cnt:
            break 

    fqh.close() 
    subFile_1.close() 

    fqh = helper.open_file('%s/%s' % (fastq_path, fastq_2_file)) 
    for rec in SeqIO.parse(fqh, 'fastq'):
        read_id = rec.id.split('/')

        if len(read_id) > 1:
            ## @UNC15-SN850_63:4:1101:1103:2151/1
            if read_id[0] in left_reads:
                subFile_2.write(rec.format("fastq"))
        else:
            ## @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 1:N:0:GGNTAC
            if rec.id in left_reads:
                subFile_2.write(rec.format("fastq"))
    fqh.close() 
    subFile_2.close() 

    print "%s/%s" % (fastq_path, sub_fastq_1_file)
    print "%s/%s" % (fastq_path, sub_fastq_2_file)
    print 

    print '%d Number of reads scanned' % total_cnt
    print '%d Number of reads in' % sample_cnt
    print 
def check_splice_site_consensus(fas_file, splice_region):
    """
    splice site consensus check
    """

    sys.stdout.write("splice site sequence consensus check started...\n")
    get_gene_models = defaultdict()
    splice_site_con = 0
    fas_fh = helper.open_file(fas_file)
    for fas_rec in SeqIO.parse(fas_fh, "fasta"):
        if fas_rec.id in splice_region:
            for details in splice_region[fas_rec.id]:
                for genes, regions in details.items():
                    acc_cons_cnt = 0
                    don_cons_cnt = 0

                    if len(regions
                           ) == 1:  ## single exon long transcripts no checks
                        get_gene_models[(fas_rec.id, genes[0], genes[1],
                                         genes[2])] = 1
                        continue

                    for region in regions:
                        if genes[-1] == '+':
                            #if not numpy.isnan(region[0]):## acceptor splice site
                            if region[0]:  ## acceptor splice site
                                acc_seq = fas_rec.seq[int(region[0]) -
                                                      3:int(region[0]) - 1]
                                if str(acc_seq).upper() == "AG":
                                    acc_cons_cnt += 1

                            if region[1]:
                                don_seq = fas_rec.seq[int(region[1]
                                                          ):int(region[1]) + 2]
                                if str(don_seq).upper() == "GT":
                                    don_cons_cnt += 1

                        elif genes[-1] == '-':
                            if region[0]:  ## donor splice site
                                don_seq = fas_rec.seq[int(region[0]) -
                                                      3:int(region[0]) - 1]
                                don_seq = don_seq.reverse_complement()
                                if str(don_seq).upper() == "GT":
                                    don_cons_cnt += 1

                            if region[1]:
                                acc_seq = fas_rec.seq[int(region[1]
                                                          ):int(region[1]) + 2]
                                acc_seq = acc_seq.reverse_complement()
                                if str(acc_seq).upper() == "AG":
                                    acc_cons_cnt += 1
                    ## check for half of the consensus sites
                    if acc_cons_cnt > (len(regions) / 2) and don_cons_cnt > (
                            len(regions) / 2):
                        get_gene_models[(fas_rec.id, genes[0], genes[1],
                                         genes[2])] = 1
                    else:
                        splice_site_con += 1
    fas_fh.close()
    sys.stdout.write("...considering %d best transcripts\n" %
                     len(get_gene_models))
    sys.stdout.write("discarding transcripts...\n")
    sys.stdout.write("\t%d splice-site consensus sequence missing\n" %
                     splice_site_con)

    return get_gene_models