def fasta_seq_length(fa_name): """ general information about contigs lengths in a FASTA file """ from operator import itemgetter from gfftools import helper seq_info = dict() fah = helper.open_file(fa_name) for rec in SeqIO.parse(fah, "fasta"): seq_info[rec.id] = len(rec.seq) print rec.id, len(rec.seq) fah.close() print print 'Number of FASTA entries: ', len(seq_info) for long_one in sorted(seq_info.items(), key=itemgetter(1), reverse=True): print 'Long contig length (bp): ', long_one[0], long_one[1] break for short_one in sorted(seq_info.items(), key=itemgetter(1)): print 'Short contig length (bp): ', short_one[0], short_one[1] break flength = 0 for ele in sorted(seq_info.items(), key=itemgetter(1)): flength += ele[1] print 'Average length of FASTA contig (bp): ', (flength / len(seq_info)) print
def translate_trsk_genes(gtf_file, fas_file, out_seq_fname): """ translate the trsk genes to protein sequence @args gtf_file: genome annotation file @type gtf_file: str @args fas_file: genome sequence file @type fas_file: str @args out_seq_fname: output file in fasta format @type out_seq_fname: str """ if filecmp.cmp(gtf_file, fas_file): exit("Do the two files are exactly same? Please check that!") ## reading the TSkim file to get the features sys.stdout.write('reading genome features from %s\n' % gtf_file) anno_db = GFFParser.Parse(gtf_file) total_genes = len(anno_db) ## genome sequence file reading sys.stdout.write('reading genome sequence from %s\n' % fas_file) seqlab.chrom_name_consistency(fas_file, anno_db) cds_idx = [] # deleting the empty cds lines for idp, feat in enumerate(anno_db): if not feat['cds_exons'][0].any(): # TSkim annotation expects only single transcript from a region cds_idx.append(idp) anno_db = np.delete(anno_db, cds_idx) genes_with_cds = len(anno_db) fasFH = helper.open_file(fas_file) out_seq_fh = open(out_seq_fname, "w") for rec in SeqIO.parse(fasFH, "fasta"): for idx, feature in enumerate(anno_db): if rec.id == feature['chr']: ## iterate over cds_exons cds_seq = '' for ex in feature['cds_exons'][0]:## single transcript by TSkim cds_seq += rec.seq[ex[0]-1:ex[1]] if feature['strand'] == '-': cds_seq = cds_seq.reverse_complement() ## #sys.stdout.write(str(cds_seq.translate()) + "\n") ## fasta output if cds_seq: prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence') out_seq_fh.write(prt_seq.format("fasta")) # FIXME need an efficient way to translate multiple gene # iterate over chromosome fasFH.close() out_seq_fh.close() sys.stdout.write('total genes fetched: %d\n' % total_genes) sys.stdout.write('total genes translated: %d\n' % genes_with_cds) sys.stdout.write('protein sequence stored at %s\n' % out_seq_fname)
def fasta_seq_length(fa_name): """ general information about contigs lengths in a FASTA file """ from operator import itemgetter from gfftools import helper seq_info = dict() fah = helper.open_file(fa_name) for rec in SeqIO.parse(fah, "fasta"): seq_info[rec.id] = len(rec.seq) print rec.id, len(rec.seq) fah.close() print print 'Number of FASTA entries: ', len(seq_info) for long_one in sorted(seq_info.items(), key=itemgetter(1), reverse=True): print 'Long contig length (bp): ', long_one[0], long_one[1] break for short_one in sorted(seq_info.items(), key=itemgetter(1)): print 'Short contig length (bp): ', short_one[0], short_one[1] break flength = 0 for ele in sorted(seq_info.items(), key=itemgetter(1)): flength += ele[1] print 'Average length of FASTA contig (bp): ', (flength/len(seq_info)) print
def check_splice_site_consensus(fas_file, splice_region): """ splice site consensus check """ sys.stdout.write( "splice site sequence consensus check started...\n") get_gene_models = defaultdict() splice_site_con = 0 fas_fh = helper.open_file(fas_file) for fas_rec in SeqIO.parse(fas_fh, "fasta"): if fas_rec.id in splice_region: for details in splice_region[fas_rec.id]: for genes, regions in details.items(): acc_cons_cnt = 0 don_cons_cnt = 0 if len(regions) == 1:## single exon long transcripts no checks get_gene_models[(fas_rec.id, genes[0], genes[1], genes[2])] = 1 continue for region in regions: if genes[-1] == '+': #if not numpy.isnan(region[0]):## acceptor splice site if region[0]:## acceptor splice site acc_seq = fas_rec.seq[int(region[0])-3:int(region[0])-1] if str(acc_seq).upper() == "AG": acc_cons_cnt += 1 if region[1]: don_seq = fas_rec.seq[int(region[1]):int(region[1])+2] if str(don_seq).upper() == "GT": don_cons_cnt +=1 elif genes[-1] == '-': if region[0]: ## donor splice site don_seq = fas_rec.seq[int(region[0])-3:int(region[0])-1] don_seq = don_seq.reverse_complement() if str(don_seq).upper() == "GT": don_cons_cnt +=1 if region[1]: acc_seq = fas_rec.seq[int(region[1]):int(region[1])+2] acc_seq = acc_seq.reverse_complement() if str(acc_seq).upper() == "AG": acc_cons_cnt += 1 ## check for half of the consensus sites if acc_cons_cnt > (len(regions)/2) and don_cons_cnt > (len(regions)/2): get_gene_models[(fas_rec.id, genes[0], genes[1], genes[2])] = 1 else: splice_site_con +=1 fas_fh.close() sys.stdout.write( "...considering %d best transcripts\n" % len(get_gene_models)) sys.stdout.write( "discarding transcripts...\n") sys.stdout.write( "\t%d splice-site consensus sequence missing\n" % splice_site_con) return get_gene_models
def read_genome_file(fas_file): """ read genome file in fasta and return the list of chromosomes/contigs @args fas_file: genome sequence in fasta file @type fas_file: str """ # get the filehandler from input file try: fh = helper.open_file(fas_file) except Exception, errmsg: stop_err('error in reading file '+ errmsg)
def read_genome_file(fas_file): """ read genome file in fasta and return the list of chromosomes/contigs @args fas_file: genome sequence in fasta file @type fas_file: str returns a list with contig_names and length """ # get the filehandler from input file try: fh = helper.open_file(fas_file) except Exception, errmsg: stop_err('error in reading file ' + errmsg)
def clean_genome_file(chr_names, fas_file, fas_out): """ make a stable genome file with valid contigs @args chr_names: different contig names with a valid genome sequence @type chr_names: dict @args fas_file: genome sequence in fasta file @type fas_file: str @args fas_out: new genome sequence file in fasta format @type fas_out: str """ # get the filehandler from input file try: fh = helper.open_file(fas_file) except Exception, errmsg: stop_err('error in reading file ' + errmsg)
def clean_anno_file(chr_names, gtf_file, gtf_out): """ make stable annotation file with valid contig name @args chr_names: different contig names with a valid genome sequence @type chr_names: dict @args gtf_file: genome sequence in fasta file @type gtf_file: str @args gtf_out: new genome sequence file in fasta format @type gtf_out: str """ # get the filehandler from input file try: fh = helper.open_file(gtf_file) except Exception, errmsg: stop_err('error %s in reading file %s' % (errmsg, gtf_file))
def genome_file_rec_extract(chr_pattn, fas_file, fas_out): """ get all contings based on a matiching string in the record identifier @args chr_pattn: pattern to be searched in contig names @type chr_pattn: str @args fas_file: genome sequence in fasta file @type fas_file: str @args fas_out: new genome sequence file in fasta format @type fas_out: str """ # get the filehandler from input file try: fh = helper.open_file(fas_file) except Exception, errmsg: stop_err('error in reading file '+ errmsg)
def clean_anno_file(chr_names, gtf_file, gtf_out): """ make stable annotation file with valid contig name @args chr_names: different contig names with a valid genome sequence @type chr_names: dict @args gtf_file: genome annotation in gtf/gff form @type gtf_file: str @args gtf_out: new genome annotation in gtf/gff form @type gtf_out: str """ # get the filehandler from input file try: fh = helper.open_file(gtf_file) except Exception, errmsg: stop_err('error %s in reading file %s' % (errmsg, gtf_file))
def clean_genome_file(chr_names, fas_file, fas_out): """ make a stable genome file with valid contigs @args chr_names: different contig names with a valid genome sequence @type chr_names: dict @args fas_file: genome sequence in fasta file @type fas_file: str @args fas_out: new genome sequence file in fasta format @type fas_out: str """ # get the filehandler from input file try: fh = helper.open_file(fas_file) except Exception, errmsg: stop_err('error in reading file '+ errmsg)
def genome_file_rec_extract(chr_pattn, fas_file, fas_out): """ get all contings based on a matiching string in the record identifier @args chr_pattn: pattern to be searched in contig names @type chr_pattn: str @args fas_file: genome sequence in fasta file @type fas_file: str @args fas_out: new genome sequence file in fasta format @type fas_out: str """ # get the filehandler from input file try: fh = helper.open_file(fas_file) except Exception, errmsg: stop_err('error in reading file ' + errmsg)
def __main__(): try: fname = sys.argv[1] fa_out = sys.argv[2] except: print __doc__ sys.exit(-1) # get the valid chromosome identifier from user as STDIN chrom = dict() for chr in sys.stdin: chr = chr.strip() chrom[chr] = 1 # get the filehandler from input file try: fh = helper.open_file(fname) except Exception, errmsg: stop_err('error in reading file '+ errmsg)
def true_ss_seq_fetch(fnam, Label, boundary): """ true splice signals """ foh = helper.open_file(fnam) don_cnt_pl = don_cnt_mi = acc_cnt_pl = acc_cnt_mi = 0 don_in_pl = don_in_mi = acc_in_pl = acc_in_mi = 0 for rec in SeqIO.parse(foh, "fasta"): if rec.id in Label: for Lfeat in Label[rec.id]: for fid, loc in Lfeat.items(): acc_ind = don_ind = 0 if loc[-1] == '+': acc_mot_seq = rec.seq[(int(loc[0])-boundary)-2:(int(loc[0])+boundary)-2] if not acc_mot_seq: acc_ind = 1 if str(acc_mot_seq[boundary-1:boundary+1]).upper() != 'AG': acc_ind = 1 if acc_ind: acc_in_pl += 1 else: acc_cnt_pl += 1 don_mot_seq = rec.seq[(int(loc[1])-boundary)+1:(int(loc[1])+boundary)+1] if not don_mot_seq: don_ind = 1 if str(don_mot_seq[boundary-1:boundary+1]).upper() != 'GT': don_ind = 1 if don_ind: don_in_pl += 1 else: don_cnt_pl += 1 elif loc[-1] == '-': don_mot_seq = rec.seq[(int(loc[0])-boundary)-2:(int(loc[0])+boundary)-2] if not don_mot_seq: don_ind = 1 if str(don_mot_seq[boundary-1:boundary+1]).upper() != 'AC': don_ind = 1 if don_ind: don_in_mi += 1 else: don_cnt_mi += 1 acc_mot_seq = rec.seq[(int(loc[1])-boundary)+1:(int(loc[1])+boundary)+1] if not acc_mot_seq: acc_ind = 1 if str(acc_mot_seq[boundary-1:boundary+1]).upper() != 'CT': acc_ind = 1 if acc_ind: acc_in_mi += 1 else: acc_cnt_mi += 1 print print '%d and %d Consensus DONOR sites on positive and negative strand' % (don_cnt_pl, don_cnt_mi) print '%d and %d Non-Consensus DONOR sites on positive and negative strand' % (don_in_pl, don_in_mi) print print '%d and %d Consensus ACCEPTOR sites on positive and negative strand' % (acc_cnt_pl, acc_cnt_mi) print '%d and %d Non-Consensus ACCEPTOR sites on positive and negative strand' % (acc_in_pl, acc_in_mi) print foh.close() return don_cnt_pl+don_cnt_mi, acc_cnt_pl+acc_cnt_mi, don_in_pl+don_in_mi, acc_in_pl+acc_in_mi
def experiment_db(config_file, opt_action): """ function to collect details of each organism FIXME descriptions @args config_file: yaml file contain the information for the experiment @type config_file: str """ ## parsing the config file config_map = yaml.safe_load(open(config_file, "rU")) data_path = config_map["genome_data_path"]["dir"] exp_path = config_map["experiment_data_path"]["dir"] org_fasta_file = dict( A_carolinensis="%s/A_carolinensis/ensembl_release_79/ensembl_release_79.fas" % data_path, M_mulatta="%s/M_mulatta/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, O_cuniculus="%s/O_cuniculus/ensembl_release_79/ensembl_release_79.fas" % data_path, M_gallopavo="%s/M_gallopavo/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, B_anthracis="%s/B_anthracis/ensembl_release-21/Bacillus_anthracis_str_a0193.GCA_000181915.1.21.dna.toplevel.fa" % data_path, C_familiaris="%s/C_familiaris/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, D_melanogaster="%s/D_melanogaster/ensembl_release_79/ensembl_release_79.fas" % data_path, E_caballus="%s/E_caballus/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, M_domestica="%s/M_domestica/ensembl_release-69/Monodelphis_domestica.BROADO5.69.dna.toplevel.fa" % data_path, O_sativa="%s/O_sativa/phytozome_v9.0/Osativa_204.fa" % data_path, A_gambiae="%s/A_gambiae/anoGam1_ucsc/anoGam1_rm.fasta" % data_path, B_rapa="%s/B_rapa/phytozome_v9.0/Brapa_197_stable.fa" % data_path, C_japonica="%s/C_japonica/STARgenome/Caenorhabditis_japonica.C_japonica-7.0.1.22.dna_sm.stable.fa" % data_path, G_gallus="%s/G_gallus/ensembl_release_79/ensembl_release_79.fas" % data_path, M_musculus="%s/M_musculus/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, V_vinifera="%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.fas.bz2" % data_path, A_mellifera="%s/A_mellifera/apiMel3_ucsc/apiMel3_sm.fasta" % data_path, B_taurus="%s/B_taurus/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, C_rubella="%s/C_rubella/phytozome_v9.0/Crubella_183.fa.gz" % data_path, D_rerio="%s/D_rerio/ensembl_release_79/ensembl_release_79.fas" % data_path, G_max="%s/G_max/phytozome_v9.0/Gmax_189_filter.fa" % data_path, M_truncatula="%s/M_truncatula/STARgenome/Mtruncatula_198.fa" % data_path, P_pacificus="%s/P_pacificus/STARgenome/Pristionchus_pacificus.P_pacificus-5.0.22.dna_sm.stable.fa" % data_path, S_scrofa="%s/S_scrofa/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, X_tropicalis="%s/X_tropicalis/JGIv4-1/JGIv4-1.fa" % data_path, C_sativus="%s/C_sativus/phytozome_v9.0/Csativus_122_filtered.fa" % data_path, D_simulans="%s/D_simulans/ensembl_release_22/ensembl_release_22.fas.bz2" % data_path, H_sapiens="%s/H_sapiens/hg19_bowtie2/hg19.fa" % data_path, O_anatinus="%s/O_anatinus/ensembl_release-69/Ornithorhynchus_anatinus.OANA5.69-filtered_dna.fa" % data_path, N_vitripennis="%s/N_vitripennis/ensembl_release_22/N_vitripennis_dna_sm.fa" % data_path, P_troglodytes="%s/P_troglodytes/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, S_tuberosum="%s/S_tuberosum/phytozome_v9.0/Stuberosum_206.fa" % data_path, Z_mays="%s/Z_mays/phytozome_v9.0/Zmays_181.fa" % data_path, A_thaliana="%s/A_thaliana/arabidopsis_tair10/sequences/TAIR9_chr_all.fas" % data_path, O_aries="%s/O_aries/ensembl_release_79/ensembl_release_79.fas" % data_path, C_jacchus="%s/C_jacchus/ensembl_release_79/ensembl_release_79.fas" % data_path, C_elegans="%s/C_elegans/ensembl_release_79/ensembl_release_79.fas" % data_path, O_latipes="%s/O_latipes/ensembl_release_79/ensembl_release_79.fas" % data_path, R_norvegicus="%s/R_norvegicus/ensembl_release_79/ensembl_release_79.fas" % data_path, C_briggsae="%s/C_briggsae/ensembl_release_22/ensembl_release_22.fas.bz2" % data_path, T_nigroviridis="%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.fas" % data_path, ) org_gtf_file = dict( A_carolinensis="%s/A_carolinensis/ensembl_release_79/ensembl_release_79.gtf" % data_path, M_mulatta="%s/M_mulatta/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, O_cuniculus="%s/O_cuniculus/ensembl_release_79/ensembl_release_79.gtf" % data_path, M_gallopavo="%s/M_gallopavo/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, B_anthracis="%s/B_anthracis/ensembl_release-21/Bacillus_anthracis" % data_path, C_familiaris="%s/C_familiaris/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, D_melanogaster="%s/D_melanogaster/ensembl_release_79/ensembl_release_79.gtf" % data_path, E_caballus="%s/E_caballus/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, M_domestica="%s/M_domestica/ensembl_release-69/Monodelphis_domestica.BROADO5.69.gtf" % data_path, O_sativa="%s/O_sativa/phytozome_v9.0/Osativa_204_gene.gff3" % data_path, A_gambiae="%s/A_gambiae/anoGam1_ucsc/anoGam1_ucsc.gtf" % data_path, B_rapa="%s/B_rapa/phytozome_v9.0/Brapa_197_gene.gff3" % data_path, C_japonica="%s/C_japonica/ensembl_release-22/Caenorhabditis_japonica.C_japonica-7.0.1.22.gff3" % data_path, G_gallus="%s/G_gallus/ensembl_release_79/ensembl_release_79.gtf" % data_path, M_musculus="%s/M_musculus/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, V_vinifera="%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.gff.bz2" % data_path, A_mellifera="%s/A_mellifera/apiMel3_ucsc/apiMel2_ucsc.gtf" % data_path, B_taurus="%s/B_taurus/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, C_jacchus="%s/C_jacchus/ensembl_release_79/ensembl_release_79.gtf" % data_path, C_rubella="%s/C_rubella/phytozome_v9.0/Crubella_183.gff3" % data_path, D_rerio="%s/D_rerio/ensembl_release_79/ensembl_release_79.gtf" % data_path, G_max="%s/G_max/phytozome_v9.0/Gmax_189_gene.gff3" % data_path, N_vitripennis="%s/N_vitripennis/ensembl_release_22/N_vitripennis.gtf" % data_path, O_aries="%s/O_aries/ensembl_release_79/ensembl_release_79.gtf" % data_path, M_truncatula="%s/M_truncatula/" % data_path, P_pacificus="%s/P_pacificus/ensembl_release-22/Pristionchus_pacificus.P_pacificus-5.0.22.gtf" % data_path, S_scrofa="%s/S_scrofa/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, X_tropicalis="%s/X_tropicalis/JGIv4-1/JGIv4-1.gff" % data_path, C_sativus="%s/C_sativus/phytozome_v9.0/Csativus_122_gene.gff3" % data_path, D_simulans="%s/D_simulans/ensembl_release_22/ensembl_release_22.gff.bz2" % data_path, H_sapiens="%s/H_sapiens/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, O_anatinus="%s/O_anatinus/ensembl_release-69/" % data_path, P_troglodytes="%s/P_troglodytes/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, S_tuberosum="%s/S_tuberosum/phytozome_v9.0/Stuberosum_206_gene.gff3" % data_path, Z_mays="%s/Z_mays/phytozome_v9.0/Zmays_181_gene.gff3" % data_path, A_thaliana="%s/A_thaliana/arabidopsis_tair10/annotations/TAIR10_GFF3_genes.gff" % data_path, C_elegans="%s/C_elegans/ensembl_release_79/ensembl_release_79.gtf" % data_path, D_discoideum="%s/D_discoideum/" % data_path, D_yakuba="%s/D_yakuba/ensembl_release-22/Drosophila_yakuba.dyak_r1.3_FB2008_07.22.gff3" % data_path, O_latipes="%s/O_latipes/ensembl_release_79/ensembl_release_79.gtf" % data_path, R_norvegicus="%s/R_norvegicus/ensembl_release_79/ensembl_release_79.gtf" % data_path, C_briggsae="%s/C_briggsae/ensembl_release_22/ensembl_release_22.gff.bz2" % data_path, C_brenneri="%s/C_brenneri/ensembl_release-22/Caenorhabditis_brenneri.C_brenneri-6.0.1b.22.gff3" % data_path, C_remanei="%s/C_remanei/ensembl_release-22/Caenorhabditis_remanei.C_remanei-15.0.1.22.gff3" % data_path, D_pseudoobscura="%s/D_pseudoobscura/ensembl_release-22/Drosophila_pseudoobscura.HGSC2.22.gff3" % data_path, T_pseudonana="%s/T_pseudonana/Thaps3/Thaps3_chromosomes_geneModels_FilteredModels2.gff" % data_path, T_nigroviridis="%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.gtf" % data_path, ) ## TODO algorithms details ## experiment details org_db = defaultdict() for ent in config_map["experiment"]: species_name = ent["organism_name"] sra_run_id = ent["sra_run_id"] genome_build_version = ent["genome_build_version"] db_server = ent["release_db"] ## mapping to short names arabidopsis_thaliana --> A_thaliana genus, species = species_name.strip().split("_") short_name = "%s_%s" % (genus[0].upper(), species) org_db[short_name] = dict(short_name=short_name) org_db[short_name]["long_name"] = species_name org_db[short_name]["sra_run_id"] = sra_run_id org_db[short_name]["genome_release_db"] = genome_build_version ## the broad path to the experiment org_db[short_name]["genome_dir"] = data_path org_db[short_name]["experiment_dir"] = exp_path build_release = genome_build_version.split("_") org_db[short_name]["release_db"] = db_server ## ensembl_metazoa, phytozome org_db[short_name]["release_nb"] = build_release[-1] ## build number sra_files = [] ## sequencing reads files if os.path.isdir("%s/%s/source_data" % (exp_path, short_name)): for sra_file in os.listdir("%s/%s/source_data" % (exp_path, short_name)): file_prefx, ext = os.path.splitext(sra_file) if ext == ".sra": ## skipping the original .sra binary file continue if re.search(sra_run_id, sra_file): sra_files.append(sra_file) else: print "warning: missing sequencing read trace file %s/%s/source_data" % (exp_path, short_name) org_db[short_name]["fastq_path"] = "%s/%s/source_data" % (exp_path, short_name) org_db[short_name]["fastq"] = sra_files ## read mapping, read assembly and label generation working folders for sub_dir in ["read_mapping", "signal_labels", "trans_pred"]: work_path = "%s/%s/%s" % (exp_path, short_name, sub_dir) if not os.path.isdir(work_path): try: os.makedirs(work_path) except OSError: print "error: cannot create the directory %s." % work_path sys.exit(0) org_db[short_name]["read_map_dir"] = "%s/%s/read_mapping" % (exp_path, short_name) org_db[short_name]["read_assembly_dir"] = "%s/%s/trans_pred" % (exp_path, short_name) org_db[short_name]["labels_dir"] = "%s/%s/signal_labels" % (exp_path, short_name) ## calculate the sequence read length readlength = 0 if opt_action in ["2", "3"]: ## perform this action only for selected options if sra_files: fqfile = os.path.join(org_db[short_name]["fastq_path"], sra_files[0]) print "using sequencing read file %s to determine readLength" % fqfile fh = helper.open_file(fqfile) for rec in SeqIO.parse(fh, "fastq"): readlength = len(rec.seq) break fh.close() org_db[short_name]["read_length"] = readlength ## check for the genome sequence file if short_name in org_fasta_file: if os.path.isfile(org_fasta_file[short_name]): org_db[short_name]["fasta"] = org_fasta_file[short_name] else: org_db[short_name]["fasta"] = None else: print "warning: missing genome sequence file for %s under %s/%s/%s" % ( short_name, data_path, short_name, genome_build_version, ) org_db[short_name]["fasta"] = None if not os.path.isdir("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version)): try: os.makedirs("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version)) except OSError: print "error: cannot create the directory %s/%s/%s" % (data_path, short_name, genome_build_version) sys.exit(0) org_db[short_name]["genome_index_dir"] = "%s/%s/%s/STARgenome/" % (data_path, short_name, genome_build_version) ## check the genome annotation if short_name in org_gtf_file: if os.path.isfile(org_gtf_file[short_name]): org_db[short_name]["gtf"] = org_gtf_file[short_name] if opt_action in ["2", "3", "4", "c"]: ## perform this action only for selected options ## get the gtf feature lengths from fetch_remote_data import prepare_data as pd feat_len_db = pd.make_anno_db(org_gtf_file[short_name]) org_db[short_name]["max_intron_len"] = feat_len_db["max_intron"] org_db[short_name]["max_exon_len"] = feat_len_db["max_exon"] else: print "error: the provided gtf file %s is not available to read. Please check!" % org_gtf_file[ short_name ] sys.exit(-1) else: print "warning: missing annotation file for %s under %s/%s/%s" % ( short_name, data_path, short_name, genome_build_version, ) org_db[short_name]["gtf"] = None org_db[short_name]["max_intron_len"] = None org_db[short_name]["max_exon_len"] = None print "fetched details for %s" % short_name return org_db
print '\t'.join(rec) def fasta_reader(fname): """ reading a FASTA file """ regions_removed = collections.defaultdict(list) for rec in SeqIO.parse(fname, "fasta"): #rec.id = 'chr'+rec.id Nindex = [item for item in range(len(rec.seq)) if rec.seq[item]=="N"] ##index of the desired nucleotide for xn, xp in itertools.groupby(enumerate(Nindex), lambda (i,x):i-x): ## cod_range = map(itemgetter(1), xp) regions_removed[rec.id].append((cod_range[0], cod_range[-1])) return regions_removed try: ffa=sys.argv[1] fbz=sys.argv[2] except: print __doc__ sys.exit(-1) dis_cod = fasta_reader(ffa) bzh = helper.open_file(fbz) pred_score(bzh, dis_cod)
def __main__(): try: fastq_path = sys.argv[1] except: print __doc__ sys.exit(-1) fastq_1_file = None fastq_2_file = None # TODO expecting the left and right reads in the base folder with .fastq ending. Make it is common general form ## get the files from base path for filename in os.listdir(fastq_path): if re.search(r'_1.fastq', filename): fastq_1_file = filename if re.search(r'_2.fastq', filename): fastq_2_file = filename print fastq_1_file, fastq_2_file print ## count the number of reads and calculate the sub sample fqh = helper.open_file('%s/%s' % (fastq_path, fastq_1_file)) read_cnt = 0 for rec in SeqIO.parse(fqh, 'fastq'): read_cnt += 1 fqh.close() print '%d Number of reads in FASTQ' % read_cnt print ## what percentage sub-sample percentage = 1 sub_count = int(round((percentage * read_cnt) / 100.0)) assert sub_count <= read_cnt, ' %d (sub-sample count) should be less than total read count %d' % ( sub_count, read_cnt) print "%d Sub sample count" % sub_count print try: accept_prob = (1.0 * sub_count) / read_cnt except: accept_prob = 1 print accept_prob sub_fastq_1_file = "%d_percentage_%s.bz2" % (percentage, fastq_1_file) sub_fastq_2_file = "%d_percentage_%s.bz2" % (percentage, fastq_2_file) ## writing out sub sample files try: subFile_1 = bz2.BZ2File("%s/%s" % (fastq_path, sub_fastq_1_file), 'wb') subFile_2 = bz2.BZ2File("%s/%s" % (fastq_path, sub_fastq_2_file), 'wb') except Exception as error: sys.exit(error) total_cnt = 0 sample_cnt = 0 left_reads = dict() fqh = helper.open_file('%s/%s' % (fastq_path, fastq_1_file)) for rec in SeqIO.parse(fqh, 'fastq'): rnb = random.random() total_cnt += 1 if rnb <= accept_prob: ## @UNC15-SN850_63:4:1101:1103:2151/1 @UNC15-SN850_63:4:1101:1103:2151/2 read_id = rec.id.split('/') if len(read_id) > 1: left_reads[read_id[0]] = 0 else: ## @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 1:N:0:GGNTAC @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 2:N:0:GGNTAC left_reads[rec.id] = 0 sample_cnt += 1 subFile_1.write(rec.format("fastq")) if sub_count == sample_cnt: break fqh.close() subFile_1.close() fqh = helper.open_file('%s/%s' % (fastq_path, fastq_2_file)) for rec in SeqIO.parse(fqh, 'fastq'): read_id = rec.id.split('/') if len(read_id) > 1: ## @UNC15-SN850_63:4:1101:1103:2151/1 if read_id[0] in left_reads: subFile_2.write(rec.format("fastq")) else: ## @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 1:N:0:GGNTAC if rec.id in left_reads: subFile_2.write(rec.format("fastq")) fqh.close() subFile_2.close() print "%s/%s" % (fastq_path, sub_fastq_1_file) print "%s/%s" % (fastq_path, sub_fastq_2_file) print print '%d Number of reads scanned' % total_cnt print '%d Number of reads in' % sample_cnt print
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1): """ wrapper for running STAR program @args org_db: a python dictionary with all details about a single organism @type org_db: defaultdict @args read_type: library type - paired-end or single-end (default: PE) @type read_type: str @args max_mates_gap_length: maximum insert size from the sample (default: 10000) @type max_mates_gap_length: int @args num_cpus: number of threads to use for the run (default: 1) @type num_cpus: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") from gfftools import helper, GFFParser genome_dir = org_db['genome_index_dir']## genome indices and annotation file gtf_db = org_db['gtf'] if gtf_db != None: ## check for the annotation file type gff or gtf gff_hand = helper.open_file(gtf_db) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## library type if read_type == 'PE': read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1]) else: read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0]) ## getting the command to uncompress the read file zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} file_prefx, ext = os.path.splitext(org_db['fastq'][0]) out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name']) ## genomic feature information max_lenth_intron = org_db['max_intron_len'] ## according to the file type if gtf_db == None: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 4 \ --sjdbScore 1 \ --sjdbOverhang 5 \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus) elif ftype: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 4 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbScore 1 \ --sjdbOverhang 5 \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db) else: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 4 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbScore 1 \ --sjdbOverhang 5 \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db) sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run) try: process = subprocess.Popen(make_star_run, shell=True) returncode = process.wait() if returncode !=0: raise Exception, "Exit status return code = %i" % returncode sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix) except Exception, e: sys.exit("Error running STAR.\n%s" % str( e ))
def translate_trsk_genes(gtf_file, fas_file, out_seq_fname): """ translate the trsk genes to protein sequence @args gtf_file: genome annotation file @type gtf_file: str @args fas_file: genome sequence file @type fas_file: str @args out_seq_fname: output file in fasta format @type out_seq_fname: str """ if filecmp.cmp(gtf_file, fas_file): exit("Do the two files are exactly same? Please check that!") ## reading the TSkim file to get the features sys.stdout.write('reading genome features from %s\n' % gtf_file) anno_db = GFFParser.Parse(gtf_file) total_genes = len(anno_db) ## genome sequence file reading sys.stdout.write('reading genome sequence from %s\n' % fas_file) seqlab.chrom_name_consistency(fas_file, anno_db) cds_idx = [] # deleting the empty cds lines for idp, feat in enumerate(anno_db): if not feat['cds_exons'][0].any( ): # TSkim annotation expects only single transcript from a region cds_idx.append(idp) anno_db = np.delete(anno_db, cds_idx) genes_with_cds = len(anno_db) fasFH = helper.open_file(fas_file) out_seq_fh = open(out_seq_fname, "w") for rec in SeqIO.parse(fasFH, "fasta"): for idx, feature in enumerate(anno_db): if rec.id == feature['chr']: ## iterate over cds_exons cds_seq = '' for ex in feature['cds_exons'][ 0]: ## single transcript by TSkim cds_seq += rec.seq[ex[0] - 1:ex[1]] if feature['strand'] == '-': cds_seq = cds_seq.reverse_complement() ## #sys.stdout.write(str(cds_seq.translate()) + "\n") ## fasta output if cds_seq: prt_seq = SeqRecord(cds_seq.translate(), id=feature['name'], description='protein sequence') out_seq_fh.write(prt_seq.format("fasta")) # FIXME need an efficient way to translate multiple gene # iterate over chromosome fasFH.close() out_seq_fh.close() sys.stdout.write('total genes fetched: %d\n' % total_genes) sys.stdout.write('total genes translated: %d\n' % genes_with_cds) sys.stdout.write('protein sequence stored at %s\n' % out_seq_fname)
def create_star_genome_index(fasta_file, out_dir, genome_anno=None, num_workers=1, onematelength=100): """ Creating STAR genome index with or without using genome annotation @args fasta_file: reference genome sequence file .fasta format @type fasta_file: str @args out_dir: genome index binary file storage place @type out_dir: str @args genome_anno: genome annotation file (optional) @type genome_anno: str @args num_workers: number of threads to run (default value = 1) @type num_workers: int @args onematelength: One Mate Length (default value=100) @type onematelength: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") file_prefx, ext = os.path.splitext(fasta_file) if ext in [".bz2", ".gz", ".lzma" ]: ## checking for the compressed form of the file extension exit( "error: STAR - Generating genome indexes - recommended to use the uncompressed FASTA file %s." % fasta_file) if not genome_anno: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d' % (out_dir, fasta_file, num_workers) else: file_prefx, ext = os.path.splitext(genome_anno) if ext in [".bz2", ".gz", ".lzma"]: exit( "error: STAR - Generating genome indexes - recommended to use the uncompressed GTF/GFF file %s." % genome_anno) ## check for the file type gff_hand = helper.open_file(genome_anno) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## according to the file type if ftype: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) else: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) ## create downloadpath if doesnot exists if not os.path.exists(out_dir): try: os.makedirs(out_dir) except OSError: exit("error: cannot create the directory %s." % out_dir) else: ## if present any other old index files clean up the folder for the_file in os.listdir(out_dir): file_path = os.path.join(out_dir, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception, e: print(e)
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1): """ wrapper for running STAR program @args org_db: a python dictionary with all details about a single organism @type org_db: defaultdict @args read_type: library type - paired-end or single-end (default: PE) @type read_type: str @args max_mates_gap_length: maximum insert size from the sample (default: 10000) @type max_mates_gap_length: int @args num_cpus: number of threads to use for the run (default: 1) @type num_cpus: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") from gfftools import helper, GFFParser genome_dir = org_db['genome_index_dir']## genome indices and annotation file gtf_db = org_db['gtf'] if gtf_db != None: ## check for the annotation file type gff or gtf gff_hand = helper.open_file(gtf_db) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## library type if read_type == 'PE': read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1]) else: read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0]) ## getting the command to uncompress the read file zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} file_prefx, ext = os.path.splitext(org_db['fastq'][0]) out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name']) ## genomic feature information max_lenth_intron = org_db['max_intron_len'] ##sjdbOverhang mate_len = org_db['mate_length'] ## according to the file type if gtf_db == None: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 3 \ --sjdbScore 1 \ --sjdbOverhang %d \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, mate_len) elif ftype: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 3 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbScore 1 \ --sjdbOverhang %d \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len) else: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 3 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbScore 1 \ --sjdbOverhang %d \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len) sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run) try: process = subprocess.Popen(make_star_run, shell=True) returncode = process.wait() if returncode !=0: raise Exception, "Exit status return code = %i" % returncode sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix) except Exception, e: sys.exit("Error running STAR.\n%s" % str( e ))
def create_star_genome_index(fasta_file, out_dir, genome_anno=None, num_workers=1, onematelength=100): """ Creating STAR genome index with or without using genome annotation TODO check whether the fasta and gtf files are uncompressed star works with uncompressed files in this step. @args fasta_file: reference genome sequence file .fasta format @type fasta_file: str @args out_dir: genome index binary file storage place @type out_dir: str @args genome_anno: genome annotation file (optional) @type genome_anno: str @args num_workers: number of threads to run (default value = 1) @type num_workers: int @args onematelength: One Mate Length (default value=100) @type num_workers: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") if not genome_anno: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d' % (out_dir, fasta_file, num_workers) else: ## check for the file type gff_hand = helper.open_file(genome_anno) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## according to the file type if ftype: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) else: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) ## create downloadpath if doesnot exists if not os.path.exists(out_dir): try: os.makedirs(out_dir) except OSError: print "error: cannot create the directory %s." % out_dir sys.exit(0) else:## if present any other old index files clean up the folder for the_file in os.listdir(out_dir): file_path = os.path.join(out_dir, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception, e: print e
def experiment_db(config_file, opt_action): """ function to collect details of each organism FIXME descriptions @args config_file: yaml file contain the information for the experiment @type config_file: str """ ## parsing the config file config_map = yaml.safe_load(open(config_file, "rU")) data_path = config_map['genome_data_path']['dir'] exp_path = config_map['experiment_data_path']['dir'] org_fasta_file = dict( A_carolinensis = '%s/A_carolinensis/ensembl_release_79/ensembl_release_79.fas' % data_path, M_mulatta = "%s/M_mulatta/ensembl_release_79/ensembl_release_79.fas" % data_path, O_cuniculus = "%s/O_cuniculus/ensembl_release_79/ensembl_release_79.fas" % data_path, M_gallopavo = "%s/M_gallopavo/ensembl_release_79/ensembl_release_79.fas.bz2" % data_path, B_anthracis = '%s/B_anthracis/' % data_path, C_familiaris = '%s/C_familiaris/ensembl_release_79/ensembl_release_79.fas' % data_path, D_melanogaster = '%s/D_melanogaster/ensembl_release_79/ensembl_release_79.fas' % data_path, E_caballus = '%s/E_caballus/ensembl_release_79/ensembl_release_79.fas' % data_path, M_domestica = '%s/M_domestica/ensembl_release_79/ensembl_release_79.fas' % data_path, O_sativa = '%s/O_sativa/phytozome_v9.0/Osativa_204.fa' % data_path, A_gambiae = '%s/A_gambiae/ensembl_release_28/ensembl_release_28.fas' % data_path, B_rapa = '%s/B_rapa/phytozome_v9.0/Brapa_197_stable.fa' % data_path, G_gallus = '%s/G_gallus/ensembl_release_79/ensembl_release_79.fas' % data_path, M_musculus = '%s/M_musculus/ensembl_release_79/ensembl_release_79.fas' % data_path, V_vinifera = '%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.fas.bz2' % data_path, A_mellifera = '%s/A_mellifera/ensembl_release_28/ensembl_release_28.fas' % data_path, B_taurus = '%s/B_taurus/ensembl_release_79/ensembl_release_79.fas' % data_path, C_rubella = '%s/C_rubella/phytozome_v9.0/Crubella_183.fa.gz' % data_path, D_rerio = '%s/D_rerio/ensembl_release_79/ensembl_release_79.fas' % data_path, G_max = '%s/G_max/phytozome_v9.0/Gmax_189_filter.fa' % data_path, M_truncatula = '%s/M_truncatula/STARgenome/Mtruncatula_198.fa' % data_path, P_pacificus = '%s/P_pacificus/' % data_path, S_scrofa = '%s/S_scrofa/ensembl_release_79/ensembl_release_79.fas' % data_path, X_tropicalis = '%s/X_tropicalis/JGIv4-1/JGIv4-1.fa' % data_path, C_sativus = '%s/C_sativus/phytozome_v9.0/Csativus_122_filtered.fa' % data_path, D_simulans = '%s/D_simulans/ensembl_release_28/ensembl_release_28.fas' % data_path, H_sapiens = '%s/H_sapiens/ensembl_release_79/STARgenome/hg19_chrOnly.fa' % data_path, N_vitripennis = '%s/N_vitripennis/ensembl_release_22/N_vitripennis_dna_sm.fa' % data_path, P_troglodytes = '%s/P_troglodytes/ensembl_release_79/ensembl_release_79.fas' % data_path, S_tuberosum = '%s/S_tuberosum/phytozome_v9.0/Stuberosum_206.fa' % data_path, Z_mays = '%s/Z_mays/phytozome_v9.0/Zmays_181.fa' % data_path, A_thaliana = '%s/A_thaliana/arabidopsis_tair10/sequences/TAIR9_chr_all.fas' % data_path, O_aries = '%s/O_aries/ensembl_release_79/ensembl_release_79.fas' % data_path, C_jacchus = '%s/C_jacchus/ensembl_release_79/ensembl_release_79.fas' % data_path, C_elegans = '%s/C_elegans/ensembl_release-69/Caenorhabditis_elegans.WBcel215.69.dna.toplevel.fa' % data_path, O_latipes = '%s/O_latipes/ensembl_release_79/ensembl_release_79.fas' % data_path, R_norvegicus = '%s/R_norvegicus/ensembl_release_79/ensembl_release_79.fas' % data_path, G_gorilla = '%s/G_gorilla/ensembl_release_79/ensembl_release_79.fas' % data_path, P_paniscus = '%s/P_paniscus/eva_mpg_de/eva_mpg_de.fas' % data_path, C_porcellus = '%s/C_porcellus/ensembl_release_79/ensembl_release_79.fas' % data_path, O_anatinus = '%s/O_anatinus/ensembl_release_79/ensembl_release_79.fas' % data_path, A_platyrhynchos = '%s/A_platyrhynchos/ensembl_release_79/ensembl_release_79.fas' % data_path, O_niloticus = '%s/O_niloticus/ensembl_release_79/ensembl_release_79.fas' % data_path, L_chalumnae = '%s/L_chalumnae/ensembl_release_79/ensembl_release_79.fas' % data_path, H_glaber = '%s/H_glaber/naked_mole_rat_db/naked_mole_rat_db.fas' % data_path, M_eugenii = '%s/M_eugenii/ensembl_release_79/ensembl_release_79.fas' % data_path, C_briggsae = '%s/C_briggsae/ensembl_release_28/ensembl_release_28.fas' % data_path, C_japonica = '%s/C_japonica/ensembl_release_28/ensembl_release_28.fas' % data_path, C_remanei = '%s/C_remanei/ensembl_release_28/ensembl_release_28.fas' % data_path, P_marinus = '%s/P_marinus/ensembl_release_79/ensembl_release_79.fas' % data_path, C_brenneri = '%s/C_brenneri/ensembl_release_28/ensembl_release_28.fas' % data_path, C_intestinalis = '%s/C_intestinalis/ensembl_release_79/ensembl_release_79.fas' % data_path, S_cerevisiae = '%s/S_cerevisiae/ensembl_release_79/ensembl_release_79.fas' % data_path, S_pombe = '%s/S_pombe/ensembl_release_28/ensembl_release_28.fas' % data_path, A_aegypti = '%s/A_aegypti/ensembl_release_28/ensembl_release_28.fas' % data_path, C_hircus = '%s/C_hircus/ncbi_genome/ncbi_genome.fas' % data_path, F_catus = '%s/F_catus/ensembl_release_79/ensembl_release_79.fas' % data_path, T_nigroviridis = '%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.fas' % data_path ) #C_elegans = '%s/C_elegans/ensembl_release_79/ensembl_release_79.fas' % data_path, org_gtf_file = dict( A_carolinensis = '%s/A_carolinensis/ensembl_release_79/ensembl_release_79.gtf' % data_path, M_mulatta = "%s/M_mulatta/ensembl_release_79/ensembl_release_79.gtf" % data_path, O_cuniculus = "%s/O_cuniculus/ensembl_release_79/ensembl_release_79.gtf" % data_path, M_gallopavo = "%s/M_gallopavo/ensembl_release_79/ensembl_release_79.gtf.bz2" % data_path, B_anthracis = '%s/B_anthracis/ensembl_release-21/Bacillus_anthracis' % data_path, C_familiaris = '%s/C_familiaris/ensembl_release_79/ensembl_release_79.gtf' % data_path, D_melanogaster = '%s/D_melanogaster/ensembl_release_79/ensembl_release_79.gtf' % data_path, E_caballus = '%s/E_caballus/ensembl_release_79/ensembl_release_79.gtf' % data_path, M_domestica = '%s/M_domestica/ensembl_release_79/ensembl_release_79.gtf' % data_path, O_sativa = '%s/O_sativa/phytozome_v9.0/Osativa_204_gene.gff3' % data_path, A_gambiae = '%s/A_gambiae/ensembl_release_28/ensembl_release_28.gtf' % data_path, B_rapa = '%s/B_rapa/phytozome_v9.0/Brapa_197_gene.gff3' % data_path, G_gallus = '%s/G_gallus/ensembl_release_79/ensembl_release_79.gtf' % data_path, M_musculus = '%s/M_musculus/ensembl_release_79/ensembl_release_79.gtf' % data_path, V_vinifera = '%s/V_vinifera/phytozome_v9.0/phytozome_v9.0.gff.bz2' % data_path, A_mellifera = '%s/A_mellifera/ensembl_release_28/ensembl_release_28.gtf' % data_path, B_taurus = '%s/B_taurus/ensembl_release_79/ensembl_release_79.gtf' % data_path, C_jacchus = '%s/C_jacchus/ensembl_release_79/ensembl_release_79.gtf' % data_path, C_rubella = '%s/C_rubella/phytozome_v9.0/Crubella_183.gff3' % data_path, D_rerio = '%s/D_rerio/ensembl_release_79/ensembl_release_79.gtf' % data_path, G_max = '%s/G_max/phytozome_v9.0/Gmax_189_gene.gff3' % data_path, N_vitripennis = '%s/N_vitripennis/ensembl_release_22/N_vitripennis.gtf' % data_path, O_aries = '%s/O_aries/ensembl_release_79/ensembl_release_79.gtf' % data_path, M_truncatula = '%s/M_truncatula/' % data_path, P_pacificus = '%s/P_pacificus/ensembl_release-22/.gtf' % data_path, S_scrofa = '%s/S_scrofa/ensembl_release_79/ensembl_release_79.gtf' % data_path, X_tropicalis = '%s/X_tropicalis/JGIv4-1/JGIv4-1.gff' % data_path, C_sativus = '%s/C_sativus/phytozome_v9.0/Csativus_122_gene.gff3' % data_path, D_simulans = '%s/D_simulans/ensembl_release_28/ensembl_release_28.gtf' % data_path, H_sapiens = '%s/H_sapiens/ensembl_release_79/ensembl_release_79.gtf' % data_path, P_troglodytes = '%s/P_troglodytes/ensembl_release_79/ensembl_release_79.gtf' % data_path, S_tuberosum = '%s/S_tuberosum/phytozome_v9.0/Stuberosum_206_gene.gff3' % data_path, Z_mays = '%s/Z_mays/phytozome_v9.0/Zmays_181_gene.gff3' % data_path, A_thaliana = '%s/A_thaliana/arabidopsis_tair10/annotations/TAIR10_GFF3_genes.gff' % data_path, C_elegans = '%s/C_elegans/ensembl_release_79/ensembl_release_79.gtf' % data_path, D_discoideum = '%s/D_discoideum/' % data_path, D_yakuba = '%s/D_yakuba/ensembl_release-22/.gff3' % data_path, O_latipes = '%s/O_latipes/ensembl_release_79/ensembl_release_79.gtf' % data_path, R_norvegicus = '%s/R_norvegicus/ensembl_release_79/ensembl_release_79.gtf' % data_path, D_pseudoobscura = '%s/D_pseudoobscura/ensembl_release-22/.gff3' % data_path, T_pseudonana = '%s/T_pseudonana/Thaps3/.gff' % data_path, G_gorilla = '%s/G_gorilla/ensembl_release_79/ensembl_release_79.gtf' % data_path, C_porcellus = '%s/C_porcellus/ensembl_release_79/ensembl_release_79.gtf' % data_path, O_anatinus = '%s/O_anatinus/ensembl_release_79/ensembl_release_79.gtf' % data_path, A_platyrhynchos = '%s/A_platyrhynchos/ensembl_release_79/ensembl_release_79.gtf' % data_path, O_niloticus = '%s/O_niloticus/ensembl_release_79/ensembl_release_79.gtf' % data_path, L_chalumnae = '%s/L_chalumnae/ensembl_release_79/ensembl_release_79.gtf' % data_path, C_briggsae = '%s/C_briggsae/ensembl_release_28/ensembl_release_28.gtf' % data_path, M_eugenii = '%s/M_eugenii/ensembl_release_79/ensembl_release_79.gtf' % data_path, C_remanei = '%s/C_remanei/ensembl_release_28/ensembl_release_28.gtf' % data_path, C_brenneri = '%s/C_brenneri/ensembl_release_28/ensembl_release_28.gtf' % data_path, C_intestinalis = '%s/C_intestinalis/ensembl_release_79/ensembl_release_79.gtf' % data_path, S_pombe = '%s/S_pombe/ensembl_release_28/ensembl_release_28.gtf' % data_path, P_marinus = '%s/P_marinus/ensembl_release_79/ensembl_release_79.gtf' % data_path, S_cerevisiae = '%s/S_cerevisiae/ensembl_release_79/ensembl_release_79.gtf' % data_path, C_japonica = '%s/C_japonica/ensembl_release_28/ensembl_release_28.gtf' % data_path, F_catus = '%s/F_catus/ensembl_release_79/ensembl_release_79.gtf' % data_path, A_aegypti = '%s/A_aegypti/ensembl_release_28/ensembl_release_28.gtf' % data_path, C_hircus = '%s/C_hircus/ncbi_genome/ncbi_genome.gff' % data_path, T_nigroviridis = '%s/T_nigroviridis/ensembl_release_79/ensembl_release_79.gtf' % data_path ) ## TODO algorithms details #A_aegypti = '%s/A_aegypti/ensembl_release_28/ensembl_release_28.gtf' % data_path, #F_catus = '%s/F_catus/ensembl_release_79/ensembl_release_79.gtf' % data_path, #C_hircus = '%s/C_hircus/ncbi_genome/ncbi_genome.gff' % data_path, #H_glaber = '%s/H_glaber/naked_mole_rat_db/naked_mole_rat_db.gtf' % data_path, ## experiment details org_db = defaultdict() for ent in config_map['experiment']: species_name = ent['organism_name'] sra_run_id = ent['sra_run_id'] genome_build_version = ent['genome_build_version'] db_server = ent['release_db'] ## mapping to short names arabidopsis_thaliana --> A_thaliana genus, species = species_name.strip().split("_") short_name = "%s_%s" % (genus[0].upper(), species) org_db[short_name] = dict(short_name = short_name) org_db[short_name]['long_name'] = species_name org_db[short_name]['sra_run_id'] = sra_run_id org_db[short_name]['genome_release_db'] = genome_build_version ## the broad path to the experiment org_db[short_name]['genome_dir'] = data_path org_db[short_name]['experiment_dir'] = exp_path build_release = genome_build_version.split("_") org_db[short_name]['release_db'] = db_server ## ensembl_metazoa, phytozome org_db[short_name]['release_nb'] = build_release[-1] ## build number sra_files = [] ## sequencing reads files if os.path.isdir("%s/%s/source_data" % (exp_path, short_name)): for sra_file in os.listdir("%s/%s/source_data" % (exp_path, short_name)): file_prefx, ext = os.path.splitext(sra_file) if ext == ".sra": ## skipping the original .sra binary file continue if re.search(sra_run_id, sra_file): sra_files.append(sra_file) else: print "warning: missing sequencing read trace file %s/%s/source_data" % (exp_path, short_name) org_db[short_name]['fastq_path'] = "%s/%s/source_data" % (exp_path, short_name) org_db[short_name]['fastq'] = sra_files ## read mapping, read assembly and label generation working folders for sub_dir in ['read_mapping', 'signal_labels', 'trans_pred', 'source_data']: work_path = "%s/%s/%s" % (exp_path, short_name, sub_dir) if not os.path.isdir(work_path): try: os.makedirs(work_path) except OSError: exit("error: cannot create the directory %s." % work_path) org_db[short_name]['read_map_dir'] = "%s/%s/read_mapping" % (exp_path, short_name) org_db[short_name]['read_assembly_dir'] = "%s/%s/trans_pred" % (exp_path, short_name) org_db[short_name]['labels_dir'] = "%s/%s/signal_labels" % (exp_path, short_name) ## calculate the sequence read length readlength = 0 if opt_action in ["2", "3"]: ## perform this action only for selected options if sra_files: fqfile = os.path.join(org_db[short_name]['fastq_path'], sra_files[0]) print 'using sequencing read file %s to determine readLength' % fqfile fh = helper.open_file(fqfile) for rec in SeqIO.parse(fh, "fastq"): readlength = len(rec.seq) break fh.close() org_db[short_name]['read_length'] = readlength ## check for the genome sequence file if short_name in org_fasta_file: if os.path.isfile(org_fasta_file[short_name]): org_db[short_name]['fasta'] = org_fasta_file[short_name] else: org_db[short_name]['fasta'] = None else: print "warning: missing genome sequence file for %s under %s/%s/%s" % (short_name, data_path, short_name, genome_build_version) org_db[short_name]['fasta'] = None if not os.path.isdir("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version)): try: os.makedirs("%s/%s/%s/STARgenome" % (data_path, short_name, genome_build_version)) except OSError: exit("error: cannot create the directory %s/%s/%s" % (data_path, short_name, genome_build_version)) org_db[short_name]['genome_index_dir'] = "%s/%s/%s/STARgenome/" % (data_path, short_name, genome_build_version) ## check the genome annotation if short_name in org_gtf_file: if os.path.isfile(org_gtf_file[short_name]): org_db[short_name]['gtf'] = org_gtf_file[short_name] if opt_action in ["2", "3", "4", "c"]: ## perform this action only for selected options ## get the gtf feature lengths from fetch_remote_data import prepare_data as pd feat_len_db = pd.make_anno_db(org_gtf_file[short_name]) org_db[short_name]['max_intron_len'] = feat_len_db['max_intron'] org_db[short_name]['max_exon_len'] = feat_len_db['max_exon'] else: exit("error: the provided gtf file %s is not available to read. Please check!" % org_gtf_file[short_name]) else: print("warning: missing annotation file for %s under %s/%s/%s" % (short_name, data_path, short_name, genome_build_version)) org_db[short_name]['gtf'] = None org_db[short_name]['max_intron_len'] = None org_db[short_name]['max_exon_len'] = None print("fetched details for %s" % short_name) return org_db
def __main__(): try: fastq_path = sys.argv[1] except: print __doc__ sys.exit(-1) fastq_1_file = None fastq_2_file = None # TODO expecting the left and right reads in the base folder with .fastq ending. Make it is common general form ## get the files from base path for filename in os.listdir(fastq_path): if re.search(r'_1.fastq', filename): fastq_1_file = filename if re.search(r'_2.fastq', filename): fastq_2_file = filename print fastq_1_file, fastq_2_file print ## count the number of reads and calculate the sub sample fqh = helper.open_file('%s/%s' % (fastq_path, fastq_1_file)) read_cnt = 0 for rec in SeqIO.parse(fqh, 'fastq'): read_cnt +=1 fqh.close() print '%d Number of reads in FASTQ' % read_cnt print ## what percentage sub-sample percentage = 1 sub_count = int(round((percentage*read_cnt)/100.0)) assert sub_count <= read_cnt, ' %d (sub-sample count) should be less than total read count %d' % (sub_count, read_cnt) print "%d Sub sample count" % sub_count print try: accept_prob = (1.0*sub_count)/read_cnt except: accept_prob = 1 print accept_prob sub_fastq_1_file = "%d_percentage_%s.bz2" % (percentage, fastq_1_file) sub_fastq_2_file = "%d_percentage_%s.bz2" % (percentage, fastq_2_file) ## writing out sub sample files try: subFile_1 = bz2.BZ2File("%s/%s" % (fastq_path, sub_fastq_1_file), 'wb') subFile_2 = bz2.BZ2File("%s/%s" % (fastq_path, sub_fastq_2_file), 'wb') except Exception as error: sys.exit(error) total_cnt = 0 sample_cnt = 0 left_reads = dict() fqh = helper.open_file('%s/%s' % (fastq_path, fastq_1_file)) for rec in SeqIO.parse(fqh, 'fastq'): rnb = random.random() total_cnt += 1 if rnb <= accept_prob: ## @UNC15-SN850_63:4:1101:1103:2151/1 @UNC15-SN850_63:4:1101:1103:2151/2 read_id = rec.id.split('/') if len(read_id) > 1: left_reads[read_id[0]] = 0 else: ## @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 1:N:0:GGNTAC @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 2:N:0:GGNTAC left_reads[rec.id] = 0 sample_cnt += 1 subFile_1.write(rec.format("fastq")) if sub_count == sample_cnt: break fqh.close() subFile_1.close() fqh = helper.open_file('%s/%s' % (fastq_path, fastq_2_file)) for rec in SeqIO.parse(fqh, 'fastq'): read_id = rec.id.split('/') if len(read_id) > 1: ## @UNC15-SN850_63:4:1101:1103:2151/1 if read_id[0] in left_reads: subFile_2.write(rec.format("fastq")) else: ## @UNC11-SN627:294:C236MACXX:5:1101:1430:2218 1:N:0:GGNTAC if rec.id in left_reads: subFile_2.write(rec.format("fastq")) fqh.close() subFile_2.close() print "%s/%s" % (fastq_path, sub_fastq_1_file) print "%s/%s" % (fastq_path, sub_fastq_2_file) print print '%d Number of reads scanned' % total_cnt print '%d Number of reads in' % sample_cnt print
def check_splice_site_consensus(fas_file, splice_region): """ splice site consensus check """ sys.stdout.write("splice site sequence consensus check started...\n") get_gene_models = defaultdict() splice_site_con = 0 fas_fh = helper.open_file(fas_file) for fas_rec in SeqIO.parse(fas_fh, "fasta"): if fas_rec.id in splice_region: for details in splice_region[fas_rec.id]: for genes, regions in details.items(): acc_cons_cnt = 0 don_cons_cnt = 0 if len(regions ) == 1: ## single exon long transcripts no checks get_gene_models[(fas_rec.id, genes[0], genes[1], genes[2])] = 1 continue for region in regions: if genes[-1] == '+': #if not numpy.isnan(region[0]):## acceptor splice site if region[0]: ## acceptor splice site acc_seq = fas_rec.seq[int(region[0]) - 3:int(region[0]) - 1] if str(acc_seq).upper() == "AG": acc_cons_cnt += 1 if region[1]: don_seq = fas_rec.seq[int(region[1] ):int(region[1]) + 2] if str(don_seq).upper() == "GT": don_cons_cnt += 1 elif genes[-1] == '-': if region[0]: ## donor splice site don_seq = fas_rec.seq[int(region[0]) - 3:int(region[0]) - 1] don_seq = don_seq.reverse_complement() if str(don_seq).upper() == "GT": don_cons_cnt += 1 if region[1]: acc_seq = fas_rec.seq[int(region[1] ):int(region[1]) + 2] acc_seq = acc_seq.reverse_complement() if str(acc_seq).upper() == "AG": acc_cons_cnt += 1 ## check for half of the consensus sites if acc_cons_cnt > (len(regions) / 2) and don_cons_cnt > ( len(regions) / 2): get_gene_models[(fas_rec.id, genes[0], genes[1], genes[2])] = 1 else: splice_site_con += 1 fas_fh.close() sys.stdout.write("...considering %d best transcripts\n" % len(get_gene_models)) sys.stdout.write("discarding transcripts...\n") sys.stdout.write("\t%d splice-site consensus sequence missing\n" % splice_site_con) return get_gene_models