def density_adjusted(fname, chr_sam, minlength, maxlength, path_wig, path_den, path_gff): '''Density will be a size separated dictionary = {length : [reads at 0, reads at 1, ....]} this makes it easier to select a size range later for analysis''' fname = fname chr_sam = chr_sam minlength = minlength maxlength = maxlength GFFgen = GFF.parse(path_gff) # open chr aligned sam file f_samfile = open(chr_sam) samfile = csv.reader(f_samfile, delimiter=' ') # dictionaries to hold read counts density_plus = {} density_minus = {} density_plus_sizesep = {} density_minus_sizesep = {} if minlength < 0 or maxlength < 0: print "Error. Length input not valid." return (0) # Makes 2 sets of indices, one for all reads, and another for size separated: for sequence in GFFgen: density_plus[sequence.id] = [0 for x in range(len(sequence) + 20)] density_minus[sequence.id] = [0 for x in range(len(sequence) + 20)] for length in range(minlength, maxlength + 1): density_plus_sizesep[length] = [0 for x in range(len(sequence) + 20)] density_minus_sizesep[length] = [0 for x in range(len(sequence) + 20)] total_reads = 0 mapped_reads = 0 # Loop through the samfile. for read in samfile: if read[0][0] == '@': # Ignore header lines. continue if read[1] == '4': # A bowtie mismatch. continue chrom = read[2] # chromosome identified for read in bowtie readid = read[0] # read id startp = int( read[3] ) - 1 # start position. Need to subtract 1 since genomic sequence starts at 1, seq = Seq.Seq(read[9]) # sequence of the read length = len(seq) # length of read if length < 23: length_shift = 24 - length else: length_shift = 0 if chrom not in density_plus.keys(): print "Error: Bowtie index and GFF do not match" total_reads += 1 # Note that Bowtie reverse complements any sequence aligning to the reverse strand. # and so read[3] is the 3'-end of minus strand reads # Filter to get rid of reads of particular length. Or a particular strand. if (length < minlength or length > maxlength): continue mapped_reads += 1 # 16 is the minus strand, 0 is the plus strand if (read[1] == '16'): start = startp - length_shift density_minus[chrom][start] += 1 density_minus_sizesep[length][start] += 1 if (read[1] == '0'): start = startp + length - 1 + length_shift density_plus[chrom][start] += 1 density_plus_sizesep[length][start] += 1 path_oldformat = path_den + "binary/" if not os.path.exists(path_oldformat): os.makedirs(path_oldformat) density_plus[sequence.id] = [ float(i) * 1000000 / float(mapped_reads) for i in density_plus[sequence.id] ] density_minus[sequence.id] = [ float(i) * 1000000 / float(mapped_reads) for i in density_minus[sequence.id] ] ribo_util.writebin(density_plus, path_oldformat + fname + "_plus_") ribo_util.makePickle(density_plus, path_den + "plus") ribo_util.makePickle(density_plus_sizesep, path_den + "plus_sizesep") ribo_util.countstowig(density_plus, path_wig + "_plus") ribo_util.writebin(density_minus, path_oldformat + fname + "_minus_") ribo_util.makePickle(density_minus, path_den + "minus") ribo_util.makePickle(density_minus_sizesep, path_den + "minus_sizesep") ribo_util.countstowig(density_minus, path_wig + "_minus")
def SD_affinity_genome(paths_in): '''This function takes octamers of genomic sequence and calculates shine dalgarno affinity: output is a size separated dict that can be used like a density dict.''' # load octamers SD_affinity = paths_in['SD_affinity'] affinity_list = pd.read_csv(SD_affinity) affinity_list = pd.Series(affinity_list.SD_affinity.values, index=affinity_list.Octamer).to_dict() length_range = range(10, 46) GFFgen = GFF.parse(paths_in['path_gff']) chr = GFFgen.next() feat_num = 0 affinity_plus = [] affinity_minus = [] density_plus_sizesep = {} density_minus_sizesep = {} sequence = chr.seq sequence_rc = sequence.reverse_complement() genome_size = len(sequence) position = 0 for position in range(0, genome_size): if position < 8: motif = 'AAAAAAAA' motif_rc = 'AAAAAAAA' elif genome_size - position < 8: motif_rc = 'AAAAAAAA' motif = 'AAAAAAAA' else: motif = sequence[position - 8:position].transcribe() motif_rc = sequence[position:position + 8].transcribe() motif_rc = motif_rc.reverse_complement() if len(motif) == 8 and len(motif_rc) == 8: SD_affinity_plus = affinity_list[motif] SD_affinity_minus = affinity_list[motif_rc] else: SD_affinity_plus = 0.0 SD_affinity_minus = 0.0 if position == 100000: print '100000' if position == 500000: print '500000' if position == 1000000: print '1000000' if position == 2000000: print '2000000' affinity_plus.append(SD_affinity_plus) affinity_minus.append(SD_affinity_minus) for length in length_range: density_plus_sizesep[length] = affinity_plus density_minus_sizesep[length] = affinity_minus path_den = inpath + 'density/density/SD1/' ribo_util.makePickle(density_plus_sizesep, path_den + "plus_sizesep") ribo_util.makePickle(density_minus_sizesep, path_den + "minus_sizesep") return
def run_filter_UMI(inputs, paths_in, paths_out): print "\n\tStarted UMI removal at " + str(datetime.now()) files = inputs['files'] run = inputs['run_filter_UMI'] linker_UMI = inputs['linker_UMI'] RT_UMI = inputs['linker_UMI'] for fname in files: file_in = paths_out['path_filter'] + fname + '_UMI-trimmed.fastq' file_out = paths_out['path_filter'] + fname + '-trimmed.fastq' file_log = paths_out['path_log'] + fname + '_filter' if not run == 'yes': print fname + " will not be filtered for a UMI" continue else: file_out = open(paths_out['path_filter'] + fname + '-trimmed.fastq', "w") UMI = {} umi = [] n_umi = [] umi_unique = [] with open(file_in, 'rb') as f: count = 0 for line in f: if count == 0: count = 1 Identifier = line continue if count == 1: count = 2 Sequence = line continue if count == 2: count = 3 QIdentifier = line continue if count == 3: count = 0 PHRED = line Identifier = Identifier[:-1] Sequence = Sequence[RT_UMI:-linker_UMI-1] QIdentifier = QIdentifier[:-1] PHRED = PHRED[RT_UMI:-linker_UMI-1] file_out.write(Identifier + "\n" + Sequence + "\n" + QIdentifier + "\n" + PHRED + "\n") umi_seq = Sequence[0:RT_UMI] + Sequence[-linker_UMI-1:-1] if umi_seq not in umi: umi.append(umi_seq) n_umi.append(1) umi_unique.append('yes') umi_read.append('') n_seq.append(1) else: index = umi.index(umi_seq) n_umi[index] += 1 umi_unique[index] = 'no' continue UMI['UMI'] = umi UMI['count'] = n_umi UMI['unique'] = umi_unique ribo_util.makePickle(UMI, file_log + fname + '_UMI', protocol=pickle.HIGHEST_PROTOCOL) f.close() file_out.close() print "\tFinished UMI removal at " + str(datetime.now()) print "\tCOMPLETED UMI REMOVAL"
def GFF_to_dict(paths_in, gff_settings): '''Parse gff into dict: - feat_of_interest = what to look for in gff (protein_coding, tRNA, rRNA, etc) - name_qual = qualifier for alias/gene name (Name, gene_id) - name_qual_alt = alternative qualifier, if none, set as 'none' - biotype_qual = qualifier for type of feature (biotype, etc) These values must correspont to values in the GFF''' '''Unload gff_settings''' path_out = gff_settings['path_out'] feat_of_interest = gff_settings[ 'feat_of_interest'] #all, protein_coding, tRNA, rRNA name_qual = gff_settings['name_qual'] name_qual_alt = gff_settings['name_qual_alt'] remove_genes = gff_settings['remove_genes'] # aSD_seq = gff_settings['aSD_seq'] path_badgenes = paths_in['path_badgenes'] '''Output path can be defined, or use 0 to set as the annotation file for my main pipeline''' if path_out == 0: path_gff_dict = paths_in['path_gff_dict'] else: path_gff_dict = path_out '''Parse GFF using BCBio''' GFFgen = GFF.parse(paths_in['path_gff']) feat_num = 0 '''Define data arrays: will be used as columns for pandas DateFrame''' gff_dict = {} chromosome = [] aliaslist = [] startlist = [] stoplist = [] seqlist = [] typelist = [] strandlist = [] startcodon = [] stopcodon = [] SDaffinity = [] G_content = [] C_content = [] A_content = [] T_content = [] aa_code, codon_code = ribo_util.get_genetic_code() aa_comp_dict = {} '''Make list of bad genes''' # from Gene-Wei-Li # bad_genes = pd.read_csv(path_badgenes) # bad_genes = bad_genes.to_dict(orient='list') # bad_genes = bad_genes['GeneName'] '''Sift through GFF for relevant information''' for chromosome_number in range(1, 50): chr = next(GFFgen, None) if chr is None: break for feature in chr.features: chromosome_id = chr.id if feature.sub_features == []: feat_num += 1 continue if remove_genes == 'yes': '''Skip over non-CDS annotations''' if not feature.sub_features[0].type == feat_of_interest: feat_num += 1 continue elif feature.qualifiers.has_key('pseudo') == True: feat_num += 1 continue else: feature_type = 'CDS' else: '''Add feat type to GFF, noting pseudogenes''' if feature.qualifiers.has_key('pseudo') == True: feature_type = 'pseudo' else: feature_type = feature.sub_features[0].type '''Get feature name''' if name_qual in feature.qualifiers: feat_name = feature.qualifiers[name_qual][0] elif name_qual_alt in feature.qualifiers: feat_name = feature.qualifiers[name_qual_alt][0] else: feat_name = 'None' feat_num += 1 continue '''Remove feature if bad''' # if remove_genes == 'yes': # if feat_name in bad_genes: # feat_num+=1 # continue # else: # if feat_name in bad_genes: # feature_type = 'bad' '''Get start, end, and strand position''' start = feature.location.start.position end = feature.location.end.position strand = feature.strand '''Analyze features of interest for feat information''' alias = feat_name '''Each strand is treated differently, + strand == 1''' if strand == 1: '''I save gene sequence + 50 bp from each end: makes it easier to analyze start and stop sequence context without using whole genome sequence''' if start < 50: # if gene is near the beginning of genome sequence: sequence = 'N' * (50 - start ) # TB GFF starts at 0, add N * 50 sequence = sequence + chr[ 0:end + 50].seq # gene sequence + 50nt at each end else: sequence = chr[start - 50:end + 50].seq # gene sequence + 50nt at each end strand_val = '+' startcodon_pos = start stopcodon_pos = end - 1 if start > 200: upstream_seq = chr[start - 200:start + 100].seq else: '''For minus strand, 'end' is start codon, 'start' is stop codon and sequence is reverse compliment of gene sequence.''' sequence_rc = chr[start - 50:end + 50].seq sequence = sequence_rc.reverse_complement() strand_val = '-' startcodon_pos = end - 1 stopcodon_pos = start if end + 200 > len(chr.seq): upstream_seq = 'none' else: upstream_seq_rc = chr[end - 100:end + 200].seq upstream_seq = upstream_seq_rc.reverse_complement() sequence = str(sequence) start_codon = sequence[50:53:1] stop_codon = sequence[-53:-50] '''get sequence from start to stop for GC analysis''' CDS_seq = sequence[50:-50:1] G, C, A, T = GC_of_CDS(CDS_seq) '''Calculate SD affinity''' # SD_seq = sequence[30:50:1] # analyze 20 nt upstream of start codons # SD_affinity = shine_dalgarno_affinity(aSD_seq, SD_seq) '''Append data to lists''' if alias == 'trmD': print sequence chromosome.append(chromosome_id) typelist.append(feature_type) aliaslist.append(alias) seqlist.append(sequence) strandlist.append(strand_val) startlist.append(startcodon_pos) stoplist.append(stopcodon_pos) startcodon.append(start_codon) stopcodon.append(stop_codon) # SDaffinity.append(SD_affinity) G_content.append(G) C_content.append(C) A_content.append(A) T_content.append(T) feat_num += 1 '''Append lists to gff_dict''' gff_dict['Chromosome'] = chromosome gff_dict['Alias'] = aliaslist gff_dict['Strand'] = strandlist gff_dict['Start'] = startlist gff_dict['Stop'] = stoplist gff_dict['Sequence'] = seqlist gff_dict['Start_Codon'] = startcodon gff_dict['Stop_Codon'] = stopcodon gff_dict['Type'] = typelist # gff_dict['SD_affinity'] = SDaffinity gff_dict['G_content'] = G_content gff_dict['C_content'] = C_content gff_dict['A_content'] = A_content gff_dict['T_content'] = T_content '''Pickle dict for use later''' ribo_util.makePickle(gff_dict, path_gff_dict) '''print dataframe, and save as .csv for use later''' ## Print GFF to check gff_df = pd.DataFrame(gff_dict) display(gff_df) gff_df.to_csv(path_gff_dict + '.csv') return
def run_avggene_transcriptend(fname, settings, plus, minus, gff, path_start, path_stop): next_gene = settings['next_gene'] equal_weight = settings['equal_weight'] length_in_ORF = settings['length_in_ORF'] length_out_ORF = settings['length_out_ORF'] minlength = settings['minlength'] maxlength = settings['maxlength'] density_plus = plus density_minus = minus gff_dict = gff window_length = length_in_ORF + length_out_ORF positionindex = range(0, window_length) lengthindex = range(minlength, maxlength+1) alias_list = gff_dict['Alias'] strand_list = gff_dict['Strand'] start_list = gff_dict['Start'] stop_list = gff_dict['Stop'] lastgene, dist_from_lastgene = get_transcript_end('/Volumes/HDD/data/gff/BS/rho-independent_ends.csv') # datastructure: # for heatmap - each dict has read counts at each position separated by length keys # for avggene summary - dictionary with position as keys averagegene_start = {length : [0]*(window_length) for length in lengthindex} averagegene_stop = {length : [0]*(window_length) for length in lengthindex} start_all = {position : 0 for position in positionindex} stop_all = {position : 0 for position in positionindex} genes_too_close = [] genes_below_thresh = [] for alias, start, stop, strand in itertools.izip(alias_list, start_list,stop_list, strand_list): if alias not in lastgene: continue else: getindex = lastgene.index(alias) endpos = dist_from_lastgene[getindex] genelength = abs(start - stop) + 1 nextgene = ribo_util.nextgene(alias, gff_dict) # define plot start and stop window for genes in + or - strand: if strand == '+': stop = stop + endpos start_window = start - length_out_ORF stop_window = stop - length_in_ORF stop_window_max = stop + length_out_ORF if strand == '-': stop = stop - endpos start_window = start + length_out_ORF stop_window = stop + length_in_ORF stop_window_max = stop - length_out_ORF if next_gene > 0: if nextgene['distance'] < next_gene: # exclude genes that are too close genes_too_close.append(alias) genes_too_close.append(nextgene['alias']) continue if alias in genes_too_close: continue if genelength < length_in_ORF + 20: #exclude genes that are too small continue if equal_weight == 'y': gene_reads, length_reads = ribo_util.get_genecounts(start_window, stop_window, strand, density_plus, density_minus, minlength, maxlength) gene_reads = sum(gene_reads) if gene_reads < 10: genes_below_thresh.append(alias) continue else: gene_reads = 1 gene_reads = float(gene_reads) genomelength = len(density_plus[density_plus.keys()[0]]) if strand == '+': if not 0 <= start_window < genomelength: continue if not 0 <= stop_window_max < genomelength: continue density_dict = density_plus for length in lengthindex: for position in positionindex: start_density = density_dict[length][start_window + position] start_density = float(start_density) / gene_reads averagegene_start[length][position] += start_density start_all[position] += start_density stop_density = density_dict[length][stop_window + position] stop_density = float(stop_density) / gene_reads averagegene_stop[length][position] += stop_density stop_all[position] += stop_density if strand == '-': if not 0 <= start_window < genomelength: continue if not 0 <= stop_window_max < genomelength: continue density_dict = density_minus for length in lengthindex: for position in positionindex: start_density = density_dict[length][start_window - position] start_density = float(start_density) / gene_reads averagegene_start[length][position] += start_density start_all[position] += start_density stop_density = density_dict[length][stop_window - position] stop_density = float(stop_density) / gene_reads averagegene_stop[length][position] += stop_density stop_all[position] += stop_density genes_excluded = list(set(genes_too_close)) print '\t' + str(len(genes_excluded)) + ' genes excluded from ' + fname + str(len(genes_below_thresh)) ribo_util.makePickle(start_all, path_start + '_all_end', protocol=pickle.HIGHEST_PROTOCOL) ribo_util.makePickle(averagegene_start, path_start + '_HM_end', protocol=pickle.HIGHEST_PROTOCOL) ribo_util.makePickle(stop_all, path_stop + '_all_end', protocol=pickle.HIGHEST_PROTOCOL) ribo_util.makePickle(averagegene_stop, path_stop + '_HM_end', protocol=pickle.HIGHEST_PROTOCOL)