def main(): parser = argparse.ArgumentParser() parser.add_argument('input_gpd', help="GENEPRED input or - for STDIN") args = parser.parse_args() inf = sys.stdin if args.input_gpd != '-': inf = open(args.input_gpd) seen = set() ls = RangeBasics.Loci() ls.verbose = True ls.use_direction = False for line in inf: if line[0] == '#': continue gpd = GenePredEntry(line) if gpd.value('name') in seen: sys.stderr.write( "ERROR: need uniquely named genepred entry names\n" + name + "\n") sys.exit() seen.add(gpd.value('name')) r = gpd.locus_range.copy() r.direction = None r.set_payload(gpd.value('name')) l = RangeBasics.Locus() l.add_member(r) ls.add_locus(l) ls.update_loci() z = 0 for locus in ls.loci: z += 1 for member in locus.members: print str(z) + "\t" + member.get_payload()
def get_loci(transcripts_genepred): loci = Loci() loci.verbose = True with open(transcripts_genepred) as inf: for line in inf: if line[0] == '#': continue gpd = GenePredEntry(line.rstrip()) rng = Bed(gpd.value('chrom'), gpd.value('txStart'), gpd.value('txEnd')) rng.set_payload(gpd.value('name')) loc1 = Locus() loc1.add_member(rng) loci.add_locus(loc1) sys.stderr.write("Organizing genepred data into overlapping loci\n") sys.stderr.write("Started with " + str(len(loci.loci)) + " loci\n") loci.update_loci() sys.stderr.write("Ended with " + str(len(loci.loci)) + " loci\n") m = 0 locus2name = {} name2locus = {} for locus in loci.loci: m += 1 for member in locus.members: name = member.get_payload() if m not in locus2name: locus2name[m] = set() locus2name[m].add(name) name2locus[name] = m return [locus2name, name2locus]
def process_locus(locus, args): depth = {} s2psl = SAMtoPSLconversionFactory() unique = {} chr = locus[0].value('rname') for sam in locus: p = PSL(s2psl.convert_line(sam.get_line())) g = GenePredEntry(p.get_genepred_line()) g = g.get_smoothed(args.min_intron) for i in range(0,g.get_exon_count()): rng = str(g.value('exonStarts')[i])+"\t"+str(g.value('exonEnds')[i]) if rng not in unique: unique[rng] = 0 unique[rng]+=1 for bstr in unique: [start,end] = bstr.split("\t") for i in range(int(start),int(end)): if i not in depth: depth[i] = 0 depth[i] += unique[bstr] # add the number of these to the depth #now we can print the depth prevdepth = 0 prevstart = None lasti = None for i in sorted(depth.keys()): if depth[i] < args.min_depth: continue if depth[i] != prevdepth: #output what we have so far if we have something if prevstart: output_depth(chr+"\t"+str(prevstart)+"\t"+str(lasti+1)+"\t"+str(prevdepth),args) prevstart = i prevdepth = depth[i] lasti = i if prevstart: output_depth(chr+"\t"+str(prevstart)+"\t"+str(lasti+1)+"\t"+str(prevdepth),args)
def main(): parser = argparse.ArgumentParser() parser.add_argument('input_gpd',help="GENEPRED input or - for STDIN") args = parser.parse_args() inf = sys.stdin if args.input_gpd != '-': inf = open(args.input_gpd) seen = set() ls = RangeBasics.Loci() ls.verbose = True ls.use_direction = False for line in inf: if line[0] == '#': continue gpd = GenePredEntry(line) if gpd.value('name') in seen: sys.stderr.write("ERROR: need uniquely named genepred entry names\n"+name+"\n") sys.exit() seen.add(gpd.value('name')) r = gpd.locus_range.copy() r.direction = None r.set_payload(gpd.value('name')) l = RangeBasics.Locus() l.add_member(r) ls.add_locus(l) ls.update_loci() z = 0 for locus in ls.loci: z += 1 for member in locus.members: print str(z) + "\t" + member.get_payload()
def get_loci(transcripts_genepred): loci = Loci() loci.verbose= True with open(transcripts_genepred) as inf: for line in inf: if line[0]=='#': continue gpd = GenePredEntry(line.rstrip()) rng = Bed(gpd.value('chrom'),gpd.value('txStart'),gpd.value('txEnd')) rng.set_payload(gpd.value('name')) loc1 = Locus() loc1.add_member(rng) loci.add_locus(loc1) sys.stderr.write("Organizing genepred data into overlapping loci\n") sys.stderr.write("Started with "+str(len(loci.loci))+" loci\n") loci.update_loci() sys.stderr.write("Ended with "+str(len(loci.loci))+" loci\n") m = 0 locus2name = {} name2locus = {} for locus in loci.loci: m+=1 for member in locus.members: name = member.get_payload() if m not in locus2name: locus2name[m] = set() locus2name[m].add(name) name2locus[name] = m return [locus2name,name2locus]
def main(): parser = argparse.ArgumentParser(description="Filter a genepred by transcript length") parser.add_argument('input',help="Input '-' for STDOUT") parser.add_argument('--min_length',type=int,help="Minimum transcript length") parser.add_argument('--max_length',type=int,help="Maximum transcript length") parser.add_argument('--names',help="filter on a name list") parser.add_argument('--gene_names',help="filter on a gene name list") parser.add_argument('-v','--invert',action='store_true',help='Invert search result') args = parser.parse_args() name_list = set() gene_name_list = set() if args.names: with open(args.names) as inf: for line in inf: f = line.rstrip().split("\t") name_list.add(f[0]) if args.gene_names: with open(args.gene_names) as inf: for line in inf: f = line.rstrip().split("\t") gene_name_list.add(f[0]) inf = sys.stdin if args.input != '-': inf = open(args.input) for line in inf: if re.match('^#',line): continue is_good = True g = GPD(line.rstrip()) tot = g.length() if args.min_length: if tot < args.min_length: is_good = False if args.max_length: if tot > args.max_length: is_good = False if args.names: if g.value('name') not in name_list: is_good = False if args.gene_names: if g.value('gene_name') not in args.gene_name_list: is_good = False # If we are still here we can print if not args.invert: if is_good: print line.rstrip() else: if not is_good: print line.rstrip()
def main(): parser = argparse.ArgumentParser(description="For every genepred entry report its alignability",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="Genepred can be gzipped or - for STDIN") parser.add_argument('-r','--reference',required=True,help="Reference fasta") parser.add_argument('-k','--fragment_size',default=100,type=int,help="Fragment size to try to align") parser.add_argument('-x','--hisat_index',required=True,help="HISAT index base name") parser.add_argument('--threads',type=int,default=cpu_count(),help="number of threads") parser.add_argument('--type',choices=['mean','median'],default='mean',help="How to bring together overlapping reads") parser.add_argument('--perbase',action='store_true') parser.add_argument('--output','-o',help="output file or leave unset for STDOUT") args = parser.parse_args() if args.input=='-': args.input=sys.stdin elif re.search('\.gz$',args.input): args.input = gzip.open(args.input) else: args.input = open(args.input) udir = os.path.dirname(os.path.realpath(__file__)) cmd2 = udir+'/genepred_counts_to_mappability.py -' cmd2 += ' --threads '+str(args.threads) cmd2 += ' -k '+str(args.fragment_size) if args.perbase: cmd2 += ' --perbase' if args.output: cmd2 += ' --output '+args.output if args.type: cmd2 += ' --type '+args.type p2 = Popen(cmd2.split(),stdin=PIPE) ref = read_fasta_into_hash(args.reference) cmd1 = 'hisat -x '+args.hisat_index+' -U - -f --reorder -p '+str(args.threads) p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin,stderr=null) #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin) line_number = 0 for line in args.input: line_number +=1 gpd = GPD(line.rstrip()) #print gpd.entry['name'] #print gpd.length() if gpd.length() < args.fragment_size: continue seq = gpd.get_sequence(ref) for i in range(0,len(seq)-args.fragment_size+1): info = gpd.value('name')+"\t"+gpd.value('gene_name')+"\t"+str(line_number)+"\t"+str(len(seq))+"\t"+str(i) einfo = encode_name(info) p1.stdin.write('>'+einfo+"\n") p1.stdin.write(seq[i:i+args.fragment_size]+"\n") p1.communicate() p2.communicate()
def main(): parser = argparse.ArgumentParser() parser.add_argument('reference_genome') parser.add_argument('transcripts_genepred') parser.add_argument('--out_gpd', help="fusion genepred", required=True) parser.add_argument('--out_fasta', help="fusion fasta", required=True) parser.add_argument( '--fusion_count', type=int, default=1000, help="Create this many fusions, max is number of genes/2.") args = parser.parse_args() ref = read_fasta_into_hash(args.reference_genome) of_gpd = open(args.out_gpd, 'w') of_fasta = open(args.out_fasta, 'w') genes = {} with open(args.transcripts_genepred) as inf: for line in inf: gpd = GPD(line.rstrip()) if gpd.value('exonCount') <= 1: continue if gpd.value('gene_name') not in genes: genes[gpd.value('gene_name')] = [] genes[gpd.value('gene_name')].append(gpd) gene_names = genes.keys() fusion_count = args.fusion_count shuffle(gene_names) pairs = [] while True: if len(pairs) == fusion_count: break if len(gene_names) < 2: break pair = [gene_names[0], gene_names[1]] pairs.append(pair) gene_names.pop(0) gene_names.pop(0) for pair in pairs: [gpds, ars] = get_random_gpds_from_pair(pair, genes, ref) print ars.name of_fasta.write(ars.get_fasta()) for gpd in gpds: of_gpd.write(gpd + "\n") of_gpd.close() of_fasta.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('reference_genome') parser.add_argument('transcripts_genepred') parser.add_argument('--out_gpd',help="fusion genepred",required=True) parser.add_argument('--out_fasta',help="fusion fasta",required=True) parser.add_argument('--fusion_count',type=int,default=1000,help="Create this many fusions, max is number of genes/2.") args = parser.parse_args() ref = read_fasta_into_hash(args.reference_genome) of_gpd = open(args.out_gpd,'w') of_fasta = open(args.out_fasta,'w') genes = {} with open(args.transcripts_genepred) as inf: for line in inf: gpd = GPD(line.rstrip()) if gpd.value('exonCount') <= 1: continue if gpd.value('gene_name') not in genes: genes[gpd.value('gene_name')] = [] genes[gpd.value('gene_name')].append(gpd) gene_names = genes.keys() fusion_count = args.fusion_count shuffle(gene_names) pairs = [] while True: if len(pairs) == fusion_count: break if len(gene_names) < 2: break pair = [gene_names[0],gene_names[1]] pairs.append(pair) gene_names.pop(0) gene_names.pop(0) for pair in pairs: [gpds,ars] = get_random_gpds_from_pair(pair,genes,ref) print ars.name of_fasta.write(ars.get_fasta()) for gpd in gpds: of_gpd.write(gpd+"\n") of_gpd.close() of_fasta.close()
def load_from_inputs(args): #Read in the VCF file sys.stderr.write("Reading in the VCF file\n") alleles = {} #with open(args.phased_VCF) as inf: with open(args.inputs[1]) as inf: for line in inf: vcf = VCF(line) if not vcf.is_snp(): continue g = vcf.get_phased_genotype() if not g: continue if vcf.value('chrom') not in alleles: alleles[vcf.value('chrom')] = {} if vcf.value('pos') in alleles[vcf.value('chrom')]: sys.stderr.write("WARNING: seeing the same position twice.\n" + line.rstrip() + "\n") alleles[vcf.value('chrom')][vcf.value( 'pos')] = g # set our left and right sys.stderr.write("Reading in the reference genome\n") #ref = read_fasta_into_hash(args.reference_genome) ref = read_fasta_into_hash(args.inputs[0]) res1 = [] res2 = [] p = None sys.stderr.write("Introducing VCF changes to reference sequences\n") # Pretty memory intesnive to so don't go with all possible threads if args.threads > 1: p = Pool(processes=max(1, int(args.threads / 4))) for chrom in ref: # handle the case where there is no allele information if chrom not in alleles: r1q = Queue() r1q.put([0, chrom, ref[chrom]]) res1.append(r1q) r2q = Queue() r2q.put([0, chrom, ref[chrom]]) res2.append(r2q) elif args.threads > 1: res1.append( p.apply_async(adjust_reference_genome, args=(alleles[chrom], ref[chrom], 0, chrom))) res2.append( p.apply_async(adjust_reference_genome, args=(alleles[chrom], ref[chrom], 1, chrom))) else: r1q = Queue() r1q.put( adjust_reference_genome(alleles[chrom], ref[chrom], 0, chrom)) res1.append(r1q) r2q = Queue() r2q.put( adjust_reference_genome(alleles[chrom], ref[chrom], 1, chrom)) res2.append(r2q) if args.threads > 1: p.close() p.join() # now we can fill reference 1 with all our new sequences ref1 = {} c1 = 0 for i in range(0, len(res1)): res = res1[i].get() c1 += res[0] ref1[res[1]] = res[2] # now we can fill reference 2 with all our new sequences ref2 = {} c2 = 0 for i in range(0, len(res2)): res = res2[i].get() c2 += res[0] ref2[res[1]] = res[2] sys.stderr.write("Made " + str(c1) + "|" + str(c2) + " changes to the reference\n") # Now ref1 and ref2 have are the diploid sources of the transcriptome gpdnames = {} txn1 = Transcriptome() txn2 = Transcriptome() txn1.set_reference_genome_dictionary(ref1) txn2.set_reference_genome_dictionary(ref2) #with open(args.transcripts_genepred) as inf: with open(args.inputs[2]) as inf: for line in inf: if line[0] == '#': continue txn1.add_genepred_line(line.rstrip()) txn2.add_genepred_line(line.rstrip()) gpd = GenePredEntry(line.rstrip()) gpdnames[gpd.value('name')] = gpd.value('gene_name') # The transcriptomes are set but we dont' really need the references anymore # Empty our big memory things txn1.ref_hash = None txn2.ref_hash = None for chrom in ref1.keys(): del ref1[chrom] for chrom in ref2.keys(): del ref2[chrom] for chrom in ref.keys(): del ref[chrom] if not args.locus_by_gene_name: #[locus2name,name2locus] = get_loci(args.transcripts_genepred) [locus2name, name2locus] = get_loci(args.inputs[2]) else: # set locus by gene name sys.stderr.write("Organizing loci by gene name\n") locus2name = {} name2locus = {} numname = {} m = 0 for name in sorted(gpdnames): gene = gpdnames[name] if gene not in numname: m += 1 numname[gene] = m num = numname[gene] if num not in locus2name: locus2name[num] = set() locus2name[num].add(name) name2locus[name] = num sys.stderr.write("Ended with " + str(len(locus2name.keys())) + " loci\n") if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn1.add_expression(f[0], float(f[1])) txn2.add_expression(f[0], float(f[1])) elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") cuffz = 0 with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: cuffz += 1 sys.stderr.write(str(cuffz) + " cufflinks entries processed\r") f = line.rstrip().split("\t") txn1.add_expression_no_update(f[0], float(f[9])) txn2.add_expression_no_update(f[0], float(f[9])) txn1.update_expression() txn2.update_expression() sys.stderr.write("\n") elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") else: sys.stderr.write( "Warning isoform expression not sepcified, using uniform expression model.\n" ) # Now we have the transcriptomes set rhos = {} # The ASE of allele 1 (the left side) randos = {} if args.seed: random.seed(args.seed) for z in locus2name: randos[z] = random.random() sys.stderr.write("Setting rho for each transcript\n") # Lets set rho for ASE for each transcript for tname in sorted(txn1.transcripts): if args.ASE_identical or args.ASE_identical == 0: rhos[tname] = float(args.ASE_identical) elif args.ASE_isoform_random: rhos[tname] = random.random() else: # we must be on locus random rhos[tname] = randos[name2locus[tname]] #Now our dataset is set up rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1, txn2) rbe.gene_names = gpdnames rbe.name2locus = name2locus rbe.set_transcriptome1_rho(rhos) return rbe
def main(): parser = argparse.ArgumentParser() parser.add_argument('gpd_input') parser.add_argument('bam_input') parser.add_argument('--intergenic_buffer',default=10000,type=int) parser.add_argument('--window_size',default=10000,type=int) parser.add_argument('--bin_size',default=1000,type=int) parser.add_argument('--use_off_regions',action='store_true',help="Use a region even if there is no reads mapped to it.") parser.add_argument('--get_exons',action='store_true') args = parser.parse_args() chr_beds = {} gene_beds = [] exon_beds = [] sys.stderr.write("Reading genepred file\n") asum = 0 atot = 0 with open(args.gpd_input) as inf: for line in inf: g = GenePredEntry(line) asum += g.length() atot += 1 grng = g.get_bed() grng.direction = None if grng.chr not in chr_beds: chr_beds[grng.chr] = grng.copy() chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng) gene_beds.append(grng) for i in range(0,g.get_exon_count()): erng = Bed(g.value('chrom'),g.value('exonStarts')[i],g.value('exonEnds')[i]) exon_beds.append(erng) avglen = float(asum)/float(atot) sys.stderr.write("Sorting gene bed\n") gene_beds = sort_ranges(gene_beds) gene_beds = merge_ranges(gene_beds,already_sorted=True) sys.stderr.write("Sorting chromosome beds\n") chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()]) sys.stderr.write("Sorting exon beds\n") exon_beds = sort_ranges(exon_beds) sys.stderr.write("Get padded genes\n") padded_gene_beds = pad_ranges(gene_beds,args.intergenic_buffer,chr_beds) padded_gene_beds = merge_ranges(padded_gene_beds,already_sorted=True) sys.stderr.write("Get intergenic regions\n") intergenic_beds = subtract_ranges(chr_beds,padded_gene_beds,already_sorted=True) intergenic_beds = merge_ranges(intergenic_beds,already_sorted=True) intergenic_beds = window_break(intergenic_beds,args.window_size) #for i in intergenic_beds: print i.get_range_string() sys.stderr.write("Get merged exons\n") exon_beds = merge_ranges(exon_beds) sys.stderr.write("Get introns\n") intron_beds = subtract_ranges(gene_beds,exon_beds,already_sorted=True) intron_beds = merge_ranges(intron_beds,already_sorted=True) intron_beds = window_break(intron_beds,args.window_size) sys.stderr.write("Going through short reads\n") cmd = "sam_to_bed_depth.py "+args.bam_input p = Popen(cmd.split(),stdout=PIPE) for x in intron_beds: x.set_payload([]) # payloads are read depths for x in intergenic_beds: x.set_payload([]) # payloads are read depths for x in exon_beds: x.set_payload([]) # payloads are read depths introndepth = [] intergenicdepth = [] exondepth = [] pseudoreadcount = 0 if not args.get_exons: exon_beds = [] section_count = 0 while True: section_count += 1 line = p.stdout.readline() if not line: break f = line.split("\t") depth = int(f[3]) curr = Bed(f[0],int(f[1]),int(f[2])) if section_count %100==0: sys.stderr.write(curr.get_range_string()+" \r") pseudoreadcount += depth if len(exon_beds) > 0: while curr.cmp(exon_beds[0]) > 0 and len(exon_beds) > 0: # we've passed the region v = exon_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) exondepth.append(av) #print str(av)+" exonic "+v.get_range_string() c = curr.cmp(exon_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(exon_beds[0]) for i in range(0,size): exon_beds[0].get_payload().append(depth) if len(intron_beds) > 0: while curr.cmp(intron_beds[0]) > 0 and len(intron_beds) > 0: # we've passed the region v = intron_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) introndepth.append(av) #print str(av)+" intronic "+v.get_range_string() c = curr.cmp(intron_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intron_beds[0]) for i in range(0,size): intron_beds[0].get_payload().append(depth) if len(intergenic_beds) > 0: while curr.cmp(intergenic_beds[0]) > 0 and len(intergenic_beds) > 0: # we've passed the region v = intergenic_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) intergenicdepth.append(av) display(curr,introndepth,intergenicdepth,pseudoreadcount,avglen) #print str(av)+" intergenic "+v.get_range_string() c = curr.cmp(intergenic_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intergenic_beds[0]) for i in range(0,size): intergenic_beds[0].get_payload().append(depth) #if c > 0: # we passed the intron # v = intergenic_beds.pop(0) # av = average(v) # intergenicdepth.append(av) # print str(av)+" intergenic "+v.get_range_string() if args.use_off_regions: for x in exon_beds: introndepth.append(average(x.get_payload())) for x in intron_beds: introndepth.append(average(x.get_payload())) for x in intergenic_beds: intergenicdepth.append(average(x.get_payload())) p.communicate()
def main(): parser = argparse.ArgumentParser( description= "Rename gene and transcript elements of GenePred file that are redundant. Please specify an output if you would like report files generated for the filters." ) parser.add_argument('input', help="GENEPREDFILE or '-' for STDIN") parser.add_argument( '-o', '--output', help= "OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated" ) parser.add_argument( '--minimum_locus_distance', type=int, default=500000, help="Genes with the same name will be renamed if this far apart") parser.add_argument( '--keep_positional_duplicates', action='store_true', help="By default we remove one of the duplicate entries") parser.add_argument( '--keep_transcript_names', action='store_true', help="By default we rename duplicated transcript names") parser.add_argument( '--keep_gene_names', action='store_true', help="By default we rename genes located at different loci.") args = parser.parse_args() inf = sys.stdin if args.input != '-': inf = open(args.input) of = sys.stdout if args.output: of = open(args.output, 'w') txdef = {} gfams = {} for line in inf: if line[0] == '#': continue g = GenePredEntry(line) loc = g.value('chrom') + ':' + ','.join( [str(x) for x in g.value('exonStarts')]) + '-' + ','.join( [str(x) for x in g.value('exonEnds')]) + '/' + g.value('strand') if loc not in txdef: txdef[loc] = [] txdef[loc].append(g) if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = [] gfams[g.value('gene_name')].append(g.value('name')) # now we have cataloged all transcripts by unique locations omissions = [] keepers = [] for loc in sorted(txdef.keys()): if args.keep_positional_duplicates: # We don't want to ommit anything here for g in txdef[loc]: keepers.append(g) continue #basically skipping this part by populating keepers num = len(txdef[loc]) if num > 1: sys.stderr.write("Found " + str(num) + " entries at location\n") sys.stderr.write(loc + "\n") sys.stderr.write("They are:\n") largest = 0 keepgene = None keepindex = -1 i = 0 for e in txdef[loc]: famsize = len(gfams[e.value('gene_name')]) sys.stderr.write(" " + e.value('gene_name') + "\t" + e.value('name') + "\t" + str(famsize) + "\n") if famsize > largest: keepgene = e largest = famsize keepindex = i i += 1 for j in range(0, len(txdef[loc])): if j != keepindex: omissions.append(txdef[loc][j]) else: keepers.append(txdef[loc][j]) sys.stderr.write(" Biggest gene family is " + keepgene.value('gene_name') + " with " + str(largest) + " transcripts\n") sys.stderr.write(" so keep that one.\n") else: keepers.append(txdef[loc][0]) sys.stderr.write("Omitting " + str(len(omissions)) + " entries for redundant positions\n") if args.output and not args.keep_positional_duplicates: of1 = open(args.output + '.positional_duplicate_omissions', 'w') for g in omissions: of1.write(g.get_line() + "\n") of1.close() # Now the keepers contains transcripts with unique locations # Lets provide unique names to remaining transcripts tnames = {} renametx = {} for g in keepers: tx = g.value('name') if tx not in tnames: tnames[tx] = [] tnames[tx].append(g) for name in tnames: if args.keep_transcript_names: continue # We don't want to rename them nsize = len(tnames[name]) if nsize > 1: sys.stderr.write("Name: " + name + " has a family of size " + str(nsize) + "\n") for i in range(0, len(tnames[name])): newname = name + '[' + str(i + 1) + '/' + str(nsize) + ']' renametx[newname] = name tnames[name][i].entry['name'] = newname sys.stderr.write("Renamed: " + str(len(renametx)) + " transcripts\n") if args.output and not args.keep_transcript_names: of1 = open(args.output + '.renamed_transcripts', 'w') for name in sorted(renametx.keys()): of1.write(name + "\t" + renametx[name] + "\n") of1.close() #now we need to arrange into gene families gnames = {} for name in tnames: for g in tnames[name]: gene = g.value('gene_name') if gene not in gnames: gnames[gene] = [] gnames[gene].append(g) renamegene = {} finished = [] for gene in gnames: if args.keep_gene_names: for g in gnames[gene]: finished.append(g) continue # We don't want to rename genes if len(gnames[gene]) == 1: finished.append(gnames[gene][0]) continue # Now we need to make sure these genes are really on the same locus. loci = Loci() loci.set_minimum_distance(args.minimum_locus_distance) for g in gnames[gene]: r = g.locus_range.copy() r.set_payload(g) loc = Locus() loc.add_member(r) loci.add_locus(loc) loci.update_loci() lcount = len(loci.loci) if lcount == 1: for g in gnames[gene]: finished.append(g) continue # need to rename some genes for i in range(0, lcount): newname = gene + '[' + str(i + 1) + '/' + str(lcount) + ']' rstr = loci.loci[i].range.get_range_string() renamegene[newname] = gene sys.stderr.write(newname + "\t" + rstr + "\n") for m in loci.loci[i].members: m.get_payload().entry['gene_name'] = newname finished.append(m.get_payload()) sys.stderr.write("Renamed: " + str(len(renamegene)) + " genes\n") if args.output and not args.keep_transcript_names: of1 = open(args.output + '.renamed_genes', 'w') for name in sorted(renamegene.keys()): of1.write(name + "\t" + renamegene[name] + "\n") of1.close() #Now lets resort by genes bygene = {} for g in finished: gene = g.value('gene_name') if gene not in bygene: bygene[gene] = [] bygene[gene].append(g) for gene in sorted(bygene.keys()): for g in bygene[gene]: of.write(g.get_line() + "\n") of.close() inf.close()
def load_from_inputs(args): #Read in the VCF file sys.stderr.write("Reading in the VCF file\n") alleles = {} #with open(args.phased_VCF) as inf: with open(args.inputs[1]) as inf: for line in inf: vcf = VCF(line) if not vcf.is_snp(): continue g = vcf.get_phased_genotype() if not g: continue if vcf.value('chrom') not in alleles: alleles[vcf.value('chrom')] = {} if vcf.value('pos') in alleles[vcf.value('chrom')]: sys.stderr.write("WARNING: seeing the same position twice.\n"+line.rstrip()+"\n") alleles[vcf.value('chrom')][vcf.value('pos')] = g # set our left and right sys.stderr.write("Reading in the reference genome\n") #ref = read_fasta_into_hash(args.reference_genome) ref = read_fasta_into_hash(args.inputs[0]) res1 = [] res2 = [] p = None sys.stderr.write("Introducing VCF changes to reference sequences\n") # Pretty memory intesnive to so don't go with all possible threads if args.threads > 1: p = Pool(processes=max(1,int(args.threads/4))) for chrom in ref: # handle the case where there is no allele information if chrom not in alleles: r1q = Queue() r1q.put([0,chrom,ref[chrom]]) res1.append(r1q) r2q = Queue() r2q.put([0,chrom,ref[chrom]]) res2.append(r2q) elif args.threads > 1: res1.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],0,chrom))) res2.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],1,chrom))) else: r1q = Queue() r1q.put(adjust_reference_genome(alleles[chrom],ref[chrom],0,chrom)) res1.append(r1q) r2q = Queue() r2q.put(adjust_reference_genome(alleles[chrom],ref[chrom],1,chrom)) res2.append(r2q) if args.threads > 1: p.close() p.join() # now we can fill reference 1 with all our new sequences ref1 = {} c1 = 0 for i in range(0,len(res1)): res = res1[i].get() c1 += res[0] ref1[res[1]]=res[2] # now we can fill reference 2 with all our new sequences ref2 = {} c2 = 0 for i in range(0,len(res2)): res = res2[i].get() c2 += res[0] ref2[res[1]]=res[2] sys.stderr.write("Made "+str(c1)+"|"+str(c2)+" changes to the reference\n") # Now ref1 and ref2 have are the diploid sources of the transcriptome gpdnames = {} txn1 = Transcriptome() txn2 = Transcriptome() txn1.set_reference_genome_dictionary(ref1) txn2.set_reference_genome_dictionary(ref2) #with open(args.transcripts_genepred) as inf: with open(args.inputs[2]) as inf: for line in inf: if line[0]=='#': continue txn1.add_genepred_line(line.rstrip()) txn2.add_genepred_line(line.rstrip()) gpd = GenePredEntry(line.rstrip()) gpdnames[gpd.value('name')] = gpd.value('gene_name') # The transcriptomes are set but we dont' really need the references anymore # Empty our big memory things txn1.ref_hash = None txn2.ref_hash = None for chrom in ref1.keys(): del ref1[chrom] for chrom in ref2.keys(): del ref2[chrom] for chrom in ref.keys(): del ref[chrom] if not args.locus_by_gene_name: #[locus2name,name2locus] = get_loci(args.transcripts_genepred) [locus2name,name2locus] = get_loci(args.inputs[2]) else: # set locus by gene name sys.stderr.write("Organizing loci by gene name\n") locus2name = {} name2locus = {} numname = {} m = 0 for name in sorted(gpdnames): gene = gpdnames[name] if gene not in numname: m+=1 numname[gene] = m num = numname[gene] if num not in locus2name: locus2name[num] = set() locus2name[num].add(name) name2locus[name] = num sys.stderr.write("Ended with "+str(len(locus2name.keys()))+" loci\n") if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn1.add_expression(f[0],float(f[1])) txn2.add_expression(f[0],float(f[1])) elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") cuffz = 0 with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: cuffz +=1 sys.stderr.write(str(cuffz)+" cufflinks entries processed\r") f = line.rstrip().split("\t") txn1.add_expression_no_update(f[0],float(f[9])) txn2.add_expression_no_update(f[0],float(f[9])) txn1.update_expression() txn2.update_expression() sys.stderr.write("\n") elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") else: sys.stderr.write("Warning isoform expression not sepcified, using uniform expression model.\n") # Now we have the transcriptomes set rhos = {} # The ASE of allele 1 (the left side) randos = {} if args.seed: random.seed(args.seed) for z in locus2name: randos[z] = random.random() sys.stderr.write("Setting rho for each transcript\n") # Lets set rho for ASE for each transcript for tname in sorted(txn1.transcripts): if args.ASE_identical or args.ASE_identical == 0: rhos[tname] = float(args.ASE_identical) elif args.ASE_isoform_random: rhos[tname] = random.random() else: # we must be on locus random rhos[tname] = randos[name2locus[tname]] #Now our dataset is set up rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1,txn2) rbe.gene_names = gpdnames rbe.name2locus = name2locus rbe.set_transcriptome1_rho(rhos) return rbe
def main(): parser = argparse.ArgumentParser(description="Rename gene and transcript elements of GenePred file that are redundant. Please specify an output if you would like report files generated for the filters.") parser.add_argument('input',help="GENEPREDFILE or '-' for STDIN") parser.add_argument('-o','--output',help="OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated") parser.add_argument('--minimum_locus_distance',type=int,default=500000,help="Genes with the same name will be renamed if this far apart") parser.add_argument('--keep_positional_duplicates',action='store_true',help="By default we remove one of the duplicate entries") parser.add_argument('--keep_transcript_names',action='store_true',help="By default we rename duplicated transcript names") parser.add_argument('--keep_gene_names',action='store_true',help="By default we rename genes located at different loci.") args = parser.parse_args() inf = sys.stdin if args.input != '-': inf = open(args.input) of = sys.stdout if args.output: of = open(args.output,'w') txdef = {} gfams = {} for line in inf: if line[0] == '#': continue g = GenePredEntry(line) loc = g.value('chrom') + ':' +','.join([str(x) for x in g.value('exonStarts')]) + '-' + ','.join([str(x) for x in g.value('exonEnds')])+'/'+g.value('strand') if loc not in txdef: txdef[loc] = [] txdef[loc].append(g) if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = [] gfams[g.value('gene_name')].append(g.value('name')) # now we have cataloged all transcripts by unique locations omissions = [] keepers = [] for loc in sorted(txdef.keys()): if args.keep_positional_duplicates: # We don't want to ommit anything here for g in txdef[loc]: keepers.append(g) continue #basically skipping this part by populating keepers num = len(txdef[loc]) if num > 1: sys.stderr.write("Found "+str(num)+" entries at location\n") sys.stderr.write(loc +"\n") sys.stderr.write("They are:\n") largest = 0 keepgene = None keepindex = -1 i = 0 for e in txdef[loc]: famsize = len(gfams[e.value('gene_name')]) sys.stderr.write(" "+e.value('gene_name')+"\t"+e.value('name')+"\t"+str(famsize)+"\n") if famsize > largest: keepgene = e largest = famsize keepindex = i i+=1 for j in range(0,len(txdef[loc])): if j != keepindex: omissions.append(txdef[loc][j]) else: keepers.append(txdef[loc][j]) sys.stderr.write(" Biggest gene family is "+keepgene.value('gene_name')+" with "+str(largest)+" transcripts\n") sys.stderr.write(" so keep that one.\n") else: keepers.append(txdef[loc][0]) sys.stderr.write("Omitting "+str(len(omissions))+" entries for redundant positions\n") if args.output and not args.keep_positional_duplicates: of1 = open(args.output+'.positional_duplicate_omissions','w') for g in omissions: of1.write(g.get_line()+"\n") of1.close() # Now the keepers contains transcripts with unique locations # Lets provide unique names to remaining transcripts tnames = {} renametx = {} for g in keepers: tx = g.value('name') if tx not in tnames: tnames[tx] = [] tnames[tx].append(g) for name in tnames: if args.keep_transcript_names: continue # We don't want to rename them nsize = len(tnames[name]) if nsize > 1: sys.stderr.write("Name: "+name+" has a family of size "+str(nsize)+"\n") for i in range(0,len(tnames[name])): newname = name+'['+str(i+1)+'/'+str(nsize)+']' renametx[newname] = name tnames[name][i].entry['name'] = newname sys.stderr.write("Renamed: "+str(len(renametx))+" transcripts\n") if args.output and not args.keep_transcript_names: of1 = open(args.output+'.renamed_transcripts','w') for name in sorted(renametx.keys()): of1.write(name+"\t"+renametx[name]+"\n") of1.close() #now we need to arrange into gene families gnames = {} for name in tnames: for g in tnames[name]: gene = g.value('gene_name') if gene not in gnames: gnames[gene] = [] gnames[gene].append(g) renamegene = {} finished = [] for gene in gnames: if args.keep_gene_names: for g in gnames[gene]: finished.append(g) continue # We don't want to rename genes if len(gnames[gene])==1: finished.append(gnames[gene][0]) continue # Now we need to make sure these genes are really on the same locus. loci = Loci() loci.set_minimum_distance(args.minimum_locus_distance) for g in gnames[gene]: r = g.locus_range.copy() r.set_payload(g) loc = Locus() loc.add_member(r) loci.add_locus(loc) loci.update_loci() lcount = len(loci.loci) if lcount == 1: for g in gnames[gene]: finished.append(g) continue # need to rename some genes for i in range(0,lcount): newname = gene+'['+str(i+1)+'/'+str(lcount)+']' rstr = loci.loci[i].range.get_range_string() renamegene[newname] = gene sys.stderr.write(newname+"\t"+rstr+"\n") for m in loci.loci[i].members: m.get_payload().entry['gene_name'] = newname finished.append(m.get_payload()) sys.stderr.write("Renamed: "+str(len(renamegene))+" genes\n") if args.output and not args.keep_transcript_names: of1 = open(args.output+'.renamed_genes','w') for name in sorted(renamegene.keys()): of1.write(name+"\t"+renamegene[name]+"\n") of1.close() #Now lets resort by genes bygene = {} for g in finished: gene = g.value('gene_name') if gene not in bygene: bygene[gene] = [] bygene[gene].append(g) for gene in sorted(bygene.keys()): for g in bygene[gene]: of.write(g.get_line()+"\n") of.close() inf.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gpd_input') parser.add_argument('bam_input') parser.add_argument('--intergenic_buffer', default=10000, type=int) parser.add_argument('--window_size', default=10000, type=int) parser.add_argument('--bin_size', default=1000, type=int) parser.add_argument( '--use_off_regions', action='store_true', help="Use a region even if there is no reads mapped to it.") parser.add_argument('--get_exons', action='store_true') args = parser.parse_args() chr_beds = {} gene_beds = [] exon_beds = [] sys.stderr.write("Reading genepred file\n") asum = 0 atot = 0 with open(args.gpd_input) as inf: for line in inf: g = GenePredEntry(line) asum += g.length() atot += 1 grng = g.get_bed() grng.direction = None if grng.chr not in chr_beds: chr_beds[grng.chr] = grng.copy() chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng) gene_beds.append(grng) for i in range(0, g.get_exon_count()): erng = Bed(g.value('chrom'), g.value('exonStarts')[i], g.value('exonEnds')[i]) exon_beds.append(erng) avglen = float(asum) / float(atot) sys.stderr.write("Sorting gene bed\n") gene_beds = sort_ranges(gene_beds) gene_beds = merge_ranges(gene_beds, already_sorted=True) sys.stderr.write("Sorting chromosome beds\n") chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()]) sys.stderr.write("Sorting exon beds\n") exon_beds = sort_ranges(exon_beds) sys.stderr.write("Get padded genes\n") padded_gene_beds = pad_ranges(gene_beds, args.intergenic_buffer, chr_beds) padded_gene_beds = merge_ranges(padded_gene_beds, already_sorted=True) sys.stderr.write("Get intergenic regions\n") intergenic_beds = subtract_ranges(chr_beds, padded_gene_beds, already_sorted=True) intergenic_beds = merge_ranges(intergenic_beds, already_sorted=True) intergenic_beds = window_break(intergenic_beds, args.window_size) #for i in intergenic_beds: print i.get_range_string() sys.stderr.write("Get merged exons\n") exon_beds = merge_ranges(exon_beds) sys.stderr.write("Get introns\n") intron_beds = subtract_ranges(gene_beds, exon_beds, already_sorted=True) intron_beds = merge_ranges(intron_beds, already_sorted=True) intron_beds = window_break(intron_beds, args.window_size) sys.stderr.write("Going through short reads\n") cmd = "sam_to_bed_depth.py " + args.bam_input p = Popen(cmd.split(), stdout=PIPE) for x in intron_beds: x.set_payload([]) # payloads are read depths for x in intergenic_beds: x.set_payload([]) # payloads are read depths for x in exon_beds: x.set_payload([]) # payloads are read depths introndepth = [] intergenicdepth = [] exondepth = [] pseudoreadcount = 0 if not args.get_exons: exon_beds = [] section_count = 0 while True: section_count += 1 line = p.stdout.readline() if not line: break f = line.split("\t") depth = int(f[3]) curr = Bed(f[0], int(f[1]), int(f[2])) if section_count % 100 == 0: sys.stderr.write(curr.get_range_string() + " \r") pseudoreadcount += depth if len(exon_beds) > 0: while curr.cmp(exon_beds[0]) > 0 and len( exon_beds) > 0: # we've passed the region v = exon_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) exondepth.append(av) #print str(av)+" exonic "+v.get_range_string() c = curr.cmp(exon_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(exon_beds[0]) for i in range(0, size): exon_beds[0].get_payload().append(depth) if len(intron_beds) > 0: while curr.cmp(intron_beds[0]) > 0 and len( intron_beds) > 0: # we've passed the region v = intron_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) introndepth.append(av) #print str(av)+" intronic "+v.get_range_string() c = curr.cmp(intron_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intron_beds[0]) for i in range(0, size): intron_beds[0].get_payload().append(depth) if len(intergenic_beds) > 0: while curr.cmp(intergenic_beds[0]) > 0 and len( intergenic_beds) > 0: # we've passed the region v = intergenic_beds.pop(0) if len(v.get_payload()) == 0 and not args.use_off_regions: continue av = average(v) intergenicdepth.append(av) display(curr, introndepth, intergenicdepth, pseudoreadcount, avglen) #print str(av)+" intergenic "+v.get_range_string() c = curr.cmp(intergenic_beds[0]) if c == 0: # overlaps with intron size = curr.overlap_size(intergenic_beds[0]) for i in range(0, size): intergenic_beds[0].get_payload().append(depth) #if c > 0: # we passed the intron # v = intergenic_beds.pop(0) # av = average(v) # intergenicdepth.append(av) # print str(av)+" intergenic "+v.get_range_string() if args.use_off_regions: for x in exon_beds: introndepth.append(average(x.get_payload())) for x in intron_beds: introndepth.append(average(x.get_payload())) for x in intergenic_beds: intergenicdepth.append(average(x.get_payload())) p.communicate()
def main(): parser = argparse.ArgumentParser( description="For every genepred entry report its alignability", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Genepred can be gzipped or - for STDIN") parser.add_argument('-r', '--reference', required=True, help="Reference fasta") parser.add_argument('-k', '--fragment_size', default=100, type=int, help="Fragment size to try to align") parser.add_argument('-x', '--hisat_index', required=True, help="HISAT index base name") parser.add_argument('--threads', type=int, default=cpu_count(), help="number of threads") parser.add_argument('--type', choices=['mean', 'median'], default='mean', help="How to bring together overlapping reads") parser.add_argument('--perbase', action='store_true') parser.add_argument('--output', '-o', help="output file or leave unset for STDOUT") args = parser.parse_args() if args.input == '-': args.input = sys.stdin elif re.search('\.gz$', args.input): args.input = gzip.open(args.input) else: args.input = open(args.input) udir = os.path.dirname(os.path.realpath(__file__)) cmd2 = udir + '/genepred_counts_to_mappability.py -' cmd2 += ' --threads ' + str(args.threads) cmd2 += ' -k ' + str(args.fragment_size) if args.perbase: cmd2 += ' --perbase' if args.output: cmd2 += ' --output ' + args.output if args.type: cmd2 += ' --type ' + args.type p2 = Popen(cmd2.split(), stdin=PIPE) ref = read_fasta_into_hash(args.reference) cmd1 = 'hisat -x ' + args.hisat_index + ' -U - -f --reorder -p ' + str( args.threads) p1 = Popen(cmd1.split(), stdin=PIPE, stdout=p2.stdin, stderr=null) #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin) line_number = 0 for line in args.input: line_number += 1 gpd = GPD(line.rstrip()) #print gpd.entry['name'] #print gpd.length() if gpd.length() < args.fragment_size: continue seq = gpd.get_sequence(ref) for i in range(0, len(seq) - args.fragment_size + 1): info = gpd.value('name') + "\t" + gpd.value( 'gene_name') + "\t" + str(line_number) + "\t" + str( len(seq)) + "\t" + str(i) einfo = encode_name(info) p1.stdin.write('>' + einfo + "\n") p1.stdin.write(seq[i:i + args.fragment_size] + "\n") p1.communicate() p2.communicate()