def load_from_inputs(args): #Read in the VCF file sys.stderr.write("Reading in the VCF file\n") alleles = {} #with open(args.phased_VCF) as inf: with open(args.inputs[1]) as inf: for line in inf: vcf = VCF(line) if not vcf.is_snp(): continue g = vcf.get_phased_genotype() if not g: continue if vcf.value('chrom') not in alleles: alleles[vcf.value('chrom')] = {} if vcf.value('pos') in alleles[vcf.value('chrom')]: sys.stderr.write("WARNING: seeing the same position twice.\n" + line.rstrip() + "\n") alleles[vcf.value('chrom')][vcf.value( 'pos')] = g # set our left and right sys.stderr.write("Reading in the reference genome\n") #ref = read_fasta_into_hash(args.reference_genome) ref = read_fasta_into_hash(args.inputs[0]) res1 = [] res2 = [] p = None sys.stderr.write("Introducing VCF changes to reference sequences\n") # Pretty memory intesnive to so don't go with all possible threads if args.threads > 1: p = Pool(processes=max(1, int(args.threads / 4))) for chrom in ref: # handle the case where there is no allele information if chrom not in alleles: r1q = Queue() r1q.put([0, chrom, ref[chrom]]) res1.append(r1q) r2q = Queue() r2q.put([0, chrom, ref[chrom]]) res2.append(r2q) elif args.threads > 1: res1.append( p.apply_async(adjust_reference_genome, args=(alleles[chrom], ref[chrom], 0, chrom))) res2.append( p.apply_async(adjust_reference_genome, args=(alleles[chrom], ref[chrom], 1, chrom))) else: r1q = Queue() r1q.put( adjust_reference_genome(alleles[chrom], ref[chrom], 0, chrom)) res1.append(r1q) r2q = Queue() r2q.put( adjust_reference_genome(alleles[chrom], ref[chrom], 1, chrom)) res2.append(r2q) if args.threads > 1: p.close() p.join() # now we can fill reference 1 with all our new sequences ref1 = {} c1 = 0 for i in range(0, len(res1)): res = res1[i].get() c1 += res[0] ref1[res[1]] = res[2] # now we can fill reference 2 with all our new sequences ref2 = {} c2 = 0 for i in range(0, len(res2)): res = res2[i].get() c2 += res[0] ref2[res[1]] = res[2] sys.stderr.write("Made " + str(c1) + "|" + str(c2) + " changes to the reference\n") # Now ref1 and ref2 have are the diploid sources of the transcriptome gpdnames = {} txn1 = Transcriptome() txn2 = Transcriptome() txn1.set_reference_genome_dictionary(ref1) txn2.set_reference_genome_dictionary(ref2) #with open(args.transcripts_genepred) as inf: with open(args.inputs[2]) as inf: for line in inf: if line[0] == '#': continue txn1.add_genepred_line(line.rstrip()) txn2.add_genepred_line(line.rstrip()) gpd = GenePredEntry(line.rstrip()) gpdnames[gpd.value('name')] = gpd.value('gene_name') # The transcriptomes are set but we dont' really need the references anymore # Empty our big memory things txn1.ref_hash = None txn2.ref_hash = None for chrom in ref1.keys(): del ref1[chrom] for chrom in ref2.keys(): del ref2[chrom] for chrom in ref.keys(): del ref[chrom] if not args.locus_by_gene_name: #[locus2name,name2locus] = get_loci(args.transcripts_genepred) [locus2name, name2locus] = get_loci(args.inputs[2]) else: # set locus by gene name sys.stderr.write("Organizing loci by gene name\n") locus2name = {} name2locus = {} numname = {} m = 0 for name in sorted(gpdnames): gene = gpdnames[name] if gene not in numname: m += 1 numname[gene] = m num = numname[gene] if num not in locus2name: locus2name[num] = set() locus2name[num].add(name) name2locus[name] = num sys.stderr.write("Ended with " + str(len(locus2name.keys())) + " loci\n") if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn1.add_expression(f[0], float(f[1])) txn2.add_expression(f[0], float(f[1])) elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") cuffz = 0 with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: cuffz += 1 sys.stderr.write(str(cuffz) + " cufflinks entries processed\r") f = line.rstrip().split("\t") txn1.add_expression_no_update(f[0], float(f[9])) txn2.add_expression_no_update(f[0], float(f[9])) txn1.update_expression() txn2.update_expression() sys.stderr.write("\n") elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") else: sys.stderr.write( "Warning isoform expression not sepcified, using uniform expression model.\n" ) # Now we have the transcriptomes set rhos = {} # The ASE of allele 1 (the left side) randos = {} if args.seed: random.seed(args.seed) for z in locus2name: randos[z] = random.random() sys.stderr.write("Setting rho for each transcript\n") # Lets set rho for ASE for each transcript for tname in sorted(txn1.transcripts): if args.ASE_identical or args.ASE_identical == 0: rhos[tname] = float(args.ASE_identical) elif args.ASE_isoform_random: rhos[tname] = random.random() else: # we must be on locus random rhos[tname] = randos[name2locus[tname]] #Now our dataset is set up rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1, txn2) rbe.gene_names = gpdnames rbe.name2locus = name2locus rbe.set_transcriptome1_rho(rhos) return rbe
def main(): parser = argparse.ArgumentParser(description="Create a simulated RNA-seq dataset") parser.add_argument('reference_genome',help="The reference genome.") parser.add_argument('transcripts_genepred',help="A genepred file describing the transcripts. Each transcript name must be unique.") group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--uniform_expression',action='store_true',help="Uniform distribution of transcript expression") group.add_argument('--isoform_expression',help="The transcript expression in TSV format <Transcript name> tab <Expression>") group.add_argument('--cufflinks_isoform_expression',help="The expression of the isoforms or - for a uniform distribution of transcript expression") group2 = parser.add_mutually_exclusive_group() group2.add_argument('--long_reads_only',action='store_true') group2.add_argument('--short_reads_only',action='store_true') group2.add_argument('--output',help="Directory name for output") parser.add_argument('--short_read_count',type=int,default=10000,help="INT number of short reads") parser.add_argument('--short_read_length',type=int,default=101,help="INT length of the short reads") parser.add_argument('--long_read_count',type=int,default=4000,help="INT default number of long reads") parser.add_argument('--no_errors',action='store_true') parser.add_argument('--threads',type=int,default=1) args = parser.parse_args() if args.output: args.output = args.output.rstrip('/') fq_prof_pacbio_ccs95 = None fq_prof_pacbio_subreads = None fq_prof_illumina = None if not args.no_errors: fq_prof_pacbio_ccs95 = default_pacbio_ccs95() fq_prof_pacbio_subreads = default_pacbio_subreads() fq_prof_illumina = default_illumina() ref = read_fasta_into_hash(args.reference_genome) txn = Transcriptome() txn.set_reference_genome_dictionary(ref) with open(args.transcripts_genepred) as inf: for line in inf: if line[0]=='#': continue txn.add_genepred_line(line.rstrip()) if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0],float(f[1])) elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0],float(f[9])) sys.stderr.write("have transcriptome\n") for n in txn.ref_hash.keys(): del txn.ref_hash[n] rbe = SimulationBasics.RandomTranscriptomeEmitter(txn) # Now we have the transcriptomes set #Now our dataset is set up if args.short_reads_only: rbe.set_gaussian_fragmentation_default_hiseq() for zi in range(0,args.short_read_count): [name,seq] = rbe.emit_short_read(args.short_read_length) if args.no_errors: print "@SRSIM"+str(zi+1) print seq print "+" print 'I'*len(seq) else: l1perm = fq_prof_illumina.create_fastq_and_permute_sequence(seq) print "@SRSIM"+str(zi+1) print l1perm['seq'] print "+" print l1perm['qual'] return if args.long_reads_only: rbe.set_gaussian_fragmentation_default_pacbio() for zi in range(0,args.long_read_count): [name,seq] = rbe.emit_long_read() if args.no_errors: g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs' print "@"+g print seq print "+" print 'I'*len(seq) else: g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs' seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence(seq) print "@"+g print seqperm['seq'] print "+" print seqperm['qual'] return if not os.path.exists(args.output): os.makedirs(args.output) rbe.set_gaussian_fragmentation_default_hiseq() # Lets prepare to output now sys.stderr.write("Sequencing short reads\n") global left_handle global right_handle left_handle = gzip.open(args.output+"/SR_1.fq.gz",'wb') right_handle = gzip.open(args.output+"/SR_2.fq.gz",'wb') buffer_size = 10000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0,args.short_read_count): z = i+1 if z %1000==0: sys.stderr.write(str(z)+"\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short) buffer = [] if args.threads > 1: p.close() p.join() global greport of = open(args.output+"/SR_report.txt",'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]])+"\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing short reads\n") left_handle.close() right_handle.close() # Now lets create the long read set rbe.set_gaussian_fragmentation_default_pacbio() sys.stderr.write("Sequencing ccs long reads\n") global long_handle long_handle = gzip.open(args.output+"/LR_ccs.fq.gz",'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0,args.long_read_count): z = i+1 if z %100==0: sys.stderr.write(str(z)+"\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output+"/LR_ccs_report.txt",'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]])+"\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing ccs long reads\n") sys.stderr.write("Sequencing long sub reads\n") long_handle = gzip.open(args.output+"/LR_sub.fq.gz",'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(z,z+args.long_read_count): z = i+1 if z %100==0: sys.stderr.write(str(z)+"\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output+"/LR_sub_report.txt",'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]])+"\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing long sub reads\n") combo = {} with open(args.output+"/SR_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name,express,left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output+"/LR_ccs_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name,express,left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output+"/LR_sub_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name,express,left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) of = open(args.output+"/LR_SR_combo_report.txt",'w') for name in sorted(combo): of.write(name+"\t"+combo[name]['express']+"\t"+str(combo[name]['left'])+"\n") of.close()
def main(): parser = argparse.ArgumentParser( description="Create a simulated RNA-seq dataset") parser.add_argument('reference_genome', help="The reference genome.") parser.add_argument( 'transcripts_genepred', help= "A genepred file describing the transcripts. Each transcript name must be unique." ) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--uniform_expression', action='store_true', help="Uniform distribution of transcript expression") group.add_argument( '--isoform_expression', help= "The transcript expression in TSV format <Transcript name> tab <Expression>" ) group.add_argument( '--cufflinks_isoform_expression', help= "The expression of the isoforms or - for a uniform distribution of transcript expression" ) group2 = parser.add_mutually_exclusive_group() group2.add_argument('--long_reads_only', action='store_true') group2.add_argument('--short_reads_only', action='store_true') group2.add_argument('--output', help="Directory name for output") parser.add_argument('--short_read_count', type=int, default=10000, help="INT number of short reads") parser.add_argument('--short_read_length', type=int, default=101, help="INT length of the short reads") parser.add_argument('--long_read_count', type=int, default=4000, help="INT default number of long reads") parser.add_argument('--no_errors', action='store_true') parser.add_argument('--threads', type=int, default=1) args = parser.parse_args() if args.output: args.output = args.output.rstrip('/') fq_prof_pacbio_ccs95 = None fq_prof_pacbio_subreads = None fq_prof_illumina = None if not args.no_errors: fq_prof_pacbio_ccs95 = default_pacbio_ccs95() fq_prof_pacbio_subreads = default_pacbio_subreads() fq_prof_illumina = default_illumina() ref = read_fasta_into_hash(args.reference_genome) txn = Transcriptome() txn.set_reference_genome_dictionary(ref) with open(args.transcripts_genepred) as inf: for line in inf: if line[0] == '#': continue txn.add_genepred_line(line.rstrip()) if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0], float(f[1])) elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0], float(f[9])) sys.stderr.write("have transcriptome\n") for n in txn.ref_hash.keys(): del txn.ref_hash[n] rbe = SimulationBasics.RandomTranscriptomeEmitter(txn) # Now we have the transcriptomes set #Now our dataset is set up if args.short_reads_only: rbe.set_gaussian_fragmentation_default_hiseq() for zi in range(0, args.short_read_count): [name, seq] = rbe.emit_short_read(args.short_read_length) if args.no_errors: print "@SRSIM" + str(zi + 1) print seq print "+" print 'I' * len(seq) else: l1perm = fq_prof_illumina.create_fastq_and_permute_sequence( seq) print "@SRSIM" + str(zi + 1) print l1perm['seq'] print "+" print l1perm['qual'] return if args.long_reads_only: rbe.set_gaussian_fragmentation_default_pacbio() for zi in range(0, args.long_read_count): [name, seq] = rbe.emit_long_read() if args.no_errors: g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str( zi + 1) + '/ccs' print "@" + g print seq print "+" print 'I' * len(seq) else: g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str( zi + 1) + '/ccs' seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence( seq) print "@" + g print seqperm['seq'] print "+" print seqperm['qual'] return if not os.path.exists(args.output): os.makedirs(args.output) rbe.set_gaussian_fragmentation_default_hiseq() # Lets prepare to output now sys.stderr.write("Sequencing short reads\n") global left_handle global right_handle left_handle = gzip.open(args.output + "/SR_1.fq.gz", 'wb') right_handle = gzip.open(args.output + "/SR_2.fq.gz", 'wb') buffer_size = 10000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0, args.short_read_count): z = i + 1 if z % 1000 == 0: sys.stderr.write(str(z) + "\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_short_read_buffer(buffer[:], rbe, args, fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer, args=(buffer[:], rbe, args, fq_prof_illumina), callback=do_short) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_short_read_buffer(buffer[:], rbe, args, fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer, args=(buffer[:], rbe, args, fq_prof_illumina), callback=do_short) buffer = [] if args.threads > 1: p.close() p.join() global greport of = open(args.output + "/SR_report.txt", 'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]]) + "\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing short reads\n") left_handle.close() right_handle.close() # Now lets create the long read set rbe.set_gaussian_fragmentation_default_pacbio() sys.stderr.write("Sequencing ccs long reads\n") global long_handle long_handle = gzip.open(args.output + "/LR_ccs.fq.gz", 'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0, args.long_read_count): z = i + 1 if z % 100 == 0: sys.stderr.write(str(z) + "\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs'), callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs'), callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output + "/LR_ccs_report.txt", 'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]]) + "\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing ccs long reads\n") sys.stderr.write("Sequencing long sub reads\n") long_handle = gzip.open(args.output + "/LR_sub.fq.gz", 'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(z, z + args.long_read_count): z = i + 1 if z % 100 == 0: sys.stderr.write(str(z) + "\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub'), callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub'), callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output + "/LR_sub_report.txt", 'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]]) + "\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing long sub reads\n") combo = {} with open(args.output + "/SR_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name, express, left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output + "/LR_ccs_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name, express, left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output + "/LR_sub_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name, express, left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) of = open(args.output + "/LR_SR_combo_report.txt", 'w') for name in sorted(combo): of.write(name + "\t" + combo[name]['express'] + "\t" + str(combo[name]['left']) + "\n") of.close()
def load_from_inputs(args): #Read in the VCF file sys.stderr.write("Reading in the VCF file\n") alleles = {} #with open(args.phased_VCF) as inf: with open(args.inputs[1]) as inf: for line in inf: vcf = VCF(line) if not vcf.is_snp(): continue g = vcf.get_phased_genotype() if not g: continue if vcf.value('chrom') not in alleles: alleles[vcf.value('chrom')] = {} if vcf.value('pos') in alleles[vcf.value('chrom')]: sys.stderr.write("WARNING: seeing the same position twice.\n"+line.rstrip()+"\n") alleles[vcf.value('chrom')][vcf.value('pos')] = g # set our left and right sys.stderr.write("Reading in the reference genome\n") #ref = read_fasta_into_hash(args.reference_genome) ref = read_fasta_into_hash(args.inputs[0]) res1 = [] res2 = [] p = None sys.stderr.write("Introducing VCF changes to reference sequences\n") # Pretty memory intesnive to so don't go with all possible threads if args.threads > 1: p = Pool(processes=max(1,int(args.threads/4))) for chrom in ref: # handle the case where there is no allele information if chrom not in alleles: r1q = Queue() r1q.put([0,chrom,ref[chrom]]) res1.append(r1q) r2q = Queue() r2q.put([0,chrom,ref[chrom]]) res2.append(r2q) elif args.threads > 1: res1.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],0,chrom))) res2.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],1,chrom))) else: r1q = Queue() r1q.put(adjust_reference_genome(alleles[chrom],ref[chrom],0,chrom)) res1.append(r1q) r2q = Queue() r2q.put(adjust_reference_genome(alleles[chrom],ref[chrom],1,chrom)) res2.append(r2q) if args.threads > 1: p.close() p.join() # now we can fill reference 1 with all our new sequences ref1 = {} c1 = 0 for i in range(0,len(res1)): res = res1[i].get() c1 += res[0] ref1[res[1]]=res[2] # now we can fill reference 2 with all our new sequences ref2 = {} c2 = 0 for i in range(0,len(res2)): res = res2[i].get() c2 += res[0] ref2[res[1]]=res[2] sys.stderr.write("Made "+str(c1)+"|"+str(c2)+" changes to the reference\n") # Now ref1 and ref2 have are the diploid sources of the transcriptome gpdnames = {} txn1 = Transcriptome() txn2 = Transcriptome() txn1.set_reference_genome_dictionary(ref1) txn2.set_reference_genome_dictionary(ref2) #with open(args.transcripts_genepred) as inf: with open(args.inputs[2]) as inf: for line in inf: if line[0]=='#': continue txn1.add_genepred_line(line.rstrip()) txn2.add_genepred_line(line.rstrip()) gpd = GenePredEntry(line.rstrip()) gpdnames[gpd.value('name')] = gpd.value('gene_name') # The transcriptomes are set but we dont' really need the references anymore # Empty our big memory things txn1.ref_hash = None txn2.ref_hash = None for chrom in ref1.keys(): del ref1[chrom] for chrom in ref2.keys(): del ref2[chrom] for chrom in ref.keys(): del ref[chrom] if not args.locus_by_gene_name: #[locus2name,name2locus] = get_loci(args.transcripts_genepred) [locus2name,name2locus] = get_loci(args.inputs[2]) else: # set locus by gene name sys.stderr.write("Organizing loci by gene name\n") locus2name = {} name2locus = {} numname = {} m = 0 for name in sorted(gpdnames): gene = gpdnames[name] if gene not in numname: m+=1 numname[gene] = m num = numname[gene] if num not in locus2name: locus2name[num] = set() locus2name[num].add(name) name2locus[name] = num sys.stderr.write("Ended with "+str(len(locus2name.keys()))+" loci\n") if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn1.add_expression(f[0],float(f[1])) txn2.add_expression(f[0],float(f[1])) elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") cuffz = 0 with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: cuffz +=1 sys.stderr.write(str(cuffz)+" cufflinks entries processed\r") f = line.rstrip().split("\t") txn1.add_expression_no_update(f[0],float(f[9])) txn2.add_expression_no_update(f[0],float(f[9])) txn1.update_expression() txn2.update_expression() sys.stderr.write("\n") elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") else: sys.stderr.write("Warning isoform expression not sepcified, using uniform expression model.\n") # Now we have the transcriptomes set rhos = {} # The ASE of allele 1 (the left side) randos = {} if args.seed: random.seed(args.seed) for z in locus2name: randos[z] = random.random() sys.stderr.write("Setting rho for each transcript\n") # Lets set rho for ASE for each transcript for tname in sorted(txn1.transcripts): if args.ASE_identical or args.ASE_identical == 0: rhos[tname] = float(args.ASE_identical) elif args.ASE_isoform_random: rhos[tname] = random.random() else: # we must be on locus random rhos[tname] = randos[name2locus[tname]] #Now our dataset is set up rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1,txn2) rbe.gene_names = gpdnames rbe.name2locus = name2locus rbe.set_transcriptome1_rho(rhos) return rbe