def load_from_inputs(args):
    #Read in the VCF file
    sys.stderr.write("Reading in the VCF file\n")
    alleles = {}
    #with open(args.phased_VCF) as inf:
    with open(args.inputs[1]) as inf:
        for line in inf:
            vcf = VCF(line)
            if not vcf.is_snp(): continue
            g = vcf.get_phased_genotype()
            if not g: continue
            if vcf.value('chrom') not in alleles:
                alleles[vcf.value('chrom')] = {}
            if vcf.value('pos') in alleles[vcf.value('chrom')]:
                sys.stderr.write("WARNING: seeing the same position twice.\n" +
                                 line.rstrip() + "\n")
            alleles[vcf.value('chrom')][vcf.value(
                'pos')] = g  # set our left and right

    sys.stderr.write("Reading in the reference genome\n")
    #ref = read_fasta_into_hash(args.reference_genome)
    ref = read_fasta_into_hash(args.inputs[0])
    res1 = []
    res2 = []
    p = None
    sys.stderr.write("Introducing VCF changes to reference sequences\n")
    # Pretty memory intesnive to so don't go with all possible threads
    if args.threads > 1: p = Pool(processes=max(1, int(args.threads / 4)))
    for chrom in ref:
        # handle the case where there is no allele information
        if chrom not in alleles:
            r1q = Queue()
            r1q.put([0, chrom, ref[chrom]])
            res1.append(r1q)
            r2q = Queue()
            r2q.put([0, chrom, ref[chrom]])
            res2.append(r2q)
        elif args.threads > 1:
            res1.append(
                p.apply_async(adjust_reference_genome,
                              args=(alleles[chrom], ref[chrom], 0, chrom)))
            res2.append(
                p.apply_async(adjust_reference_genome,
                              args=(alleles[chrom], ref[chrom], 1, chrom)))
        else:
            r1q = Queue()
            r1q.put(
                adjust_reference_genome(alleles[chrom], ref[chrom], 0, chrom))
            res1.append(r1q)
            r2q = Queue()
            r2q.put(
                adjust_reference_genome(alleles[chrom], ref[chrom], 1, chrom))
            res2.append(r2q)
    if args.threads > 1:
        p.close()
        p.join()

    # now we can fill reference 1 with all our new sequences
    ref1 = {}
    c1 = 0
    for i in range(0, len(res1)):
        res = res1[i].get()
        c1 += res[0]
        ref1[res[1]] = res[2]

    # now we can fill reference 2 with all our new sequences
    ref2 = {}
    c2 = 0
    for i in range(0, len(res2)):
        res = res2[i].get()
        c2 += res[0]
        ref2[res[1]] = res[2]
    sys.stderr.write("Made " + str(c1) + "|" + str(c2) +
                     " changes to the reference\n")

    # Now ref1 and ref2 have are the diploid sources of the transcriptome
    gpdnames = {}
    txn1 = Transcriptome()
    txn2 = Transcriptome()
    txn1.set_reference_genome_dictionary(ref1)
    txn2.set_reference_genome_dictionary(ref2)
    #with open(args.transcripts_genepred) as inf:
    with open(args.inputs[2]) as inf:
        for line in inf:
            if line[0] == '#': continue
            txn1.add_genepred_line(line.rstrip())
            txn2.add_genepred_line(line.rstrip())
            gpd = GenePredEntry(line.rstrip())
            gpdnames[gpd.value('name')] = gpd.value('gene_name')
    # The transcriptomes are set but we dont' really need the references anymore
    # Empty our big memory things
    txn1.ref_hash = None
    txn2.ref_hash = None
    for chrom in ref1.keys():
        del ref1[chrom]
    for chrom in ref2.keys():
        del ref2[chrom]
    for chrom in ref.keys():
        del ref[chrom]

    if not args.locus_by_gene_name:
        #[locus2name,name2locus] = get_loci(args.transcripts_genepred)
        [locus2name, name2locus] = get_loci(args.inputs[2])
    else:  # set locus by gene name
        sys.stderr.write("Organizing loci by gene name\n")
        locus2name = {}
        name2locus = {}
        numname = {}
        m = 0
        for name in sorted(gpdnames):
            gene = gpdnames[name]
            if gene not in numname:
                m += 1
                numname[gene] = m
            num = numname[gene]
            if num not in locus2name:
                locus2name[num] = set()
            locus2name[num].add(name)
            name2locus[name] = num
        sys.stderr.write("Ended with " + str(len(locus2name.keys())) +
                         " loci\n")

    if args.isoform_expression:
        sys.stderr.write("Reading expression from a TSV\n")
        with open(args.isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                txn1.add_expression(f[0], float(f[1]))
                txn2.add_expression(f[0], float(f[1]))
    elif args.cufflinks_isoform_expression:
        sys.stderr.write("Using cufflinks expression\n")
        cuffz = 0
        with open(args.cufflinks_isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                cuffz += 1
                sys.stderr.write(str(cuffz) + " cufflinks entries processed\r")
                f = line.rstrip().split("\t")
                txn1.add_expression_no_update(f[0], float(f[9]))
                txn2.add_expression_no_update(f[0], float(f[9]))
        txn1.update_expression()
        txn2.update_expression()
        sys.stderr.write("\n")
    elif args.uniform_expression:
        sys.stderr.write("Using uniform expression model\n")
    else:
        sys.stderr.write(
            "Warning isoform expression not sepcified, using uniform expression model.\n"
        )
    # Now we have the transcriptomes set
    rhos = {}  # The ASE of allele 1 (the left side)
    randos = {}
    if args.seed:
        random.seed(args.seed)
    for z in locus2name:
        randos[z] = random.random()
    sys.stderr.write("Setting rho for each transcript\n")
    # Lets set rho for ASE for each transcript
    for tname in sorted(txn1.transcripts):
        if args.ASE_identical or args.ASE_identical == 0:
            rhos[tname] = float(args.ASE_identical)
        elif args.ASE_isoform_random:
            rhos[tname] = random.random()
        else:  # we must be on locus random
            rhos[tname] = randos[name2locus[tname]]
    #Now our dataset is set up
    rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1, txn2)
    rbe.gene_names = gpdnames
    rbe.name2locus = name2locus
    rbe.set_transcriptome1_rho(rhos)
    return rbe
コード例 #2
0
def main():
  parser = argparse.ArgumentParser(description="Create a simulated RNA-seq dataset")
  parser.add_argument('reference_genome',help="The reference genome.")
  parser.add_argument('transcripts_genepred',help="A genepred file describing the transcripts.  Each transcript name must be unique.")
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument('--uniform_expression',action='store_true',help="Uniform distribution of transcript expression")
  group.add_argument('--isoform_expression',help="The transcript expression in TSV format <Transcript name> tab <Expression>")
  group.add_argument('--cufflinks_isoform_expression',help="The expression of the isoforms or - for a uniform distribution of transcript expression")
  group2 = parser.add_mutually_exclusive_group()
  group2.add_argument('--long_reads_only',action='store_true')
  group2.add_argument('--short_reads_only',action='store_true')
  group2.add_argument('--output',help="Directory name for output")
  parser.add_argument('--short_read_count',type=int,default=10000,help="INT number of short reads")
  parser.add_argument('--short_read_length',type=int,default=101,help="INT length of the short reads")
  parser.add_argument('--long_read_count',type=int,default=4000,help="INT default number of long reads")
  parser.add_argument('--no_errors',action='store_true')
  parser.add_argument('--threads',type=int,default=1)
  args = parser.parse_args()
  if args.output:
    args.output = args.output.rstrip('/')

  fq_prof_pacbio_ccs95 = None
  fq_prof_pacbio_subreads = None
  fq_prof_illumina = None
  if not args.no_errors:
    fq_prof_pacbio_ccs95 = default_pacbio_ccs95()
    fq_prof_pacbio_subreads = default_pacbio_subreads()
    fq_prof_illumina = default_illumina()

  ref = read_fasta_into_hash(args.reference_genome)
  txn = Transcriptome()
  txn.set_reference_genome_dictionary(ref)
  with open(args.transcripts_genepred) as inf:
    for line in inf:
      if line[0]=='#': continue
      txn.add_genepred_line(line.rstrip())
  if args.isoform_expression:
    sys.stderr.write("Reading expression from a TSV\n")
    with open(args.isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        f = line.rstrip().split("\t")
        txn.add_expression(f[0],float(f[1]))
  elif args.uniform_expression:
    sys.stderr.write("Using uniform expression model\n")
  elif args.cufflinks_isoform_expression:
    sys.stderr.write("Using cufflinks expression\n")
    with open(args.cufflinks_isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        f = line.rstrip().split("\t")
        txn.add_expression(f[0],float(f[9]))
  sys.stderr.write("have transcriptome\n")
  for n in txn.ref_hash.keys(): del txn.ref_hash[n]
  rbe = SimulationBasics.RandomTranscriptomeEmitter(txn)
  # Now we have the transcriptomes set
  #Now our dataset is set up
  if args.short_reads_only:
    rbe.set_gaussian_fragmentation_default_hiseq()
    for zi in range(0,args.short_read_count):
      [name,seq] = rbe.emit_short_read(args.short_read_length)
      if args.no_errors:
        print "@SRSIM"+str(zi+1)
        print seq
        print "+"
        print 'I'*len(seq)
      else:
        l1perm = fq_prof_illumina.create_fastq_and_permute_sequence(seq)
        print "@SRSIM"+str(zi+1)
        print l1perm['seq']
        print "+"
        print l1perm['qual']
    return
  if args.long_reads_only:
    rbe.set_gaussian_fragmentation_default_pacbio()
    for zi in range(0,args.long_read_count):
      [name,seq] = rbe.emit_long_read()
      if args.no_errors:
        g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs'
        print "@"+g
        print seq
        print "+"
        print 'I'*len(seq)   
      else: 
        g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs'
        seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence(seq)
        print "@"+g
        print seqperm['seq']
        print "+"
        print seqperm['qual']  
    return
  if not os.path.exists(args.output):
    os.makedirs(args.output)


  rbe.set_gaussian_fragmentation_default_hiseq()
  # Lets prepare to output now
  sys.stderr.write("Sequencing short reads\n")
  global left_handle
  global right_handle
  left_handle = gzip.open(args.output+"/SR_1.fq.gz",'wb')
  right_handle = gzip.open(args.output+"/SR_2.fq.gz",'wb')
  buffer_size = 10000
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  z = 0
  for i in range(0,args.short_read_count):
    z = i+1
    if z %1000==0: sys.stderr.write(str(z)+"\r")
    buffer.append(z)
    if len(buffer) >= buffer_size:
      if args.threads <= 1:
        v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina)
        do_short(v)
      else:
        p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short)
      buffer = []
  if len(buffer) > 0:
    if args.threads <= 1:
      v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina)
      do_short(v)
    else:
      p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short)
    buffer = []
  if args.threads > 1:
    p.close()
    p.join()

  global greport
  of = open(args.output+"/SR_report.txt",'w')
  for name in greport:
    of.write("\t".join([str(x) for x in greport[name]])+"\n")
  of.close()  
  greport = {}

  sys.stderr.write("\nFinished sequencing short reads\n")
  left_handle.close()
  right_handle.close()

  # Now lets create the long read set
  rbe.set_gaussian_fragmentation_default_pacbio()
  sys.stderr.write("Sequencing ccs long reads\n")
  global long_handle
  long_handle = gzip.open(args.output+"/LR_ccs.fq.gz",'wb')
  buffer_size = 1000
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  z = 0
  for i in range(0,args.long_read_count):
    z = i+1
    if z %100==0: sys.stderr.write(str(z)+"\r")
    buffer.append(z)
    if len(buffer) >= buffer_size:
      if args.threads <= 1:
        v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs')
        do_long(v)
      else:
        p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long)
      buffer = []
  if len(buffer) > 0:
    if args.threads <= 1:
      v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs')
      do_long(v)
    else:
      p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long)
    buffer = []
  if args.threads > 1:
    p.close()
    p.join()

  long_handle.close()
  of = open(args.output+"/LR_ccs_report.txt",'w')
  for name in greport:
    of.write("\t".join([str(x) for x in greport[name]])+"\n")
  of.close()  
  greport = {}
  sys.stderr.write("\nFinished sequencing ccs long reads\n")

  sys.stderr.write("Sequencing long sub reads\n")
  long_handle = gzip.open(args.output+"/LR_sub.fq.gz",'wb')
  buffer_size = 1000
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  for i in range(z,z+args.long_read_count):
    z = i+1
    if z %100==0: sys.stderr.write(str(z)+"\r")
    buffer.append(z)
    if len(buffer) >= buffer_size:
      if args.threads <= 1:
        v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub')
        do_long(v)
      else:
        p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long)
      buffer = []
  if len(buffer) > 0:
    if args.threads <= 1:
      v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub')
      do_long(v)
    else:
      p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long)
    buffer = []
  if args.threads > 1:
    p.close()
    p.join()

  long_handle.close()
  of = open(args.output+"/LR_sub_report.txt",'w')
  for name in greport:
    of.write("\t".join([str(x) for x in greport[name]])+"\n")
  of.close()  
  greport = {}
  sys.stderr.write("\nFinished sequencing long sub reads\n")

  combo = {}
  with open(args.output+"/SR_report.txt") as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      [name,express,left] = f
      if name not in combo:
        combo[name] = {}
        combo[name]['express'] = express
        combo[name]['left'] = 0
      combo[name]['left'] += int(left)
  with open(args.output+"/LR_ccs_report.txt") as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      [name,express,left] = f
      if name not in combo:
        combo[name] = {}
        combo[name]['express'] = express
        combo[name]['left'] = 0
      combo[name]['left'] += int(left)
  with open(args.output+"/LR_sub_report.txt") as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      [name,express,left] = f
      if name not in combo:
        combo[name] = {}
        combo[name]['express'] = express
        combo[name]['left'] = 0
      combo[name]['left'] += int(left)
  of = open(args.output+"/LR_SR_combo_report.txt",'w')
  for name in sorted(combo):
    of.write(name+"\t"+combo[name]['express']+"\t"+str(combo[name]['left'])+"\n")
  of.close()
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(
        description="Create a simulated RNA-seq dataset")
    parser.add_argument('reference_genome', help="The reference genome.")
    parser.add_argument(
        'transcripts_genepred',
        help=
        "A genepred file describing the transcripts.  Each transcript name must be unique."
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--uniform_expression',
                       action='store_true',
                       help="Uniform distribution of transcript expression")
    group.add_argument(
        '--isoform_expression',
        help=
        "The transcript expression in TSV format <Transcript name> tab <Expression>"
    )
    group.add_argument(
        '--cufflinks_isoform_expression',
        help=
        "The expression of the isoforms or - for a uniform distribution of transcript expression"
    )
    group2 = parser.add_mutually_exclusive_group()
    group2.add_argument('--long_reads_only', action='store_true')
    group2.add_argument('--short_reads_only', action='store_true')
    group2.add_argument('--output', help="Directory name for output")
    parser.add_argument('--short_read_count',
                        type=int,
                        default=10000,
                        help="INT number of short reads")
    parser.add_argument('--short_read_length',
                        type=int,
                        default=101,
                        help="INT length of the short reads")
    parser.add_argument('--long_read_count',
                        type=int,
                        default=4000,
                        help="INT default number of long reads")
    parser.add_argument('--no_errors', action='store_true')
    parser.add_argument('--threads', type=int, default=1)
    args = parser.parse_args()
    if args.output:
        args.output = args.output.rstrip('/')

    fq_prof_pacbio_ccs95 = None
    fq_prof_pacbio_subreads = None
    fq_prof_illumina = None
    if not args.no_errors:
        fq_prof_pacbio_ccs95 = default_pacbio_ccs95()
        fq_prof_pacbio_subreads = default_pacbio_subreads()
        fq_prof_illumina = default_illumina()

    ref = read_fasta_into_hash(args.reference_genome)
    txn = Transcriptome()
    txn.set_reference_genome_dictionary(ref)
    with open(args.transcripts_genepred) as inf:
        for line in inf:
            if line[0] == '#': continue
            txn.add_genepred_line(line.rstrip())
    if args.isoform_expression:
        sys.stderr.write("Reading expression from a TSV\n")
        with open(args.isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                txn.add_expression(f[0], float(f[1]))
    elif args.uniform_expression:
        sys.stderr.write("Using uniform expression model\n")
    elif args.cufflinks_isoform_expression:
        sys.stderr.write("Using cufflinks expression\n")
        with open(args.cufflinks_isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                txn.add_expression(f[0], float(f[9]))
    sys.stderr.write("have transcriptome\n")
    for n in txn.ref_hash.keys():
        del txn.ref_hash[n]
    rbe = SimulationBasics.RandomTranscriptomeEmitter(txn)
    # Now we have the transcriptomes set
    #Now our dataset is set up
    if args.short_reads_only:
        rbe.set_gaussian_fragmentation_default_hiseq()
        for zi in range(0, args.short_read_count):
            [name, seq] = rbe.emit_short_read(args.short_read_length)
            if args.no_errors:
                print "@SRSIM" + str(zi + 1)
                print seq
                print "+"
                print 'I' * len(seq)
            else:
                l1perm = fq_prof_illumina.create_fastq_and_permute_sequence(
                    seq)
                print "@SRSIM" + str(zi + 1)
                print l1perm['seq']
                print "+"
                print l1perm['qual']
        return
    if args.long_reads_only:
        rbe.set_gaussian_fragmentation_default_pacbio()
        for zi in range(0, args.long_read_count):
            [name, seq] = rbe.emit_long_read()
            if args.no_errors:
                g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str(
                    zi + 1) + '/ccs'
                print "@" + g
                print seq
                print "+"
                print 'I' * len(seq)
            else:
                g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str(
                    zi + 1) + '/ccs'
                seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence(
                    seq)
                print "@" + g
                print seqperm['seq']
                print "+"
                print seqperm['qual']
        return
    if not os.path.exists(args.output):
        os.makedirs(args.output)

    rbe.set_gaussian_fragmentation_default_hiseq()
    # Lets prepare to output now
    sys.stderr.write("Sequencing short reads\n")
    global left_handle
    global right_handle
    left_handle = gzip.open(args.output + "/SR_1.fq.gz", 'wb')
    right_handle = gzip.open(args.output + "/SR_2.fq.gz", 'wb')
    buffer_size = 10000
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    z = 0
    for i in range(0, args.short_read_count):
        z = i + 1
        if z % 1000 == 0: sys.stderr.write(str(z) + "\r")
        buffer.append(z)
        if len(buffer) >= buffer_size:
            if args.threads <= 1:
                v = process_short_read_buffer(buffer[:], rbe, args,
                                              fq_prof_illumina)
                do_short(v)
            else:
                p.apply_async(process_short_read_buffer,
                              args=(buffer[:], rbe, args, fq_prof_illumina),
                              callback=do_short)
            buffer = []
    if len(buffer) > 0:
        if args.threads <= 1:
            v = process_short_read_buffer(buffer[:], rbe, args,
                                          fq_prof_illumina)
            do_short(v)
        else:
            p.apply_async(process_short_read_buffer,
                          args=(buffer[:], rbe, args, fq_prof_illumina),
                          callback=do_short)
        buffer = []
    if args.threads > 1:
        p.close()
        p.join()

    global greport
    of = open(args.output + "/SR_report.txt", 'w')
    for name in greport:
        of.write("\t".join([str(x) for x in greport[name]]) + "\n")
    of.close()
    greport = {}

    sys.stderr.write("\nFinished sequencing short reads\n")
    left_handle.close()
    right_handle.close()

    # Now lets create the long read set
    rbe.set_gaussian_fragmentation_default_pacbio()
    sys.stderr.write("Sequencing ccs long reads\n")
    global long_handle
    long_handle = gzip.open(args.output + "/LR_ccs.fq.gz", 'wb')
    buffer_size = 1000
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    z = 0
    for i in range(0, args.long_read_count):
        z = i + 1
        if z % 100 == 0: sys.stderr.write(str(z) + "\r")
        buffer.append(z)
        if len(buffer) >= buffer_size:
            if args.threads <= 1:
                v = process_long_reads(buffer[:], rbe, args,
                                       fq_prof_pacbio_ccs95, 'ccs')
                do_long(v)
            else:
                p.apply_async(process_long_reads,
                              args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95,
                                    'ccs'),
                              callback=do_long)
            buffer = []
    if len(buffer) > 0:
        if args.threads <= 1:
            v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_ccs95,
                                   'ccs')
            do_long(v)
        else:
            p.apply_async(process_long_reads,
                          args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95,
                                'ccs'),
                          callback=do_long)
        buffer = []
    if args.threads > 1:
        p.close()
        p.join()

    long_handle.close()
    of = open(args.output + "/LR_ccs_report.txt", 'w')
    for name in greport:
        of.write("\t".join([str(x) for x in greport[name]]) + "\n")
    of.close()
    greport = {}
    sys.stderr.write("\nFinished sequencing ccs long reads\n")

    sys.stderr.write("Sequencing long sub reads\n")
    long_handle = gzip.open(args.output + "/LR_sub.fq.gz", 'wb')
    buffer_size = 1000
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    for i in range(z, z + args.long_read_count):
        z = i + 1
        if z % 100 == 0: sys.stderr.write(str(z) + "\r")
        buffer.append(z)
        if len(buffer) >= buffer_size:
            if args.threads <= 1:
                v = process_long_reads(buffer[:], rbe, args,
                                       fq_prof_pacbio_subreads, 'sub')
                do_long(v)
            else:
                p.apply_async(process_long_reads,
                              args=(buffer[:], rbe, args,
                                    fq_prof_pacbio_subreads, 'sub'),
                              callback=do_long)
            buffer = []
    if len(buffer) > 0:
        if args.threads <= 1:
            v = process_long_reads(buffer[:], rbe, args,
                                   fq_prof_pacbio_subreads, 'sub')
            do_long(v)
        else:
            p.apply_async(process_long_reads,
                          args=(buffer[:], rbe, args, fq_prof_pacbio_subreads,
                                'sub'),
                          callback=do_long)
        buffer = []
    if args.threads > 1:
        p.close()
        p.join()

    long_handle.close()
    of = open(args.output + "/LR_sub_report.txt", 'w')
    for name in greport:
        of.write("\t".join([str(x) for x in greport[name]]) + "\n")
    of.close()
    greport = {}
    sys.stderr.write("\nFinished sequencing long sub reads\n")

    combo = {}
    with open(args.output + "/SR_report.txt") as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            [name, express, left] = f
            if name not in combo:
                combo[name] = {}
                combo[name]['express'] = express
                combo[name]['left'] = 0
            combo[name]['left'] += int(left)
    with open(args.output + "/LR_ccs_report.txt") as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            [name, express, left] = f
            if name not in combo:
                combo[name] = {}
                combo[name]['express'] = express
                combo[name]['left'] = 0
            combo[name]['left'] += int(left)
    with open(args.output + "/LR_sub_report.txt") as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            [name, express, left] = f
            if name not in combo:
                combo[name] = {}
                combo[name]['express'] = express
                combo[name]['left'] = 0
            combo[name]['left'] += int(left)
    of = open(args.output + "/LR_SR_combo_report.txt", 'w')
    for name in sorted(combo):
        of.write(name + "\t" + combo[name]['express'] + "\t" +
                 str(combo[name]['left']) + "\n")
    of.close()
コード例 #4
0
def load_from_inputs(args):
  #Read in the VCF file
  sys.stderr.write("Reading in the VCF file\n")
  alleles = {}
  #with open(args.phased_VCF) as inf:
  with open(args.inputs[1]) as inf:
    for line in inf:
      vcf = VCF(line)
      if not vcf.is_snp(): continue
      g = vcf.get_phased_genotype()
      if not g: continue
      if vcf.value('chrom') not in alleles:
        alleles[vcf.value('chrom')] = {}
      if vcf.value('pos') in alleles[vcf.value('chrom')]:
        sys.stderr.write("WARNING: seeing the same position twice.\n"+line.rstrip()+"\n")
      alleles[vcf.value('chrom')][vcf.value('pos')] = g # set our left and right

  sys.stderr.write("Reading in the reference genome\n")
  #ref = read_fasta_into_hash(args.reference_genome)
  ref = read_fasta_into_hash(args.inputs[0])
  res1 = []
  res2 = []
  p = None
  sys.stderr.write("Introducing VCF changes to reference sequences\n")
  # Pretty memory intesnive to so don't go with all possible threads
  if args.threads > 1: p = Pool(processes=max(1,int(args.threads/4)))
  for chrom in ref:
    # handle the case where there is no allele information
    if chrom not in alleles:
      r1q = Queue()
      r1q.put([0,chrom,ref[chrom]])
      res1.append(r1q)
      r2q = Queue()
      r2q.put([0,chrom,ref[chrom]])
      res2.append(r2q)
    elif args.threads > 1:
      res1.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],0,chrom)))
      res2.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],1,chrom)))
    else:
      r1q = Queue()
      r1q.put(adjust_reference_genome(alleles[chrom],ref[chrom],0,chrom))
      res1.append(r1q)
      r2q = Queue()
      r2q.put(adjust_reference_genome(alleles[chrom],ref[chrom],1,chrom))
      res2.append(r2q)
  if args.threads > 1:
    p.close()
    p.join()

  # now we can fill reference 1 with all our new sequences
  ref1 = {} 
  c1 = 0
  for i in range(0,len(res1)):
    res = res1[i].get()
    c1 += res[0]
    ref1[res[1]]=res[2]

  # now we can fill reference 2 with all our new sequences
  ref2 = {} 
  c2 = 0
  for i in range(0,len(res2)):
    res = res2[i].get()
    c2 += res[0]
    ref2[res[1]]=res[2]
  sys.stderr.write("Made "+str(c1)+"|"+str(c2)+" changes to the reference\n")

  # Now ref1 and ref2 have are the diploid sources of the transcriptome
  gpdnames = {}
  txn1 = Transcriptome()
  txn2 = Transcriptome()
  txn1.set_reference_genome_dictionary(ref1)
  txn2.set_reference_genome_dictionary(ref2)
  #with open(args.transcripts_genepred) as inf:
  with open(args.inputs[2]) as inf:
    for line in inf:
      if line[0]=='#': continue
      txn1.add_genepred_line(line.rstrip())
      txn2.add_genepred_line(line.rstrip())
      gpd = GenePredEntry(line.rstrip())
      gpdnames[gpd.value('name')] = gpd.value('gene_name')
  # The transcriptomes are set but we dont' really need the references anymore
  # Empty our big memory things
  txn1.ref_hash = None
  txn2.ref_hash = None
  for chrom in ref1.keys():  del ref1[chrom]
  for chrom in ref2.keys():  del ref2[chrom]
  for chrom in ref.keys():  del ref[chrom]

  if not args.locus_by_gene_name:
    #[locus2name,name2locus] = get_loci(args.transcripts_genepred)
    [locus2name,name2locus] = get_loci(args.inputs[2])
  else: # set locus by gene name
    sys.stderr.write("Organizing loci by gene name\n")
    locus2name = {}
    name2locus = {}
    numname = {}
    m = 0
    for name in sorted(gpdnames): 
      gene = gpdnames[name]
      if gene not in numname:
        m+=1
        numname[gene] = m
      num = numname[gene]
      if num not in locus2name:
        locus2name[num] = set()
      locus2name[num].add(name)
      name2locus[name] = num
    sys.stderr.write("Ended with "+str(len(locus2name.keys()))+" loci\n")

  if args.isoform_expression:
    sys.stderr.write("Reading expression from a TSV\n")
    with open(args.isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        f = line.rstrip().split("\t")
        txn1.add_expression(f[0],float(f[1]))
        txn2.add_expression(f[0],float(f[1]))
  elif args.cufflinks_isoform_expression:
    sys.stderr.write("Using cufflinks expression\n")
    cuffz = 0
    with open(args.cufflinks_isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        cuffz +=1
        sys.stderr.write(str(cuffz)+" cufflinks entries processed\r")
        f = line.rstrip().split("\t")
        txn1.add_expression_no_update(f[0],float(f[9]))
        txn2.add_expression_no_update(f[0],float(f[9]))
    txn1.update_expression()
    txn2.update_expression()
    sys.stderr.write("\n")
  elif args.uniform_expression:
    sys.stderr.write("Using uniform expression model\n")
  else:
    sys.stderr.write("Warning isoform expression not sepcified, using uniform expression model.\n")
  # Now we have the transcriptomes set
  rhos = {} # The ASE of allele 1 (the left side)
  randos = {}
  if args.seed:
    random.seed(args.seed)
  for z in locus2name: randos[z] = random.random()
  sys.stderr.write("Setting rho for each transcript\n")
  # Lets set rho for ASE for each transcript
  for tname in sorted(txn1.transcripts):
    if args.ASE_identical or args.ASE_identical == 0:
      rhos[tname] = float(args.ASE_identical)
    elif args.ASE_isoform_random:
      rhos[tname] = random.random()
    else: # we must be on locus random
      rhos[tname] = randos[name2locus[tname]]
  #Now our dataset is set up
  rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1,txn2)
  rbe.gene_names = gpdnames
  rbe.name2locus = name2locus
  rbe.set_transcriptome1_rho(rhos)
  return rbe