def main():
  parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file")
  parser.add_argument('input',help="PSLFILE or - for STIDN")
  parser.add_argument('reference',help="FASTAFILE reference genome")
  parser.add_argument('query',help="FASTAFILE query sequences")
  parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT")
  #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
  args = parser.parse_args()
  # Read in the reference genome
  sys.stderr.write("Reading in reference genome\n")
  g = read_fasta_into_hash(args.reference)
  sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n")
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  fhr = FastaHandleReader(open(args.query))
  last_fasta = fhr.read_entry()
  if not last_fasta:
    sys.stderr.write("ERROR: No query sequences\n")
    sys.exit()
  for line in inf:
    p = PSLBasics.PSL(line)
    if not p.validate():
      sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n")
    n = p.value('qName')
    if not last_fasta:
      sys.stderr.write("ERROR: Ran out of query sequences too soon.  Are they sorted properly\n")
      sys.exit()
    while last_fasta['name'] != n:
      last_fasta = fhr.read_entry()
    p.set_query(last_fasta['seq'])
    p.set_reference_dictionary(g)
    print p.get_line()
    p.pretty_print(50)
  fhr.close()
def main():
  parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size")
  parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it")
  parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference")
  parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output")
  parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN")
  parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred")
  args = parser.parse_args()

  cpus = multiprocessing.cpu_count()

  genome = {}
  if args.output_fake_psl:
    genome = read_fasta_into_hash(args.output_fake_psl)

  #read in the reference genepred first
  gpf = GenePredBasics.GenePredFile(args.reference_genepred)
  #lets sort entries by chromosome
  ref = {}
  for e in [x.entry for x in gpf.entries]:
    if len(e['exonStarts']) <= 1: continue
    if e['chrom'] not in ref:
      ref[e['chrom']] = {}
    for i in range(1,len(e['exonStarts'])):
      if e['exonEnds'][i-1] not in ref[e['chrom']]:
        ref[e['chrom']][e['exonEnds'][i-1]] = {}
      if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]:
        ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand']
  #Stored all junctions as 1-base

  read_info = {}
  pf = GenericFileReader(args.psl)
  fcount_total = 0
  while True:
    line = pf.readline()
    if not line: break
    if re.match('^#',line): continue
    line = line.rstrip()
    pe = PSLBasics.line_to_entry(line)
    if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']):
      sys.stderr.write("WARNING invalid psl\n")
      continue
    genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
    ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size)
    refjuns = {}
    if pe['tName'] in ref: refjuns = ref[pe['tName']]
    new_ge = nudge(pe,ge,refjuns,args)
    if args.output_fake_psl:
      new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome)
      print new_psl_line
    else:
      print GenePredBasics.entry_to_line(new_ge)
def main():
  parser = argparse.ArgumentParser(description="For every genepred entry report its alignability",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="Genepred can be gzipped or - for STDIN")
  parser.add_argument('-r','--reference',required=True,help="Reference fasta")
  parser.add_argument('-k','--fragment_size',default=100,type=int,help="Fragment size to try to align")
  parser.add_argument('-x','--hisat_index',required=True,help="HISAT index base name")
  parser.add_argument('--threads',type=int,default=cpu_count(),help="number of threads")
  parser.add_argument('--type',choices=['mean','median'],default='mean',help="How to bring together overlapping reads")
  parser.add_argument('--perbase',action='store_true')
  parser.add_argument('--output','-o',help="output file or leave unset for STDOUT")
  args = parser.parse_args()
  
  if args.input=='-': args.input=sys.stdin
  elif re.search('\.gz$',args.input):
    args.input = gzip.open(args.input)
  else: args.input = open(args.input)

  udir = os.path.dirname(os.path.realpath(__file__))
  cmd2 = udir+'/genepred_counts_to_mappability.py -'
  cmd2 += ' --threads '+str(args.threads)
  cmd2 += ' -k '+str(args.fragment_size)
  if args.perbase: cmd2 += ' --perbase'
  if args.output: cmd2 += ' --output '+args.output
  if args.type: cmd2 += ' --type '+args.type
  p2 = Popen(cmd2.split(),stdin=PIPE)
  ref = read_fasta_into_hash(args.reference)
  cmd1 = 'hisat -x '+args.hisat_index+' -U - -f --reorder -p '+str(args.threads)
  p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin,stderr=null)
  #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin)
  line_number = 0
  for line in args.input:
    line_number +=1
    gpd = GPD(line.rstrip())
    #print gpd.entry['name']
    #print gpd.length()
    if gpd.length() < args.fragment_size: continue
    seq = gpd.get_sequence(ref)
    for i in range(0,len(seq)-args.fragment_size+1):
      info = gpd.value('name')+"\t"+gpd.value('gene_name')+"\t"+str(line_number)+"\t"+str(len(seq))+"\t"+str(i)
      einfo = encode_name(info)
      p1.stdin.write('>'+einfo+"\n")
      p1.stdin.write(seq[i:i+args.fragment_size]+"\n")
  p1.communicate()
  p2.communicate()
예제 #4
0
def main():
    parser = argparse.ArgumentParser(
        description="Correct the matches/mismatches and Ncount of a PSL file")
    parser.add_argument('input', help="PSLFILE or - for STIDN")
    parser.add_argument('reference', help="FASTAFILE reference genome")
    parser.add_argument('query', help="FASTAFILE query sequences")
    parser.add_argument('--minimum_intron_size',
                        type=int,
                        default=68,
                        help="INT")
    #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
    args = parser.parse_args()
    # Read in the reference genome
    sys.stderr.write("Reading in reference genome\n")
    g = read_fasta_into_hash(args.reference)
    sys.stderr.write("Finished reading " + str(len(g.keys())) +
                     " reference sequences\n")
    inf = sys.stdin
    if args.input != '-':
        inf = open(args.input)
    fhr = FastaHandleReader(open(args.query))
    last_fasta = fhr.read_entry()
    if not last_fasta:
        sys.stderr.write("ERROR: No query sequences\n")
        sys.exit()
    for line in inf:
        p = PSLBasics.PSL(line)
        if not p.validate():
            sys.stderr.write(
                "WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"
                + line.rstrip() + "\n")
        n = p.value('qName')
        if not last_fasta:
            sys.stderr.write(
                "ERROR: Ran out of query sequences too soon.  Are they sorted properly\n"
            )
            sys.exit()
        while last_fasta['name'] != n:
            last_fasta = fhr.read_entry()
        p.set_query(last_fasta['seq'])
        p.set_reference_dictionary(g)
        print p.get_line()
        p.pretty_print(50)
    fhr.close()
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('ref_genome')
  parser.add_argument('phased_vcf')
  args = parser.parse_args()
  g = read_fasta_into_hash(args.ref_genome)
  gL = {}
  gR = {}
  for chr in g:
    gL[chr] = [x for x in g[chr].upper()]
    gR[chr] = [x for x in g[chr].upper()]
  #with open('1KG.biallelic.het.exonic/1KG.biallelic.het.exonic.vcf') as inf:
  z = 0
  with open(args.phased_vcf) as inf:
    for line in inf:
      if re.match('#',line): continue
      z += 1
      sys.stderr.write(str(z)+"\r")
      f = line.rstrip().split("\t")
      chr = f[0]
      [n1,n2] = [int(x) for x in f[9].split('|')]
      if int(f[1]) > len(gL[chr]): 
        sys.stderr.write(line)
        sys.exit()
      if n1 == 0: 
        gL[chr][int(f[1])-1] = f[3]
      else: 
        gL[chr][int(f[1])-1] = f[4]
      if n2 == 0:
        gR[chr][int(f[1])-1] = f[3]
      else:
        gR[chr][int(f[1])-1] = f[4]
  sys.stderr.write("\nalmost done\n")
  ofL = open('L.fa','w')
  for chr in sorted(gL.keys()):
    ofL.write(">"+chr+"\n")
    ofL.write("".join(gL[chr])+"\n")
  ofL.close()
  ofR = open('R.fa','w')
  for chr in sorted(gR.keys()):
    ofR.write(">"+chr+"\n")
    ofR.write("".join(gR[chr])+"\n")
  ofR.close()
예제 #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('ref_genome')
    parser.add_argument('phased_vcf')
    args = parser.parse_args()
    g = read_fasta_into_hash(args.ref_genome)
    gL = {}
    gR = {}
    for chr in g:
        gL[chr] = [x for x in g[chr].upper()]
        gR[chr] = [x for x in g[chr].upper()]
    #with open('1KG.biallelic.het.exonic/1KG.biallelic.het.exonic.vcf') as inf:
    z = 0
    with open(args.phased_vcf) as inf:
        for line in inf:
            if re.match('#', line): continue
            z += 1
            sys.stderr.write(str(z) + "\r")
            f = line.rstrip().split("\t")
            chr = f[0]
            [n1, n2] = [int(x) for x in f[9].split('|')]
            if int(f[1]) > len(gL[chr]):
                sys.stderr.write(line)
                sys.exit()
            if n1 == 0:
                gL[chr][int(f[1]) - 1] = f[3]
            else:
                gL[chr][int(f[1]) - 1] = f[4]
            if n2 == 0:
                gR[chr][int(f[1]) - 1] = f[3]
            else:
                gR[chr][int(f[1]) - 1] = f[4]
    sys.stderr.write("\nalmost done\n")
    ofL = open('L.fa', 'w')
    for chr in sorted(gL.keys()):
        ofL.write(">" + chr + "\n")
        ofL.write("".join(gL[chr]) + "\n")
    ofL.close()
    ofR = open('R.fa', 'w')
    for chr in sorted(gR.keys()):
        ofR.write(">" + chr + "\n")
        ofR.write("".join(gR[chr]) + "\n")
    ofR.close()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('reference_genome')
    parser.add_argument('transcripts_genepred')
    parser.add_argument('--out_gpd', help="fusion genepred", required=True)
    parser.add_argument('--out_fasta', help="fusion fasta", required=True)
    parser.add_argument(
        '--fusion_count',
        type=int,
        default=1000,
        help="Create this many fusions, max is number of genes/2.")
    args = parser.parse_args()
    ref = read_fasta_into_hash(args.reference_genome)
    of_gpd = open(args.out_gpd, 'w')
    of_fasta = open(args.out_fasta, 'w')
    genes = {}
    with open(args.transcripts_genepred) as inf:
        for line in inf:
            gpd = GPD(line.rstrip())
            if gpd.value('exonCount') <= 1: continue
            if gpd.value('gene_name') not in genes:
                genes[gpd.value('gene_name')] = []
            genes[gpd.value('gene_name')].append(gpd)
    gene_names = genes.keys()
    fusion_count = args.fusion_count
    shuffle(gene_names)
    pairs = []
    while True:
        if len(pairs) == fusion_count: break
        if len(gene_names) < 2: break
        pair = [gene_names[0], gene_names[1]]
        pairs.append(pair)
        gene_names.pop(0)
        gene_names.pop(0)
    for pair in pairs:
        [gpds, ars] = get_random_gpds_from_pair(pair, genes, ref)
        print ars.name
        of_fasta.write(ars.get_fasta())
        for gpd in gpds:
            of_gpd.write(gpd + "\n")
    of_gpd.close()
    of_fasta.close()
def main():
  parser = argparse.ArgumentParser(description='Create artifical reference sequences from a genepred')
  parser.add_argument('gpd_file')
  parser.add_argument('reference_fasta')
  parser.add_argument('-o','--output',help="output file to write to or STDOUT if not set")
  args = parser.parse_args()
  of  = sys.stdout
  if args.output: of = open(args.output,'w')
  f = read_fasta_into_hash(args.reference_fasta)
  with open(args.gpd_file) as inf:
    for line in inf:
      gpd = GenePredBasics.GenePredEntry()
      gpd.line_to_entry(line.rstrip())
      ars = ARS()
      beds = []
      for i in range(0,gpd.value('exonCount')):
        b = Bed(gpd.value('chrom'),gpd.value('exonStarts')[i],gpd.value('exonEnds')[i],gpd.value('strand'))
        beds.append(b)
      ars.set_bounds(beds)
      ars.set_name(gpd.value('name'))
      ars.set_sequence_from_original_reference_hash(f)
      of.write(ars.get_fasta())
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('reference_genome')
  parser.add_argument('transcripts_genepred')
  parser.add_argument('--out_gpd',help="fusion genepred",required=True)
  parser.add_argument('--out_fasta',help="fusion fasta",required=True)
  parser.add_argument('--fusion_count',type=int,default=1000,help="Create this many fusions, max is number of genes/2.")
  args = parser.parse_args()
  ref = read_fasta_into_hash(args.reference_genome)
  of_gpd = open(args.out_gpd,'w')
  of_fasta = open(args.out_fasta,'w')
  genes = {}
  with open(args.transcripts_genepred) as inf:
    for line in inf:
      gpd = GPD(line.rstrip())
      if gpd.value('exonCount') <= 1: continue
      if gpd.value('gene_name') not in genes:
        genes[gpd.value('gene_name')] = []
      genes[gpd.value('gene_name')].append(gpd)
  gene_names = genes.keys()
  fusion_count = args.fusion_count
  shuffle(gene_names)
  pairs = []
  while True:
    if len(pairs) == fusion_count: break
    if len(gene_names) < 2: break
    pair = [gene_names[0],gene_names[1]]
    pairs.append(pair)
    gene_names.pop(0)
    gene_names.pop(0)
  for pair in pairs:
    [gpds,ars] = get_random_gpds_from_pair(pair,genes,ref)
    print ars.name
    of_fasta.write(ars.get_fasta())
    for gpd in gpds:
      of_gpd.write(gpd+"\n")
  of_gpd.close()
  of_fasta.close()
예제 #10
0
def main():
    parser = argparse.ArgumentParser(
        description="Create a simulated RNA-seq dataset")
    parser.add_argument('reference_genome', help="The reference genome.")
    parser.add_argument(
        'transcripts_genepred',
        help=
        "A genepred file describing the transcripts.  Each transcript name must be unique."
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--uniform_expression',
                       action='store_true',
                       help="Uniform distribution of transcript expression")
    group.add_argument(
        '--isoform_expression',
        help=
        "The transcript expression in TSV format <Transcript name> tab <Expression>"
    )
    group.add_argument(
        '--cufflinks_isoform_expression',
        help=
        "The expression of the isoforms or - for a uniform distribution of transcript expression"
    )
    group2 = parser.add_mutually_exclusive_group()
    group2.add_argument('--long_reads_only', action='store_true')
    group2.add_argument('--short_reads_only', action='store_true')
    group2.add_argument('--output', help="Directory name for output")
    parser.add_argument('--short_read_count',
                        type=int,
                        default=10000,
                        help="INT number of short reads")
    parser.add_argument('--short_read_length',
                        type=int,
                        default=101,
                        help="INT length of the short reads")
    parser.add_argument('--long_read_count',
                        type=int,
                        default=4000,
                        help="INT default number of long reads")
    parser.add_argument('--no_errors', action='store_true')
    parser.add_argument('--threads', type=int, default=1)
    args = parser.parse_args()
    if args.output:
        args.output = args.output.rstrip('/')

    fq_prof_pacbio_ccs95 = None
    fq_prof_pacbio_subreads = None
    fq_prof_illumina = None
    if not args.no_errors:
        fq_prof_pacbio_ccs95 = default_pacbio_ccs95()
        fq_prof_pacbio_subreads = default_pacbio_subreads()
        fq_prof_illumina = default_illumina()

    ref = read_fasta_into_hash(args.reference_genome)
    txn = Transcriptome()
    txn.set_reference_genome_dictionary(ref)
    with open(args.transcripts_genepred) as inf:
        for line in inf:
            if line[0] == '#': continue
            txn.add_genepred_line(line.rstrip())
    if args.isoform_expression:
        sys.stderr.write("Reading expression from a TSV\n")
        with open(args.isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                txn.add_expression(f[0], float(f[1]))
    elif args.uniform_expression:
        sys.stderr.write("Using uniform expression model\n")
    elif args.cufflinks_isoform_expression:
        sys.stderr.write("Using cufflinks expression\n")
        with open(args.cufflinks_isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                txn.add_expression(f[0], float(f[9]))
    sys.stderr.write("have transcriptome\n")
    for n in txn.ref_hash.keys():
        del txn.ref_hash[n]
    rbe = SimulationBasics.RandomTranscriptomeEmitter(txn)
    # Now we have the transcriptomes set
    #Now our dataset is set up
    if args.short_reads_only:
        rbe.set_gaussian_fragmentation_default_hiseq()
        for zi in range(0, args.short_read_count):
            [name, seq] = rbe.emit_short_read(args.short_read_length)
            if args.no_errors:
                print "@SRSIM" + str(zi + 1)
                print seq
                print "+"
                print 'I' * len(seq)
            else:
                l1perm = fq_prof_illumina.create_fastq_and_permute_sequence(
                    seq)
                print "@SRSIM" + str(zi + 1)
                print l1perm['seq']
                print "+"
                print l1perm['qual']
        return
    if args.long_reads_only:
        rbe.set_gaussian_fragmentation_default_pacbio()
        for zi in range(0, args.long_read_count):
            [name, seq] = rbe.emit_long_read()
            if args.no_errors:
                g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str(
                    zi + 1) + '/ccs'
                print "@" + g
                print seq
                print "+"
                print 'I' * len(seq)
            else:
                g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str(
                    zi + 1) + '/ccs'
                seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence(
                    seq)
                print "@" + g
                print seqperm['seq']
                print "+"
                print seqperm['qual']
        return
    if not os.path.exists(args.output):
        os.makedirs(args.output)

    rbe.set_gaussian_fragmentation_default_hiseq()
    # Lets prepare to output now
    sys.stderr.write("Sequencing short reads\n")
    global left_handle
    global right_handle
    left_handle = gzip.open(args.output + "/SR_1.fq.gz", 'wb')
    right_handle = gzip.open(args.output + "/SR_2.fq.gz", 'wb')
    buffer_size = 10000
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    z = 0
    for i in range(0, args.short_read_count):
        z = i + 1
        if z % 1000 == 0: sys.stderr.write(str(z) + "\r")
        buffer.append(z)
        if len(buffer) >= buffer_size:
            if args.threads <= 1:
                v = process_short_read_buffer(buffer[:], rbe, args,
                                              fq_prof_illumina)
                do_short(v)
            else:
                p.apply_async(process_short_read_buffer,
                              args=(buffer[:], rbe, args, fq_prof_illumina),
                              callback=do_short)
            buffer = []
    if len(buffer) > 0:
        if args.threads <= 1:
            v = process_short_read_buffer(buffer[:], rbe, args,
                                          fq_prof_illumina)
            do_short(v)
        else:
            p.apply_async(process_short_read_buffer,
                          args=(buffer[:], rbe, args, fq_prof_illumina),
                          callback=do_short)
        buffer = []
    if args.threads > 1:
        p.close()
        p.join()

    global greport
    of = open(args.output + "/SR_report.txt", 'w')
    for name in greport:
        of.write("\t".join([str(x) for x in greport[name]]) + "\n")
    of.close()
    greport = {}

    sys.stderr.write("\nFinished sequencing short reads\n")
    left_handle.close()
    right_handle.close()

    # Now lets create the long read set
    rbe.set_gaussian_fragmentation_default_pacbio()
    sys.stderr.write("Sequencing ccs long reads\n")
    global long_handle
    long_handle = gzip.open(args.output + "/LR_ccs.fq.gz", 'wb')
    buffer_size = 1000
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    z = 0
    for i in range(0, args.long_read_count):
        z = i + 1
        if z % 100 == 0: sys.stderr.write(str(z) + "\r")
        buffer.append(z)
        if len(buffer) >= buffer_size:
            if args.threads <= 1:
                v = process_long_reads(buffer[:], rbe, args,
                                       fq_prof_pacbio_ccs95, 'ccs')
                do_long(v)
            else:
                p.apply_async(process_long_reads,
                              args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95,
                                    'ccs'),
                              callback=do_long)
            buffer = []
    if len(buffer) > 0:
        if args.threads <= 1:
            v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_ccs95,
                                   'ccs')
            do_long(v)
        else:
            p.apply_async(process_long_reads,
                          args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95,
                                'ccs'),
                          callback=do_long)
        buffer = []
    if args.threads > 1:
        p.close()
        p.join()

    long_handle.close()
    of = open(args.output + "/LR_ccs_report.txt", 'w')
    for name in greport:
        of.write("\t".join([str(x) for x in greport[name]]) + "\n")
    of.close()
    greport = {}
    sys.stderr.write("\nFinished sequencing ccs long reads\n")

    sys.stderr.write("Sequencing long sub reads\n")
    long_handle = gzip.open(args.output + "/LR_sub.fq.gz", 'wb')
    buffer_size = 1000
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    for i in range(z, z + args.long_read_count):
        z = i + 1
        if z % 100 == 0: sys.stderr.write(str(z) + "\r")
        buffer.append(z)
        if len(buffer) >= buffer_size:
            if args.threads <= 1:
                v = process_long_reads(buffer[:], rbe, args,
                                       fq_prof_pacbio_subreads, 'sub')
                do_long(v)
            else:
                p.apply_async(process_long_reads,
                              args=(buffer[:], rbe, args,
                                    fq_prof_pacbio_subreads, 'sub'),
                              callback=do_long)
            buffer = []
    if len(buffer) > 0:
        if args.threads <= 1:
            v = process_long_reads(buffer[:], rbe, args,
                                   fq_prof_pacbio_subreads, 'sub')
            do_long(v)
        else:
            p.apply_async(process_long_reads,
                          args=(buffer[:], rbe, args, fq_prof_pacbio_subreads,
                                'sub'),
                          callback=do_long)
        buffer = []
    if args.threads > 1:
        p.close()
        p.join()

    long_handle.close()
    of = open(args.output + "/LR_sub_report.txt", 'w')
    for name in greport:
        of.write("\t".join([str(x) for x in greport[name]]) + "\n")
    of.close()
    greport = {}
    sys.stderr.write("\nFinished sequencing long sub reads\n")

    combo = {}
    with open(args.output + "/SR_report.txt") as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            [name, express, left] = f
            if name not in combo:
                combo[name] = {}
                combo[name]['express'] = express
                combo[name]['left'] = 0
            combo[name]['left'] += int(left)
    with open(args.output + "/LR_ccs_report.txt") as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            [name, express, left] = f
            if name not in combo:
                combo[name] = {}
                combo[name]['express'] = express
                combo[name]['left'] = 0
            combo[name]['left'] += int(left)
    with open(args.output + "/LR_sub_report.txt") as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            [name, express, left] = f
            if name not in combo:
                combo[name] = {}
                combo[name]['express'] = express
                combo[name]['left'] = 0
            combo[name]['left'] += int(left)
    of = open(args.output + "/LR_SR_combo_report.txt", 'w')
    for name in sorted(combo):
        of.write(name + "\t" + combo[name]['express'] + "\t" +
                 str(combo[name]['left']) + "\n")
    of.close()
def main():
  parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file")
  parser.add_argument('input',help="PSLFILE or - for STIDN")
  parser.add_argument('reference',help="FASTAFILE reference genome")
  parser.add_argument('query',help="FASTAFILE query sequences")
  parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT")
  #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
  args = parser.parse_args()
  # Read in the reference genome
  sys.stderr.write("Reading in reference genome\n")
  g = read_fasta_into_hash(args.reference)
  sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n")
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  fhr = FastaHandleReader(open(args.query))
  last_fasta = fhr.read_entry()
  if not last_fasta:
    sys.stderr.write("ERROR: No query sequences\n")
    sys.exit()
  for line in inf:
    p = PSLBasics.PSL(line)
    if not p.validate():
      sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n")
    n = p.value('qName')
    if not last_fasta:
      sys.stderr.write("ERROR: Ran out of query sequences too soon.  Are they sorted properly\n")
      sys.exit()
    while last_fasta['name'] != n:
      last_fasta = fhr.read_entry()
    p.set_query(last_fasta['seq'])
    p.set_reference_dictionary(g)
    p.correct_stats()
    print p.get_line()
    continue
    f = last_fasta
    nCount = 0
    matches = 0
    misMatches = 0
    prev_qE = 0
    prev_tE = 0
    qNumInsert = 0
    qBaseInsert = 0
    tNumInsert = 0
    tBaseInsert = 0
    for i in range(p.value('blockCount')):
      blen = p.value('blockSizes')[i]
      qS = p.value('qStarts')[i] #query start
      qE = qS + blen             #query end
      tS = p.value('tStarts')[i] #target start
      tE = tS + blen             #target end
      #Work on gaps
      if prev_qE > 0 or prev_tE > 0: #if its not our first time through
        tgap = tS-prev_tE
        if tgap < args.minimum_intron_size and tgap > 0:
          tNumInsert += 1
          tBaseInsert += tgap
        qgap = qS-prev_qE
        if qgap > 0:
          qNumInsert += 1
          qBaseInsert += qgap
      query = f['seq']
      if p.value('strand') == '-':
        query = rc(f['seq'])
      qseq = query[qS:qE].upper()
      rseq = g[p.value('tName')][tS:tE].upper()
      #print qseq+"\n"+rseq+"\n"
      for j in range(0,blen):
        if qseq[j] == 'N':
          nCount += 1
        elif qseq[j] == rseq[j]:
          matches += 1
        else:
          misMatches += 1
      prev_qE = qE
      prev_tE = tE
    p.entry['matches'] = matches
    p.entry['misMatches'] = misMatches
    p.entry['nCount'] = nCount
    p.entry['qNumInsert'] = qNumInsert
    p.entry['qBaseInsert'] = qBaseInsert
    p.entry['tNumInsert'] = tNumInsert
    p.entry['tBaseInsert'] = tBaseInsert
    p.entry['qSize'] = len(query)
    p.entry['tSize'] = len(g[p.value('tName')]) 
    print p.get_line()
    #p.pretty_print(100)
  fhr.close()
def main():
  parser = argparse.ArgumentParser(description="Create a simulated RNA-seq dataset")
  parser.add_argument('reference_genome',help="The reference genome.")
  parser.add_argument('transcripts_genepred',help="A genepred file describing the transcripts.  Each transcript name must be unique.")
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument('--uniform_expression',action='store_true',help="Uniform distribution of transcript expression")
  group.add_argument('--isoform_expression',help="The transcript expression in TSV format <Transcript name> tab <Expression>")
  group.add_argument('--cufflinks_isoform_expression',help="The expression of the isoforms or - for a uniform distribution of transcript expression")
  group2 = parser.add_mutually_exclusive_group()
  group2.add_argument('--long_reads_only',action='store_true')
  group2.add_argument('--short_reads_only',action='store_true')
  group2.add_argument('--output',help="Directory name for output")
  parser.add_argument('--short_read_count',type=int,default=10000,help="INT number of short reads")
  parser.add_argument('--short_read_length',type=int,default=101,help="INT length of the short reads")
  parser.add_argument('--long_read_count',type=int,default=4000,help="INT default number of long reads")
  parser.add_argument('--no_errors',action='store_true')
  parser.add_argument('--threads',type=int,default=1)
  args = parser.parse_args()
  if args.output:
    args.output = args.output.rstrip('/')

  fq_prof_pacbio_ccs95 = None
  fq_prof_pacbio_subreads = None
  fq_prof_illumina = None
  if not args.no_errors:
    fq_prof_pacbio_ccs95 = default_pacbio_ccs95()
    fq_prof_pacbio_subreads = default_pacbio_subreads()
    fq_prof_illumina = default_illumina()

  ref = read_fasta_into_hash(args.reference_genome)
  txn = Transcriptome()
  txn.set_reference_genome_dictionary(ref)
  with open(args.transcripts_genepred) as inf:
    for line in inf:
      if line[0]=='#': continue
      txn.add_genepred_line(line.rstrip())
  if args.isoform_expression:
    sys.stderr.write("Reading expression from a TSV\n")
    with open(args.isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        f = line.rstrip().split("\t")
        txn.add_expression(f[0],float(f[1]))
  elif args.uniform_expression:
    sys.stderr.write("Using uniform expression model\n")
  elif args.cufflinks_isoform_expression:
    sys.stderr.write("Using cufflinks expression\n")
    with open(args.cufflinks_isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        f = line.rstrip().split("\t")
        txn.add_expression(f[0],float(f[9]))
  sys.stderr.write("have transcriptome\n")
  for n in txn.ref_hash.keys(): del txn.ref_hash[n]
  rbe = SimulationBasics.RandomTranscriptomeEmitter(txn)
  # Now we have the transcriptomes set
  #Now our dataset is set up
  if args.short_reads_only:
    rbe.set_gaussian_fragmentation_default_hiseq()
    for zi in range(0,args.short_read_count):
      [name,seq] = rbe.emit_short_read(args.short_read_length)
      if args.no_errors:
        print "@SRSIM"+str(zi+1)
        print seq
        print "+"
        print 'I'*len(seq)
      else:
        l1perm = fq_prof_illumina.create_fastq_and_permute_sequence(seq)
        print "@SRSIM"+str(zi+1)
        print l1perm['seq']
        print "+"
        print l1perm['qual']
    return
  if args.long_reads_only:
    rbe.set_gaussian_fragmentation_default_pacbio()
    for zi in range(0,args.long_read_count):
      [name,seq] = rbe.emit_long_read()
      if args.no_errors:
        g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs'
        print "@"+g
        print seq
        print "+"
        print 'I'*len(seq)   
      else: 
        g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs'
        seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence(seq)
        print "@"+g
        print seqperm['seq']
        print "+"
        print seqperm['qual']  
    return
  if not os.path.exists(args.output):
    os.makedirs(args.output)


  rbe.set_gaussian_fragmentation_default_hiseq()
  # Lets prepare to output now
  sys.stderr.write("Sequencing short reads\n")
  global left_handle
  global right_handle
  left_handle = gzip.open(args.output+"/SR_1.fq.gz",'wb')
  right_handle = gzip.open(args.output+"/SR_2.fq.gz",'wb')
  buffer_size = 10000
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  z = 0
  for i in range(0,args.short_read_count):
    z = i+1
    if z %1000==0: sys.stderr.write(str(z)+"\r")
    buffer.append(z)
    if len(buffer) >= buffer_size:
      if args.threads <= 1:
        v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina)
        do_short(v)
      else:
        p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short)
      buffer = []
  if len(buffer) > 0:
    if args.threads <= 1:
      v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina)
      do_short(v)
    else:
      p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short)
    buffer = []
  if args.threads > 1:
    p.close()
    p.join()

  global greport
  of = open(args.output+"/SR_report.txt",'w')
  for name in greport:
    of.write("\t".join([str(x) for x in greport[name]])+"\n")
  of.close()  
  greport = {}

  sys.stderr.write("\nFinished sequencing short reads\n")
  left_handle.close()
  right_handle.close()

  # Now lets create the long read set
  rbe.set_gaussian_fragmentation_default_pacbio()
  sys.stderr.write("Sequencing ccs long reads\n")
  global long_handle
  long_handle = gzip.open(args.output+"/LR_ccs.fq.gz",'wb')
  buffer_size = 1000
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  z = 0
  for i in range(0,args.long_read_count):
    z = i+1
    if z %100==0: sys.stderr.write(str(z)+"\r")
    buffer.append(z)
    if len(buffer) >= buffer_size:
      if args.threads <= 1:
        v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs')
        do_long(v)
      else:
        p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long)
      buffer = []
  if len(buffer) > 0:
    if args.threads <= 1:
      v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs')
      do_long(v)
    else:
      p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long)
    buffer = []
  if args.threads > 1:
    p.close()
    p.join()

  long_handle.close()
  of = open(args.output+"/LR_ccs_report.txt",'w')
  for name in greport:
    of.write("\t".join([str(x) for x in greport[name]])+"\n")
  of.close()  
  greport = {}
  sys.stderr.write("\nFinished sequencing ccs long reads\n")

  sys.stderr.write("Sequencing long sub reads\n")
  long_handle = gzip.open(args.output+"/LR_sub.fq.gz",'wb')
  buffer_size = 1000
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  for i in range(z,z+args.long_read_count):
    z = i+1
    if z %100==0: sys.stderr.write(str(z)+"\r")
    buffer.append(z)
    if len(buffer) >= buffer_size:
      if args.threads <= 1:
        v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub')
        do_long(v)
      else:
        p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long)
      buffer = []
  if len(buffer) > 0:
    if args.threads <= 1:
      v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub')
      do_long(v)
    else:
      p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long)
    buffer = []
  if args.threads > 1:
    p.close()
    p.join()

  long_handle.close()
  of = open(args.output+"/LR_sub_report.txt",'w')
  for name in greport:
    of.write("\t".join([str(x) for x in greport[name]])+"\n")
  of.close()  
  greport = {}
  sys.stderr.write("\nFinished sequencing long sub reads\n")

  combo = {}
  with open(args.output+"/SR_report.txt") as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      [name,express,left] = f
      if name not in combo:
        combo[name] = {}
        combo[name]['express'] = express
        combo[name]['left'] = 0
      combo[name]['left'] += int(left)
  with open(args.output+"/LR_ccs_report.txt") as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      [name,express,left] = f
      if name not in combo:
        combo[name] = {}
        combo[name]['express'] = express
        combo[name]['left'] = 0
      combo[name]['left'] += int(left)
  with open(args.output+"/LR_sub_report.txt") as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      [name,express,left] = f
      if name not in combo:
        combo[name] = {}
        combo[name]['express'] = express
        combo[name]['left'] = 0
      combo[name]['left'] += int(left)
  of = open(args.output+"/LR_SR_combo_report.txt",'w')
  for name in sorted(combo):
    of.write(name+"\t"+combo[name]['express']+"\t"+str(combo[name]['left'])+"\n")
  of.close()
def main():
  parser = argparse.ArgumentParser(description="splice together partial alignments")
  group1 = parser.add_mutually_exclusive_group(required=True)
  group1.add_argument('--fastq_reads')
  group1.add_argument('--fasta_reads')
  parser.add_argument('--genome',help="FASTA reference genome",required=True)
  parser.add_argument('--genepred',help="Transcriptome genepred")
  parser.add_argument('--max_intron_size',type=int,default=100000,help="INT maximum intron size")
  parser.add_argument('--min_intron_size',type=int,default=68,help="INT minimum intron size")
  parser.add_argument('--max_gap_size',type=int,default=10,help="INT gap size in query to join")
  parser.add_argument('--max_search_expand',type=int,default=10,help="INT max search space to expand search for junction")
  parser.add_argument('--direction_specific',action='store_true',help="The direction of the transcript is known and properly oriented already")
  parser.add_argument('--threads',type=int,default=0,help="INT number of threads to use default cpu_count")
  parser.add_argument('-o','--output',default='-',help="FILENAME output results to here rather than STDOUT which is default")
  parser.add_argument('input_alignment',help="FILENAME input .psl file or '-' for STDIN")
  args = parser.parse_args()

  # Read our reference genome
  sys.stderr.write("Reading reference\n")
  ref = read_fasta_into_hash(args.genome)

  # Make sure our reads are unique
  sys.stderr.write("Checking for unqiuely named reads\n")
  reads = check_for_uniquely_named_reads(args) # does a hard exit and error if there are any names repeated
  sys.stderr.write("Reads are uniquely named\n")
  
  # Set number of threads to use
  cpu_count = multiprocessing.cpu_count()
  if args.threads > 0:
    cpu_count = args.threads

  #Set reference splices (if any are available)
  reference_splices = {}
  if args.genepred:
    sys.stderr.write("Reading reference splices from genepred\n")
    reference_splices = get_reference_splices(args)

  sys.stderr.write("Reading alignments into loci\n")

  # Get locus division (first stage)
  # Each read (qName) is separated
  # Then each locus will be specific to at chromosome (tName)
  # Then by (strand), but keep in mind this is the is based on the read
  # Each locus should be specific to a direction but we don't necessarily
  # know direction based on the data we have thus far.  
  inf = sys.stdin
  if args.input_alignment != '-': inf = open(args.input_alignment,'r')
  loci = {}
  for line in inf:
    line = line.rstrip()
    if re.match('^#',line): continue
    psl = PSLBasics.line_to_entry(line)
    if psl['qName'] not in loci:
      loci[psl['qName']] = {}
    if psl['tName'] not in loci[psl['qName']]:
      loci[psl['qName']][psl['tName']] = {}
    if psl['strand'] not in loci[psl['qName']][psl['tName']]:
      loci[psl['qName']][psl['tName']][psl['strand']] = {}
    if psl['tStarts'][0] not in loci[psl['qName']][psl['tName']][psl['strand']]:
      loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'][0]] = []
    loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'][0]].append(psl)

  sys.stderr.write("breaking loci by genomic distance\n")
  for qname in loci:
    for chr in loci[qname]:
      for strand in loci[qname][chr]:
        #print qname + "\t" + chr + "\t" + strand
        starts = loci[qname][chr][strand].keys()
        current_set = []
        locus_sets = []
        last_end = -1*(args.max_intron_size+2)
        for start in sorted(starts):
          for e in loci[qname][chr][strand][start]:
            start = e['tStarts'][0]+1 # base-1 start of start of alignment
            if start > last_end+args.max_intron_size:
              # we have the start of a new set
              if len(current_set) > 0: 
                locus_sets.append(current_set)
              current_set = []
            last_end = e['tStarts'][len(e['tStarts'])-1]+e['blockSizes'][len(e['tStarts'])-1]
            current_set.append(e)
        if len(current_set) > 0:
          locus_sets.append(current_set)
        loci[qname][chr][strand] = locus_sets # replace what was there with these ordered sets

  locus_total = 0
  for qname in loci:
    for chr in loci[qname]:
      for strand in loci[qname][chr]:
        for locus_set in loci[qname][chr][strand]:
          locus_total+=1  

  sys.stderr.write("Work on each read in each locus with "+str(cpu_count)+" CPUs\n")
  p = multiprocessing.Pool(processes=cpu_count)
  locus_count = 0
  for qname in loci:
    for chr in loci[qname]:
      for strand in loci[qname][chr]:
        #print qname + "\t" + chr + "\t" + strand
        for locus_set in loci[qname][chr][strand]:
          locus_count += 1
          onum = len(locus_set)
          # send blank reference splices unless we have some
          rsplices = {}
          if chr in reference_splices: rsplices = reference_splices[chr]
          #p.apply_async(process_locus_set,(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count),callback=do_locus_callback)
          r1 = execute_locus(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count)
          do_locus_callback(r1)
          #nnum = len(new_locus_set)
          #print str(onum) + " to " + str(nnum)
          #for e in new_locus_set:
          #  print PSLBasics.entry_to_line(e)
  p.close()
  p.join() 
  sys.stderr.write("\nfinished\n")

  ofh = sys.stdout
  if not args.output == '-':
    ofh = open(args.output,'w')

  for line in combo_results:
    ofh.write(line)
예제 #14
0
def main():
  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  cpus = multiprocessing.cpu_count()
  parser.add_argument('--ref_genome',required=True,help='(required) FASTA filename of reference genome')
  parser.add_argument('--bwa_index',required=True,help='(required) BWA Index')
  #parser.add_argument('--max_mismatches',type=int,default=2,help='INT maximum number of allowed mismatches')
  parser.add_argument('--min_read_size',type=int,default=30,help='INT minimum read size to consider')
  parser.add_argument('--test_size',type=int,default=5000,help='INT number of sequences to test')
  parser.add_argument('--min_test_size',type=int,default=500,help='INT disregard any parameter sets that do not produce at least this number of sequences prior to mapping.')
  parser.add_argument('--left_trim_range',help='start:end:increment, default is 0:[read_length]:5')
  parser.add_argument('--right_trim_range',help='start:end:increment, default is 0:[read_length]:5')
  parser.add_argument('--quality_number_range',help='start:end:increment, default is [qual_min]:[qual_max]:5')
  parser.add_argument('--quality_fail_count_range',help='start:end:increment, default is 0:[read_length]:5')
  parser.add_argument('--mapped_mismatch_range',help='start:end:increment, default is 0:3:1')
  parser.add_argument('--ignore_mapped_mismatches',action='store_true')
  parser.add_argument('--ignore_quality',action='store_true')
  parser.add_argument('--threads',type=int,default=cpus,help='INT of threads to run that defaults to cpu_count')
  parser.add_argument('--tempdir',default='/tmp/',help='Directory of your prefered temporary directory')
  parser.add_argument('-o',help='FILENAME for output')
  parser.add_argument('fastq_file',help='FILENAME for fastq file (can be .gz)')
  args = parser.parse_args()
  maxcnt = args.test_size
  mincnt = args.min_test_size
  sys.stderr.write("Testing up to "+str(maxcnt)+" reads.\n")
  sys.stderr.write("Require parameters leave at least "+str(mincnt)+" reads.\n")
  #max_allowed_mismatches = args.max_mismatches
  #sys.stderr.write("Allowing up to "+str(max_allowed_mismatches)+" mismatches.\n")
  #max_end_mismatches = 2
  min_read_size = args.min_read_size
  sys.stderr.write("Requiring QC parameters produce a minimum read length of "+str(min_read_size)+"\n")
  man = multiprocessing.Manager()
  Q = man.Queue()
  ifile = args.bwa_index 
  sys.stderr.write("BWA index: "+ifile+"\n")
  refgenome = args.ref_genome
  sys.stderr.write("Ref Genome: "+refgenome+"\n")
  #ifile = '/Shared/Au/jason/Reference/UCSC/Human/hg19_GRCh37_feb2009/BWA_Index/genome.fa'
  #refgenome = '/Users/weirathe/jason/Reference/UCSC/Human/hg19_GRCh37_feb2009/Genome/genome.fa'
  #refgenome = 'test_ref.fa'
  if args.threads: cpus = args.threads
  sys.stderr.write("Using "+str(cpus)+" threads\n")
  sys.stderr.write("reading reference genome\n")
  g = read_fasta_into_hash(refgenome)
  gz = {}
  cman = multiprocessing.Manager()
  cQ = man.Queue()
  pc = multiprocessing.Pool(processes=cpus)
  cresults = []
  sys.stderr.write("compressing reference genome\n")
  for name in g:
    pc.apply_async(comp,[name,g[name],cQ,len(g)])
  pc.close()
  pc.join()
  sys.stderr.write("\n")
  while not cQ.empty():
    [name,zseq] = cQ.get()
    gz[name] = zseq

  sys.stderr.write("finished processing reference genome\n")

  #[entries,stats] = read_fastq('test3.fq',maxcnt)
  [entries,stats] = read_fastq(args.fastq_file,maxcnt)

  #tstart = '/tmp'
  tstart = args.tempdir.rstrip('/')
  tdir = tstart.rstrip('/')+'/'+'weirathe.'+str(randint(1,100000000))
  if not os.path.exists(tdir): os.makedirs(tdir)
  z = 0
  #max_l_trim = 20
  #max_r_trim = 20
  max_l_trim = stats['lenmax']
  max_r_trim = stats['lenmax']
  min_l_trim = 0
  min_r_trim = 0
  l_trim_iter = 5
  r_trim_iter = 5
  if args.left_trim_range:
    m = re.match('(\d+):(\d+):(\d+)',args.left_trim_range)
    if not m:
      sys.stderr.write("Error. malformed left trim range "+args.left_trim_range+"\n")
      return
    max_l_trim = int(m.group(2))
    min_l_trim = int(m.group(1))
    l_trim_iter = int(m.group(3))
  if args.right_trim_range:
    m = re.match('(\d+):(\d+):(\d+)',args.right_trim_range)
    if not m:
      sys.stderr.write("Error. malformed right trim range "+args.right_trim_range+"\n")
      return
    max_r_trim = int(m.group(2))
    min_r_trim = int(m.group(1))
    r_trim_iter = int(m.group(3))

  max_q_num = stats['qmax']
  max_q_fail = stats['lenmax']
  min_q_num = stats['qmin']
  min_q_fail = 0
  q_num_iter = 5
  q_fail_iter = 5

  if args.quality_number_range:
    m = re.match('(\d+):(\d+):(\d+)',args.quality_number_range)
    if not m:
      sys.stderr.write("Error. malformed quality number range "+args.quality_number_range+"\n")
      return
    max_q_num = int(m.group(2))
    min_q_num = int(m.group(1))
    q_num_iter = int(m.group(3))
  if args.quality_fail_count_range:
    m = re.match('(\d+):(\d+):(\d+)',args.quality_fail_count_range)
    if not m:
      sys.stderr.write("Error. malformed quality number range "+args.quality_fail_count_range+"\n")
      return
    max_q_fail = int(m.group(2))
    min_q_fail = int(m.group(1))
    q_fail_iter = int(m.group(3))

  if args.ignore_quality:
    max_q_fail = stats['lenmax']
    min_q_fail = stats['lenmax']
    q_fail_iter = 1
    max_q_num = stats['qmax']
    min_q_num = stats['qmax']
    q_num_iter = 1

  max_mismatch = 3
  min_mismatch = 0
  mismatch_iter = 1

  if args.mapped_mismatch_range:
    m = re.match('(\d+):(\d+):(\d+)',args.mapped_mismatch_range)
    if not m:
      sys.stderr.write("Error. malformed mapped mismatch tolerance range "+args.mapped_mismatch_range+"\n")
      return
    max_mismatch = int(m.group(2))
    min_mismatch = int(m.group(1))
    q_mismatch = int(m.group(3))

  if args.ignore_mapped_mismatches:
    min_mismatch = stats['lenmax']
    max_mismatch = stats['lenmax']
    mismatch_iter = 1

  flist = []
  run_params = {}
  run_stats = {}
  sys.stderr.write("Left trim search space: "+str(min_l_trim)+":"+str(min([stats['lenmax'],max_l_trim]))+":"+str(l_trim_iter)+"\n")
  sys.stderr.write("Right trim search space: "+str(min_r_trim)+":"+str(min([stats['lenmax'],max_r_trim]))+":"+str(r_trim_iter)+"\n")
  sys.stderr.write("Quality number search space: "+str(max(min_q_num,stats['qmin']))+":"+str(min(max_q_num,stats['qmax']))+":"+str(q_num_iter)+"\n")
  sys.stderr.write("Quality fail count search space: "+str(min_q_fail)+":"+str(min(stats['lenmax'],max_q_fail))+":"+str(q_fail_iter)+"\n")
  sys.stderr.write("Max mapped mismatch search space: "+str(min_mismatch)+":"+str(min(stats['lenmax'],max_mismatch))+":"+str(mismatch_iter)+"\n")
  for l_cut in range(min_l_trim,min([stats['lenmax'],max_l_trim])+1,l_trim_iter):
   for r_cut in range(min_r_trim,min([stats['lenmax'],max_r_trim])+1,r_trim_iter):
    for q_floor in range(max(min_q_num,stats['qmin']),min(max_q_num,stats['qmax'])+1,q_num_iter):
     for failure_limit in range(min(min_q_fail,stats['lenmax']-l_cut-r_cut),min(stats['lenmax']-l_cut-r_cut,max_q_fail)+1,q_fail_iter):
      for max_allowed_mismatches in range(min_mismatch,max_mismatch+1,mismatch_iter):
       z += 1
       run_params[z] = {}
       run_params[z]['l_cut'] = l_cut
       run_params[z]['r_cut'] = r_cut
       run_params[z]['q_floor'] = q_floor
       run_params[z]['failure_limit'] = failure_limit
       run_params[z]['max_allowed_mismatches'] = max_allowed_mismatches
       run_stats[z] = {}
       run_stats[z]['after_qc_reads'] = 0
       run_stats[z]['after_qc_bases'] = 0
       of = open(tdir+'/'+str(z)+'.fq','w')
       k = 0
       scnt = 0
       for e in entries:
         seq = e['seq']
         seq = left_trim(seq,l_cut)
         seq = right_trim(seq,r_cut)
         qual = e['quality']
         qual = left_trim(qual,l_cut)
         qual = right_trim(qual,r_cut)
         if len(seq) < min_read_size: continue
         failure_count = 0
         for i in range(0,len(qual)):
           if seq[i].upper() == 'N': failure_count += 1
           elif ord(qual[i]) < q_floor: failure_count += 1
         if failure_count > failure_limit: continue
         k+=1
         scnt += 1
         run_stats[z]['after_qc_reads'] += 1
         run_stats[z]['after_qc_bases'] += len(seq)
         of.write("@s_"+str(k)+"\n")
         of.write(seq+"\n")
         of.write('+'+"\n")
         of.write(qual+"\n")
       of.close()
       if scnt < mincnt: #how many sequences were left after filtering, make sure we have enough to care
         os.remove(tdir+'/'+str(z)+'.fq')
       else:
         flist.append(z)
  sys.stderr.write("total of "+str(len(flist))+" params\n")
  p = multiprocessing.Pool(processes=cpus)
  results = []
  for z in flist:
    p.apply_async(check_parameters,(z,gz,ifile,tdir,run_params[z]['max_allowed_mismatches'],Q,len(flist)))
    #check_parameters(z,gz,ifile,tdir,max_end_mismatches,max_allowed_mismatches,Q)
    #print str(map_bases) + "\t" + str(map_reads)
  p.close()
  p.join()
  sys.stderr.write("\n")
  run_results = {}
  while True:
    if Q.empty(): break
    [z, reads, bases] = Q.get()
    #[z, reads, bases] = result
    run_results[z] = {}
    run_results[z]['after_mapped_reads'] = reads    
    run_results[z]['after_mapped_bases'] = bases    

  header = "left_cut_count\tright_cut_count\tmin_quality_value\tmax_quality_failure_count\tmax_mapped_mismatch_count\toriginal_read_count\toriginal_base_count\tpost_qc_read_count\tpost_qc_base_count\tmapped_reads\tmapped_bases"
  if args.o:
    of = open(args.o,'w')
    of.write(header+"\n")
  else:
    print header
  for z in sorted(run_results.keys()):
    ostring =  str(run_params[z]['l_cut']) + "\t" + str(run_params[z]['r_cut']) + "\t" + \
               str(run_params[z]['q_floor']) + "\t" + str(run_params[z]['failure_limit']) + "\t"
    ostring += str(run_params[z]['max_allowed_mismatches']) + "\t"
    ostring += str(stats['readcount']) + "\t" + str(stats['basecount']) + "\t"
    ostring += str(run_stats[z]['after_qc_reads']) + "\t" + str(run_stats[z]['after_qc_bases']) + "\t" 
    ostring += str(run_results[z]['after_mapped_reads']) + "\t" + str(run_results[z]['after_mapped_bases']) + "\t" 
    if args.o:
      of.write(ostring+"\n")
    else:
      print ostring
  if args.o:
    of.close()
  rmtree(tdir)
def load_from_inputs(args):
  #Read in the VCF file
  sys.stderr.write("Reading in the VCF file\n")
  alleles = {}
  #with open(args.phased_VCF) as inf:
  with open(args.inputs[1]) as inf:
    for line in inf:
      vcf = VCF(line)
      if not vcf.is_snp(): continue
      g = vcf.get_phased_genotype()
      if not g: continue
      if vcf.value('chrom') not in alleles:
        alleles[vcf.value('chrom')] = {}
      if vcf.value('pos') in alleles[vcf.value('chrom')]:
        sys.stderr.write("WARNING: seeing the same position twice.\n"+line.rstrip()+"\n")
      alleles[vcf.value('chrom')][vcf.value('pos')] = g # set our left and right

  sys.stderr.write("Reading in the reference genome\n")
  #ref = read_fasta_into_hash(args.reference_genome)
  ref = read_fasta_into_hash(args.inputs[0])
  res1 = []
  res2 = []
  p = None
  sys.stderr.write("Introducing VCF changes to reference sequences\n")
  # Pretty memory intesnive to so don't go with all possible threads
  if args.threads > 1: p = Pool(processes=max(1,int(args.threads/4)))
  for chrom in ref:
    # handle the case where there is no allele information
    if chrom not in alleles:
      r1q = Queue()
      r1q.put([0,chrom,ref[chrom]])
      res1.append(r1q)
      r2q = Queue()
      r2q.put([0,chrom,ref[chrom]])
      res2.append(r2q)
    elif args.threads > 1:
      res1.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],0,chrom)))
      res2.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],1,chrom)))
    else:
      r1q = Queue()
      r1q.put(adjust_reference_genome(alleles[chrom],ref[chrom],0,chrom))
      res1.append(r1q)
      r2q = Queue()
      r2q.put(adjust_reference_genome(alleles[chrom],ref[chrom],1,chrom))
      res2.append(r2q)
  if args.threads > 1:
    p.close()
    p.join()

  # now we can fill reference 1 with all our new sequences
  ref1 = {} 
  c1 = 0
  for i in range(0,len(res1)):
    res = res1[i].get()
    c1 += res[0]
    ref1[res[1]]=res[2]

  # now we can fill reference 2 with all our new sequences
  ref2 = {} 
  c2 = 0
  for i in range(0,len(res2)):
    res = res2[i].get()
    c2 += res[0]
    ref2[res[1]]=res[2]
  sys.stderr.write("Made "+str(c1)+"|"+str(c2)+" changes to the reference\n")

  # Now ref1 and ref2 have are the diploid sources of the transcriptome
  gpdnames = {}
  txn1 = Transcriptome()
  txn2 = Transcriptome()
  txn1.set_reference_genome_dictionary(ref1)
  txn2.set_reference_genome_dictionary(ref2)
  #with open(args.transcripts_genepred) as inf:
  with open(args.inputs[2]) as inf:
    for line in inf:
      if line[0]=='#': continue
      txn1.add_genepred_line(line.rstrip())
      txn2.add_genepred_line(line.rstrip())
      gpd = GenePredEntry(line.rstrip())
      gpdnames[gpd.value('name')] = gpd.value('gene_name')
  # The transcriptomes are set but we dont' really need the references anymore
  # Empty our big memory things
  txn1.ref_hash = None
  txn2.ref_hash = None
  for chrom in ref1.keys():  del ref1[chrom]
  for chrom in ref2.keys():  del ref2[chrom]
  for chrom in ref.keys():  del ref[chrom]

  if not args.locus_by_gene_name:
    #[locus2name,name2locus] = get_loci(args.transcripts_genepred)
    [locus2name,name2locus] = get_loci(args.inputs[2])
  else: # set locus by gene name
    sys.stderr.write("Organizing loci by gene name\n")
    locus2name = {}
    name2locus = {}
    numname = {}
    m = 0
    for name in sorted(gpdnames): 
      gene = gpdnames[name]
      if gene not in numname:
        m+=1
        numname[gene] = m
      num = numname[gene]
      if num not in locus2name:
        locus2name[num] = set()
      locus2name[num].add(name)
      name2locus[name] = num
    sys.stderr.write("Ended with "+str(len(locus2name.keys()))+" loci\n")

  if args.isoform_expression:
    sys.stderr.write("Reading expression from a TSV\n")
    with open(args.isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        f = line.rstrip().split("\t")
        txn1.add_expression(f[0],float(f[1]))
        txn2.add_expression(f[0],float(f[1]))
  elif args.cufflinks_isoform_expression:
    sys.stderr.write("Using cufflinks expression\n")
    cuffz = 0
    with open(args.cufflinks_isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        cuffz +=1
        sys.stderr.write(str(cuffz)+" cufflinks entries processed\r")
        f = line.rstrip().split("\t")
        txn1.add_expression_no_update(f[0],float(f[9]))
        txn2.add_expression_no_update(f[0],float(f[9]))
    txn1.update_expression()
    txn2.update_expression()
    sys.stderr.write("\n")
  elif args.uniform_expression:
    sys.stderr.write("Using uniform expression model\n")
  else:
    sys.stderr.write("Warning isoform expression not sepcified, using uniform expression model.\n")
  # Now we have the transcriptomes set
  rhos = {} # The ASE of allele 1 (the left side)
  randos = {}
  if args.seed:
    random.seed(args.seed)
  for z in locus2name: randos[z] = random.random()
  sys.stderr.write("Setting rho for each transcript\n")
  # Lets set rho for ASE for each transcript
  for tname in sorted(txn1.transcripts):
    if args.ASE_identical or args.ASE_identical == 0:
      rhos[tname] = float(args.ASE_identical)
    elif args.ASE_isoform_random:
      rhos[tname] = random.random()
    else: # we must be on locus random
      rhos[tname] = randos[name2locus[tname]]
  #Now our dataset is set up
  rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1,txn2)
  rbe.gene_names = gpdnames
  rbe.name2locus = name2locus
  rbe.set_transcriptome1_rho(rhos)
  return rbe
예제 #16
0
def main():
    parser = argparse.ArgumentParser(
        description="splice together partial alignments")
    group1 = parser.add_mutually_exclusive_group(required=True)
    group1.add_argument('--fastq_reads')
    group1.add_argument('--fasta_reads')
    parser.add_argument('--genome',
                        help="FASTA reference genome",
                        required=True)
    parser.add_argument('--genepred', help="Transcriptome genepred")
    parser.add_argument('--max_intron_size',
                        type=int,
                        default=100000,
                        help="INT maximum intron size")
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT minimum intron size")
    parser.add_argument('--max_gap_size',
                        type=int,
                        default=10,
                        help="INT gap size in query to join")
    parser.add_argument(
        '--max_search_expand',
        type=int,
        default=10,
        help="INT max search space to expand search for junction")
    parser.add_argument(
        '--direction_specific',
        action='store_true',
        help=
        "The direction of the transcript is known and properly oriented already"
    )
    parser.add_argument('--threads',
                        type=int,
                        default=0,
                        help="INT number of threads to use default cpu_count")
    parser.add_argument(
        '-o',
        '--output',
        default='-',
        help=
        "FILENAME output results to here rather than STDOUT which is default")
    parser.add_argument('input_alignment',
                        help="FILENAME input .psl file or '-' for STDIN")
    args = parser.parse_args()

    # Read our reference genome
    sys.stderr.write("Reading reference\n")
    ref = read_fasta_into_hash(args.genome)

    # Make sure our reads are unique
    sys.stderr.write("Checking for unqiuely named reads\n")
    reads = check_for_uniquely_named_reads(
        args)  # does a hard exit and error if there are any names repeated
    sys.stderr.write("Reads are uniquely named\n")

    # Set number of threads to use
    cpu_count = multiprocessing.cpu_count()
    if args.threads > 0:
        cpu_count = args.threads

    #Set reference splices (if any are available)
    reference_splices = {}
    if args.genepred:
        sys.stderr.write("Reading reference splices from genepred\n")
        reference_splices = get_reference_splices(args)

    sys.stderr.write("Reading alignments into loci\n")

    # Get locus division (first stage)
    # Each read (qName) is separated
    # Then each locus will be specific to at chromosome (tName)
    # Then by (strand), but keep in mind this is the is based on the read
    # Each locus should be specific to a direction but we don't necessarily
    # know direction based on the data we have thus far.
    inf = sys.stdin
    if args.input_alignment != '-': inf = open(args.input_alignment, 'r')
    loci = {}
    for line in inf:
        line = line.rstrip()
        if re.match('^#', line): continue
        psl = PSLBasics.line_to_entry(line)
        if psl['qName'] not in loci:
            loci[psl['qName']] = {}
        if psl['tName'] not in loci[psl['qName']]:
            loci[psl['qName']][psl['tName']] = {}
        if psl['strand'] not in loci[psl['qName']][psl['tName']]:
            loci[psl['qName']][psl['tName']][psl['strand']] = {}
        if psl['tStarts'][0] not in loci[psl['qName']][psl['tName']][
                psl['strand']]:
            loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts']
                                                            [0]] = []
        loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts']
                                                        [0]].append(psl)

    sys.stderr.write("breaking loci by genomic distance\n")
    for qname in loci:
        for chr in loci[qname]:
            for strand in loci[qname][chr]:
                #print qname + "\t" + chr + "\t" + strand
                starts = loci[qname][chr][strand].keys()
                current_set = []
                locus_sets = []
                last_end = -1 * (args.max_intron_size + 2)
                for start in sorted(starts):
                    for e in loci[qname][chr][strand][start]:
                        start = e['tStarts'][
                            0] + 1  # base-1 start of start of alignment
                        if start > last_end + args.max_intron_size:
                            # we have the start of a new set
                            if len(current_set) > 0:
                                locus_sets.append(current_set)
                            current_set = []
                        last_end = e['tStarts'][len(e['tStarts']) -
                                                1] + e['blockSizes'][
                                                    len(e['tStarts']) - 1]
                        current_set.append(e)
                if len(current_set) > 0:
                    locus_sets.append(current_set)
                loci[qname][chr][
                    strand] = locus_sets  # replace what was there with these ordered sets

    locus_total = 0
    for qname in loci:
        for chr in loci[qname]:
            for strand in loci[qname][chr]:
                for locus_set in loci[qname][chr][strand]:
                    locus_total += 1

    sys.stderr.write("Work on each read in each locus with " + str(cpu_count) +
                     " CPUs\n")
    p = multiprocessing.Pool(processes=cpu_count)
    locus_count = 0
    for qname in loci:
        for chr in loci[qname]:
            for strand in loci[qname][chr]:
                #print qname + "\t" + chr + "\t" + strand
                for locus_set in loci[qname][chr][strand]:
                    locus_count += 1
                    onum = len(locus_set)
                    # send blank reference splices unless we have some
                    rsplices = {}
                    if chr in reference_splices:
                        rsplices = reference_splices[chr]
                    #p.apply_async(process_locus_set,(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count),callback=do_locus_callback)
                    r1 = execute_locus(locus_set, args, rsplices, ref[chr],
                                       reads[qname], locus_total, locus_count)
                    do_locus_callback(r1)
                    #nnum = len(new_locus_set)
                    #print str(onum) + " to " + str(nnum)
                    #for e in new_locus_set:
                    #  print PSLBasics.entry_to_line(e)
    p.close()
    p.join()
    sys.stderr.write("\nfinished\n")

    ofh = sys.stdout
    if not args.output == '-':
        ofh = open(args.output, 'w')

    for line in combo_results:
        ofh.write(line)
def main():
    parser = argparse.ArgumentParser(
        description='Use reference junctions when they are close',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT min intron size")
    parser.add_argument(
        '--min_local_support',
        type=int,
        default=0,
        help=
        "INT min number of junctions within search_size of a junction in order to count it"
    )
    parser.add_argument('--search_size',
                        type=int,
                        default=10,
                        help="INT search space for reference")
    parser.add_argument(
        '--output_fake_psl',
        help="FASTAFILE reference genome to make a fake PSL output")
    parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN")
    parser.add_argument('reference_genepred',
                        help="FASTAFILENAME for reference genepred")
    args = parser.parse_args()

    cpus = multiprocessing.cpu_count()

    genome = {}
    if args.output_fake_psl:
        genome = read_fasta_into_hash(args.output_fake_psl)

    #read in the reference genepred first
    gpf = GenePredBasics.GenePredFile(args.reference_genepred)
    #lets sort entries by chromosome
    ref = {}
    for e in [x.entry for x in gpf.entries]:
        if len(e['exonStarts']) <= 1: continue
        if e['chrom'] not in ref:
            ref[e['chrom']] = {}
        for i in range(1, len(e['exonStarts'])):
            if e['exonEnds'][i - 1] not in ref[e['chrom']]:
                ref[e['chrom']][e['exonEnds'][i - 1]] = {}
            if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i -
                                                                           1]]:
                ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] +
                                                      1] = e['strand']
    #Stored all junctions as 1-base

    read_info = {}
    pf = GenericFileReader(args.psl)
    fcount_total = 0
    while True:
        line = pf.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        pe = PSLBasics.line_to_entry(line)
        if len(pe['tStarts']) != len(pe['blockSizes']) or len(
                pe['qStarts']) != len(pe['blockSizes']):
            sys.stderr.write("WARNING invalid psl\n")
            continue
        genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
        ge = GenePredBasics.smooth_gaps(
            GenePredBasics.line_to_entry(genepred_line), args.min_intron_size)
        refjuns = {}
        if pe['tName'] in ref: refjuns = ref[pe['tName']]
        new_ge = nudge(pe, ge, refjuns, args)
        if args.output_fake_psl:
            new_psl_line = GenePredBasics.entry_to_fake_psl_line(
                new_ge, genome)
            print new_psl_line
        else:
            print GenePredBasics.entry_to_line(new_ge)
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--only_output_alternates',action='store_true',help='When selected, the original coordiantes are not output, and only the alternates are output')
  parser.add_argument('--long_form', action='store_true',help="add an additional column to the beginning of the output indicating whether it is an original or alternate splice coordinate")
  parser.add_argument('GenomeFastaFile',nargs=1,help="FILENAME Fasta format file of the reference genome")
  parser.add_argument('SpliceSiteFile',nargs=1,help="FILENAME Splice Site file is in tsv format with <Left chrom> <Left coord (base-1)> <Left dir [+-]> <Right chrom> <Right coord (base-1)> <Right dir [+-]>\nWhere the coordinates indicate the base that is inside the exon proximal to the splice.  Direction indicates the transcription direction on the chromosome for that side of the splice.  For coordiantes 1-base means that the number 1 would be the first base of the sequence (makes sense to do it that way, right? :P)")
  of = sys.stdout
  args = parser.parse_args()
  golds = []
  with open(args.SpliceSiteFile[0]) as inf:
    for line in inf:
      f = line.rstrip().split()
      t = {}
      t['l'] = {}
      t['r'] = {}
      t['l']['chr'] = f[0]
      t['l']['coord'] = int(f[1])
      t['l']['dir'] = f[2]
      t['r']['chr'] = f[3]
      t['r']['coord'] = int(f[4])
      t['r']['dir'] = f[5]
      golds.append(t)

  ref = read_fasta_into_hash(args.GenomeFastaFile[0])
  lens = {}
  for chr in ref:
    lens[chr] = len(ref[chr])
  for g in golds:
    l_chrom = g['l']['chr']
    r_chrom = g['r']['chr']
    l_start = g['l']['coord']
    r_start = g['r']['coord']
    l_dir = g['l']['dir']
    r_dir = g['r']['dir']
    # print the main case
    if not args.only_output_alternates:
      startstring = ''
      if args.long_form: startstring = "original\t"
      of.write(startstring+l_chrom + "\t" + str(l_start) + "\t" + l_dir + "\t" + r_chrom + "\t" + str(r_start) + "\t" + r_dir+"\n")
    #check upstream left
    equivalent = 1
    l_base = l_start
    r_base = r_start
    while(equivalent == 1):
      left_bases = ''
      right_bases = ''
      if l_dir == '+':
        l_base -= 1
        if l_base < 1: break
        left_bases = str(ref[l_chrom][l_base])
      else:
        l_base += 1
        if l_base > lens[l_chrom]: break
        left_bases = rc(str(ref[l_chrom][l_base-2]))
      if r_dir == '+':
        r_base -= 1
        if r_base < 1: break
        right_bases = str(ref[r_chrom][r_base-1])
      else:
        r_base += 1
        if r_base > lens[r_chrom]: break
        right_bases = rc(str(ref[r_chrom][r_base-1]))
      if left_bases != right_bases: break
      startstring = ''
      if args.long_form: startstring = "alternate\t"
      of.write(startstring+l_chrom + "\t" + str(l_base) + "\t" + l_dir + "\t" + r_chrom + "\t" + str(r_base) + "\t" + r_dir+"\n")
    #check downstream left
    equivalent = 1
    l_base = l_start
    r_base = r_start
    while(equivalent == 1):
      left_bases = ''
      right_bases = ''
      if l_dir == '+':
        l_base += 1
        if l_base > lens[l_chrom]: break
        left_bases = str(ref[l_chrom][l_base-1])
      else:
        l_base -= 1
        if l_base < 1: break
        left_bases = rc(str(ref[l_chrom][l_base-1]))
      if r_dir == '+':
        r_base += 1
        if r_base > lens[r_chrom]: break
        right_bases = str(ref[r_chrom][r_base-2])
      else:
        r_base -= 1
        if r_base > lens[r_chrom]: break
        right_bases = rc(str(ref[r_chrom][r_base]))
      if left_bases != right_bases: break
      startstring = ''
      if args.long_form: startstring = "alternate\t"
      of.write(startstring+l_chrom + "\t" + str(l_base) + "\t" + l_dir + "\t" + r_chrom + "\t" + str(r_base) + "\t" + r_dir+"\n")
def load_from_inputs(args):
    #Read in the VCF file
    sys.stderr.write("Reading in the VCF file\n")
    alleles = {}
    #with open(args.phased_VCF) as inf:
    with open(args.inputs[1]) as inf:
        for line in inf:
            vcf = VCF(line)
            if not vcf.is_snp(): continue
            g = vcf.get_phased_genotype()
            if not g: continue
            if vcf.value('chrom') not in alleles:
                alleles[vcf.value('chrom')] = {}
            if vcf.value('pos') in alleles[vcf.value('chrom')]:
                sys.stderr.write("WARNING: seeing the same position twice.\n" +
                                 line.rstrip() + "\n")
            alleles[vcf.value('chrom')][vcf.value(
                'pos')] = g  # set our left and right

    sys.stderr.write("Reading in the reference genome\n")
    #ref = read_fasta_into_hash(args.reference_genome)
    ref = read_fasta_into_hash(args.inputs[0])
    res1 = []
    res2 = []
    p = None
    sys.stderr.write("Introducing VCF changes to reference sequences\n")
    # Pretty memory intesnive to so don't go with all possible threads
    if args.threads > 1: p = Pool(processes=max(1, int(args.threads / 4)))
    for chrom in ref:
        # handle the case where there is no allele information
        if chrom not in alleles:
            r1q = Queue()
            r1q.put([0, chrom, ref[chrom]])
            res1.append(r1q)
            r2q = Queue()
            r2q.put([0, chrom, ref[chrom]])
            res2.append(r2q)
        elif args.threads > 1:
            res1.append(
                p.apply_async(adjust_reference_genome,
                              args=(alleles[chrom], ref[chrom], 0, chrom)))
            res2.append(
                p.apply_async(adjust_reference_genome,
                              args=(alleles[chrom], ref[chrom], 1, chrom)))
        else:
            r1q = Queue()
            r1q.put(
                adjust_reference_genome(alleles[chrom], ref[chrom], 0, chrom))
            res1.append(r1q)
            r2q = Queue()
            r2q.put(
                adjust_reference_genome(alleles[chrom], ref[chrom], 1, chrom))
            res2.append(r2q)
    if args.threads > 1:
        p.close()
        p.join()

    # now we can fill reference 1 with all our new sequences
    ref1 = {}
    c1 = 0
    for i in range(0, len(res1)):
        res = res1[i].get()
        c1 += res[0]
        ref1[res[1]] = res[2]

    # now we can fill reference 2 with all our new sequences
    ref2 = {}
    c2 = 0
    for i in range(0, len(res2)):
        res = res2[i].get()
        c2 += res[0]
        ref2[res[1]] = res[2]
    sys.stderr.write("Made " + str(c1) + "|" + str(c2) +
                     " changes to the reference\n")

    # Now ref1 and ref2 have are the diploid sources of the transcriptome
    gpdnames = {}
    txn1 = Transcriptome()
    txn2 = Transcriptome()
    txn1.set_reference_genome_dictionary(ref1)
    txn2.set_reference_genome_dictionary(ref2)
    #with open(args.transcripts_genepred) as inf:
    with open(args.inputs[2]) as inf:
        for line in inf:
            if line[0] == '#': continue
            txn1.add_genepred_line(line.rstrip())
            txn2.add_genepred_line(line.rstrip())
            gpd = GenePredEntry(line.rstrip())
            gpdnames[gpd.value('name')] = gpd.value('gene_name')
    # The transcriptomes are set but we dont' really need the references anymore
    # Empty our big memory things
    txn1.ref_hash = None
    txn2.ref_hash = None
    for chrom in ref1.keys():
        del ref1[chrom]
    for chrom in ref2.keys():
        del ref2[chrom]
    for chrom in ref.keys():
        del ref[chrom]

    if not args.locus_by_gene_name:
        #[locus2name,name2locus] = get_loci(args.transcripts_genepred)
        [locus2name, name2locus] = get_loci(args.inputs[2])
    else:  # set locus by gene name
        sys.stderr.write("Organizing loci by gene name\n")
        locus2name = {}
        name2locus = {}
        numname = {}
        m = 0
        for name in sorted(gpdnames):
            gene = gpdnames[name]
            if gene not in numname:
                m += 1
                numname[gene] = m
            num = numname[gene]
            if num not in locus2name:
                locus2name[num] = set()
            locus2name[num].add(name)
            name2locus[name] = num
        sys.stderr.write("Ended with " + str(len(locus2name.keys())) +
                         " loci\n")

    if args.isoform_expression:
        sys.stderr.write("Reading expression from a TSV\n")
        with open(args.isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                txn1.add_expression(f[0], float(f[1]))
                txn2.add_expression(f[0], float(f[1]))
    elif args.cufflinks_isoform_expression:
        sys.stderr.write("Using cufflinks expression\n")
        cuffz = 0
        with open(args.cufflinks_isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                cuffz += 1
                sys.stderr.write(str(cuffz) + " cufflinks entries processed\r")
                f = line.rstrip().split("\t")
                txn1.add_expression_no_update(f[0], float(f[9]))
                txn2.add_expression_no_update(f[0], float(f[9]))
        txn1.update_expression()
        txn2.update_expression()
        sys.stderr.write("\n")
    elif args.uniform_expression:
        sys.stderr.write("Using uniform expression model\n")
    else:
        sys.stderr.write(
            "Warning isoform expression not sepcified, using uniform expression model.\n"
        )
    # Now we have the transcriptomes set
    rhos = {}  # The ASE of allele 1 (the left side)
    randos = {}
    if args.seed:
        random.seed(args.seed)
    for z in locus2name:
        randos[z] = random.random()
    sys.stderr.write("Setting rho for each transcript\n")
    # Lets set rho for ASE for each transcript
    for tname in sorted(txn1.transcripts):
        if args.ASE_identical or args.ASE_identical == 0:
            rhos[tname] = float(args.ASE_identical)
        elif args.ASE_isoform_random:
            rhos[tname] = random.random()
        else:  # we must be on locus random
            rhos[tname] = randos[name2locus[tname]]
    #Now our dataset is set up
    rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1, txn2)
    rbe.gene_names = gpdnames
    rbe.name2locus = name2locus
    rbe.set_transcriptome1_rho(rhos)
    return rbe
예제 #20
0
def main():
    parser = argparse.ArgumentParser(
        description="For every genepred entry report its alignability",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Genepred can be gzipped or - for STDIN")
    parser.add_argument('-r',
                        '--reference',
                        required=True,
                        help="Reference fasta")
    parser.add_argument('-k',
                        '--fragment_size',
                        default=100,
                        type=int,
                        help="Fragment size to try to align")
    parser.add_argument('-x',
                        '--hisat_index',
                        required=True,
                        help="HISAT index base name")
    parser.add_argument('--threads',
                        type=int,
                        default=cpu_count(),
                        help="number of threads")
    parser.add_argument('--type',
                        choices=['mean', 'median'],
                        default='mean',
                        help="How to bring together overlapping reads")
    parser.add_argument('--perbase', action='store_true')
    parser.add_argument('--output',
                        '-o',
                        help="output file or leave unset for STDOUT")
    args = parser.parse_args()

    if args.input == '-': args.input = sys.stdin
    elif re.search('\.gz$', args.input):
        args.input = gzip.open(args.input)
    else:
        args.input = open(args.input)

    udir = os.path.dirname(os.path.realpath(__file__))
    cmd2 = udir + '/genepred_counts_to_mappability.py -'
    cmd2 += ' --threads ' + str(args.threads)
    cmd2 += ' -k ' + str(args.fragment_size)
    if args.perbase: cmd2 += ' --perbase'
    if args.output: cmd2 += ' --output ' + args.output
    if args.type: cmd2 += ' --type ' + args.type
    p2 = Popen(cmd2.split(), stdin=PIPE)
    ref = read_fasta_into_hash(args.reference)
    cmd1 = 'hisat -x ' + args.hisat_index + ' -U - -f --reorder -p ' + str(
        args.threads)
    p1 = Popen(cmd1.split(), stdin=PIPE, stdout=p2.stdin, stderr=null)
    #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin)
    line_number = 0
    for line in args.input:
        line_number += 1
        gpd = GPD(line.rstrip())
        #print gpd.entry['name']
        #print gpd.length()
        if gpd.length() < args.fragment_size: continue
        seq = gpd.get_sequence(ref)
        for i in range(0, len(seq) - args.fragment_size + 1):
            info = gpd.value('name') + "\t" + gpd.value(
                'gene_name') + "\t" + str(line_number) + "\t" + str(
                    len(seq)) + "\t" + str(i)
            einfo = encode_name(info)
            p1.stdin.write('>' + einfo + "\n")
            p1.stdin.write(seq[i:i + args.fragment_size] + "\n")
    p1.communicate()
    p2.communicate()