예제 #1
0
def main():
  parser = argparse.ArgumentParser(description="Filter a genepred by transcript length")
  parser.add_argument('input',help="Input '-' for STDOUT")
  parser.add_argument('--min_length',type=int,help="Minimum transcript length")
  parser.add_argument('--max_length',type=int,help="Maximum transcript length")
  parser.add_argument('--names',help="filter on a name list")
  parser.add_argument('--gene_names',help="filter on a gene name list")
  parser.add_argument('-v','--invert',action='store_true',help='Invert search result')
  args = parser.parse_args()
  name_list = set()
  gene_name_list = set()
  if args.names:
    with open(args.names) as inf:
      for line in inf:
        f = line.rstrip().split("\t")
        name_list.add(f[0])
  if args.gene_names:
    with open(args.gene_names) as inf:
      for line in inf:
        f = line.rstrip().split("\t")
        gene_name_list.add(f[0])
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  for line in inf:
    if re.match('^#',line): continue
    is_good = True
    g = GPD(line.rstrip())
    tot = g.length()
    if args.min_length:
      if tot < args.min_length:
        is_good = False
    if args.max_length:
      if tot > args.max_length:
        is_good = False
    if args.names:
      if g.value('name') not in name_list:
        is_good = False
    if args.gene_names:
      if g.value('gene_name') not in args.gene_name_list:
        is_good = False
    # If we are still here we can print
    if not args.invert:
      if is_good: print line.rstrip()
    else:
      if not is_good: print line.rstrip()
def main():
  parser = argparse.ArgumentParser(description="For every genepred entry report its alignability",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input',help="Genepred can be gzipped or - for STDIN")
  parser.add_argument('-r','--reference',required=True,help="Reference fasta")
  parser.add_argument('-k','--fragment_size',default=100,type=int,help="Fragment size to try to align")
  parser.add_argument('-x','--hisat_index',required=True,help="HISAT index base name")
  parser.add_argument('--threads',type=int,default=cpu_count(),help="number of threads")
  parser.add_argument('--type',choices=['mean','median'],default='mean',help="How to bring together overlapping reads")
  parser.add_argument('--perbase',action='store_true')
  parser.add_argument('--output','-o',help="output file or leave unset for STDOUT")
  args = parser.parse_args()
  
  if args.input=='-': args.input=sys.stdin
  elif re.search('\.gz$',args.input):
    args.input = gzip.open(args.input)
  else: args.input = open(args.input)

  udir = os.path.dirname(os.path.realpath(__file__))
  cmd2 = udir+'/genepred_counts_to_mappability.py -'
  cmd2 += ' --threads '+str(args.threads)
  cmd2 += ' -k '+str(args.fragment_size)
  if args.perbase: cmd2 += ' --perbase'
  if args.output: cmd2 += ' --output '+args.output
  if args.type: cmd2 += ' --type '+args.type
  p2 = Popen(cmd2.split(),stdin=PIPE)
  ref = read_fasta_into_hash(args.reference)
  cmd1 = 'hisat -x '+args.hisat_index+' -U - -f --reorder -p '+str(args.threads)
  p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin,stderr=null)
  #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin)
  line_number = 0
  for line in args.input:
    line_number +=1
    gpd = GPD(line.rstrip())
    #print gpd.entry['name']
    #print gpd.length()
    if gpd.length() < args.fragment_size: continue
    seq = gpd.get_sequence(ref)
    for i in range(0,len(seq)-args.fragment_size+1):
      info = gpd.value('name')+"\t"+gpd.value('gene_name')+"\t"+str(line_number)+"\t"+str(len(seq))+"\t"+str(i)
      einfo = encode_name(info)
      p1.stdin.write('>'+einfo+"\n")
      p1.stdin.write(seq[i:i+args.fragment_size]+"\n")
  p1.communicate()
  p2.communicate()
예제 #3
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('gpd_input')
  parser.add_argument('bam_input')
  parser.add_argument('--intergenic_buffer',default=10000,type=int)
  parser.add_argument('--window_size',default=10000,type=int)
  parser.add_argument('--bin_size',default=1000,type=int)
  parser.add_argument('--use_off_regions',action='store_true',help="Use a region even if there is no reads mapped to it.")
  parser.add_argument('--get_exons',action='store_true')
  args = parser.parse_args()
  chr_beds = {}
  gene_beds = []
  exon_beds = []
  sys.stderr.write("Reading genepred file\n")
  asum = 0
  atot = 0
  with open(args.gpd_input) as inf:
    for line in inf:
      g = GenePredEntry(line)
      asum += g.length()
      atot += 1
      grng = g.get_bed()
      grng.direction = None
      if grng.chr not in chr_beds:
        chr_beds[grng.chr] = grng.copy()
      chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng)
      gene_beds.append(grng)
      for i in range(0,g.get_exon_count()):
        erng = Bed(g.value('chrom'),g.value('exonStarts')[i],g.value('exonEnds')[i])
        exon_beds.append(erng)
  avglen = float(asum)/float(atot)
  sys.stderr.write("Sorting gene bed\n")
  gene_beds = sort_ranges(gene_beds)
  gene_beds = merge_ranges(gene_beds,already_sorted=True)
  sys.stderr.write("Sorting chromosome beds\n")
  chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()])
  sys.stderr.write("Sorting exon beds\n")
  exon_beds = sort_ranges(exon_beds)
  sys.stderr.write("Get padded genes\n")
  padded_gene_beds = pad_ranges(gene_beds,args.intergenic_buffer,chr_beds)
  padded_gene_beds = merge_ranges(padded_gene_beds,already_sorted=True)
  sys.stderr.write("Get intergenic regions\n")
  intergenic_beds = subtract_ranges(chr_beds,padded_gene_beds,already_sorted=True)
  intergenic_beds = merge_ranges(intergenic_beds,already_sorted=True)
  intergenic_beds = window_break(intergenic_beds,args.window_size)
  #for i in intergenic_beds: print i.get_range_string()
  sys.stderr.write("Get merged exons\n")
  exon_beds = merge_ranges(exon_beds)
  sys.stderr.write("Get introns\n")
  intron_beds = subtract_ranges(gene_beds,exon_beds,already_sorted=True)  
  intron_beds = merge_ranges(intron_beds,already_sorted=True)
  intron_beds = window_break(intron_beds,args.window_size)
  sys.stderr.write("Going through short reads\n")
  cmd = "sam_to_bed_depth.py "+args.bam_input
  p = Popen(cmd.split(),stdout=PIPE)
  for x in intron_beds: x.set_payload([]) # payloads are read depths
  for x in intergenic_beds: x.set_payload([]) # payloads are read depths
  for x in exon_beds: x.set_payload([]) # payloads are read depths
  introndepth = []
  intergenicdepth = []
  exondepth = []
  pseudoreadcount = 0
  if not args.get_exons: exon_beds = []
  section_count = 0
  while True:
    section_count += 1
    line = p.stdout.readline()
    if not line: break
    f = line.split("\t")
    depth = int(f[3])
    curr = Bed(f[0],int(f[1]),int(f[2]))
    if section_count %100==0: sys.stderr.write(curr.get_range_string()+"          \r")
    pseudoreadcount += depth
    if len(exon_beds) > 0:
      while curr.cmp(exon_beds[0]) > 0 and len(exon_beds) > 0: # we've passed the region
        v = exon_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        exondepth.append(av)
        #print str(av)+" exonic "+v.get_range_string()
      c = curr.cmp(exon_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(exon_beds[0])
        for i in range(0,size): exon_beds[0].get_payload().append(depth)
    if len(intron_beds) > 0:
      while curr.cmp(intron_beds[0]) > 0 and len(intron_beds) > 0: # we've passed the region
        v = intron_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        introndepth.append(av)
        #print str(av)+" intronic "+v.get_range_string()
      c = curr.cmp(intron_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(intron_beds[0])
        for i in range(0,size): intron_beds[0].get_payload().append(depth)
    if len(intergenic_beds) > 0:
      while curr.cmp(intergenic_beds[0]) > 0 and len(intergenic_beds) > 0: # we've passed the region
        v = intergenic_beds.pop(0)
        if len(v.get_payload()) == 0 and not args.use_off_regions: continue
        av = average(v)
        intergenicdepth.append(av)
        display(curr,introndepth,intergenicdepth,pseudoreadcount,avglen)
        #print str(av)+" intergenic "+v.get_range_string()
      c = curr.cmp(intergenic_beds[0])
      if c == 0: # overlaps with intron
        size = curr.overlap_size(intergenic_beds[0])
        for i in range(0,size): intergenic_beds[0].get_payload().append(depth)
      #if c > 0: # we passed the intron
      #  v = intergenic_beds.pop(0)
      #  av = average(v)
      #  intergenicdepth.append(av)
      #  print str(av)+" intergenic "+v.get_range_string()
  if args.use_off_regions:
    for x in exon_beds: introndepth.append(average(x.get_payload()))
    for x in intron_beds: introndepth.append(average(x.get_payload()))
    for x in intergenic_beds: intergenicdepth.append(average(x.get_payload()))
  p.communicate()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('gpd_input')
    parser.add_argument('bam_input')
    parser.add_argument('--intergenic_buffer', default=10000, type=int)
    parser.add_argument('--window_size', default=10000, type=int)
    parser.add_argument('--bin_size', default=1000, type=int)
    parser.add_argument(
        '--use_off_regions',
        action='store_true',
        help="Use a region even if there is no reads mapped to it.")
    parser.add_argument('--get_exons', action='store_true')
    args = parser.parse_args()
    chr_beds = {}
    gene_beds = []
    exon_beds = []
    sys.stderr.write("Reading genepred file\n")
    asum = 0
    atot = 0
    with open(args.gpd_input) as inf:
        for line in inf:
            g = GenePredEntry(line)
            asum += g.length()
            atot += 1
            grng = g.get_bed()
            grng.direction = None
            if grng.chr not in chr_beds:
                chr_beds[grng.chr] = grng.copy()
            chr_beds[grng.chr] = chr_beds[grng.chr].merge(grng)
            gene_beds.append(grng)
            for i in range(0, g.get_exon_count()):
                erng = Bed(g.value('chrom'),
                           g.value('exonStarts')[i],
                           g.value('exonEnds')[i])
                exon_beds.append(erng)
    avglen = float(asum) / float(atot)
    sys.stderr.write("Sorting gene bed\n")
    gene_beds = sort_ranges(gene_beds)
    gene_beds = merge_ranges(gene_beds, already_sorted=True)
    sys.stderr.write("Sorting chromosome beds\n")
    chr_beds = sort_ranges([chr_beds[x] for x in chr_beds.keys()])
    sys.stderr.write("Sorting exon beds\n")
    exon_beds = sort_ranges(exon_beds)
    sys.stderr.write("Get padded genes\n")
    padded_gene_beds = pad_ranges(gene_beds, args.intergenic_buffer, chr_beds)
    padded_gene_beds = merge_ranges(padded_gene_beds, already_sorted=True)
    sys.stderr.write("Get intergenic regions\n")
    intergenic_beds = subtract_ranges(chr_beds,
                                      padded_gene_beds,
                                      already_sorted=True)
    intergenic_beds = merge_ranges(intergenic_beds, already_sorted=True)
    intergenic_beds = window_break(intergenic_beds, args.window_size)
    #for i in intergenic_beds: print i.get_range_string()
    sys.stderr.write("Get merged exons\n")
    exon_beds = merge_ranges(exon_beds)
    sys.stderr.write("Get introns\n")
    intron_beds = subtract_ranges(gene_beds, exon_beds, already_sorted=True)
    intron_beds = merge_ranges(intron_beds, already_sorted=True)
    intron_beds = window_break(intron_beds, args.window_size)
    sys.stderr.write("Going through short reads\n")
    cmd = "sam_to_bed_depth.py " + args.bam_input
    p = Popen(cmd.split(), stdout=PIPE)
    for x in intron_beds:
        x.set_payload([])  # payloads are read depths
    for x in intergenic_beds:
        x.set_payload([])  # payloads are read depths
    for x in exon_beds:
        x.set_payload([])  # payloads are read depths
    introndepth = []
    intergenicdepth = []
    exondepth = []
    pseudoreadcount = 0
    if not args.get_exons: exon_beds = []
    section_count = 0
    while True:
        section_count += 1
        line = p.stdout.readline()
        if not line: break
        f = line.split("\t")
        depth = int(f[3])
        curr = Bed(f[0], int(f[1]), int(f[2]))
        if section_count % 100 == 0:
            sys.stderr.write(curr.get_range_string() + "          \r")
        pseudoreadcount += depth
        if len(exon_beds) > 0:
            while curr.cmp(exon_beds[0]) > 0 and len(
                    exon_beds) > 0:  # we've passed the region
                v = exon_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                exondepth.append(av)
                #print str(av)+" exonic "+v.get_range_string()
            c = curr.cmp(exon_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(exon_beds[0])
                for i in range(0, size):
                    exon_beds[0].get_payload().append(depth)
        if len(intron_beds) > 0:
            while curr.cmp(intron_beds[0]) > 0 and len(
                    intron_beds) > 0:  # we've passed the region
                v = intron_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                introndepth.append(av)
                #print str(av)+" intronic "+v.get_range_string()
            c = curr.cmp(intron_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(intron_beds[0])
                for i in range(0, size):
                    intron_beds[0].get_payload().append(depth)
        if len(intergenic_beds) > 0:
            while curr.cmp(intergenic_beds[0]) > 0 and len(
                    intergenic_beds) > 0:  # we've passed the region
                v = intergenic_beds.pop(0)
                if len(v.get_payload()) == 0 and not args.use_off_regions:
                    continue
                av = average(v)
                intergenicdepth.append(av)
                display(curr, introndepth, intergenicdepth, pseudoreadcount,
                        avglen)
                #print str(av)+" intergenic "+v.get_range_string()
            c = curr.cmp(intergenic_beds[0])
            if c == 0:  # overlaps with intron
                size = curr.overlap_size(intergenic_beds[0])
                for i in range(0, size):
                    intergenic_beds[0].get_payload().append(depth)
            #if c > 0: # we passed the intron
            #  v = intergenic_beds.pop(0)
            #  av = average(v)
            #  intergenicdepth.append(av)
            #  print str(av)+" intergenic "+v.get_range_string()
    if args.use_off_regions:
        for x in exon_beds:
            introndepth.append(average(x.get_payload()))
        for x in intron_beds:
            introndepth.append(average(x.get_payload()))
        for x in intergenic_beds:
            intergenicdepth.append(average(x.get_payload()))
    p.communicate()
예제 #5
0
def main():
    parser = argparse.ArgumentParser(
        description="For every genepred entry report its alignability",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input', help="Genepred can be gzipped or - for STDIN")
    parser.add_argument('-r',
                        '--reference',
                        required=True,
                        help="Reference fasta")
    parser.add_argument('-k',
                        '--fragment_size',
                        default=100,
                        type=int,
                        help="Fragment size to try to align")
    parser.add_argument('-x',
                        '--hisat_index',
                        required=True,
                        help="HISAT index base name")
    parser.add_argument('--threads',
                        type=int,
                        default=cpu_count(),
                        help="number of threads")
    parser.add_argument('--type',
                        choices=['mean', 'median'],
                        default='mean',
                        help="How to bring together overlapping reads")
    parser.add_argument('--perbase', action='store_true')
    parser.add_argument('--output',
                        '-o',
                        help="output file or leave unset for STDOUT")
    args = parser.parse_args()

    if args.input == '-': args.input = sys.stdin
    elif re.search('\.gz$', args.input):
        args.input = gzip.open(args.input)
    else:
        args.input = open(args.input)

    udir = os.path.dirname(os.path.realpath(__file__))
    cmd2 = udir + '/genepred_counts_to_mappability.py -'
    cmd2 += ' --threads ' + str(args.threads)
    cmd2 += ' -k ' + str(args.fragment_size)
    if args.perbase: cmd2 += ' --perbase'
    if args.output: cmd2 += ' --output ' + args.output
    if args.type: cmd2 += ' --type ' + args.type
    p2 = Popen(cmd2.split(), stdin=PIPE)
    ref = read_fasta_into_hash(args.reference)
    cmd1 = 'hisat -x ' + args.hisat_index + ' -U - -f --reorder -p ' + str(
        args.threads)
    p1 = Popen(cmd1.split(), stdin=PIPE, stdout=p2.stdin, stderr=null)
    #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin)
    line_number = 0
    for line in args.input:
        line_number += 1
        gpd = GPD(line.rstrip())
        #print gpd.entry['name']
        #print gpd.length()
        if gpd.length() < args.fragment_size: continue
        seq = gpd.get_sequence(ref)
        for i in range(0, len(seq) - args.fragment_size + 1):
            info = gpd.value('name') + "\t" + gpd.value(
                'gene_name') + "\t" + str(line_number) + "\t" + str(
                    len(seq)) + "\t" + str(i)
            einfo = encode_name(info)
            p1.stdin.write('>' + einfo + "\n")
            p1.stdin.write(seq[i:i + args.fragment_size] + "\n")
    p1.communicate()
    p2.communicate()