def main():
    parser = argparse.ArgumentParser(
        description=
        "Find mapping distance of paired end reads.  Takes an ordered (by query) alignment to a transcriptome.\nSomething that works for an input thus far is like:\nhisat --reorder -x mytranscriptome -1 my_1.fastq -2 my_2.fastq | this_script.py -"
    )
    parser.add_argument('input_fasta', help="FASTAFILE or - for stdin")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input_fasta != '-':
        inf = open(args.input_fasta)
    fh = FastaHandleReader(inf)
    data = []
    sys.stderr.write("Reads    Mean    Stddev\n")
    while True:
        entry = fh.read_entry()
        if not entry: break
        dist = len(entry['seq'])
        data.append(dist)
        if len(data) < 2: continue
        if len(data) % 1000 == 0:
            sys.stderr.write(
                str(len(data)) + "    " + str(int(mean(data))) + "    " +
                str(int(stddev(data))) + "              \r")
    sys.stderr.write(
        str(len(data)) + "    " + str(int(mean(data))) + "    " +
        str(int(stddev(data))) + "              \r")
    sys.stderr.write("\n")
コード例 #2
0
def main():
  parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file")
  parser.add_argument('input',help="PSLFILE or - for STIDN")
  parser.add_argument('reference',help="FASTAFILE reference genome")
  parser.add_argument('query',help="FASTAFILE query sequences")
  parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT")
  #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
  args = parser.parse_args()
  # Read in the reference genome
  sys.stderr.write("Reading in reference genome\n")
  g = read_fasta_into_hash(args.reference)
  sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n")
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  fhr = FastaHandleReader(open(args.query))
  last_fasta = fhr.read_entry()
  if not last_fasta:
    sys.stderr.write("ERROR: No query sequences\n")
    sys.exit()
  for line in inf:
    p = PSLBasics.PSL(line)
    if not p.validate():
      sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n")
    n = p.value('qName')
    if not last_fasta:
      sys.stderr.write("ERROR: Ran out of query sequences too soon.  Are they sorted properly\n")
      sys.exit()
    while last_fasta['name'] != n:
      last_fasta = fhr.read_entry()
    p.set_query(last_fasta['seq'])
    p.set_reference_dictionary(g)
    print p.get_line()
    p.pretty_print(50)
  fhr.close()
コード例 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input', help="Use - for STDIN")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--fasta', action='store_true')
    group.add_argument('--fastq', action='store_true')
    group.add_argument('--gpd', action='store_true')
    parser.add_argument('--output_table', help='save coversion to file')
    parser.add_argument('-o', '--output')
    args = parser.parse_args()

    if args.input == '-': args.input = sys.stdin
    else: args.input = open(args.input)

    if args.output: args.output = open(args.output, 'w')
    if args.output_table: args.output_table = open(args.output_table, 'w')
    else: args.output = sys.stdout
    if args.gpd:
        z = 0
        for line in args.input:
            f = line.rstrip().split("\t")
            z += 1
            name = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str(
                z) + '/ccs'
            if args.output_table:
                args.output_table.write(f[0] + "\t" + name + "\n")
            f[0] = name
            f[1] = name
            args.output.write("\t".join(f) + "\n")
        args.output.close()
        if args.output_table:
            args.output_table.close()
        return

    if args.fasta:
        args.input = FastaHandleReader(args.input)
    elif args.fastq:
        args.input = FastqHandleReader(args.input)
    z = 0
    while True:
        e = args.input.read_entry()
        if not e: break
        z += 1
        name = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str(
            z) + '/ccs'
        if args.fastq:
            args.output.write('@' + name + "\n" + e['seq'] + "\n" + '+' +
                              e['qual'] + "\n")
        elif args.fasta:
            args.output.write('>' + name + "\n" + e['seq'] + "\n")
        if args.output_table:
            args.output_table.write(e['name'] + "\t" + name + "\n")
    args.output.close()
    if args.output_table: args.output_table.close()
コード例 #4
0
def main():
  parser = argparse.ArgumentParser(description="Convert a genome to its mappability",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('reference_genome',help="Use - for STDIN")
  parser.add_argument('-k','--fragment_length',type=int,default=36,help="length of fragment to check mappability")
  parser.add_argument('-x','--genome_index',required=True)
  parser.add_argument('--threads',type=int,default=cpu_count(),help="Thread count")
  parser.add_argument('-o','--output',help="set for output file otherwise will be STDOUT")
  parser.add_argument('--type',choices=['mean','median','geometric_mean'],default='mean',help="How to combine window results")
  args = parser.parse_args()

  if args.output:
    args.output = open(args.output,'w')
  else:
    args.output = sys.stdout

  
  
  udir = os.path.dirname(os.path.realpath(__file__))
  cmd4 = 'bed_tools.py - --merge --break_merge_on_feature'
  p4 = Popen(cmd4.split(),stdin=PIPE,stdout=args.output)
  cmd3 = udir+'/counts_to_mappability.py - --fragment_length '+str(args.fragment_length)
  cmd3 += ' --'+args.type
  p3 = Popen(cmd3.split(),stdin=PIPE,stdout=p4.stdin)
  cmd2 = 'hisat_to_mapping_count.py -'
  p2 = Popen(cmd2.split(),stdin=PIPE,stdout=p3.stdin)
  cmd1 = 'hisat -x '+args.genome_index+' -U - -f --reorder -p '+str(args.threads)
  p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin)
  inf = open(args.reference_genome)
  fhr = FastaHandleReader(inf)
  while True:
    e = fhr.read_entry()
    if not e: break
    for i in range(0,len(e['seq'])-args.fragment_length):
      p1.stdin.write('>'+e['name']+':'+str(i+1)+'-'+str(i+args.fragment_length)+"\n")
      p1.stdin.write(e['seq'][i:i+args.fragment_length].upper()+"\n")
  p1.communicate()
  p2.communicate()
  p3.communicate()
  p4.communicate()
  args.output.close()
コード例 #5
0
def main():
    parser = argparse.ArgumentParser(
        description="Correct the matches/mismatches and Ncount of a PSL file")
    parser.add_argument('input', help="PSLFILE or - for STIDN")
    parser.add_argument('reference', help="FASTAFILE reference genome")
    parser.add_argument('query', help="FASTAFILE query sequences")
    parser.add_argument('--minimum_intron_size',
                        type=int,
                        default=68,
                        help="INT")
    #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
    args = parser.parse_args()
    # Read in the reference genome
    sys.stderr.write("Reading in reference genome\n")
    g = read_fasta_into_hash(args.reference)
    sys.stderr.write("Finished reading " + str(len(g.keys())) +
                     " reference sequences\n")
    inf = sys.stdin
    if args.input != '-':
        inf = open(args.input)
    fhr = FastaHandleReader(open(args.query))
    last_fasta = fhr.read_entry()
    if not last_fasta:
        sys.stderr.write("ERROR: No query sequences\n")
        sys.exit()
    for line in inf:
        p = PSLBasics.PSL(line)
        if not p.validate():
            sys.stderr.write(
                "WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"
                + line.rstrip() + "\n")
        n = p.value('qName')
        if not last_fasta:
            sys.stderr.write(
                "ERROR: Ran out of query sequences too soon.  Are they sorted properly\n"
            )
            sys.exit()
        while last_fasta['name'] != n:
            last_fasta = fhr.read_entry()
        p.set_query(last_fasta['seq'])
        p.set_reference_dictionary(g)
        print p.get_line()
        p.pretty_print(50)
    fhr.close()
コード例 #6
0
def main():
    parser = argparse.ArgumentParser(description="Find primers in a sequence")
    parser.add_argument('input', help="FASTA_FILE genome or - for STDIN")
    parser.add_argument('--AT_end_limit',
                        type=int,
                        default=4,
                        help='Maxmimum number of A/T to look for at the end')
    parser.add_argument(
        '--overlap_join',
        type=int,
        default=8,
        help='Join together matches with this much exact overlap')
    parser.add_argument('--end_criteria',
                        type=int,
                        default=5000,
                        help='Stop when you have seen a k-mer this many times')
    parser.add_argument('--total_candidates',
                        type=int,
                        default=100,
                        help='Look at this number of candidates')
    parser.add_argument('--kmersize',
                        type=int,
                        default=18,
                        help='Look at this number of candidates')
    args = parser.parse_args()

    if args.input == '-':
        args.input = sys.stdin
    else:
        args.input = open(args.input)

    #tx = read_fasta_into_hash(args.transcriptome_fasta)
    totals = {}
    total_length = 0
    lenlow = args.kmersize
    lenhigh = args.kmersize
    counts = {}
    z = 0
    reader = FastaHandleReader(args.input)
    while True:
        e = reader.read_entry()
        if not e: break
        z += 1
        sys.stderr.write(str(z) + "\r")
        seq = e['seq']
        explode(counts, seq, lenlow, lenhigh)
        longest = 0
        if z % 20 == 0:
            for part in counts.keys():
                if counts[part] <= 3: del counts[part]
                else:
                    edgemax = edgeAT(part)
                    if edgemax > args.AT_end_limit: del counts[part]
            biggest = sorted(counts, key=counts.get, reverse=True)
            if len(biggest) > 1:
                if counts[biggest[1]] > args.end_criteria:
                    break
    sys.stderr.write("\n")
    numuse = args.total_candidates
    rankedsets = {}
    z = 0
    #for seq in counts.keys():
    #  edgemax = edgeAT(seq)
    #  if edgemax > args.AT_end_limit: del counts[seq]
    for myset in [[x, counts[x]]
                  for x in sorted(counts, key=counts.get, reverse=True)
                  ][0:numuse]:
        rankedsets[z] = myset
        z += 1
    lastsets = -1
    numsets = len(rankedsets.keys())
    while numsets != lastsets:
        sys.stderr.write(str(numsets) + "          \r")
        lastsets = numsets
        reduceranks(rankedsets)  # remove highly similar sets
        numsets = len(rankedsets.keys())
    sys.stderr.write("\n")
    # now we have our best candidates
    #for i in sorted(rankedsets.keys()):
    #  print rankedsets[i][0]+"\t"+str(rankedsets[i][1])

    lastsets = 0
    numsets = len(rankedsets.keys())
    while numsets != lastsets:
        lastsets = numsets
        combine_overlapping(rankedsets, args)
        numsets = len(rankedsets.keys())
    for i in sorted(rankedsets.keys()):
        print rankedsets[i][0] + "\t" + str(rankedsets[i][1])
コード例 #7
0
def main():
  parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file")
  parser.add_argument('input',help="PSLFILE or - for STIDN")
  parser.add_argument('reference',help="FASTAFILE reference genome")
  parser.add_argument('query',help="FASTAFILE query sequences")
  parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT")
  #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
  args = parser.parse_args()
  # Read in the reference genome
  sys.stderr.write("Reading in reference genome\n")
  g = read_fasta_into_hash(args.reference)
  sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n")
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  fhr = FastaHandleReader(open(args.query))
  last_fasta = fhr.read_entry()
  if not last_fasta:
    sys.stderr.write("ERROR: No query sequences\n")
    sys.exit()
  for line in inf:
    p = PSLBasics.PSL(line)
    if not p.validate():
      sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n")
    n = p.value('qName')
    if not last_fasta:
      sys.stderr.write("ERROR: Ran out of query sequences too soon.  Are they sorted properly\n")
      sys.exit()
    while last_fasta['name'] != n:
      last_fasta = fhr.read_entry()
    p.set_query(last_fasta['seq'])
    p.set_reference_dictionary(g)
    p.correct_stats()
    print p.get_line()
    continue
    f = last_fasta
    nCount = 0
    matches = 0
    misMatches = 0
    prev_qE = 0
    prev_tE = 0
    qNumInsert = 0
    qBaseInsert = 0
    tNumInsert = 0
    tBaseInsert = 0
    for i in range(p.value('blockCount')):
      blen = p.value('blockSizes')[i]
      qS = p.value('qStarts')[i] #query start
      qE = qS + blen             #query end
      tS = p.value('tStarts')[i] #target start
      tE = tS + blen             #target end
      #Work on gaps
      if prev_qE > 0 or prev_tE > 0: #if its not our first time through
        tgap = tS-prev_tE
        if tgap < args.minimum_intron_size and tgap > 0:
          tNumInsert += 1
          tBaseInsert += tgap
        qgap = qS-prev_qE
        if qgap > 0:
          qNumInsert += 1
          qBaseInsert += qgap
      query = f['seq']
      if p.value('strand') == '-':
        query = rc(f['seq'])
      qseq = query[qS:qE].upper()
      rseq = g[p.value('tName')][tS:tE].upper()
      #print qseq+"\n"+rseq+"\n"
      for j in range(0,blen):
        if qseq[j] == 'N':
          nCount += 1
        elif qseq[j] == rseq[j]:
          matches += 1
        else:
          misMatches += 1
      prev_qE = qE
      prev_tE = tE
    p.entry['matches'] = matches
    p.entry['misMatches'] = misMatches
    p.entry['nCount'] = nCount
    p.entry['qNumInsert'] = qNumInsert
    p.entry['qBaseInsert'] = qBaseInsert
    p.entry['tNumInsert'] = tNumInsert
    p.entry['tBaseInsert'] = tBaseInsert
    p.entry['qSize'] = len(query)
    p.entry['tSize'] = len(g[p.value('tName')]) 
    print p.get_line()
    #p.pretty_print(100)
  fhr.close()
コード例 #8
0
def main():
  parser = argparse.ArgumentParser(description="Find primers in a sequence")
  parser.add_argument('input',help="FASTA_FILE genome or - for STDIN")
  parser.add_argument('--AT_end_limit',type=int,default=4,help='Maxmimum number of A/T to look for at the end')
  parser.add_argument('--overlap_join',type=int,default=8,help='Join together matches with this much exact overlap')
  parser.add_argument('--end_criteria',type=int,default=5000,help='Stop when you have seen a k-mer this many times')
  parser.add_argument('--total_candidates',type=int,default=100,help='Look at this number of candidates')
  parser.add_argument('--kmersize',type=int,default=18,help='Look at this number of candidates')
  args = parser.parse_args()

  if args.input == '-':
    args.input = sys.stdin
  else:
    args.input = open(args.input)

  #tx = read_fasta_into_hash(args.transcriptome_fasta)
  totals = {}
  total_length = 0
  lenlow = args.kmersize
  lenhigh = args.kmersize
  counts = {}
  z = 0
  reader = FastaHandleReader(args.input)
  while True:
    e = reader.read_entry()
    if not e: break
    z+=1
    sys.stderr.write(str(z)+"\r")
    seq = e['seq']
    explode(counts,seq,lenlow,lenhigh)  
    longest = 0
    if z %20 == 0: 
      for part in counts.keys():
        if counts[part] <= 3: del counts[part]
        else:
          edgemax = edgeAT(part)
          if edgemax > args.AT_end_limit: del counts[part]
      biggest =  sorted(counts, key=counts.get,reverse=True)
      if len(biggest) > 1:
        if counts[biggest[1]] > args.end_criteria:
          break
  sys.stderr.write("\n")
  numuse = args.total_candidates
  rankedsets= {}
  z = 0
  #for seq in counts.keys():
  #  edgemax = edgeAT(seq)
  #  if edgemax > args.AT_end_limit: del counts[seq]
  for myset in [[x,counts[x]] for x in sorted(counts, key=counts.get,reverse=True)][0:numuse]:
    rankedsets[z] = myset
    z +=1
  lastsets = -1
  numsets = len(rankedsets.keys())
  while numsets != lastsets:
    sys.stderr.write(str(numsets)+"          \r")
    lastsets = numsets
    reduceranks(rankedsets) # remove highly similar sets
    numsets = len(rankedsets.keys())
  sys.stderr.write("\n")
  # now we have our best candidates
  #for i in sorted(rankedsets.keys()):
  #  print rankedsets[i][0]+"\t"+str(rankedsets[i][1])

  lastsets = 0
  numsets = len(rankedsets.keys())
  while numsets != lastsets:
    lastsets = numsets
    combine_overlapping(rankedsets,args)
    numsets = len(rankedsets.keys())
  for i in sorted(rankedsets.keys()):
    print rankedsets[i][0]+"\t"+str(rankedsets[i][1])