def main(): args = get_args() if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None matches, probes = get_matches(args.lastz, args.splitchar, args.components, args.fish) #unique_matches = sum([1 for uce, map_pos in matches.iteritems() if len(map_pos) == probes[uce]]) if args.fasta: tb = bx.seq.twobit.TwoBitFile(file(args.genome)) count = 0 for k, v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes): # sort by match position v_sort = sorted(v, key=itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 140): skip = True if args.verbose: print "range longer than expected" else: chromo = v[0][0] strand = v[0][1] else: skip = True elif k in dupes: skip = True print "{0} is in dupefile".format(k) else: chromo, strand, start, end = v[0] if not skip and args.fasta: # slice out region + flank try: slc = tb[chromo][start - args.flank:end + args.flank] except: pdb.set_trace() # strip Ns from both ends slc = slc.strip('N') # reverse any strands where necessary if not strand == '+': slc = transform.DNA_reverse_complement(slc) if len(slc) != 0: args.fasta.write(">Node_{0}_length_{1}_cov_100\n{2}\n".format( count, len(slc), '\n'.join(textwrap.wrap(slc)))) if not skip and args.bed: args.bed.write("{0} {1} {2} {3} 1000 {4}\n".format( chromo, start - args.flank, end + args.flank, k, strand)) count += 1
def main(): args = get_args() if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None matches, probes = get_matches(args.lastz, args.splitchar, args.components, args.fish) #unique_matches = sum([1 for uce, map_pos in matches.iteritems() if len(map_pos) == probes[uce]]) if args.fasta: tb = bx.seq.twobit.TwoBitFile(file(args.genome)) count = 0 for k,v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes): # sort by match position v_sort = sorted(v, key = itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 140): skip = True if args.verbose: print "range longer than expected" else: chromo = v[0][0] strand = v[0][1] else: skip = True elif k in dupes: skip = True print "{0} is in dupefile".format(k) else: chromo, strand, start, end = v[0] if not skip and args.fasta: # slice out region + flank try: slc = tb[chromo][start - args.flank:end + args.flank] except: pdb.set_trace() # strip Ns from both ends slc = slc.strip('N') # reverse any strands where necessary if not strand == '+': slc = transform.DNA_reverse_complement(slc) if len(slc) != 0: args.fasta.write(">Node_{0}_length_{1}_cov_100\n{2}\n".format(count, len(slc), '\n'.join(textwrap.wrap(slc)))) if not skip and args.bed: args.bed.write("{0} {1} {2} {3} 1000 {4}\n".format(chromo, start - args.flank, end + args.flank, k, strand)) count += 1
def main(): args = get_args() uces = set([ get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query) ]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [ os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace( '-', "_") for f in files ] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k, v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key=itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose: print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def main(): args = get_args() uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k,v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key = itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose:print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def main(): args = get_args() if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() # get dbSNP data all_snps = get_xml_data(args.xml) used = set() # iterate over intersections args.output.write('rsid,pos,maf,1000g\n') for row in args.dbsnp: if not row.startswith('UCE'): uce, chromo, start, end, snp, snps, snpe = row.strip('\n').split(',') start, end, snps, snpe = map(int, [start, end, snps, snpe]) # get relative position if not snpe - snps > 1 and snp not in used and not uce in dupes: middle = int(round((start + end)/2, 0)) rel_snp_pos = snps - middle # lookup data for snps if all_snps[snp.strip('rs')].val_1000G and all_snps[snp.strip('rs')].val_1000G.lower() == 'true': thousandg = True else: thousandg = False if not all_snps[snp.strip('rs')].freq_freq: freq = 0.0 else: freq = float(all_snps[snp.strip('rs')].freq_freq) args.output.write("{0},{1},{2},{3}\n".format( snp, rel_snp_pos, freq, thousandg ) ) # make sure we skip any duplicates used.add(snp)
def main(): args = get_args() if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None used = set() mx = max([int(row.strip('\n').split(',')[3]) \ - int(row.strip('\n').split(',')[2]) \ for row in open(args.dbsnp,'rU') if not row.startswith('UCE')]) # get the SNP metadata all_snps = get_xml_data(args.xml) # find the middle overall_middle = int(round(mx / 2, 0)) # list to hold results l = numpy.zeros(mx + 1) positions = copy.deepcopy(l) # create a dict to hold the results by position in longest array #differences = dict((d,numpy.array([])) for d in range(-middle, middle + 1)) # iterate over intersections d = {} if args.output2: args.output2.write( 'UCE,chromo,uce-start,uce-end,snp-name,snp-start,snp-end,1000gvalidated,freq\n' ) for row in open(args.dbsnp, 'rU'): if not row.startswith('UCE'): uce, chromo, start, end, snp, snps, snpe = row.strip('\n').split( ',') start, end, snps, snpe = map(int, [start, end, snps, snpe]) # get middle of this UCE middle = int(round((start + end) / 2, 0)) #pdb.set_trace() if snp not in used: if not snpe - snps > 1 \ and (uce not in dupes) \ and all_snps[snp.strip('rs')].val_1000G == 'true' \ and all_snps[snp.strip('rs')].freq_freq is not None: if not uce in d.keys(): d[uce] = numpy.zeros(mx + 1) rel_snp_pos = snps - middle d[uce][overall_middle + rel_snp_pos] = all_snps[snp.strip('rs')].freq_freq if args.output2 and not snpe - snps > 1 and ( snp not in used) and (uce not in dupes): args.output2.write("{},{},{},{},{},{},{},{},{}\n".format( uce, chromo, start, end, snp, snps, snpe, all_snps[snp.strip('rs')].val_1000G, all_snps[snp.strip('rs')].freq_freq)) used.add(snp) stack = numpy.array([d[uce] for uce in d.keys()]) #pdb.set_trace() # compute the running average win = 25 data = sum(stack > 0) weightings = numpy.repeat(1.0, win) / win running = numpy.convolve(data, weightings)[win - 1:-(win - 1)] args.output.write("pos,avg,ci,datatype\n") for base in range(len(running)): pos = base - overall_middle args.output.write("{},{},,running\n".format(pos, running[base])) # also output the average heterozygosity of 1000 Genome validated, hetero SNPs. for base in range(len(stack[0])): pos = base - overall_middle values = numpy.where(stack[:, base] != 0)[0] # reindex avg = numpy.mean(stack[:, base][values]) ci = 1.96 * (numpy.std(stack[:, base][values], ddof=1) / numpy.sqrt(len(stack[:, base][values]))) args.output.write("{},{},{},mean_hetero\n".format(pos, avg, ci)) win = 25 data = numpy.mean(stack, axis=1) weightings = numpy.repeat(1.0, win) / win running = numpy.convolve(data, weightings)[win - 1:-(win - 1)] for base in range(len(stack[0])): pos = base - overall_middle args.output.write("{},{},,running_hetero\n".format(pos, running[base]))
def main(): args = get_args() print get_dupes(args.lastz)
def main(): args = get_args() print get_dupes(args.lastz)
def main(): args = get_args() if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None used = set() mx = max([int(row.strip('\n').split(',')[3]) \ - int(row.strip('\n').split(',')[2]) \ for row in open(args.dbsnp,'rU') if not row.startswith('UCE')]) # get the SNP metadata all_snps = get_xml_data(args.xml) # find the middle overall_middle = int(round(mx/2, 0)) # list to hold results l = numpy.zeros(mx + 1) positions = copy.deepcopy(l) # create a dict to hold the results by position in longest array #differences = dict((d,numpy.array([])) for d in range(-middle, middle + 1)) # iterate over intersections d = {} if args.output2: args.output2.write('UCE,chromo,uce-start,uce-end,snp-name,snp-start,snp-end,1000gvalidated,freq\n') for row in open(args.dbsnp, 'rU'): if not row.startswith('UCE'): uce, chromo, start, end, snp, snps, snpe = row.strip('\n').split(',') start, end, snps, snpe = map(int, [start, end, snps, snpe]) # get middle of this UCE middle = int(round((start + end)/2, 0)) #pdb.set_trace() if snp not in used: if not snpe - snps > 1 \ and (uce not in dupes) \ and all_snps[snp.strip('rs')].val_1000G == 'true' \ and all_snps[snp.strip('rs')].freq_freq is not None: if not uce in d.keys(): d[uce] = numpy.zeros(mx + 1) rel_snp_pos = snps - middle d[uce][overall_middle + rel_snp_pos] = all_snps[snp.strip('rs')].freq_freq if args.output2 and not snpe - snps > 1 and (snp not in used) and (uce not in dupes): args.output2.write("{},{},{},{},{},{},{},{},{}\n".format( uce, chromo, start, end, snp, snps, snpe, all_snps[snp.strip('rs')].val_1000G, all_snps[snp.strip('rs')].freq_freq)) used.add(snp) stack = numpy.array([d[uce] for uce in d.keys()]) #pdb.set_trace() # compute the running average win = 25 data = sum(stack > 0) weightings = numpy.repeat(1.0, win) / win running = numpy.convolve(data, weightings)[win-1:-(win-1)] args.output.write("pos,avg,ci,datatype\n") for base in range(len(running)): pos = base - overall_middle args.output.write("{},{},,running\n".format(pos,running[base])) # also output the average heterozygosity of 1000 Genome validated, hetero SNPs. for base in range(len(stack[0])): pos = base - overall_middle values = numpy.where(stack[:,base] != 0)[0] # reindex avg = numpy.mean(stack[:,base][values]) ci = 1.96 * (numpy.std(stack[:,base][values], ddof = 1)/numpy.sqrt(len(stack[:,base][values]))) args.output.write("{},{},{},mean_hetero\n".format(pos, avg, ci)) win = 25 data = numpy.mean(stack, axis = 1) weightings = numpy.repeat(1.0, win) / win running = numpy.convolve(data, weightings)[win-1:-(win-1)] for base in range(len(stack[0])): pos = base - overall_middle args.output.write("{},{},,running_hetero\n".format(pos,running[base]))