def get_matches(lastz_file, splitchar, components, fish = False): matches = defaultdict(list) probes = defaultdict(int) for lz in lastz.Reader(lastz_file, long_format = True): # skip silly hg19 mhc haplotypes if "hap" in lz.name1: print "Skipping: ", lz.name1 else: if not fish: uce_name = get_name(lz.name2, "|", 1) probe_number = int(lz.name2.split(':')[-1]) else: uce_name = get_name(lz.name2, "_", 1) # add 1 because fish probe indexing starts @ 0 probe_number = int(lz.name2.split('|')[1].split('_')[1]) + 1 #pdb.set_trace() if probe_number > probes[uce_name]: probes[uce_name] = probe_number matches[uce_name].append([get_name(lz.name1, splitchar = splitchar, items = components), lz.strand2, lz.zstart1, lz.end1]) return matches, probes
def main(): args = get_args() uces = set([ get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query) ]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [ os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace( '-', "_") for f in files ] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k, v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key=itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose: print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def get_matches(lastz_file, splitchar, components, fish=False): matches = defaultdict(list) probes = defaultdict(int) for lz in lastz.Reader(lastz_file, long_format=True): # skip silly hg19 mhc haplotypes if "hap" in lz.name1: print "Skipping: ", lz.name1 else: if not fish: uce_name = get_name(lz.name2, "|", 1) probe_number = int(lz.name2.split(':')[-1]) else: uce_name = get_name(lz.name2, "_", 1) # add 1 because fish probe indexing starts @ 0 probe_number = int(lz.name2.split('|')[1].split('_')[1]) + 1 #pdb.set_trace() if probe_number > probes[uce_name]: probes[uce_name] = probe_number matches[uce_name].append([ get_name(lz.name1, splitchar=splitchar, items=components), lz.strand2, lz.zstart1, lz.end1 ]) return matches, probes
def main(): args = get_args() uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k,v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key = itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose:print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()