def main(): start_time = time.time() print 'Started: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(start_time)) options, arg = interface() alignment = lastz.Align(options.target, options.query, options.coverage, \ options.identity, options.output) lzstdout, lztstderr = alignment.run() if lztstderr: pdb.set_trace() end_time = time.time() print 'Ended: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(end_time)) print 'Time for execution: ', (end_time - start_time) / 60, 'minutes'
def main(args): #args = get_args() pre_regex = args.regex regex = re.compile("^(%s)(?:.*)" % pre_regex) if not os.path.isdir(args.output): os.makedirs(args.output) else: raise IOError( "The directory {} already exists. Please check and remove by hand." .format(args.output)) exons = set( new_get_probe_name(seq.id, regex) for seq in SeqIO.parse(open(args.reference, 'rU'), 'fasta')) #print exons if args.dupefile: dupes = get_dupes(log, args.dupefile, regex) else: dupes = set() fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*')) for f in fasta_files: replace_bad_fasta_chars = "sed -i -e '/>/! s=[K,Y,R,S,M,W,B,D,H,V,k,y,r,s,m,w,b,d,h,v]=N=g' %s" % f remove_os_sed_copies = "rm %s/*-e " % args.contigs fasta_name = f.split('/')[-1] if not fasta_name.startswith('sample'): rename_samples = "mv %s %s/sample_%s" % (f, args.contigs, fasta_name) os.system(rename_samples) os.system(replace_bad_fasta_chars) os.system(remove_os_sed_copies) fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*')) organisms = get_organism_names_from_fasta_files(fasta_files) conn, c = create_probe_database( log, os.path.join(args.output, 'probe.matches.sqlite'), organisms, exons) log.info("Processing contig data") # open a file for duplicate writing, if we're interested if args.keep_duplicates is not None: dupefile = open(args.keep_duplicates, 'w') else: dupefile = None log.info("{}".format("-" * 65)) kmers = {} for contig in sorted(fasta_files): critter = os.path.basename(contig).split('.')[0].replace('-', "_") output = os.path.join( args.output, os.path.splitext(os.path.basename(contig))[0] + '.lastz') contigs = contig_count(contig) # align the probes to the contigs alignment = lastz.Align(contig, args.reference, args.min_coverage, args.min_identity, output) lzstdout, lztstderr = alignment.run() if lztstderr: raise EnvironmentError("lastz: {}".format(lztstderr)) # parse the lastz results of the alignment matches = defaultdict(set) orientation = defaultdict(set) revmatches = defaultdict(set) probe_dupes = set() if not lztstderr: for lz in lastz.Reader(output): contig_name = get_contig_name(lz.name1, args) exon_name = new_get_probe_name(lz.name2, regex) if args.dupefile and exon_name in dupes: probe_dupes.add(exon_name) else: matches[contig_name].add(exon_name) orientation[exon_name].add(lz.strand2) revmatches[exon_name].add(contig_name) # we need to check nodes for dupe matches to the same probes contigs_matching_mult_exons = check_contigs_for_dupes(matches) exon_dupe_contigs, exon_dupe_exons = check_loci_for_dupes(revmatches) nodes_to_drop = contigs_matching_mult_exons.union(exon_dupe_contigs) # write out duplicates if requested if dupefile is not None: log.info("Writing duplicates file for {}".format(critter)) if len(exon_dupe_exons) != 0: dupefile.write( "[{} - probes hitting multiple contigs]\n".format(critter)) for exon in exon_dupe_exons: dupefile.write("{}:{}\n".format( exon, ', '.join(revmatches[exon]))) dupefile.write("\n") if len(contigs_matching_mult_exons) != 0: dupefile.write( "[{} - contigs hitting multiple probes]\n".format(critter)) for dupe in contigs_matching_mult_exons: dupefile.write("{}:{}\n".format(dupe, ', '.join(matches[dupe]))) dupefile.write("\n") dupefile.write("[{} - contig orientation]\n".format(critter)) for dupe in contigs_matching_mult_exons: matches_list = list(matches[dupe]) for exon in matches_list: dupefile.write("{}:{}\n".format( exon, list(orientation[exon])[0])) dupefile.write("\n") # remove dupe and/or dubious nodes/contigs match_copy = copy.deepcopy(matches) for k in match_copy.keys(): if k in nodes_to_drop: del matches[k] #print matches #print lz.name1 #get contig id #contig_id = re.search("^(\d*)\s\d*\s\d*.*", lz.name1).groups()[0] #print matches #added function to return the kmer count (sum of all kmers of target contigs) for lz in lastz.Reader(output): for element in matches: #print element, "has to match", lz[1] if re.search("^(\d*)\s\d*\s\d*.*", lz[1]).groups()[0] == element: kmer_value = get_kmer_value(lz.name1) kmers.setdefault(contig, []) kmers[contig].append(kmer_value) store_lastz_results_in_db(c, matches, orientation, critter) conn.commit() pretty_log_output(log, critter, matches, contigs, probe_dupes, contigs_matching_mult_exons, exon_dupe_exons) kmerfile = open(os.path.join(args.output, 'kmer_count.txt'), 'w') for key in kmers: count = 0 for element in kmers[key]: count += int(element) kmerfile.write("%s : %d\n" % (os.path.basename(key).split('.')[0], count)) if dupefile is not None: dupefile.close() log.info("{}".format("-" * 65)) log.info("The LASTZ alignments are in {}".format(args.output)) log.info("The exon match database is in {}".format( os.path.join(args.output, "probes.matches.sqlite"))) text = "Completed" log.info(text.center(65, "=")) # Access the SQL file and export tab-separated text-file sql_file = os.path.join(args.output, 'probe.matches.sqlite') tsf_out = os.path.join(args.output, 'match_table.txt') sql_cmd = "%s -header -nullvalue '.' -separator '\t' %s \"select * from matches;\" > %s" % ( args.sqlite3, sql_file, tsf_out) os.system(sql_cmd) # Create the config file for the extraction of the desired loci output_folder = args.output with open(os.path.join(output_folder, 'config'), 'w') as f: print('[Organisms]', file=f) for aln in glob.glob(os.path.join(output_folder, '*.lastz')): aln = os.path.basename(aln) #aln = aln.split('_')[0] aln = aln.replace('.lastz', '') print(aln, file=f) print('\n[Loci]', file=f) with open(os.path.join(output_folder, 'match_table.txt')) as match_table: lines = match_table.readlines() for line in lines[1:]: print(line.split('\t')[0], file=f)
def main(): args = get_args() if args.regex and args.repl is not None: # "s_[0-9]+$" regex = re.compile(args.regex) uces = set([get_name(read.identifier, "|", 1, regex=regex, repl=args.repl) for read in fasta.FastaReader(args.query)]) else: uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)]) regex = None if args.dupefile: print "\t Getting dupes" dupes = get_dupes(args.dupefile, regex, args.repl) contig = args.contigs#glob.glob(os.path.join(args.contigs, '*.fa*')) organisms = ["contigs"]#get_organism_names_from_fasta_files(fasta_files) conn, c = create_probe_database( uces ) print "Processing:" #for contig in fasta_files: critter = os.path.basename(contig).split('.')[0].replace('-', "_") #output = args.align # os.path.join( # args.align, \ # os.path.splitext(os.path.basename(contig))[0] + '.lastz' # ) contigs = contig_count(contig) # align the probes to the contigs alignment = lastz.Align( contig, args.query, args.coverage, args.identity, args.align ) lzstdout, lztstderr = alignment.run() # parse the lastz results of the alignment matches, orientation, revmatches = \ defaultdict(set), defaultdict(set), defaultdict(set) probe_dupes = set() if not lztstderr: for lz in lastz.Reader(args.align ): # get strandedness of match contig_name = get_name(lz.name1) uce_name = get_name(lz.name2, "|", 1, regex=regex, repl=args.repl) if args.dupefile and uce_name in dupes: probe_dupes.add(uce_name) else: matches[contig_name].add(uce_name) orientation[uce_name].add(lz.strand2) revmatches[uce_name].add(contig_name) else: print "Error in lastz:" print "STDerr:" print lztstderr print "STDout:" print lzstdout # we need to check nodes for dupe matches to the same probes contigs_matching_mult_uces = check_contigs_for_dupes(matches) uces_matching_mult_contigs = check_probes_for_dupes(revmatches) nodes_to_drop = contigs_matching_mult_uces nodes_to_drop_one_of = uces_matching_mult_contigs # remove dupe and/or dubious nodes/contigs match_copy = copy.deepcopy(matches) already_observed = list() for k in match_copy.keys(): if k in nodes_to_drop: del matches[k] elif k in nodes_to_drop_one_of: if matches[k] in already_observed: del matches[k] else: already_observed.append(matches[k]) store_lastz_results_in_db(c, matches, orientation, critter) conn.commit() pretty_print_output( critter, matches, contigs, probe_dupes, contigs_matching_mult_uces, uces_matching_mult_contigs ) # get all the UCE records from the db query = "SELECT uce, {0} FROM match_map WHERE {0} IS NOT NULL".format("contigs") c.execute(query) data = {row[1].split("(")[0]:row[0] for row in c.fetchall()} nodenames = set(data.keys()) # make sure we don't lose any dupes assert len(data) == len(nodenames), "There were duplicate contigs." outp = open(args.output, 'w') print "Building UCE fasta:" #for contig in fasta_files: for record in SeqIO.parse(open(contig), 'fasta'): name = '_'.join(record.id.split('_')[:2]) if name.lower() in nodenames: record.id = "{0}|{1}".format(data[name.lower()], record.id) outp.write(record.format('fasta')) outp.close()
def main(): args = get_args() if args.regex and args.repl is not None: # "s_[0-9]+$" regex = re.compile(args.regex) uces = set([ get_name(read.identifier, "|", 1, regex=regex, repl=args.repl) for read in fasta.FastaReader(args.query) ]) else: uces = set([ get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query) ]) regex = None if args.dupefile: print "\t Getting dupes" dupes = get_dupes(args.dupefile, regex, args.repl) fasta_files = glob.glob(os.path.join(args.contigs, '*.fa*')) organisms = get_organism_names_from_fasta_files(fasta_files) conn, c = create_probe_database( os.path.join(args.output, 'probe.matches.sqlite'), organisms, uces) print "Processing:" for contig in fasta_files: critter = os.path.basename(contig).split('.')[0].replace('-', "_") output = os.path.join( args.output, \ os.path.splitext(os.path.basename(contig))[0] + '.lastz' ) contigs = contig_count(contig) # align the probes to the contigs alignment = lastz.Align(contig, args.query, args.coverage, args.identity, output) lzstdout, lztstderr = alignment.run() # parse the lastz results of the alignment matches, orientation, revmatches = \ defaultdict(set), defaultdict(set), defaultdict(set) probe_dupes = set() if not lztstderr: for lz in lastz.Reader(output): # get strandedness of match contig_name = get_name(lz.name1) uce_name = get_name(lz.name2, "|", 1, regex=regex, repl=args.repl) if args.dupefile and uce_name in dupes: probe_dupes.add(uce_name) else: matches[contig_name].add(uce_name) orientation[uce_name].add(lz.strand2) revmatches[uce_name].add(contig_name) # we need to check nodes for dupe matches to the same probes contigs_matching_mult_uces = check_contigs_for_dupes(matches) uces_matching_mult_contigs = check_probes_for_dupes(revmatches) nodes_to_drop = contigs_matching_mult_uces.union( uces_matching_mult_contigs) # remove dupe and/or dubious nodes/contigs match_copy = copy.deepcopy(matches) for k in match_copy.keys(): if k in nodes_to_drop: del matches[k] store_lastz_results_in_db(c, matches, orientation, critter) conn.commit() pretty_print_output(critter, matches, contigs, probe_dupes, contigs_matching_mult_uces, uces_matching_mult_contigs)