def main(): args = get_args() conn = sqlite3.connect(args.db) c = conn.cursor() config = ConfigParser.RawConfigParser(allow_no_value=True) config.read(args.config) organisms = get_names_from_config(config, args.group) excludes = get_names_from_config(config, 'Excludes') if excludes: organisms = [org for org in organisms if org not in excludes] args.output.write("org\tcontigs\tavg len\n") for org in organisms: # skip extended data, which are typically from genome-enabled orgs, # not capture data if not org.endswith('*'): # get the uce-matching node names from the db matching_nodes = get_matching_node_names(c, org) # parse the contig file for the organism, and return contig # lengths f = os.path.join(args.fasta, "{0}.{1}".format(org.replace('_','-'),'contigs.fasta')) records = fasta.FastaReader(f) contig_lens = [len(seq) for seq in records if '_'.join(seq.identifier.strip('>').split('_')[0:2]) in matching_nodes] # write the average contig length of contigs matching UCEs args.output.write("{0}\t{1}\t{2}\n".format(org, len(contig_lens), float(sum(contig_lens))/len(contig_lens)))
def main(): args = get_args() config = ConfigParser.RawConfigParser(allow_no_value=True) config.read(args.config) conn = sqlite3.connect(args.db) c = conn.cursor() if args.extend_db: query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db) c.execute(query) organisms = get_names_from_config(config, "Organisms") uces = get_names_from_config(config, "Loci") # pdb.set_trace() uce_fasta_out = fasta.FastaWriter(args.output) regex = re.compile("[N,n]{1,21}") for organism in organisms: print "Getting {0} reads...".format(organism) written = [] # going to need to do something more generic w/ suffixes # pdb.set_trace() name = organism.replace("_", "-") if args.notstrict: if not organism.endswith("*"): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True) elif args.extend_dir: # remove the asterisk name = name.rstrip("*") reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip("*"), uces, extend=True, notstrict=True) else: if not name.endswith("*"): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces) elif name.endswith("*") and args.extend_dir: # remove the asterisk name = name.rstrip("*") reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip("*"), uces, extend=True) for read in fasta.FastaReader(reads): name = get_name(read.identifier).lower() coverage = get_coverage(read.identifier) if name in node_dict.keys(): uce_seq = fasta.FastaSequence() uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism.rstrip("*"), coverage) # deal with strandedness because aligners dont, which # is annoying if node_dict[name][1] == "-": uce_seq.sequence = transform.DNA_reverse_complement(read.sequence) else: uce_seq.sequence = read.sequence # replace any occurrences of <21 Ns in a given sequence with # blanks. These should gap out during alignment. if regex.search(uce_seq.sequence): uce_seq.sequence = re.sub(regex, "", uce_seq.sequence) print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(" ")[0]) # Replace and leading/trailing lowercase bases from velvet # assemblies. Lowercase bases indicate low coverage, and these # have been problematic in downstream alignments). uce_seq.sequence = re.sub("^[acgtn]+", "", uce_seq.sequence) uce_seq.sequence = re.sub("[acgtn]+$", "", uce_seq.sequence) uce_fasta_out.write(uce_seq) written.append(str(node_dict[name][0])) else: pass # pdb.set_trace() if args.notstrict and missing: args.notstrict.write("[{0}]\n".format(organism)) for name in missing: args.notstrict.write("{0}\n".format(name)) written.append(name) assert set(written) == set(uces), "UCE names do not match" # assert set(written) == set(uces), pdb.set_trace() uce_fasta_out.close()
def main(): args = get_args() config = ConfigParser.RawConfigParser(allow_no_value=True) config.read(args.config) conn = sqlite3.connect(args.db) c = conn.cursor() if args.extend_db: query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db) c.execute(query) organisms = get_names_from_config(config, 'Organisms') uces = get_names_from_config(config, 'Loci') #pdb.set_trace() uce_fasta_out = fasta.FastaWriter(args.output) regex = re.compile("[N,n]{1,21}") for organism in organisms: print "Getting {0} reads...".format(organism) written = [] # going to need to do something more generic w/ suffixes #pdb.set_trace() name = organism.replace('_', '-') if args.notstrict: if not organism.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True) elif args.extend_dir: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True) else: if not name.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces) elif name.endswith('*') and args.extend_dir: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True) for read in fasta.FastaReader(reads): name = get_name(read.identifier).lower() coverage = get_coverage(read.identifier) if name in node_dict.keys(): uce_seq = fasta.FastaSequence() uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism, coverage) # deal with strandedness because aligners dont, which # is annoying if node_dict[name][1] == '-': uce_seq.sequence = transform.DNA_reverse_complement(read.sequence) else: uce_seq.sequence = read.sequence # replace any occurrences of <21 Ns if regex.search(uce_seq.sequence): uce_seq.sequence = re.sub(regex, "", uce_seq.sequence) print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(' ')[0]) uce_fasta_out.write(uce_seq) written.append(str(node_dict[name][0])) else: pass #pdb.set_trace() if args.notstrict and missing: args.notstrict.write("[{0}]\n".format(organism)) for name in missing: args.notstrict.write("{0}\n".format(name)) written.append(name) assert set(written) == set(uces), "UCE names do not match" #assert set(written) == set(uces), pdb.set_trace() uce_fasta_out.close()
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # parse the config file - allowing no values (e.g. no ":" in config file) config = ConfigParser.RawConfigParser(allow_no_value=True) config.optionxform = str config.read(args.config) # connect to the database conn = sqlite3.connect(args.locus_db) c = conn.cursor() # attach to external database, if passed as option organisms = get_names_from_config(config, "Organisms") log.info( "There are {} taxa in the match-count-config file named {}".format( len(organisms), os.path.basename(args.config) ) ) exons = get_names_from_config(config, "Loci") log.info("There are {} exon loci in the matrix".format(len(exons))) regex = re.compile("[N,n]{1,21}") out_dir = "/".join(args.output.split("/")[:-1]) temp_conf = os.path.join(out_dir, "config_extended") incomplete_outf = open(temp_conf, "w") with open(args.output, "w") as exon_fasta_out: for organism in organisms: text = "Getting exon loci for {0}".format(organism) log.info(text.center(65, "-")) written = [] # going to need to do something more generic w/ suffixes name = organism.replace("_", "-") if not organism.endswith("*"): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_exons(c, organism, exons, extend=False, notstrict=True) count = 0 log.info("There are {} exon loci for {}".format(len(node_dict), organism)) log.info("Parsing and renaming contigs for {}".format(organism)) for seq in SeqIO.parse(open(reads, "rU"), "fasta"): name = get_contig_name(seq.id).lower() # print "name:", name # print node_dict.keys() if name in node_dict.keys(): seq.id = "{0}_{1} |{0}".format(node_dict[name][0], organism.rstrip("*")) seq.name = "" seq.description = "" # deal with strandedness because aligners sometimes dont, which # is annoying if node_dict[name][1] == "-": seq.seq = seq.seq.reverse_complement() # Replace any occurrences of <21 Ns in a given sequence with # blanks. These should gap out during alignment. Also, replace # leading/trailing lowercase bases from velvet assemblies. # Lowercase bases indicate low coverage, and these # have been problematic in downstream alignments). seq, count = replace_and_remove_bases(regex, seq, count) exon_fasta_out.write(seq.format("fasta")) # print "node_dict:", node_dict[name][0] written.append(str(node_dict[name][0])) else: pass if count > 0: log.info("Replaced <20 ambiguous bases (N) in {} contigs for {}".format(count, organism)) if missing: log.info("Writing missing locus information to {}".format(temp_conf)) incomplete_outf.write("[{0}]\n".format(organism)) for name in missing: incomplete_outf.write("{0}\n".format(name)) written.append(name) # print written # print exons assert set(written) == set(exons), "exon names do not match" text = " Completed {} ".format(my_name) log.info(text.center(65, "="))
def main(args): #args = get_args() # setup logging #log, my_name = setup_logging(args) # parse the config file - allowing no values (e.g. no ":" in config file) config = ConfigParser.RawConfigParser(allow_no_value=True) config.optionxform = str config.read(args.config) # connect to the database conn = sqlite3.connect(args.locus_db) c = conn.cursor() # attach to external database, if passed as option organisms = get_names_from_config(config, 'Organisms') log.info("There are {} taxa in the match-count-config file named {}".format( len(organisms), os.path.basename(args.config) )) exons = get_names_from_config(config, 'Loci') dupefile = None if args.include_duplicates is not None: dupefile = args.include_duplicates dupe_config = ConfigParser.RawConfigParser(allow_no_value=True) dupe_config.optionxform = str if args.include_duplicates is not None: dupe_config.read(dupefile) log.info("There are {} exon loci in the matrix".format(len(exons))) regex = re.compile("[N,n]{1,21}") out_dir = '/'.join(args.output.split('/')[:-1]) temp_conf = os.path.join(out_dir, 'config_extended') incomplete_outf = open(temp_conf, 'w') with open(args.output, 'w') as exon_fasta_out: for organism in organisms: organism_dupe_dict = {} if args.include_duplicates is not None: dupes = dupe_config.items('%s - contigs hitting multiple probes' %organism) for element in dupes: organism_dupe_dict.setdefault(element[0],element[1]) organism_orientation_dict = {} if args.include_duplicates is not None: locus_orientation = dupe_config.items('%s - contig orientation' %organism) for element in locus_orientation: organism_orientation_dict.setdefault(element[0],element[1]) #print (organism_dupe_dict) text = "Getting exon loci for {0}".format(organism) log.info(text.center(65, "-")) written = [] # going to need to do something more generic w/ suffixes name = organism.replace('_', '-') if not organism.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_exons(c, organism, exons, args, organism_dupe_dict, organism_orientation_dict, extend=False, notstrict=True) count = 0 log.info("There are {} exon loci for {}".format(len(node_dict), organism)) log.info("Parsing and renaming contigs for {}".format(organism)) for seq in SeqIO.parse(open(reads, 'rU'), 'fasta'): name = get_contig_name(seq.id,args).lower() #print "name:", name #print node_dict.keys() if name in node_dict.keys(): seq.id = "{0}_{1} |{0}".format(node_dict[name][0], organism.rstrip('*')) seq.name = '' seq.description = '' # deal with strandedness because aligners sometimes dont, which # is annoying if node_dict[name][1] == '-': seq.seq = seq.seq.reverse_complement() # Replace any occurrences of <21 Ns in a given sequence with # blanks. These should gap out during alignment. Also, replace # leading/trailing lowercase bases from velvet assemblies. # Lowercase bases indicate low coverage, and these # have been problematic in downstream alignments). seq, count = replace_and_remove_bases(regex, seq, count) exon_fasta_out.write(seq.format('fasta')) #print "node_dict:", node_dict[name][0] written.append(str(node_dict[name][0])) else: pass if count > 0: log.info("Replaced <20 ambiguous bases (N) in {} contigs for {}".format(count, organism)) if missing: log.info("Writing missing locus information to {}".format(temp_conf)) incomplete_outf.write("[{0}]\n".format(organism)) for name in missing: incomplete_outf.write("{0}\n".format(name)) written.append(name) #print written #print exons # This test will result in an error if duplicates are included #assert set(written) == set(exons), "exon names do not match" text = " Completed! " log.info(text.center(65, "="))
def main(): args = get_args() # setup logging log, my_name = setup_logging(args) # parse the config file - allowing no values (e.g. no ":" in config file) config = ConfigParser.RawConfigParser(allow_no_value=True) config.optionxform = str config.read(args.match_count_output) # connect to the database conn = sqlite3.connect(args.locus_db) c = conn.cursor() # attach to external database, if passed as option if args.extend_locus_db: log.info("Attaching extended database {}".format(os.path.basename(args.extend_locus_db))) query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_locus_db) c.execute(query) organisms = get_names_from_config(config, 'Organisms') log.info("There are {} taxa in the match-count-config file named {}".format( len(organisms), os.path.basename(args.match_count_output) )) uces = get_names_from_config(config, 'Loci') if not args.incomplete_matrix: log.info("There are {} shared UCE loci in a COMPLETE matrix".format(len(uces))) else: log.info("There are {} UCE loci in an INCOMPLETE matrix".format(len(uces))) regex = re.compile("[N,n]{1,21}") if args.incomplete_matrix: incomplete_outf = open(args.incomplete_matrix, 'w') with open(args.output, 'w') as uce_fasta_out: for organism in organisms: text = "Getting UCE loci for {0}".format(organism) log.info(text.center(65, "-")) written = [] # going to need to do something more generic w/ suffixes name = organism.replace('_', '-') if args.incomplete_matrix: if not organism.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True) elif args.extend_locus_contigs: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_locus_contigs, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True) else: if not name.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces) elif name.endswith('*') and args.extend_locus_contigs: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_locus_contigs, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True) count = 0 log.info("There are {} UCE loci for {}".format(len(node_dict), organism)) log.info("Parsing and renaming contigs for {}".format(organism)) for seq in SeqIO.parse(open(reads, 'rU'), 'fasta'): name = get_contig_name(seq.id).lower() if name in node_dict.keys(): seq.id = "{0}_{1} |{0}".format(node_dict[name][0], organism.rstrip('*')) seq.name = '' seq.description = '' # deal with strandedness because aligners sometimes dont, which # is annoying if node_dict[name][1] == '-': seq.seq = seq.seq.reverse_complement() # Replace any occurrences of <21 Ns in a given sequence with # blanks. These should gap out during alignment. Also, replace # leading/trailing lowercase bases from velvet assemblies. # Lowercase bases indicate low coverage, and these # have been problematic in downstream alignments). seq, count = replace_and_remove_bases(regex, seq, count) uce_fasta_out.write(seq.format('fasta')) written.append(str(node_dict[name][0])) else: pass if count > 0: log.info("Replaced <20 ambiguous bases (N) in {} contigs for {}".format(count, organism)) if args.incomplete_matrix and missing: log.info("Writing missing locus information to {}".format(args.incomplete_matrix)) incomplete_outf.write("[{0}]\n".format(organism)) for name in missing: incomplete_outf.write("{0}\n".format(name)) written.append(name) assert set(written) == set(uces), "UCE names do not match" text = " Completed {} ".format(my_name) log.info(text.center(65, "="))