def main(): args = get_args() # global config global g_force_search if args.force_search: g_force_search = True # set defaults if args.out is None: args.out = args.fasta + ".annotated" # translate fasta? query = args.fasta if args.seqtype == "cds": query = os.path.split(query)[1] query = os.path.join(args.temp, query) query = query + ".translated" say("Translating input fasta to:\n ", query) translate_fasta(args.fasta, query) args.seqtype = "prot" # perform uniref90 search uniref90hits = uniref_search( diamond=args.diamond, database=args.uniref90db, query=query, seqtype=args.seqtype, temp=args.temp, diamond_options=args.diamond_options, ) ''' uniref90hits = query + ".uniref90.hits" uniref90map = parse_results( uniref90hits ) # perform uniref50 search uniref50hits = uniref_search( diamond=args.diamond, database=args.uniref50db, query=query, seqtype=args.seqtype, temp=args.temp, diamond_options=args.diamond_options, ) uniref50hits = query + ".uniref50.hits" uniref50map = parse_results( uniref50hits ) # override mappings? overrides = {} if args.transitive_map is not None: overrides = trans_mapping( uniref90map, args.transitive_map ) # reannoate the fasta reannotate( query=args.fasta, out=args.out, uniref90map=uniref90map, uniref50map=uniref50map, overrides=overrides, ) # done ''' say("Finished successfully.")
def parse_results(results): say("Parsing results file:\n ", results) check_path(results) mapping = {} mode = get_mode(results) min_pident = float(mode.replace("uniref", "")) with open(results) as fh: for row in csv.reader(fh, csv.excel_tab): h = Hit(row, config=c_output_format) if h.qseqid not in mapping: if float(h.pident) >= float(min_pident) and float( h.mcov) >= float(c_min_coverage): uniref = h.sseqid.split("|")[0] mapping[h.qseqid] = uniref return mapping
def trans_mapping(uniref90map, p_trans_map): say("Loading transitive mapping file:\n ", p_trans_map) check_path(p_trans_map) overrides = {} uniref90map_r = {} for header, uniref90 in uniref90map.items(): # modify by yancong uniref90 = re.sub("-[0-9]+$", "", uniref90) uniref90map_r.setdefault(uniref90, set()).add(header) with open(p_trans_map) as fh: for row in csv.reader(fh, csv.excel_tab): uniref90, uniref50 = row headers = uniref90map_r.get(uniref90, set()) for h in headers: overrides[h] = uniref50 return overrides
def uniref_search(diamond=None, database=None, query=None, seqtype=None, temp=None, diamond_options=None): if which(diamond) is None: die("<diamond> is not executable as: {}".format(diamond)) for path in [database, query, temp]: check_path(path) binary = {"nuc": "blastx", "prot": "blastp"}[seqtype] mode = get_mode(database) results = os.path.split(query)[1] results = os.path.join(temp, results) results = ".".join([results, mode, "hits"]) command = [ diamond, binary, "--db", database, "--query", query, "--outfmt", c_output_format, "--tmpdir", temp, "--out", results, #"--id", get_mode( results ).replace( "uniref", "" ), c_diamond_filters, ] command = " ".join([str(k) for k in command]) command += (" " + diamond_options) if diamond_options is not None else "" if not os.path.exists(results) or g_force_search: say("Executing:\n ", command) os.system(command) else: say("Using existing results file:\n ", results) return results
def reannotate(query=None, out=None, uniref90map=None, uniref50map=None, overrides=None): say("Writing new output file:\n ", out) oh = open(out, "w") ntot, nmap90, ninf50, nmap50 = [0 for i in range(4)] with open(query) as fh: for line in fh: line = line.strip() if line == "": continue elif line[0] != ">": print(line, file=oh) else: # diamond breaks the header on whitespace header = line[1:].split()[0] ntot += 1 uniref90code = "UniRef90_unknown" if header in uniref90map: uniref90code = uniref90map[header] nmap90 += 1 uniref50code = "UniRef50_unknown" if header in overrides: uniref50code = overrides[header] ninf50 += 1 elif header in uniref50map: uniref50code = uniref50map[header] nmap50 += 1 print("|".join([line, uniref90code, uniref50code]), file=oh) oh.close() # report say("Summary of annotations:") say(" Genes in input FASTA: {:,}".format(ntot)) say(" UniRef90 codes assigned: {:,} ({:.1f}%)".format( nmap90, 100 * nmap90 / float(ntot))) say(" UniRef50 codes assigned: {:,} ({:.1f}%)".format( nmap50 + ninf50, 100 * (nmap50 + ninf50) / float(ntot))) say(" UniRef50 codes inferred from UniRef90 codes: {:,} ({:.1f}%)".format( ninf50, 100 * ninf50 / float(ntot))) # done return None