def run_muscle_single(seq_name: str, seq: str, comparison_file: str) -> Dict[str, str]: """ Runs muscle over a single sequence against a comparison file in profile mode and returns a dictionary of the resulting alignments Arguments: seq_name: the name of the query seq: the sequence to align comparison_file: the path of the file containing comparison sequences Returns: a dictionary mapping sequence name (query or reference) to alignment """ with NamedTemporaryFile(mode="w+") as temp_in: with NamedTemporaryFile(mode="w+") as temp_out: write_fasta([seq_name], [seq], temp_in.name) # Run muscle and collect sequence positions from file result = execute([ get_config().executables.muscle, "-profile", "-quiet", "-in1", comparison_file, "-in2", temp_in.name, "-out", temp_out.name ]) if not result.successful(): raise RuntimeError( "muscle returned %d: %r while comparing query named %s" % (result.return_code, result.stderr.replace("\n", ""), seq_name)) fasta = read_fasta(temp_out.name) return fasta
def setUp(self): self.record = secmet.Record() # except for Thioesterase, all domains were found in BN001301.1 # TE domains were found in Y16952 for filename, domain_type in [("PKS_KS.input", "PKS_KS"), ("AT.input", "PKS_AT"), ("ACP.input", "ACP"), ("DH.input", "PKS_DH"), ("KR.input", "PKS_KR"), ("TE.input", "Thioesterase"), ("ER.input", "PKS_ER")]: for domain in rebuild_domains(filename, domain_type): self.record.add_antismash_domain(domain) # these PFAMs found in BN001301.1 with clusterhmmer, one was excluded # to avoid a Biopython SearchIO bug dummy_location = secmet.feature.FeatureLocation(1, 100) domain_fasta = fasta.read_fasta( path.get_full_path(__file__, 'data', "p450.input")) for name, translation in domain_fasta.items(): pfam_domain = secmet.feature.PFAMDomain(dummy_location, protein_start=5, protein_end=10, description="test") pfam_domain.translation = translation pfam_domain.domain_id = "PFAM_p450_" + name pfam_domain.domain = "p450" self.record.add_pfam_domain(pfam_domain)
def sandpuma_test(adomain_file): ## Set params test_fa = fasta.read_fasta(adomain_file) threads = 1 data_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + '/data/' knownfaa = data_dir + 'fullset0_smiles.faa' wildcard = 'UNK' snn_thresh = 0.5 knownasm = data_dir + 'fullset0_smiles.stach.faa' max_depth = 40 min_leaf_sup = 10 jackknife_data = data_dir + 'sandpuma1_jackknife.tsv' ref_aln = data_dir + 'fullset0_smiles.afa' ref_tree = data_dir + 'fullset0_smiles.fasttree.nwk' ## created with: fasttree -log fullset0_smiles.fasttree.log < fullset0_smiles.afa > fullset0_smiles.fasttree.nwk ref_pkg = data_dir + 'fullset0_smiles.fasttree.refpkg' ## created with: taxit create --aln-fasta fullset0_smiles.afa --tree-stats fullset0_smiles.fasttree.log --tree-file fullset0_smiles.fasttree.nwk -P fullset0_smiles.fasttree.refpkg -l a_domain masscutoff = 0.6 seed_file = data_dir + 'seed.afa' nodemap_file = data_dir + 'nodemap.tsv' traceback_file = data_dir + 'traceback.tsv' nrpspred2basedir = data_dir + 'NRPSPredictor2' phmmdb = data_dir + 'fullset20160624_cl_nrpsA.hmmdb' piddb = data_dir + 'fullset0_smiles.dmnd' ## Actually test run_sandpuma(test_fa, threads, knownfaa, wildcard, snn_thresh, knownasm, max_depth, min_leaf_sup, jackknife_data, ref_aln, ref_tree, ref_pkg, masscutoff, seed_file, nodemap_file, traceback_file, nrpspred2basedir, phmmdb, piddb)
def rebuild_domains(filename, domain_type): full_path = path.get_full_path(__file__, 'data', filename) domain_fasta = fasta.read_fasta(full_path) domains = [] for name, translation in domain_fasta.items(): domain = DummyAntismashDomain(start=1, end=100, domain_id=domain_type + name) domain.domain = domain_type domain.translation = translation domains.append(domain) return domains
def test_angstrom(self): aligns = fasta.read_fasta( path.get_full_path(__file__, 'data', 'nrpspred_aligns.fasta')) domain = DummyAntismashDomain(domain_id="query") domain.translation = aligns[domain.domain_id].replace("-", "") with patch.object(subprocessing, "run_muscle_single", return_value=aligns): sig = nrps_predictor.get_34_aa_signature(domain) assert sig == "L--SFDASLFEMYLLTGGDRNMYGPTEATMCATW"
def rebuild_domains(filename, domain_type): full_path = path.get_full_path(__file__, 'data', filename) domain_fasta = fasta.read_fasta(full_path) dummy_location = secmet.features.FeatureLocation(1, 100) domains = [] for name, translation in domain_fasta.items(): domain = secmet.features.AntismashDomain(dummy_location, tool="test") domain.domain = domain_type domain.domain_id = domain_type + name domain.translation = translation domains.append(domain) return domains
def trim_alignment(input_number: int, alignment_file: str) -> None: """ remove all positions before the first and after the last position shared by at least a third of all sequences """ def find_first_aa_position(conservations: List[Dict[str, int]], sequence_count: int) -> int: """ Finds the first position of a shared amino acid """ for position, conservation in enumerate(conservations): aa = sorted(conservation.items(), key=lambda x: (x[1], x[0]), reverse=True) base, count = aa[0] # skip best hits that are gaps if base == "-": continue # check that the count is greater than required if count >= sequence_count / 3: return position return 0 # can't be earlier than the start contents = fasta.read_fasta(alignment_file) # check all sequences are the same length sequence_length = len(list(contents.values())[0]) for name, seq in contents.items(): assert sequence_length == len( seq), "%s has different sequence length" % name # stripping ( and ) because it breaks newick tree parsing # and keeping only the last two fields (id and description) names = [ "|".join(name.replace("(", "_").replace(")", "_").rsplit('|', 2)[-2:]) for name in list(contents) ] seqs = list(contents.values()) # store conservation of residues conservations = [defaultdict(lambda: 0) for i in range(sequence_length) ] # type: List[Dict[str, int]] for seq in seqs: for position, base in enumerate(seq): conservations[position][base] += 1 # Find first and last amino acids shared first_shared_amino = find_first_aa_position(conservations, len(seqs)) conservations.reverse() last_shared_amino = sequence_length - find_first_aa_position( conservations, len(seqs)) # Shorten sequences to detected conserved regions seqs = [seq[first_shared_amino:last_shared_amino] for seq in seqs] seed_fasta_name = "trimmed_alignment" + str(input_number) + ".fasta" fasta.write_fasta(names, seqs, seed_fasta_name)
def generate_domains(self): inputs = fasta.read_fasta( path.get_full_path(__file__, 'data', 'PKS_KS.input')) domains = [] last_end = 0 for translation in inputs.values(): location = FeatureLocation(last_end + 10, last_end + len(translation) * 3 + 16) domain = DummyAntismashDomain(location=location) domain.translation = translation domains.append(domain) domain.domain = "PKS_KS" location = FeatureLocation( last_end + 10, last_end + len(domains[-1].translation) * 3 + 16) domains.append(DummyAntismashDomain(location=location)) domains[-1].domain = "PKS_KR" return domains
def run_predicat(reference_aln: str, queryfa: Dict[str, str], wildcard: str, ref_tree: str, ref_pkg: str, masscutoff: float, snn_thresh: float) -> PredicatResults: """ pplacer and predicat substrate prediciton Arguments: reference_aln: filename for reference protein fasta, see sandpuma_multithreaded comments for requirements queryfa: seq id to seq dictionary wildcard: suffix str identifying query sequence (Default= 'UNK' which means headers end in '_UNK') ref_tree: reference tree (newick) ref_pkg: pplacer reference package masscutoff: cutoff value for pplacer masses snn_thresh: SNN threshold for confident prediction (default=0.5) Returns: PredicatResults monophyly -> substrate specificity (str) forced -> substrate specificity (str) nndist -> distance to nearest neighbor (float) nn_score -> nearest neighbor score (float) snn_score -> scaled nearest neighbor score (float) """ query = next(iter(queryfa)) ## Align query to a single known sequence to_align = {} to_align[query] = queryfa[query] ref = fasta.read_fasta(reference_aln) tname = next(iter(ref)) ## Grab any training sequence header to_align[tname] = ref[tname].replace('-', '') aligned = subprocessing.run_mafft_predicat_trim(to_align) ## trim overhangs head = len(re.sub(r'^(-*).+$', r'\g<1>', aligned[tname])) tail = len(re.sub(r'^.+(-*)$', r'\g<1>', aligned[tname])) trimmed = aligned[query][head:len(aligned[query]) - tail].replace( '-', '') ## Removes head and tail then removes gaps trimmedfa = {query: trimmed} ## Align trimmed seq to reference all_aligned = subprocessing.run_muscle_profile_sandpuma( reference_aln, trimmedfa) ## Pplacer (NOTE: this is new to SANDPUMA as of antiSMASH5 and needs to be tested pplacer_tree = subprocessing.run_pplacer(ref_tree, reference_aln, ref_pkg, all_aligned) ## prediCAT return predicat(pplacer_tree, masscutoff, wildcard, snn_thresh)
def run_at_domain_analysis(domains: Dict[str, str]) -> ATSignatureResults: """ Analyses PKS signature of AT domains Arguments: domains: a dictionary mapping domain identifier (e.g. 'locus_AT2') to domain sequence Returns: a dictionary mapping domain identifier to a list of ATResults ordered by decreasing score """ # construct the query signatures query_signatures = {} at_positions = get_at_positions(startpos=7) for name, seq in sorted(domains.items()): alignments = subprocessing.run_muscle_single(name, seq, _AT_DOMAINS_FILENAME) query_signatures[name] = utils.extract_by_reference_positions(alignments[name], alignments[_REF_SEQUENCE], at_positions) # load reference PKS signatures and score queries against them return score_signatures(query_signatures, fasta.read_fasta(_SIGNATURES_FILENAME))
def setUp(self): self.query_data = fasta.read_fasta(path.get_full_path(__file__, 'data', 'SCO_genes.fasta'))
def setUp(self): self.aligns = fasta.read_fasta( path.get_full_path(__file__, 'data', 'nrpspred_aligns.fasta')) mock("subprocessing.run_muscle_single", returns=self.aligns)
class TestMinowaAT(unittest.TestCase): query_data = fasta.read_fasta( path.get_full_path(__file__, "data", "SCO.fasta")) def setUp(self): build_config([]) def tearDown(self): destroy_config() def test_full_run(self): results = run_minowa_at(self.query_data) assert len(results) == len(self.query_data) assert set(results) == set(self.query_data) results = {key: val.predictions for key, val in results.items()} assert results == { 'SCO0126_AT1': [('Malonyl-CoA', 81.1), ('Methoxymalonyl-CoA', 30.9), ('Methylmalonyl-CoA', 25.6), ('inactive', 23.2), ('Propionyl-CoA', 13.8), ('2-Methylbutyryl-CoA', 12.2), ('fatty_acid', 7.9), ('Isobutyryl-CoA', 1.8), ('CHC-CoA', 1.1), ('trans-1,2-CPDA', 0.0), ('Benzoyl-CoA', 0.0), ('Acetyl-CoA', 0.0), ('3-Methylbutyryl-CoA', 0.0), ('Ethylmalonyl-CoA', -3.2)], 'SCO0127_AT1': [('Methoxymalonyl-CoA', 29.2), ('Methylmalonyl-CoA', 26.5), ('Malonyl-CoA', 22.1), ('Ethylmalonyl-CoA', 13.7), ('trans-1,2-CPDA', 0.0), ('inactive', 0.0), ('fatty_acid', 0.0), ('Isobutyryl-CoA', 0.0), ('CHC-CoA', 0.0), ('Benzoyl-CoA', 0.0), ('Acetyl-CoA', 0.0), ('3-Methylbutyryl-CoA', 0.0), ('Propionyl-CoA', -0.2), ('2-Methylbutyryl-CoA', -4.3)], 'SCO5892_AT1': [('Malonyl-CoA', 151.7), ('inactive', 95.9), ('Methoxymalonyl-CoA', 74.7), ('Methylmalonyl-CoA', 70.4), ('Ethylmalonyl-CoA', 43.0), ('Propionyl-CoA', 35.7), ('Isobutyryl-CoA', 31.9), ('CHC-CoA', 27.7), ('2-Methylbutyryl-CoA', 26.1), ('Benzoyl-CoA', 25.0), ('Acetyl-CoA', 13.9), ('trans-1,2-CPDA', 13.7), ('3-Methylbutyryl-CoA', 12.5), ('fatty_acid', 9.7)], 'SCO6273_AT1': [('Malonyl-CoA', 171.9), ('inactive', 73.8), ('Methoxymalonyl-CoA', 62.1), ('Methylmalonyl-CoA', 40.8), ('Propionyl-CoA', 29.3), ('Acetyl-CoA', 18.6), ('Isobutyryl-CoA', 15.6), ('2-Methylbutyryl-CoA', 14.1), ('Benzoyl-CoA', 9.6), ('trans-1,2-CPDA', 0.0), ('fatty_acid', 0.0), ('Ethylmalonyl-CoA', 0.0), ('CHC-CoA', 0.0), ('3-Methylbutyryl-CoA', 0.0)], 'SCO6274_AT1': [('Malonyl-CoA', 171.9), ('inactive', 73.8), ('Methoxymalonyl-CoA', 62.1), ('Methylmalonyl-CoA', 40.8), ('Propionyl-CoA', 29.3), ('Acetyl-CoA', 18.6), ('Isobutyryl-CoA', 15.6), ('2-Methylbutyryl-CoA', 14.1), ('Benzoyl-CoA', 9.6), ('trans-1,2-CPDA', 0.0), ('fatty_acid', 0.0), ('Ethylmalonyl-CoA', 0.0), ('CHC-CoA', 0.0), ('3-Methylbutyryl-CoA', 0.0)], 'SCO6274_AT2': [('Malonyl-CoA', 171.9), ('inactive', 73.8), ('Methoxymalonyl-CoA', 62.1), ('Methylmalonyl-CoA', 40.8), ('Propionyl-CoA', 29.3), ('Acetyl-CoA', 18.6), ('Isobutyryl-CoA', 15.6), ('2-Methylbutyryl-CoA', 14.1), ('Benzoyl-CoA', 9.6), ('trans-1,2-CPDA', 0.0), ('fatty_acid', 0.0), ('Ethylmalonyl-CoA', 0.0), ('CHC-CoA', 0.0), ('3-Methylbutyryl-CoA', 0.0)], 'SCO6275_AT1': [('Malonyl-CoA', 209.2), ('inactive', 103.5), ('Methoxymalonyl-CoA', 75.4), ('Methylmalonyl-CoA', 68.4), ('Isobutyryl-CoA', 37.8), ('2-Methylbutyryl-CoA', 31.3), ('Benzoyl-CoA', 30.9), ('Acetyl-CoA', 30.9), ('Propionyl-CoA', 29.8), ('Ethylmalonyl-CoA', 28.1), ('fatty_acid', 20.5), ('CHC-CoA', 16.6), ('3-Methylbutyryl-CoA', 15.4), ('trans-1,2-CPDA', 15.0)], 'SCO6275_AT2': [('Malonyl-CoA', 203.5), ('inactive', 97.1), ('Methoxymalonyl-CoA', 72.9), ('Methylmalonyl-CoA', 61.7), ('Isobutyryl-CoA', 41.7), ('Propionyl-CoA', 30.9), ('Ethylmalonyl-CoA', 16.8), ('Acetyl-CoA', 16.8), ('2-Methylbutyryl-CoA', 14.2), ('Benzoyl-CoA', 13.3), ('3-Methylbutyryl-CoA', 9.0), ('fatty_acid', 8.4), ('CHC-CoA', 3.9), ('trans-1,2-CPDA', 0.0)], 'SCO6275_AT3': [('Malonyl-CoA', 207.6), ('inactive', 105.9), ('Methoxymalonyl-CoA', 62.0), ('Methylmalonyl-CoA', 50.9), ('Propionyl-CoA', 30.8), ('Ethylmalonyl-CoA', 17.7), ('Isobutyryl-CoA', 16.7), ('2-Methylbutyryl-CoA', 15.7), ('Acetyl-CoA', 15.4), ('Benzoyl-CoA', 11.6), ('CHC-CoA', 9.5), ('trans-1,2-CPDA', 0.0), ('fatty_acid', 0.0), ('3-Methylbutyryl-CoA', 0.0)], 'SCO6827_AT1': [('Methylmalonyl-CoA', 165.7), ('Ethylmalonyl-CoA', 150.9), ('Methoxymalonyl-CoA', 141.2), ('2-Methylbutyryl-CoA', 118.3), ('Malonyl-CoA', 106.6), ('trans-1,2-CPDA', 94.3), ('Benzoyl-CoA', 90.8), ('Isobutyryl-CoA', 90.1), ('Propionyl-CoA', 89.7), ('CHC-CoA', 65.8), ('Acetyl-CoA', 62.2), ('inactive', 45.4), ('3-Methylbutyryl-CoA', 43.8), ('fatty_acid', 23.7)] }
with open(input_filename, "w") as handle: for sig, domain in zip(signatures, a_domains): handle.write("%s\t%s\n" % (sig, domain.get_name())) # Run NRPSPredictor2 SVM commands = [ 'java', '-Ddatadir=%s' % data_dir, '-cp', classpath, 'org.roettig.NRPSpredictor2.NRPSpredictor2', '-i', input_filename, '-r', output_filename, '-s', '1', '-b', bacterial ] result = subprocessing.execute(commands) if not result.successful(): raise RuntimeError("NRPSPredictor2 failed: %s" % result.stderr) with open(output_filename) as handle: lines = handle.read().splitlines()[1:] # strip the header return read_output(lines) create_domain_fa = fasta.read_fasta( '/Users/robi0916/Documents/Wageningen_UR/github/sandpuma2_serina/flat/fullset20160624_cl.faa' ) domain_list = [] for i, domain in enumerate(create_domain_fa): domain_list.append(AntismashDomain(FeatureLocation( 1, 1, 1), tool="test")) # arbitrary feature location domain_list[i].domain_id = list(create_domain_fa.keys())[i] domain_list[i].translation = list(create_domain_fa.values())[i] run_nrpspredictor(domain_list)
def run_sandpuma(name2seq: Dict[str, str], threads: int, knownfaa: str, wildcard: str, snn_thresh: float, knownasm: str, max_depth: int, min_leaf_sup: int, jackknife_data: str, ref_aln: str, ref_tree: str, ref_pkg: str, masscutoff: float, seed_file: str, nodemap_file: str, traceback_file: str, nrpsdir: str, phmmdb: str, piddb: str): """ SANDPUMA parallelized pipleline Arguments: name2seq: dictionary of seq names (str) to seqs (str) threads: number of threads knownfaa: filename for reference protein fasta; assumes each header ends in '_' followed by the <substrate specificity> wildcard: str to append to the end of each query sequence; should be different that all specificities (Default= 'UNK') snn_thresh: threshold for SNN score (Default= 0.5) NOTE: may need to be adjusted with new pplacer implementation knownasm: filename for reference active site motif protein fasta, similar header formatting as knownfaa max_depth: maximum depth for the sklearn decision tree; default= 40 min_leaf_sup: minimum leaf support required within the decision tree; default= 10 jackknife_data: filename for jackknife benchmarking results ref_aln: reference alignment (fasta) file ref_tree: reference tree (newick) ref_pkg: pplacer reference package masscutoff: cutoff value for pplacer masses seed_file: seed fasta file (single entry) used for stachelhaus code extraction nodemap_file: filename for map of decision tree outcomes traceback_file: jackknife results for all paths nrpsdir: dir for NRPSPredictor2 phmmdb: pHMM database piddb: diamand db for PID Returns: """ ## Load jackknife data jk = {} allspec = {} with open(jackknife_data, "r") as j: next(j) ## skip header for line in j: line = line.strip() l = line.split("\t") jk[l[10]] = { 'true': l[4], 'pid': l[3], 'shuf': l[0], 'jk': l[1], 'query': l[2], 'bin': l[11] } called_spec = l[5] if l[7] == 'N': called_spec = 'no_call' jk[l[10]]['method'] = {} jk[l[10]]['method'][l[6]] = called_spec allspec[l[4]] = -1 allspec[l[5]] = -1 ## Map specificities to integers i2s = [] i = 0 for spec in sorted(allspec, key=allspec.get): allspec[spec] = i i2s.append(spec) i += 1 ## Prepare features and labels allmethods = ['prediCAT', 'forced_prediCAT_snn50', 'svm', 'stach', 'phmm'] features = [] labels = [] for uname in jk: for m in allmethods: if m in jk[uname]['method']: continue else: jk[uname]['method'][m] = 'no_call' labels.append(allspec[jk[uname]['true']]) feature_matrix = [jk[uname]['pid']] for m in allmethods: feature_matrix.extend( get_feature_matrix(jk[uname]['method'][m], i2s)) features.append(feature_matrix) ## Train the decision tree clf = tree.DecisionTreeClassifier(min_samples_leaf=min_leaf_sup, max_depth=max_depth) clf = clf.fit(features, labels) ## Load the nodemap for decision tree nodemap = {} with open(nodemap_file, "r") as nm: for line in nm: if line[0] == '#': continue else: line = line.strip() l = line.split("\t") nodemap[int(l[0])] = { 'parent': int(l[1]), 'parent_call': l[2], 'decision': l[3], 'thresh': float(l[4]) } nodemap = OrderedDict(sorted(nodemap.items(), key=lambda t: t[0])) ## Define paths paths = [] for n in nodemap: if nodemap[n]['decision'] == 'LEAF_NODE': p = nodemap[n]['parent'] traceback = nodemap[p]['decision'] + '%' + str( nodemap[p]['thresh'] ) + '-' + nodemap[n]['parent_call'] + '&LEAF_NODE-' + str(n) while (p != 0): n = p p = nodemap[p]['parent'] t = nodemap[p]['decision'] + '%' + str( nodemap[p]['thresh']) + '-' + nodemap[n]['parent_call'] traceback = t + '&' + traceback paths.append(traceback) ## Load path accuracies pathacc = {} with open(traceback_file, "r") as tb: for line in tb: line = line.strip() l = line.split("\t") l[2] = re.sub(r"\S+&(LEAF_NODE-\d+)$", "\g<1>", l[2]) pathacc[l[2]] = {'pct': l[0], 'n': l[1]} ## Load ASM fastas stach_fa = fasta.read_fasta(knownasm) seed_fa = fasta.read_fasta(seed_file) ## Split groups groups = split_into_groups(name2seq, threads) for group in groups: toprocess = {} for name in name2seq: if name in groups[group]: toprocess[name] = name2seq[name] p = multiprocessing.Process( target=sandpuma_multithreaded, args=(group, toprocess, knownfaa, wildcard, snn_thresh, knownasm, max_depth, min_leaf_sup, ref_aln, ref_tree, ref_pkg, masscutoff, stach_fa, seed_fa, clf, i2s, paths, pathacc, nrpsdir, phmmdb, piddb)) p.start()
scores[str(match)] = {} for s in spec: scores[str(match)][s] = 1 ## Dereplicate and return spec predictions for i in range(0,10): m = str(9-i) if m in scores: seen = {} for s in scores[m]: if s.count('|') > 0: for ss in s.split('|'): seen[ss] = 1 else: seen[s] = 1 return('|'.join(sorted(seen)), m ) return('no_call','0') def main(queryfa, stachfa, seedfa): run_asm(queryfa, stachfa, seedfa) if __name__ == '__main__': if len(sys.argv) < 3: print("Not enough arguments") if len(sys.argv) == 3: main(read_fasta(sys.argv[1]), 'data/fullset0_smiles.stach.faa', read_fasta(sys.argv[3]))
def setUp(self): self.aligns = read_fasta( path.get_full_path(nrps_pks.__file__, "test", "data", "nrpspred_aligns.fasta")) mock("subprocessing.run_muscle_single", returns=self.aligns)