def run_minowa_predictor_pks_at(pksnames, pksseqs, options): #Predict PKS AT domain specificities with Minowa et al. method and PKS code (NP searcher / ClustScan / own?) utils.writefasta( pksnames, pksseqs, options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_pksseqs.fasta") #Run PKS signature analysis logging.info( "Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences" ) with TemporaryDirectory(change=True): PKS_analysis.run_pkssignature_analysis( path.join(options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + "_pksseqs.fasta"), path.join(options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + "_pkssignatures.txt")) #Minowa method: run Minowa_AT logging.info( "Predicting PKS AT domain substrate specificities by Minowa et al. method" ) with TemporaryDirectory(change=True): minowa_AT.run_minowa_at( options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_pksseqs.fasta", options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_minowa_pkspredoutput.txt")
def generate_image(cluster_number: int, smiles: str, structures_dir: str) -> bool: """ Constructs an image, if possible, of a cluster's product structure """ filename = "genecluster%d" % cluster_number png = filename + ".png" smi = filename + ".smi" icon = filename + "_icon.png" with TemporaryDirectory(change=True): with open(smi, "w") as handler: handler.write(smiles) indigo = Indigo() query = indigo.loadMoleculeFromFile(smi) renderer = IndigoRenderer(indigo) # now that the renderer exists, so does the render-coloring option indigo.setOption("render-coloring", True) renderer.renderToFile(query, png) indigo.setOption("render-image-size", 200, 150) renderer.renderToFile(query, icon) # was it successful dircontents = os.listdir(os.getcwd()) # an exception should be raised by indigo, but just in case if png not in dircontents: return False # if so, move the files to the output dir for filename in [png, icon, smi]: shutil.copy(filename, structures_dir) os.remove(filename) return True
def test_classification_with_colon(self): # since SMCOG id and description are stored in a string separated by :, # ensure that descriptions containing : are properly handled # test gene is AQF52_5530 from CP013129.1 translation = ( "MDTHQREEDPVAARRDRTHYLYLAVIGAVLLGIAVGFLAPGVAVELKPLGTGFVN" "LIKMMISPIIFCTIVLGVGSVRKAAKVGAVGGLALGYFLVMSTVALAIGLLVGNL" "LEPGSGLHLTKEIAEAGAKQAEGGGESTPDFLLGIIPTTFVSAFTEGEVLQTLLV" "ALLAGFALQAMGAAGEPVLRGIGHIQRLVFRILGMIMWVAPVGAFGAIAAVVGAT" "GAAALKSLAVIMIGFYLTCGLFVFVVLGAVLRLVAGINIWTLLRYLGREFLLILS" "TSSSESALPRLIAKMEHLGVSKPVVGITVPTGYSFNLDGTAIYLTMASLFVAEAM" "GDPLSIGEQISLLVFMIIASKGAAGVTGAGLATLAGGLQSHRPELVDGVGLIVGI" "DRFMSEARALTNFAGNAVATVLVGTWTKEIDKARVTEVLAGNIPFDEKTLVDDHA" "PVPVPDQRAEGGEEKARAGV") cds = helpers.DummyCDS(0, len(translation)) cds.translation = translation results = smcogs.classify.classify_genes([cds]) assert results[cds.get_name( )][0].hit_id == "SMCOG1212:sodium:dicarboxylate_symporter" record = helpers.DummyRecord(seq=translation) record.add_cds_feature(cds) record.add_cluster(helpers.DummyCluster(0, len(translation))) with TemporaryDirectory(change=True): results = smcogs.run_on_record(record, None, self.options) # if we don't handle multiple semicolons right, this line will crash results.add_to_record(record) gene_functions = cds.gene_functions.get_by_tool("smcogs") assert len(gene_functions) == 1 assert str(gene_functions[0]).startswith( "transport (smcogs) SMCOG1212:sodium:dicarboxylate_symporter" " (Score: 416; E-value: 2.3e-126)")
def check_diamond_db_compatible(database_file: str) -> bool: """ Check if the given diamond database is compatible with the installed diamond version. Arguments: database_file: the path to the database file to check Returns: True if the database file is compatible, False otherwise """ with TemporaryDirectory(change=True): dummy_fasta = "dummy.fa" dummy_db = "dummy.dmnd" with open(dummy_fasta, "w") as handle: handle.write(">test\nM\n") run_diamond_makedb(dummy_db, dummy_fasta) compatible_format = _extract_db_format(dummy_db) try: db_format = _extract_db_format(database_file) except ValueError: return False if db_format != compatible_format: logging.debug( "Incompatible database format for %s. Expected %s but found %s.", database_file, compatible_format, db_format) return False return True
def run_diamond(subcommand: str, opts: Optional[List[str]] = None) -> RunResult: """ Run a diamond subcommand, possibly with further options. Arguments: subcommand: the diamond subcommand to run opts: a list of additional argument strings to pass to diamond Returns: RunResult of running diamond """ config = get_config() with TemporaryDirectory() as temp_dir: params = [ config.cb_diamond_executable, subcommand, "--threads", str(config.cpus), "--tmpdir", temp_dir, ] if opts: params.extend(opts) result = execute(params) if not result.successful(): raise RuntimeError("diamond failed to run: %s -> %s" % (subcommand, result.stderr[-100:])) return result
def run_prodigal(record: Record, options: ConfigType) -> None: """ Run progidal to annotate prokaryotic sequences """ if "basedir" in options.get('prodigal', ''): basedir = options.prodigal.basedir else: basedir = "" with TemporaryDirectory(change=True): name = record.id.lstrip('-') if not name: name = "unknown" fasta_file = '%s.fasta' % name result_file = '%s.predict' % name with open(fasta_file, 'w') as handle: seqio.write([record.to_biopython()], handle, 'fasta') # run prodigal prodigal = [path.join(basedir, 'prodigal')] prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file]) if options.genefinding_tool == "prodigal-m" or len(record.seq) < 20000: prodigal.extend(['-p', 'meta']) err = execute(prodigal).stderr if err.find('Error') > -1: logging.error("Failed to run prodigal: %r", err) raise RuntimeError("prodigal error: %s" % err) found = 0 for line in open(result_file, 'r'): # skip first line if not line.startswith('>'): continue name, start_chunk, end_chunk, prodigal_strand = line[1:].rstrip( ).split("_") try: start = int(start_chunk) end = int(end_chunk) if prodigal_strand == "+": strand = 1 else: strand = -1 except ValueError: logging.error('Malformatted prodigal output line %r', line.rstrip()) continue if start > end: strand = -1 start, end = end, start loc = FeatureLocation(start - 1, end, strand=strand) translation = record.get_aa_translation_from_location(loc) feature = CDSFeature(loc, locus_tag='ctg%s_%s' % (record.record_index, name), translation=translation, translation_table=record.transl_table) record.add_cds_feature(feature) found += 1 logging.debug("prodigal found %d CDS features", found)
def run_diamond(query_file: str, database_file: str, mode: str = "blastp", opts: Optional[List[str]] = None) -> str: """ Runs diamond, comparing the given query to the given database Arguments: query_file: the path of query sequence file database_file: the path of the database to compare to mode: the mode to use (defaults to blastp) opts: any extra options to pass to diamond Returns: the output from running diamond """ with TemporaryDirectory() as temp_dir: command = [ "diamond", mode, "--db", database_file, "--threads", str(get_config().cpus), "--query", query_file, "--tmpdir", temp_dir, ] if opts: command.extend(opts) result = execute(command) if not result.successful(): raise RuntimeError("diamond failed to run: %s -> %s" % (command, result.stderr[-100:])) return result.stdout
def test_trees(self): with TemporaryDirectory(change=True): # add the classifications to work with genefunctions.smcogs.classify(self.record.id, self.record.get_cds_features(), self.options).add_to_record( self.record) results = smcog_trees.run_on_record(self.record, None, self.options) assert len(results.tree_images) == 7 for image in results.tree_images.values(): assert os.path.exists( os.path.join(results.relative_tree_path, image)) # test the results function properly json = results.to_json() assert smcog_trees.SMCOGTreeResults.from_json( json, self.record).to_json() == json regenerated = smcog_trees.regenerate_previous_results( json, self.record, self.options) assert isinstance(regenerated, smcog_trees.SMCOGTreeResults), json assert regenerated.to_json() == json results.add_to_record(self.record) for cds in self.record.get_cds_features(): if cds.gene_functions.get_by_tool("rule-based-clusters"): continue # no sense checking, because we don't do anything with it if not cds.gene_functions.get_by_tool("smcogs"): continue assert cds.get_name() in results.tree_images assert len(cds.notes) == 1 assert cds.gene_function != secmet.qualifiers.GeneFunction.OTHER
def perform_subclusterblast(options: ConfigType, record: Record, clusters: Dict[str, ReferenceCluster], proteins: Dict[str, Protein]) -> GeneralResults: """ Run BLAST on gene cluster proteins of each cluster, parse output and return result rankings for each cluster Arguments: options: antismash Config record: the Record to analyse clusters: a dictionary mapping reference cluster name to ReferenceCluster proteins: a dictionary mapping reference protein name to Protein Returns: a GeneralResults instance storing results for all clusters in the record """ results = GeneralResults(record.id, search_type="subclusterblast") with TemporaryDirectory(change=True): allcoregenes = get_core_gene_ids(record) for region in record.get_regions(): # prepare and run diamond write_fastas_with_all_genes([region], "input.fasta", partitions=options.cpus) run_clusterblast_processes(options) blastoutput = read_clusterblast_output(options) write_raw_clusterblastoutput(options.output_dir, blastoutput, prefix="subclusterblast") # parse and score diamond results _, cluster_names_to_queries = blastparse(blastoutput, record, min_seq_coverage=40, min_perc_identity=45) ranking = score_clusterblast_output(clusters, allcoregenes, cluster_names_to_queries) logging.debug("Cluster at %s has %d subclusterblast results", region.location, len(ranking)) # store results region_result = RegionResult(region, ranking, proteins, "subclusterblast") results.add_region_result(region_result, clusters, proteins) return results
def run_minowa_predictor_pks_cal(pksnrpscoregenes, domaindict, seq_record, options): calnames = [] calseqs = [] #Predict PKS CAL domain specificities with Minowa et al. method logging.info( "Predicting CAL domain substrate specificities by Minowa et al. method" ) for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "CAL_domain": nr += 1 start = int(tab[1]) end = int(tab[2]) seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_CAL" + str(nr) calnames.append(name) calseqs.append(seq) if len(calnames) > 0: utils.writefasta( calnames, calseqs, options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_calseqs.fasta") with TemporaryDirectory(change=True): minowa_CAL.run_minowa_cal( options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_calseqs.fasta", options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_minowa_calpredoutput.txt") return calnames, calseqs
def run_kr_stereochemistry_predictions(pksnrpscoregenes, domaindict, seq_record, options): #Predict PKS KR domain stereochemistry using pattern as published in ClustScan krnames = [] krseqs = [] logging.info("Predicting PKS KR activity and stereochemistry using KR " \ "fingerprints from Starcevic et al.") for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "PKS_KR": nr += 1 start = int(tab[1]) end = int(tab[2]) seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_KR" + str(nr) krnames.append(name) krseqs.append(seq) if len(krnames) > 0: utils.writefasta( krnames, krseqs, options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krseqs.fasta") with TemporaryDirectory(change=True): kr_analysis.run_kr_analysis( options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krseqs.fasta", options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krpredoutput.txt") return krnames, krseqs
def test_trees(self): with TemporaryDirectory(change=True): results = smcogs.run_on_record(self.record, None, self.options) assert len(results.tree_images) == 7 for image in results.tree_images.values(): assert os.path.exists( os.path.join(results.relative_tree_path, image)) # test the results function properly json = results.to_json() assert smcogs.SMCOGResults.from_json(json, self.record).to_json() == json assert smcogs.regenerate_previous_results( json, self.record, self.options).to_json() == json for cds in self.record.get_cluster(0).cds_children: hit = results.best_hits.get(cds.get_name()) if hit: assert not cds.notes assert cds.gene_function in [ secmet.feature.GeneFunction.OTHER, secmet.feature.GeneFunction.CORE ] results.add_to_record(self.record) for cds in self.record.get_cluster(0).cds_children: if cds.sec_met: continue # no sense checking, because we don't do anything with it hit = results.best_hits.get(cds.get_name()) if not hit: assert cds.gene_function == secmet.feature.GeneFunction.OTHER continue assert cds.get_name() in results.tree_images assert len(cds.notes) == 1 assert cds.gene_function != secmet.feature.GeneFunction.OTHER
def internal_homology_blast(record: secmet.Record) -> Dict[int, List[List[str]]]: """ Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs store groups of homologs - including singles - in a dictionary as a list of lists accordingly Arguments: record: the Record to generate groups from Returns: a dictionary mapping cluster_number to a list containing distinct groups represented by lists of query ids """ with TemporaryDirectory(change=True): logging.info("Finding internal homologs in each gene cluster...") internalhomologygroups = {} for cluster in record.get_clusters(): cluster_number = cluster.get_cluster_number() iquerycluster_names, iqueryclusterseqs = create_blast_inputs(cluster) query_filename = "internal_input.fasta" fasta.write_fasta(iquerycluster_names, iqueryclusterseqs, query_filename) blastoutput = run_internal_blastsearch(query_filename) queries, _ = blastparse(blastoutput, record, min_seq_coverage=25, min_perc_identity=30) groups = find_internal_orthologous_groups(queries, iquerycluster_names) internalhomologygroups[cluster_number] = groups return internalhomologygroups
def test_record_to_json_and_back(self): filename = get_path_to_nisin_genbank() records = list(seqio.parse(open(filename), "genbank")) records = [ Record.from_biopython(rec, taxon="bacteria") for rec in records ] rec_results = [{}, {}, {}] results = serialiser.AntismashResults(filename, records, rec_results, "dummy") json_handle = StringIO() results.write_to_file(json_handle) json_handle.seek(0) new_results = serialiser.AntismashResults.from_file(json_handle, taxon="bacteria") assert results.to_json() == new_results.to_json() # check no records were lost assert len(new_results.records) == len(results.records) # check that the contents of the records is the same # by converting to biopython and writing to genbanks original = self.create_data_stream(results.records) new = self.create_data_stream(new_results.records) oldvalue = original.getvalue() newvalue = new.getvalue() with TemporaryDirectory(change=True): open("old.json", "w").write(oldvalue) open("new.json", "w").write(newvalue) for oldline, newline in zip(oldvalue.split('\n'), newvalue.split('\n')): assert oldline == newline
def generate_trees(smcogs_dir: str, genes_within_clusters: List[CDSFeature], nrpspks_genes: List[CDSFeature]) -> Dict[str, str]: """ smCOG phylogenetic tree construction """ pks_nrps_cds_names = set(feature.get_name() for feature in nrpspks_genes) logging.info("Calculating and drawing phylogenetic trees of cluster genes " "with smCOG members") cds_features = [] for cds in genes_within_clusters: cds_name = cds.get_name() if cds_name in pks_nrps_cds_names: continue if not cds.gene_functions.get_by_tool("smcogs"): continue cds_features.append(cds) with TemporaryDirectory(change=True): args = [] for index, cds in enumerate(cds_features): smcog = cds.gene_functions.get_by_tool("smcogs")[0].description.split(":")[0] args.append([cds, index, smcog, smcogs_dir]) subprocessing.parallel_function(smcog_tree_analysis, args) files = glob.glob("*.png") tree_filenames = {} for filename in files: tag = filename.rsplit(".png", 1)[0] tree_filenames[tag] = filename return tree_filenames
def test_minimal(self): with TemporaryDirectory(change=True) as tempdir: self.options = build_config(["--minimal", "--output-dir", tempdir], isolated=True, modules=antismash.get_all_modules()) with patch.object(nrps_pks, "run_on_record", side_effect=RuntimeError("shouldn't run")): antismash.main.run_antismash(helpers.get_path_to_balhymicin_genbank(), self.options)
def test_depict_ectoine(self): with TemporaryDirectory(change=True) as temp: assert structure_drawer.generate_image(0, "CC1=NCCC(N1)C(=O)O", temp) assert os.path.exists("genecluster0.smi") assert os.path.exists("genecluster0.png") assert os.path.exists("genecluster0_icon.png")
def generate_trees(smcogs_dir: str, hmm_results: Dict[str, List[HSP]], genes_within_clusters: List[CDSFeature], nrpspks_genes: List[CDSFeature]) -> Dict[str, str]: """ smCOG phylogenetic tree construction """ pks_nrps_gene_names = set( [feature.get_name() for feature in nrpspks_genes]) logging.info("Calculating and drawing phylogenetic trees of cluster genes " "with smCOG members") with TemporaryDirectory(change=True): cds_features = [] for cds in genes_within_clusters: gene_id = cds.get_name() if gene_id not in pks_nrps_gene_names and hmm_results.get(gene_id): cds_features.append(cds) args = [] for index, cds in enumerate(cds_features): smcog = hmm_results[cds.get_name()][0].hit_id.split(":")[0] args.append([cds, index, smcog, smcogs_dir]) subprocessing.parallel_function(smcog_tree_analysis, args) files = glob.glob("*.png") tree_filenames = {} for filename in files: tag = filename.rsplit(".png", 1)[0] tree_filenames[tag] = filename return tree_filenames
def perform_docking_domain_analysis(options, clusterpksgenes, genecluster, seq_record, pksnrpsvars): feature_by_id = utils.get_feature_dict(seq_record) #log("Predicting PKS gene order by docking domain sequence " \ # "analysis", stdout=True) startergene, endinggene = find_first_and_last_genes( clusterpksgenes, pksnrpsvars.domainnamesdict) with TemporaryDirectory(change=True): dockinganalysis_dir = utils.get_full_path(__file__, "docking_analysis") ntermintresdict = extract_nterminus(dockinganalysis_dir, clusterpksgenes, seq_record, startergene, feature_by_id) ctermintresdict = extract_cterminus(dockinganalysis_dir, clusterpksgenes, seq_record, endinggene, feature_by_id) possible_orders = find_possible_orders(clusterpksgenes, startergene, endinggene) geneorders, possible_orders_scoredict = rank_biosynthetic_orders( ntermintresdict, ctermintresdict, startergene, endinggene, possible_orders) write_gene_orders_to_html(options, geneorders, possible_orders_scoredict, genecluster, startergene, endinggene) #log("Predicting PKS gene order by docking domain sequence " \ # "analysis succeeded.", stdout=True) #Write html outfile with docking domain analysis output pksnrpsvars.dockingdomainanalysis.append(genecluster) return geneorders[0]
def test__exit(self): "Test TemporaryDirectory __exit__() method" tdir = TemporaryDirectory() trace = """ Called tempfile.mkdtemp('', 'tmp', None) Called shutil.rmtree('/fake/tmp/dir')""" tdir.__exit__(None, None, None) assert_same_trace(self.tt, trace)
def test_minimal(self): with TemporaryDirectory(change=True) as tempdir: self.options = build_config(["--minimal", "--output-dir", tempdir], isolated=True, modules=antismash.get_all_modules()) antismash.main.run_antismash(helpers.get_path_to_balhymicin_genbank(), self.options) # make sure it didn't run minimock.assert_same_trace(self.tracker, "")
def test_bad_partitions(self): with TemporaryDirectory(change=True): for i in [-10, -1, 0]: with self.assertRaisesRegex(ValueError, "Partitions must be greater than 0"): core.write_fastas_with_all_genes(self.clusters, "test", partitions=i) for i in ["str", None, 1.5]: with self.assertRaisesRegex(TypeError, "Partitions must be an int greater than 0"): core.write_fastas_with_all_genes(self.clusters, "test", partitions=i)
def test_single_partition(self): self.dummy_cluster.cds_children = [DummyCDS(1, 3)] * 3 with TemporaryDirectory(change=True): files = core.write_fastas_with_all_genes(self.clusters, "test.fasta", partitions=1) assert files == ["test.fasta"] assert os.path.exists("test.fasta") expected = "".join(">L{0}\nS{0}\n".format(i) for i in range(len(self.clusters)*3)) assert open("test.fasta").read() == expected
def run_and_regenerate_results_for_module(input_file, module, options, expected_record_count=1, callback=None): """ Runs antismash end to end over the given file with the given options and returns the given modules regenerated results if callback is supplied, it will be called with the output directory path as an argument before the output directory is cleared """ with TemporaryDirectory(change=True) as tempdir: orig_output = options.output_dir update_config({"output_dir": tempdir}) json_filename = os.path.join( options.output_dir, os.path.basename(input_file).rsplit('.', 1)[0] + ".json") assert not os.path.exists(json_filename) try: antismash.main.run_antismash(input_file, options) except: update_config({"output_dir": orig_output}) raise update_config({"output_dir": orig_output}) results = serialiser.AntismashResults.from_file( json_filename, options.taxon) # remove things that were added by results, because otherwise the add isn't tested by detection # result regeneration # this should eventually include every feature and qualifier created by antismash for record in results.records: record.clear_antismash_domains() record.clear_cds_motifs() if callback: callback(tempdir) # not the responsibility of modules, but if it's wrong then everything is assert len(results.results) == expected_record_count assert len(results.records) == expected_record_count # ensure all detection stages add their relevant parts modules_to_regenerate = antismash.main.get_detection_modules() # don't try and regenerate twice if not module in modules_to_regenerate: modules_to_regenerate.append(module) if expected_record_count == 1: regenerated = antismash.main.regenerate_results_for_record( results.records[0], options, modules_to_regenerate, results.results[0]) final = regenerated[module.__name__] assert isinstance(final, module_results.ModuleResults) else: regenerated = [ antismash.main.regenerate_results_for_record( record, options, [module], res) for record, res in zip(results.records, results.results) ] final = [result[module.__name__] for result in regenerated] for res in final: assert isinstance(res, module_results.ModuleResults) return final
def test__enter(self): "Test TemporaryDirectory __enter__() method" expected = "/fake/tmp/dir" trace = """ Called tempfile.mkdtemp('', 'tmp', None)""" tdir = TemporaryDirectory() d = tdir.__enter__() self.assertEqual(d, expected) self.assertEqual(self.cwd, '/old/cur/dir') assert_same_trace(self.tt, trace)
def perform_subclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running NCBI BLAST+ subcluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True): for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "subclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) write_clusterblast_inputfiles(options, queryclusternames, queryclusterseqs) run_clusterblast_processes(options, searchtype="subclusters") blastoutput = read_clusterblast_output(options) write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="subclusters") logging.info(" Blast search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it subclusterblastStorage = utils.Storage() subclusterblastStorage.clusternumber = clusternumber subclusterblastStorage.queryclusterprots = queryclusterprots subclusterblastStorage.clusters = clusters subclusterblastStorage.hitclusterdata = hitclusterdata subclusterblastStorage.rankedclusters = rankedclusters subclusterblastStorage.rankedclustervalues = rankedclustervalues subclusterblastStorage.proteintags = proteintags subclusterblastStorage.proteinlocations = proteinlocations subclusterblastStorage.proteinannotations = proteinannotations subclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, subclusterblastStorage, searchtype="subclusters")
def perform_knownclusterblast(options: ConfigType, record: Record, reference_clusters: Dict[str, ReferenceCluster], proteins: Dict[str, Protein]) -> GeneralResults: """ Run BLAST on gene cluster proteins of each cluster, parse output and return result rankings for each cluster Only compares clusters to known clusters from the MIBiG database Arguments: options: antismash Config record: the Record to analyse clusters: a dictionary mapping reference cluster name to ReferenceCluster proteins: a dictionary mapping reference protein name to Protein Returns: a GeneralResults instance storing results for all clusters in the record """ logging.debug("Running DIAMOND knowncluster searches..") results = GeneralResults(record.id, search_type="knownclusterblast") with TemporaryDirectory(change=True) as tempdir: write_fastas_with_all_genes(record.get_clusters(), "input.fasta") run_diamond("input.fasta", _get_datafile_path('knownclusterprots'), tempdir, options) with open("input.out", 'r') as handle: blastoutput = handle.read() write_raw_clusterblastoutput(options.output_dir, blastoutput, prefix="knownclusterblast") clusters_by_number, _ = parse_all_clusters(blastoutput, record, min_seq_coverage=40, min_perc_identity=45) core_gene_accessions = get_core_gene_ids(record) for cluster in record.get_clusters(): cluster_number = cluster.get_cluster_number() cluster_names_to_queries = clusters_by_number.get(cluster_number, {}) ranking = score_clusterblast_output(reference_clusters, core_gene_accessions, cluster_names_to_queries) # store results cluster_result = ClusterResult(cluster, ranking, proteins, "knownclusterblast") results.add_cluster_result(cluster_result, reference_clusters, proteins) write_clusterblast_output(options, record, cluster_result, proteins, searchtype="knownclusterblast") results.mibig_entries = mibig_protein_homology(blastoutput, record, reference_clusters) return results
def test_classifier(self): expected = open(path.get_full_path(__file__, "data", "nisin.txt")).readlines() with TemporaryDirectory(change=True): results = smcogs.run_on_record(self.record, None, self.options) contents = open("smcogs/smcogs.txt").readlines() assert contents == expected json = results.to_json() assert smcogs.SMCOGResults.from_json(json, self.record).to_json() == json
def test_single_file(self): self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)]) with TemporaryDirectory(change=True): files = core.write_fastas_with_all_genes(self.regions, "test.fasta") assert files == ["test.fasta"] assert os.path.exists("test.fasta") expected = "".join(">L{0}\nS{0}\n".format(i) for i in range(len(self.regions) * 3)) assert open("test.fasta").read() == expected
def run_antismash(self, filename, expected): with TemporaryDirectory() as output_dir: update_config({"output_dir": output_dir}) results = helpers.run_and_regenerate_results_for_module(filename, clusterblast, self.options) update_config({"output_dir": ""}) results, global_results = self.get_results(results) assert len(results.region_results) == 1 cluster = results.region_results[0] assert len(cluster.ranking) == expected # will change if database does self.check_svgs(global_results, expected, output_dir) return results