def setUp(self): self.config = Namespace() config.set_config(self.config) self.results_by_id = { "GENE_1": [ FakeHSP("modelA", "GENE_1", 0, 10, 50, 0), FakeHSP("modelB", "GENE_1", 0, 10, 50, 0) ], "GENE_2": [ FakeHSP("modelA", "GENE_1", 0, 10, 50, 0), FakeHSP("modelB", "GENE_1", 0, 10, 50, 0) ], "GENE_3": [ FakeHSP("modelA", "GENE_1", 0, 10, 50, 0), FakeHSP("modelB", "GENE_1", 0, 10, 50, 0) ], "GENE_4": [ FakeHSP("modelA", "GENE_1", 0, 10, 50, 0), FakeHSP("modelB", "GENE_1", 0, 10, 50, 0) ], "GENE_5": [ FakeHSP("modelA", "GENE_1", 0, 10, 50, 0), FakeHSP("modelB", "GENE_1", 0, 10, 50, 0) ] } self.feature_by_id = { "GENE_1": FakeFeature("CDS", FeatureLocation(0, 30), {"locus_tag": ["GENE_1"]}), "GENE_2": FakeFeature("CDS", FeatureLocation(30, 50), {"locus_tag": ["GENE_2"]}), "GENE_3": FakeFeature("CDS", FeatureLocation(70, 90), {"locus_tag": ["GENE_3"]}), "GENE_X": FakeFeature("CDS", FeatureLocation(95, 100), {"locus_tag": ["GENE_X"]}), "GENE_4": FakeFeature("CDS", FeatureLocation(120, 130), {"locus_tag": ["GENE_4"]}), "GENE_5": FakeFeature("CDS", FeatureLocation(130, 150), {"locus_tag": ["GENE_5"]}) } self.rulesdict = { "MetaboliteA": ("modelA", 10, 5), "MetaboliteB": ("(modelA & modelB)", 10, 5), "MetaboliteC": ("cluster(modelA,modelB)", 10, 5), "MetaboliteD": ("minimum(3,[modelA,modelB], [modelA])", 20, 5), "Metabolite0": ("modelC", 1, 3), "Metabolite1": ("modelC", 1, 3) } self.features = [] for gene_id in self.feature_by_id: self.features.append(self.feature_by_id[gene_id]) self.record = FakeRecord(self.features)
def setUp(self): self.config = Namespace() config.set_config(self.config) self.config.gff3 = utils.get_full_path(__file__, "test_gff.gff") self.config.single_entries = False contig1 = FakeRecord(seq="".join(["A" for c in xrange(0, 2000)])) contig1.id = "CONTIG_1" contig2 = FakeRecord(seq="".join(["A" for c in xrange(0, 2000)])) contig2.id = "CONTIG_2" self.sequences = [contig1, contig2]
def setUp(self): self.config = Namespace() self.config.cpus = 2 config.set_config(self.config) self.tt = TraceTracker() proc = Mock('proc', tracker=self.tt, returncode=0) proc.communicate = Mock('proc.communicate', returns=('output', 'error'), tracker=self.tt) mock('subprocess.Popen', tracker=self.tt, returns=proc)
def setUp(self): self.config = Namespace() self.config.cpus = 2 config.set_config(self.config) self.tt = TraceTracker() proc = Mock('proc', tracker=self.tt, returncode=0) proc.communicate = Mock('proc.communicate', returns=('output', 'error'), tracker=self.tt) mock('subprocess.Popen', tracker=self.tt, returns=proc) self.tmpdir = tempfile.mkdtemp(prefix="as_tests_util")
def test_set_config(self): "Test config.set_config()" c = Namespace(testing=True) self.assertIsNone(config._config) config.set_config(c) self.assertEqual(c, config._config)
def tearDown(self): set_config(None)
def setUp(self): from argparse import Namespace conf = Namespace() conf.cpus = 1 set_config(conf)
def main(): multiprocessing.freeze_support() res_object = {} # get genome files files = [] for line in open(sys.argv[1], 'r'): files.append(path.expanduser(line.replace("\n", ""))) # mockup antismash run per files i = 1 for fpath in files: res_object[fpath] = {} print "Processing %s... (%d/%d)" % (fpath, i, len(files)) i += 1 options = get_mockup_config() options.sequences = [fpath] config.set_config(options) run_antismash.setup_logging( options) #To-DO: get antismash logging to works! # load plugins plugins = run_antismash.load_detection_plugins() run_antismash.filter_plugins(plugins, options, options.enabled_cluster_types) # parse to seq_records seq_records = run_antismash.parse_input_sequences(options) options.next_clusternr = 1 for seq_record in seq_records: if options.input_type == 'nucl': seq_records = [ record for record in seq_records if len(record.seq) > 1000 ] if len(seq_records) < 1: continue utils.sort_features(seq_record) run_antismash.strip_record(seq_record) utils.fix_record_name_id(seq_record, options) # fetch results_by_id feature_by_id = utils.get_feature_dict(seq_record) results = [] results_by_id = {} for feature in utils.get_cds_features(seq_record): prefix = "%s:" % seq_record.id.replace(":", "_") gene_id = utils.get_gene_id(feature) if (prefix + gene_id) in options.hmm_results: results_by_id[gene_id] = options.hmm_results[prefix + gene_id] for res in results_by_id[gene_id]: results.append(res) # ignore short aa's min_length_aa = 100 short_cds_buffer = [] for f in seq_record.features: # temporarily remove short aa if f.type == "CDS" and len( f.qualifiers['translation'] [0]) < min_length_aa and not results_by_id.has_key( utils.get_gene_id(f)): short_cds_buffer.append(f) seq_record.features.remove(f) overlaps = utils.get_overlaps_table(seq_record) rulesdict = hmm_detection.create_rules_dict( options.enabled_cluster_types) # find total cdhit numbers in the chromosome total_cdhit = len( utils.get_cdhit_table(utils.get_cds_features(seq_record))[0]) res_object[fpath][seq_record.id] = { "total_clusters": 0, "total_genes": len(overlaps[0]), "total_cdhit": total_cdhit, "genes_with_hits": 0, "largest_cdhit": 0, "largest_domain_variations": 0, "per_hits": {}, "cluster_types": {} } # filter overlap hits results, results_by_id = hmm_detection.filter_results( results, results_by_id, overlaps, feature_by_id) # count hits for gene_id in results_by_id: res_gene = results_by_id[gene_id] if len(res_gene) > 0: res_object[fpath][seq_record.id]["genes_with_hits"] += 1 for hsp in res_gene: domain_name = hsp.query_id.replace("plants/", "") if domain_name not in res_object[fpath][ seq_record.id]["per_hits"]: res_object[fpath][ seq_record.id]["per_hits"][domain_name] = 0 res_object[fpath][ seq_record.id]["per_hits"][domain_name] += 1 # do cluster finding algorithm typedict = hmm_detection.apply_cluster_rules( results_by_id, feature_by_id, options.enabled_cluster_types, rulesdict, overlaps) hmm_detection.fix_hybrid_clusters_typedict(typedict) nseqdict = hmm_detection.get_nseq() for cds in results_by_id.keys(): feature = feature_by_id[cds] if typedict[cds] != "none": hmm_detection._update_sec_met_entry( feature, results_by_id[cds], typedict[cds], nseqdict) hmm_detection.find_clusters(seq_record, rulesdict, overlaps) seq_record.features.extend(short_cds_buffer) res_object[fpath][seq_record.id]["total_clusters"] += len( utils.get_cluster_features(seq_record)) # do cluster specific and unspecific analysis if len(utils.get_cluster_features(seq_record)) > 0: run_antismash.cluster_specific_analysis( plugins, seq_record, options) run_antismash.unspecific_analysis(seq_record, options) #Rearrange hybrid clusters name alphabetically hmm_detection.fix_hybrid_clusters(seq_record) #before writing to output, remove all hmm_detection's subdir prefixes from clustertype for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = [] for name in prod.split('-'): prod_name.append(name.split('/')[-1]) prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = [ (ct.split('/')[-1]) for ct in row.split('Type: ')[-1].split('-') ] temp_qual.append('Type: ' + "-".join(clustertypes)) elif row.startswith('Domains detected: '): cluster_results = [] for cluster_result in row.split( 'Domains detected: ')[-1].split(';'): cluster_results.append( cluster_result.split(' (E-value')[0].split( '/')[-1] + ' (E-value' + cluster_result.split(' (E-value')[-1]) temp_qual.append('Domains detected: ' + ";".join(cluster_results)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual #on plants, remove plant clustertype from hybrid types, and replace single #plant clustertype with "putative" for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = list(set(prod.split('-'))) if (len(prod_name) > 1) and ("plant" in prod_name): prod_name.remove("plant") elif prod_name == ["plant"]: prod_name = ["putative"] prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = list( set(row.split('Type: ')[-1].split('-'))) if (len(clustertypes) > 1) and ("plant" in clustertypes): clustertypes.remove("plant") elif clustertypes == ["plant"]: clustertypes = ["putative"] temp_qual.append('Type: ' + "-".join(clustertypes)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual # find largest cdhit number & largest domain diversity in a cluster res_object[fpath][seq_record.id]["average_cdhit"] = 0 res_object[fpath][seq_record.id]["average_domain_variations"] = 0 cdhit_numbers = [] domain_numbers = [] for cluster in utils.get_cluster_features(seq_record): cluster_type = utils.get_cluster_type(cluster) if cluster_type not in res_object[fpath][ seq_record.id]["cluster_types"]: res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] = 0 res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] += 1 num_cdhit = len( utils.get_cluster_cdhit_table(cluster, seq_record)) num_domain = len(utils.get_cluster_domains( cluster, seq_record)) cdhit_numbers.append(num_cdhit) domain_numbers.append(num_domain) if num_cdhit > res_object[fpath][ seq_record.id]["largest_cdhit"]: res_object[fpath][ seq_record.id]["largest_cdhit"] = num_cdhit if num_domain > res_object[fpath][ seq_record.id]["largest_domain_variations"]: res_object[fpath][seq_record.id][ "largest_domain_variations"] = num_domain if len(cdhit_numbers) > 0: res_object[fpath][seq_record.id][ "average_cdhit"] = numpy.median(cdhit_numbers) if len(domain_numbers) > 0: res_object[fpath][seq_record.id][ "average_domain_variations"] = numpy.median(domain_numbers) with open('result.js', 'w') as h: h.write('var result = %s;' % json.dumps(res_object, indent=4))
def main(): "Retrieve antiSMASH entry from database" # First load the output plugins so we can present appropriate options output_plugins = load_output_plugins() parser = argparse.ArgumentParser( description='Retrieve entry from database') parser.add_argument('seq_ids', metavar='seq_ids', nargs="*", help="accession numbers of antiSMASH-DB entries") parser.add_argument('-d', '--debug', dest='debug', action='store_true', default=False, help="Print debugging information to stderr") parser.add_argument('--list-plugins', dest='list_plugins', action='store_true', default=False, help="List all available sec. met. detection modules") parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', default=False, help="Print verbose status information to stderr") parser.add_argument('--logfile', dest='logfile', default=argparse.SUPPRESS, help="Also write logging output to a file") parser.add_argument('--statusfile', dest='statusfile', default=argparse.SUPPRESS, help="Write the current status to a file") group = parser.add_argument_group('Output options') for plugin in output_plugins: group.add_argument('--disable-%s' % plugin.name, dest=plugin.name, action='store_false', default=argparse.SUPPRESS, help="Disable %s" % plugin.short_description) group = parser.add_argument_group('Settings') group.add_argument('--outputfolder', dest='outputfoldername', default=argparse.SUPPRESS, help="Directory to write results to") group.add_argument('--dbnamespace', dest='dbnamespace', help="Define BioSQL namespace to search") group.add_argument('--nclusters', dest='nclusters', default=10, type=int, help="Number of clusters from ClusterBlast to display") group.add_argument('--seed', dest='seed', default=0, type=int, help="Random number seed for ClusterBlast coloring") options = parser.parse_args() # Logging is useful for all the following code, so make sure that is set up # right after parsing the arguments. setup_logging(options) if options.nclusters > 50: logging.info("Number of clusters (" + str(options.nclusters) + ") is too large. Reducing to 50.") options.nclusters = 50 logging.debug("Number of clusters to show in clusterblast = " + str(options.nclusters)) if options.seed != 0: random.seed(options.seed) # Load list of clutertypes clustertypes = hmm_detection.get_supported_cluster_types() # Manually set some opions that are required for working with the same codebase as run_antismash.ph options.input_type = "nucl" # Note: the clusterblast/subclusterblast options are automatically activated if aSstorage object with data is found options.clusterblast = None options.subclusterblast = None options.knownclusterblast = None options.smcogs = "TRUE" options.modeling = "none" options.enabled_cluster_types = ValidateClusterTypes(clustertypes) #Load configuration data from config file load_config(options) set_config(options) #Load and filter plugins utils.log_status("Loading detection plugins") plugins = load_detection_plugins() filter_plugins(plugins, options, clustertypes) filter_outputs(output_plugins, options) options.plugins = plugins if options.list_plugins: list_available_plugins(output_plugins) sys.exit(0) filter_outputs(output_plugins, options) #Check prerequisites if not options.seq_ids: parser.error( "Please specify at least one antiSMASH-DB accession number") if not 'outputfoldername' in options: options.outputfoldername = path.splitext( path.basename(options.sequences[0]))[0] if not os.path.exists(options.outputfoldername): os.mkdir(options.outputfoldername) options.full_outputfolder_path = path.abspath(options.outputfoldername) if not options.dbnamespace in [ options.BioSQLconfig.dbgenomenamespace, options.BioSQLconfig.dbclusternamespace ]: logging.warn( "DBnamespace %s not defined in default.cfg, switching to standard namespace %s." % (options.dbnamespace, options.BioSQLconfig.dbgenomenamespace)) options.dbnamespace = options.BioSQLconfig.dbgenomenamespace #Parse input sequence try: utils.log_status("retrieving record") seq_records = get_records(options) except: logging.exception( "Uncaptured error when reading entries from antiSMASH-DB. This should not have happened :-(" ) sys.exit(1) options.extrarecord = {} for seq_record in seq_records: options.extrarecord[seq_record.id] = argparse.Namespace() logging.debug("DB retrieval: trying to find extra data for %s" % seq_record.id) extradataHash = getExtradata(options, seq_record.id) logging.debug("Keys of extradataHash: %s" % ", ".join(extradataHash.keys())) options.extrarecord[seq_record.id].extradata = extradataHash if options.extrarecord[seq_record.id].extradata.has_key( 'ClusterBlastData'): logging.debug("DB retrieval: Found extra data for ClusterBlast") options.clusterblast = True if options.extrarecord[seq_record.id].extradata.has_key( 'SubClusterBlastData'): logging.debug("DB retrieval: Found extra data for SubClusterBlast") options.subclusterblast = True if options.extrarecord[seq_record.id].extradata.has_key( 'KnownClusterBlastData'): logging.debug( "DB retrieval: Found extra data for KnownClusterBlast") options.knownclusterblast = True if options.extrarecord[seq_record.id].extradata.has_key( 'MetabolicModel'): logging.debug("DB retrieval: Found extra data for Modeling") options.modeling = "db" #Write results utils.log_status("Writing the output files") write_results(output_plugins, seq_records, options) zip_results(seq_records, options)