예제 #1
0
 def setUp(self):
     self.config = Namespace()
     config.set_config(self.config)
     self.results_by_id = {
         "GENE_1": [
             FakeHSP("modelA", "GENE_1", 0, 10, 50, 0),
             FakeHSP("modelB", "GENE_1", 0, 10, 50, 0)
         ],
         "GENE_2": [
             FakeHSP("modelA", "GENE_1", 0, 10, 50, 0),
             FakeHSP("modelB", "GENE_1", 0, 10, 50, 0)
         ],
         "GENE_3": [
             FakeHSP("modelA", "GENE_1", 0, 10, 50, 0),
             FakeHSP("modelB", "GENE_1", 0, 10, 50, 0)
         ],
         "GENE_4": [
             FakeHSP("modelA", "GENE_1", 0, 10, 50, 0),
             FakeHSP("modelB", "GENE_1", 0, 10, 50, 0)
         ],
         "GENE_5": [
             FakeHSP("modelA", "GENE_1", 0, 10, 50, 0),
             FakeHSP("modelB", "GENE_1", 0, 10, 50, 0)
         ]
     }
     self.feature_by_id = {
         "GENE_1":
         FakeFeature("CDS", FeatureLocation(0, 30),
                     {"locus_tag": ["GENE_1"]}),
         "GENE_2":
         FakeFeature("CDS", FeatureLocation(30, 50),
                     {"locus_tag": ["GENE_2"]}),
         "GENE_3":
         FakeFeature("CDS", FeatureLocation(70, 90),
                     {"locus_tag": ["GENE_3"]}),
         "GENE_X":
         FakeFeature("CDS", FeatureLocation(95, 100),
                     {"locus_tag": ["GENE_X"]}),
         "GENE_4":
         FakeFeature("CDS", FeatureLocation(120, 130),
                     {"locus_tag": ["GENE_4"]}),
         "GENE_5":
         FakeFeature("CDS", FeatureLocation(130, 150),
                     {"locus_tag": ["GENE_5"]})
     }
     self.rulesdict = {
         "MetaboliteA": ("modelA", 10, 5),
         "MetaboliteB": ("(modelA & modelB)", 10, 5),
         "MetaboliteC": ("cluster(modelA,modelB)", 10, 5),
         "MetaboliteD": ("minimum(3,[modelA,modelB], [modelA])", 20, 5),
         "Metabolite0": ("modelC", 1, 3),
         "Metabolite1": ("modelC", 1, 3)
     }
     self.features = []
     for gene_id in self.feature_by_id:
         self.features.append(self.feature_by_id[gene_id])
     self.record = FakeRecord(self.features)
예제 #2
0
 def setUp(self):
     self.config = Namespace()
     config.set_config(self.config)
     self.config.gff3 = utils.get_full_path(__file__, "test_gff.gff")
     self.config.single_entries = False
     contig1 = FakeRecord(seq="".join(["A" for c in xrange(0, 2000)]))
     contig1.id = "CONTIG_1"
     contig2 = FakeRecord(seq="".join(["A" for c in xrange(0, 2000)]))
     contig2.id = "CONTIG_2"
     self.sequences = [contig1, contig2]
예제 #3
0
 def setUp(self):
     self.config = Namespace()
     self.config.cpus = 2
     config.set_config(self.config)
     self.tt = TraceTracker()
     proc = Mock('proc', tracker=self.tt, returncode=0)
     proc.communicate = Mock('proc.communicate',
                             returns=('output', 'error'),
                             tracker=self.tt)
     mock('subprocess.Popen', tracker=self.tt, returns=proc)
예제 #4
0
 def setUp(self):
     self.config = Namespace()
     self.config.cpus = 2
     config.set_config(self.config)
     self.tt = TraceTracker()
     proc = Mock('proc', tracker=self.tt, returncode=0)
     proc.communicate = Mock('proc.communicate',
                             returns=('output', 'error'),
                             tracker=self.tt)
     mock('subprocess.Popen', tracker=self.tt, returns=proc)
     self.tmpdir = tempfile.mkdtemp(prefix="as_tests_util")
예제 #5
0
 def test_set_config(self):
     "Test config.set_config()"
     c = Namespace(testing=True)
     self.assertIsNone(config._config)
     config.set_config(c)
     self.assertEqual(c, config._config)
예제 #6
0
 def tearDown(self):
     set_config(None)
예제 #7
0
 def setUp(self):
     from argparse import Namespace
     conf = Namespace()
     conf.cpus = 1
     set_config(conf)
예제 #8
0
def main():
    multiprocessing.freeze_support()
    res_object = {}

    # get genome files
    files = []
    for line in open(sys.argv[1], 'r'):
        files.append(path.expanduser(line.replace("\n", "")))

    # mockup antismash run per files
    i = 1
    for fpath in files:
        res_object[fpath] = {}
        print "Processing %s... (%d/%d)" % (fpath, i, len(files))
        i += 1
        options = get_mockup_config()
        options.sequences = [fpath]
        config.set_config(options)
        run_antismash.setup_logging(
            options)  #To-DO: get antismash logging to works!

        # load plugins
        plugins = run_antismash.load_detection_plugins()
        run_antismash.filter_plugins(plugins, options,
                                     options.enabled_cluster_types)

        # parse to seq_records
        seq_records = run_antismash.parse_input_sequences(options)
        options.next_clusternr = 1

        for seq_record in seq_records:
            if options.input_type == 'nucl':
                seq_records = [
                    record for record in seq_records if len(record.seq) > 1000
                ]
                if len(seq_records) < 1:
                    continue
            utils.sort_features(seq_record)
            run_antismash.strip_record(seq_record)
            utils.fix_record_name_id(seq_record, options)

            # fetch results_by_id
            feature_by_id = utils.get_feature_dict(seq_record)
            results = []
            results_by_id = {}
            for feature in utils.get_cds_features(seq_record):
                prefix = "%s:" % seq_record.id.replace(":", "_")
                gene_id = utils.get_gene_id(feature)
                if (prefix + gene_id) in options.hmm_results:
                    results_by_id[gene_id] = options.hmm_results[prefix +
                                                                 gene_id]
                    for res in results_by_id[gene_id]:
                        results.append(res)

            # ignore short aa's
            min_length_aa = 100
            short_cds_buffer = []
            for f in seq_record.features:  # temporarily remove short aa
                if f.type == "CDS" and len(
                        f.qualifiers['translation']
                    [0]) < min_length_aa and not results_by_id.has_key(
                        utils.get_gene_id(f)):
                    short_cds_buffer.append(f)
                    seq_record.features.remove(f)

            overlaps = utils.get_overlaps_table(seq_record)
            rulesdict = hmm_detection.create_rules_dict(
                options.enabled_cluster_types)
            # find total cdhit numbers in the chromosome
            total_cdhit = len(
                utils.get_cdhit_table(utils.get_cds_features(seq_record))[0])
            res_object[fpath][seq_record.id] = {
                "total_clusters": 0,
                "total_genes": len(overlaps[0]),
                "total_cdhit": total_cdhit,
                "genes_with_hits": 0,
                "largest_cdhit": 0,
                "largest_domain_variations": 0,
                "per_hits": {},
                "cluster_types": {}
            }

            # filter overlap hits
            results, results_by_id = hmm_detection.filter_results(
                results, results_by_id, overlaps, feature_by_id)

            # count hits
            for gene_id in results_by_id:
                res_gene = results_by_id[gene_id]
                if len(res_gene) > 0:
                    res_object[fpath][seq_record.id]["genes_with_hits"] += 1
                for hsp in res_gene:
                    domain_name = hsp.query_id.replace("plants/", "")
                    if domain_name not in res_object[fpath][
                            seq_record.id]["per_hits"]:
                        res_object[fpath][
                            seq_record.id]["per_hits"][domain_name] = 0
                    res_object[fpath][
                        seq_record.id]["per_hits"][domain_name] += 1

            # do cluster finding algorithm
            typedict = hmm_detection.apply_cluster_rules(
                results_by_id, feature_by_id, options.enabled_cluster_types,
                rulesdict, overlaps)
            hmm_detection.fix_hybrid_clusters_typedict(typedict)
            nseqdict = hmm_detection.get_nseq()
            for cds in results_by_id.keys():
                feature = feature_by_id[cds]
                if typedict[cds] != "none":
                    hmm_detection._update_sec_met_entry(
                        feature, results_by_id[cds], typedict[cds], nseqdict)
            hmm_detection.find_clusters(seq_record, rulesdict, overlaps)
            seq_record.features.extend(short_cds_buffer)
            res_object[fpath][seq_record.id]["total_clusters"] += len(
                utils.get_cluster_features(seq_record))

            # do cluster specific and unspecific analysis
            if len(utils.get_cluster_features(seq_record)) > 0:
                run_antismash.cluster_specific_analysis(
                    plugins, seq_record, options)
            run_antismash.unspecific_analysis(seq_record, options)

            #Rearrange hybrid clusters name alphabetically
            hmm_detection.fix_hybrid_clusters(seq_record)

            #before writing to output, remove all hmm_detection's subdir prefixes from clustertype
            for cluster in utils.get_cluster_features(seq_record):
                prod_names = []
                for prod in cluster.qualifiers['product']:
                    prod_name = []
                    for name in prod.split('-'):
                        prod_name.append(name.split('/')[-1])
                    prod_names.append("-".join(prod_name))
                cluster.qualifiers['product'] = prod_names
            for cds in utils.get_cds_features(seq_record):
                if 'sec_met' in cds.qualifiers:
                    temp_qual = []
                    for row in cds.qualifiers['sec_met']:
                        if row.startswith('Type: '):
                            clustertypes = [
                                (ct.split('/')[-1])
                                for ct in row.split('Type: ')[-1].split('-')
                            ]
                            temp_qual.append('Type: ' + "-".join(clustertypes))
                        elif row.startswith('Domains detected: '):
                            cluster_results = []
                            for cluster_result in row.split(
                                    'Domains detected: ')[-1].split(';'):
                                cluster_results.append(
                                    cluster_result.split(' (E-value')[0].split(
                                        '/')[-1] + ' (E-value' +
                                    cluster_result.split(' (E-value')[-1])
                            temp_qual.append('Domains detected: ' +
                                             ";".join(cluster_results))
                        else:
                            temp_qual.append(row)
                    cds.qualifiers['sec_met'] = temp_qual

            #on plants, remove plant clustertype from hybrid types, and replace single
            #plant clustertype with "putative"
            for cluster in utils.get_cluster_features(seq_record):
                prod_names = []
                for prod in cluster.qualifiers['product']:
                    prod_name = list(set(prod.split('-')))
                    if (len(prod_name) > 1) and ("plant" in prod_name):
                        prod_name.remove("plant")
                    elif prod_name == ["plant"]:
                        prod_name = ["putative"]
                    prod_names.append("-".join(prod_name))
                cluster.qualifiers['product'] = prod_names
            for cds in utils.get_cds_features(seq_record):
                if 'sec_met' in cds.qualifiers:
                    temp_qual = []
                    for row in cds.qualifiers['sec_met']:
                        if row.startswith('Type: '):
                            clustertypes = list(
                                set(row.split('Type: ')[-1].split('-')))
                            if (len(clustertypes) > 1) and ("plant"
                                                            in clustertypes):
                                clustertypes.remove("plant")
                            elif clustertypes == ["plant"]:
                                clustertypes = ["putative"]
                            temp_qual.append('Type: ' + "-".join(clustertypes))
                        else:
                            temp_qual.append(row)
                    cds.qualifiers['sec_met'] = temp_qual

            # find largest cdhit number & largest domain diversity in a cluster
            res_object[fpath][seq_record.id]["average_cdhit"] = 0
            res_object[fpath][seq_record.id]["average_domain_variations"] = 0
            cdhit_numbers = []
            domain_numbers = []
            for cluster in utils.get_cluster_features(seq_record):
                cluster_type = utils.get_cluster_type(cluster)
                if cluster_type not in res_object[fpath][
                        seq_record.id]["cluster_types"]:
                    res_object[fpath][
                        seq_record.id]["cluster_types"][cluster_type] = 0
                res_object[fpath][
                    seq_record.id]["cluster_types"][cluster_type] += 1
                num_cdhit = len(
                    utils.get_cluster_cdhit_table(cluster, seq_record))
                num_domain = len(utils.get_cluster_domains(
                    cluster, seq_record))
                cdhit_numbers.append(num_cdhit)
                domain_numbers.append(num_domain)
                if num_cdhit > res_object[fpath][
                        seq_record.id]["largest_cdhit"]:
                    res_object[fpath][
                        seq_record.id]["largest_cdhit"] = num_cdhit
                if num_domain > res_object[fpath][
                        seq_record.id]["largest_domain_variations"]:
                    res_object[fpath][seq_record.id][
                        "largest_domain_variations"] = num_domain
            if len(cdhit_numbers) > 0:
                res_object[fpath][seq_record.id][
                    "average_cdhit"] = numpy.median(cdhit_numbers)
            if len(domain_numbers) > 0:
                res_object[fpath][seq_record.id][
                    "average_domain_variations"] = numpy.median(domain_numbers)

        with open('result.js', 'w') as h:
            h.write('var result = %s;' % json.dumps(res_object, indent=4))
예제 #9
0
def main():
    "Retrieve antiSMASH entry from database"

    # First load the output plugins so we can present appropriate options
    output_plugins = load_output_plugins()

    parser = argparse.ArgumentParser(
        description='Retrieve entry from database')
    parser.add_argument('seq_ids',
                        metavar='seq_ids',
                        nargs="*",
                        help="accession numbers of antiSMASH-DB entries")

    parser.add_argument('-d',
                        '--debug',
                        dest='debug',
                        action='store_true',
                        default=False,
                        help="Print debugging information to stderr")
    parser.add_argument('--list-plugins',
                        dest='list_plugins',
                        action='store_true',
                        default=False,
                        help="List all available sec. met. detection modules")
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        action='store_true',
                        default=False,
                        help="Print verbose status information to stderr")
    parser.add_argument('--logfile',
                        dest='logfile',
                        default=argparse.SUPPRESS,
                        help="Also write logging output to a file")
    parser.add_argument('--statusfile',
                        dest='statusfile',
                        default=argparse.SUPPRESS,
                        help="Write the current status to a file")

    group = parser.add_argument_group('Output options')
    for plugin in output_plugins:
        group.add_argument('--disable-%s' % plugin.name,
                           dest=plugin.name,
                           action='store_false',
                           default=argparse.SUPPRESS,
                           help="Disable %s" % plugin.short_description)

    group = parser.add_argument_group('Settings')
    group.add_argument('--outputfolder',
                       dest='outputfoldername',
                       default=argparse.SUPPRESS,
                       help="Directory to write results to")
    group.add_argument('--dbnamespace',
                       dest='dbnamespace',
                       help="Define BioSQL namespace to search")

    group.add_argument('--nclusters',
                       dest='nclusters',
                       default=10,
                       type=int,
                       help="Number of clusters from ClusterBlast to display")
    group.add_argument('--seed',
                       dest='seed',
                       default=0,
                       type=int,
                       help="Random number seed for ClusterBlast coloring")
    options = parser.parse_args()

    # Logging is useful for all the following code, so make sure that is set up
    # right after parsing the arguments.
    setup_logging(options)

    if options.nclusters > 50:
        logging.info("Number of clusters (" + str(options.nclusters) +
                     ") is too large. Reducing to 50.")
        options.nclusters = 50
    logging.debug("Number of clusters to show in clusterblast = " +
                  str(options.nclusters))
    if options.seed != 0:
        random.seed(options.seed)

    # Load list of clutertypes
    clustertypes = hmm_detection.get_supported_cluster_types()

    # Manually set some opions that are required for working with the same codebase as run_antismash.ph
    options.input_type = "nucl"

    # Note: the clusterblast/subclusterblast options are automatically activated if aSstorage object with data is found
    options.clusterblast = None
    options.subclusterblast = None
    options.knownclusterblast = None
    options.smcogs = "TRUE"
    options.modeling = "none"
    options.enabled_cluster_types = ValidateClusterTypes(clustertypes)

    #Load configuration data from config file
    load_config(options)
    set_config(options)

    #Load and filter plugins
    utils.log_status("Loading detection plugins")
    plugins = load_detection_plugins()
    filter_plugins(plugins, options, clustertypes)
    filter_outputs(output_plugins, options)

    options.plugins = plugins

    if options.list_plugins:
        list_available_plugins(output_plugins)
        sys.exit(0)

    filter_outputs(output_plugins, options)

    #Check prerequisites
    if not options.seq_ids:
        parser.error(
            "Please specify at least one antiSMASH-DB accession number")

    if not 'outputfoldername' in options:
        options.outputfoldername = path.splitext(
            path.basename(options.sequences[0]))[0]
    if not os.path.exists(options.outputfoldername):
        os.mkdir(options.outputfoldername)
    options.full_outputfolder_path = path.abspath(options.outputfoldername)

    if not options.dbnamespace in [
            options.BioSQLconfig.dbgenomenamespace,
            options.BioSQLconfig.dbclusternamespace
    ]:

        logging.warn(
            "DBnamespace %s not defined in default.cfg, switching to standard namespace %s."
            % (options.dbnamespace, options.BioSQLconfig.dbgenomenamespace))
        options.dbnamespace = options.BioSQLconfig.dbgenomenamespace

    #Parse input sequence
    try:
        utils.log_status("retrieving record")
        seq_records = get_records(options)
    except:
        logging.exception(
            "Uncaptured error when reading entries from antiSMASH-DB. This should not have happened :-("
        )
        sys.exit(1)

    options.extrarecord = {}

    for seq_record in seq_records:
        options.extrarecord[seq_record.id] = argparse.Namespace()

        logging.debug("DB retrieval: trying to find extra data for %s" %
                      seq_record.id)
        extradataHash = getExtradata(options, seq_record.id)
        logging.debug("Keys of extradataHash: %s" %
                      ", ".join(extradataHash.keys()))
        options.extrarecord[seq_record.id].extradata = extradataHash

        if options.extrarecord[seq_record.id].extradata.has_key(
                'ClusterBlastData'):
            logging.debug("DB retrieval: Found extra data for ClusterBlast")
            options.clusterblast = True
        if options.extrarecord[seq_record.id].extradata.has_key(
                'SubClusterBlastData'):
            logging.debug("DB retrieval: Found extra data for SubClusterBlast")
            options.subclusterblast = True
        if options.extrarecord[seq_record.id].extradata.has_key(
                'KnownClusterBlastData'):
            logging.debug(
                "DB retrieval: Found extra data for KnownClusterBlast")
            options.knownclusterblast = True
        if options.extrarecord[seq_record.id].extradata.has_key(
                'MetabolicModel'):
            logging.debug("DB retrieval: Found extra data for Modeling")
            options.modeling = "db"

    #Write results
    utils.log_status("Writing the output files")
    write_results(output_plugins, seq_records, options)
    zip_results(seq_records, options)