Esempio n. 1
0
    def test_get_multifasta(self):
        "Test utils.get_multifasta"
        expected = """>orf0001
FAKESEQ
>orf0003
FAKESEQ
>orf0006
FAKESEQ"""

        ret = utils.get_multifasta(self.rec)
        self.assertMultiLineEqual(expected, ret)
Esempio n. 2
0
def run(seq_record, options):
    "run hmmsearch against PFAM for all CDS features"
    if 'pfamdir' not in options:
        options.pfamdir = utils.get_full_path(__file__, '')

    query_sequence = utils.get_multifasta(seq_record)

    target_hmmfile = path.join(options.pfamdir, 'Pfam-A.hmm')

    logging.info('Running whole-genome pfam search')
    results = utils.run_hmmscan(target_hmmfile, query_sequence)

    _annotate(seq_record, options, results)
Esempio n. 3
0
def run(seq_record, options):
    "run hmmsearch against PFAM for all CDS features"
    if 'pfamdir' not in options:
        options.pfamdir = utils.get_full_path(__file__, '')

    query_sequence = utils.get_multifasta(seq_record)

    target_hmmfile = path.join(options.pfamdir, 'Pfam-A.hmm')

    logging.info('Running whole-genome pfam search')

    if options.skip_cleanup:
        results_file = path.join(options.full_outputfolder_path, 'fullhmmer.txt')
        if path.exists(results_file):
            results = list(SearchIO.parse(results_file, 'hmmer3-text'))
        else:
            results = utils.run_hmmscan(target_hmmfile, query_sequence, results_file=results_file)
    else:
        results = utils.run_hmmscan(target_hmmfile, query_sequence)

    _annotate(seq_record, options, results)
Esempio n. 4
0
def detect_signature_genes(seq_record, enabled_clustertypes, options):
    "Function to be executed by module"
    feature_by_id = utils.get_feature_dict(seq_record)
    full_fasta = utils.get_multifasta(seq_record)
    rulesdict = create_rules_dict(enabled_clustertypes)
    results = []
    sig_by_name = {}
    results_by_id = {}
    for sig in _signature_profiles:
        sig_by_name[sig.name] = sig

    runresults = utils.run_hmmsearch(utils.get_full_path(
        __file__, 'bgc_seeds.hmm'),
                                     full_fasta,
                                     use_tempfile=True)
    for runresult in runresults:
        acc = runresult.accession.split('.')[0]
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.query_id in sig_by_name:
                sig = sig_by_name[hsp.query_id]
            elif acc in sig_by_name:
                sig = sig_by_name[acc]
            else:
                logging.error(
                    'BUG: Failed to find signature for ID %s / ACC %s',
                    hsp.query_id, acc)
                continue
            if hsp.bitscore > sig.cutoff:
                results.append(hsp)
                if hsp.hit_id not in results_by_id:
                    results_by_id[hsp.hit_id] = [hsp]
                else:
                    results_by_id[hsp.hit_id].append(hsp)

    #Get overlap tables (for overlap filtering etc)
    overlaps = utils.get_overlaps_table(seq_record)

    #Filter results by comparing scores of different models (for PKS systems)
    results, results_by_id = filter_results(results, results_by_id)

    # Filter results of overlapping genes (only for plants)
    if options.taxon == 'plants':
        results, results_by_id = filter_result_overlapping_genes(
            results, results_by_id, overlaps, feature_by_id)

    #Filter multiple results of the same model in one gene
    results, results_by_id = filter_result_multiple(results, results_by_id)

    #Use rules to determine gene clusters
    typedict = apply_cluster_rules(results_by_id, feature_by_id,
                                   enabled_clustertypes, rulesdict, overlaps)

    #Find number of sequences on which each pHMM is based
    nseqdict = get_nseq()

    #Save final results to seq_record
    for cds in results_by_id.keys():
        feature = feature_by_id[cds]
        _update_sec_met_entry(feature, results_by_id[cds], typedict[cds],
                              nseqdict)

    find_clusters(seq_record, rulesdict)

    #Find additional NRPS/PKS genes in gene clusters
    add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict)
    #Add details of gene cluster detection to cluster features
    store_detection_details(results_by_id, rulesdict, seq_record)
    #If all-orfs option on, remove irrelevant short orfs
    if options.all_orfs:
        remove_irrelevant_allorfs(seq_record)