Ejemplos de read_fasta en Python, ejemplos de antismash.common.fasta.read_fasta en Python

Ejemplo n.º 1

0

Mostrar archivo

def run_muscle_single(seq_name: str, seq: str,
                      comparison_file: str) -> Dict[str, str]:
    """ Runs muscle over a single sequence against a comparison file in profile
        mode and returns a dictionary of the resulting alignments

        Arguments:
            seq_name: the name of the query
            seq: the sequence to align
            comparison_file: the path of the file containing comparison sequences

        Returns:
            a dictionary mapping sequence name (query or reference) to alignment
    """
    with NamedTemporaryFile(mode="w+") as temp_in:
        with NamedTemporaryFile(mode="w+") as temp_out:
            write_fasta([seq_name], [seq], temp_in.name)
            # Run muscle and collect sequence positions from file
            result = execute([
                get_config().executables.muscle, "-profile", "-quiet", "-in1",
                comparison_file, "-in2", temp_in.name, "-out", temp_out.name
            ])
            if not result.successful():
                raise RuntimeError(
                    "muscle returned %d: %r while comparing query named %s" %
                    (result.return_code, result.stderr.replace("\n",
                                                               ""), seq_name))
            fasta = read_fasta(temp_out.name)
    return fasta

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_analysis.py Proyecto: emzodls/antismash

 def setUp(self):
     self.record = secmet.Record()
     # except for Thioesterase, all domains were found in BN001301.1
     # TE domains were found in Y16952
     for filename, domain_type in [("PKS_KS.input", "PKS_KS"),
                                   ("AT.input", "PKS_AT"),
                                   ("ACP.input", "ACP"),
                                   ("DH.input", "PKS_DH"),
                                   ("KR.input", "PKS_KR"),
                                   ("TE.input", "Thioesterase"),
                                   ("ER.input", "PKS_ER")]:
         for domain in rebuild_domains(filename, domain_type):
             self.record.add_antismash_domain(domain)
     # these PFAMs found in BN001301.1 with clusterhmmer, one was excluded
     # to avoid a Biopython SearchIO bug
     dummy_location = secmet.feature.FeatureLocation(1, 100)
     domain_fasta = fasta.read_fasta(
         path.get_full_path(__file__, 'data', "p450.input"))
     for name, translation in domain_fasta.items():
         pfam_domain = secmet.feature.PFAMDomain(dummy_location,
                                                 protein_start=5,
                                                 protein_end=10,
                                                 description="test")
         pfam_domain.translation = translation
         pfam_domain.domain_id = "PFAM_p450_" + name
         pfam_domain.domain = "p450"
         self.record.add_pfam_domain(pfam_domain)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: sandpuma.py Proyecto: eburgoswisc/antismash

def sandpuma_test(adomain_file):
    ## Set params
    test_fa = fasta.read_fasta(adomain_file)
    threads = 1
    data_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + '/data/'
    knownfaa = data_dir + 'fullset0_smiles.faa'
    wildcard = 'UNK'
    snn_thresh = 0.5
    knownasm = data_dir + 'fullset0_smiles.stach.faa'
    max_depth = 40
    min_leaf_sup = 10
    jackknife_data = data_dir + 'sandpuma1_jackknife.tsv'
    ref_aln = data_dir + 'fullset0_smiles.afa'
    ref_tree = data_dir + 'fullset0_smiles.fasttree.nwk'  ## created with: fasttree -log fullset0_smiles.fasttree.log < fullset0_smiles.afa > fullset0_smiles.fasttree.nwk
    ref_pkg = data_dir + 'fullset0_smiles.fasttree.refpkg'  ## created with: taxit create --aln-fasta fullset0_smiles.afa --tree-stats fullset0_smiles.fasttree.log --tree-file fullset0_smiles.fasttree.nwk -P fullset0_smiles.fasttree.refpkg -l a_domain
    masscutoff = 0.6
    seed_file = data_dir + 'seed.afa'
    nodemap_file = data_dir + 'nodemap.tsv'
    traceback_file = data_dir + 'traceback.tsv'
    nrpspred2basedir = data_dir + 'NRPSPredictor2'
    phmmdb = data_dir + 'fullset20160624_cl_nrpsA.hmmdb'
    piddb = data_dir + 'fullset0_smiles.dmnd'

    ## Actually test
    run_sandpuma(test_fa, threads, knownfaa, wildcard, snn_thresh, knownasm,
                 max_depth, min_leaf_sup, jackknife_data, ref_aln, ref_tree,
                 ref_pkg, masscutoff, seed_file, nodemap_file, traceback_file,
                 nrpspred2basedir, phmmdb, piddb)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_analysis.py Proyecto: yexianingyue/antismash

def rebuild_domains(filename, domain_type):
    full_path = path.get_full_path(__file__, 'data', filename)
    domain_fasta = fasta.read_fasta(full_path)
    domains = []
    for name, translation in domain_fasta.items():
        domain = DummyAntismashDomain(start=1, end=100, domain_id=domain_type + name)
        domain.domain = domain_type
        domain.translation = translation
        domains.append(domain)
    return domains

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_nrps_predictor.py Proyecto: stogqy/antismash

 def test_angstrom(self):
     aligns = fasta.read_fasta(
         path.get_full_path(__file__, 'data', 'nrpspred_aligns.fasta'))
     domain = DummyAntismashDomain(domain_id="query")
     domain.translation = aligns[domain.domain_id].replace("-", "")
     with patch.object(subprocessing,
                       "run_muscle_single",
                       return_value=aligns):
         sig = nrps_predictor.get_34_aa_signature(domain)
     assert sig == "L--SFDASLFEMYLLTGGDRNMYGPTEATMCATW"

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_analysis.py Proyecto: zachcp/antismash

def rebuild_domains(filename, domain_type):
    full_path = path.get_full_path(__file__, 'data', filename)
    domain_fasta = fasta.read_fasta(full_path)
    dummy_location = secmet.features.FeatureLocation(1, 100)
    domains = []
    for name, translation in domain_fasta.items():
        domain = secmet.features.AntismashDomain(dummy_location, tool="test")
        domain.domain = domain_type
        domain.domain_id = domain_type + name
        domain.translation = translation
        domains.append(domain)
    return domains

Ejemplo n.º 7

0

Mostrar archivo

def trim_alignment(input_number: int, alignment_file: str) -> None:
    """ remove all positions before the first and after the last position shared
        by at least a third of all sequences
    """
    def find_first_aa_position(conservations: List[Dict[str, int]],
                               sequence_count: int) -> int:
        """ Finds the first position of a shared amino acid """
        for position, conservation in enumerate(conservations):
            aa = sorted(conservation.items(),
                        key=lambda x: (x[1], x[0]),
                        reverse=True)
            base, count = aa[0]
            # skip best hits that are gaps
            if base == "-":
                continue
            # check that the count is greater than required
            if count >= sequence_count / 3:
                return position
        return 0  # can't be earlier than the start

    contents = fasta.read_fasta(alignment_file)
    # check all sequences are the same length
    sequence_length = len(list(contents.values())[0])
    for name, seq in contents.items():
        assert sequence_length == len(
            seq), "%s has different sequence length" % name
    # stripping ( and ) because it breaks newick tree parsing
    # and keeping only the last two fields (id and description)
    names = [
        "|".join(name.replace("(", "_").replace(")", "_").rsplit('|', 2)[-2:])
        for name in list(contents)
    ]
    seqs = list(contents.values())

    # store conservation of residues
    conservations = [defaultdict(lambda: 0) for i in range(sequence_length)
                     ]  # type: List[Dict[str, int]]
    for seq in seqs:
        for position, base in enumerate(seq):
            conservations[position][base] += 1

    # Find first and last amino acids shared
    first_shared_amino = find_first_aa_position(conservations, len(seqs))

    conservations.reverse()
    last_shared_amino = sequence_length - find_first_aa_position(
        conservations, len(seqs))

    # Shorten sequences to detected conserved regions
    seqs = [seq[first_shared_amino:last_shared_amino] for seq in seqs]
    seed_fasta_name = "trimmed_alignment" + str(input_number) + ".fasta"
    fasta.write_fasta(names, seqs, seed_fasta_name)

Ejemplo n.º 8

0

Mostrar archivo

    def generate_domains(self):
        inputs = fasta.read_fasta(
            path.get_full_path(__file__, 'data', 'PKS_KS.input'))
        domains = []
        last_end = 0
        for translation in inputs.values():
            location = FeatureLocation(last_end + 10,
                                       last_end + len(translation) * 3 + 16)
            domain = DummyAntismashDomain(location=location)
            domain.translation = translation
            domains.append(domain)
            domain.domain = "PKS_KS"

        location = FeatureLocation(
            last_end + 10, last_end + len(domains[-1].translation) * 3 + 16)
        domains.append(DummyAntismashDomain(location=location))
        domains[-1].domain = "PKS_KR"
        return domains

Ejemplo n.º 9

0

Mostrar archivo

Archivo: sandpuma.py Proyecto: eburgoswisc/antismash

def run_predicat(reference_aln: str, queryfa: Dict[str, str], wildcard: str,
                 ref_tree: str, ref_pkg: str, masscutoff: float,
                 snn_thresh: float) -> PredicatResults:
    """ pplacer and predicat substrate prediciton
    Arguments:
        reference_aln: filename for reference protein fasta, see sandpuma_multithreaded comments for requirements
        queryfa: seq id to seq dictionary
        wildcard: suffix str identifying query sequence (Default= 'UNK' which means headers end in '_UNK')
        ref_tree: reference tree (newick)
        ref_pkg: pplacer reference package
        masscutoff: cutoff value for pplacer masses
        snn_thresh: SNN threshold for confident prediction (default=0.5)

    Returns:                                                                                                                            PredicatResults
            monophyly -> substrate specificity (str)
            forced -> substrate specificity (str)
            nndist -> distance to nearest neighbor (float)
            nn_score -> nearest neighbor score (float)
            snn_score -> scaled nearest neighbor score (float)
    """
    query = next(iter(queryfa))
    ## Align query to a single known sequence
    to_align = {}
    to_align[query] = queryfa[query]
    ref = fasta.read_fasta(reference_aln)
    tname = next(iter(ref))  ## Grab any training sequence header
    to_align[tname] = ref[tname].replace('-', '')
    aligned = subprocessing.run_mafft_predicat_trim(to_align)
    ## trim overhangs
    head = len(re.sub(r'^(-*).+$', r'\g<1>', aligned[tname]))
    tail = len(re.sub(r'^.+(-*)$', r'\g<1>', aligned[tname]))
    trimmed = aligned[query][head:len(aligned[query]) - tail].replace(
        '-', '')  ## Removes head and tail then removes gaps
    trimmedfa = {query: trimmed}
    ## Align trimmed seq to reference
    all_aligned = subprocessing.run_muscle_profile_sandpuma(
        reference_aln, trimmedfa)
    ## Pplacer (NOTE: this is new to SANDPUMA as of antiSMASH5 and needs to be tested
    pplacer_tree = subprocessing.run_pplacer(ref_tree, reference_aln, ref_pkg,
                                             all_aligned)
    ## prediCAT
    return predicat(pplacer_tree, masscutoff, wildcard, snn_thresh)

Ejemplo n.º 10

0

Mostrar archivo

def run_at_domain_analysis(domains: Dict[str, str]) -> ATSignatureResults:
    """ Analyses PKS signature of AT domains

        Arguments:
            domains: a dictionary mapping domain identifier (e.g. 'locus_AT2')
                     to domain sequence

        Returns:
            a dictionary mapping domain identifier to
                a list of ATResults ordered by decreasing score
    """
    # construct the query signatures
    query_signatures = {}
    at_positions = get_at_positions(startpos=7)
    for name, seq in sorted(domains.items()):
        alignments = subprocessing.run_muscle_single(name, seq, _AT_DOMAINS_FILENAME)
        query_signatures[name] = utils.extract_by_reference_positions(alignments[name],
                                         alignments[_REF_SEQUENCE], at_positions)
    # load reference PKS signatures and score queries against them
    return score_signatures(query_signatures, fasta.read_fasta(_SIGNATURES_FILENAME))

Ejemplo n.º 11

0

Mostrar archivo

 def setUp(self):
     self.query_data = fasta.read_fasta(path.get_full_path(__file__, 'data', 'SCO_genes.fasta'))

Ejemplo n.º 12

0

Mostrar archivo

 def setUp(self):
     self.aligns = fasta.read_fasta(
         path.get_full_path(__file__, 'data', 'nrpspred_aligns.fasta'))
     mock("subprocessing.run_muscle_single", returns=self.aligns)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: integration_minowa.py Proyecto: mibig-secmet/antismash-mibig

class TestMinowaAT(unittest.TestCase):
    query_data = fasta.read_fasta(
        path.get_full_path(__file__, "data", "SCO.fasta"))

    def setUp(self):
        build_config([])

    def tearDown(self):
        destroy_config()

    def test_full_run(self):
        results = run_minowa_at(self.query_data)
        assert len(results) == len(self.query_data)
        assert set(results) == set(self.query_data)
        results = {key: val.predictions for key, val in results.items()}
        assert results == {
            'SCO0126_AT1': [('Malonyl-CoA', 81.1),
                            ('Methoxymalonyl-CoA', 30.9),
                            ('Methylmalonyl-CoA', 25.6), ('inactive', 23.2),
                            ('Propionyl-CoA', 13.8),
                            ('2-Methylbutyryl-CoA', 12.2), ('fatty_acid', 7.9),
                            ('Isobutyryl-CoA', 1.8), ('CHC-CoA', 1.1),
                            ('trans-1,2-CPDA', 0.0), ('Benzoyl-CoA', 0.0),
                            ('Acetyl-CoA', 0.0), ('3-Methylbutyryl-CoA', 0.0),
                            ('Ethylmalonyl-CoA', -3.2)],
            'SCO0127_AT1': [('Methoxymalonyl-CoA', 29.2),
                            ('Methylmalonyl-CoA', 26.5), ('Malonyl-CoA', 22.1),
                            ('Ethylmalonyl-CoA', 13.7),
                            ('trans-1,2-CPDA', 0.0), ('inactive', 0.0),
                            ('fatty_acid', 0.0), ('Isobutyryl-CoA', 0.0),
                            ('CHC-CoA', 0.0), ('Benzoyl-CoA', 0.0),
                            ('Acetyl-CoA', 0.0), ('3-Methylbutyryl-CoA', 0.0),
                            ('Propionyl-CoA', -0.2),
                            ('2-Methylbutyryl-CoA', -4.3)],
            'SCO5892_AT1':
            [('Malonyl-CoA', 151.7), ('inactive', 95.9),
             ('Methoxymalonyl-CoA', 74.7), ('Methylmalonyl-CoA', 70.4),
             ('Ethylmalonyl-CoA', 43.0), ('Propionyl-CoA', 35.7),
             ('Isobutyryl-CoA', 31.9), ('CHC-CoA', 27.7),
             ('2-Methylbutyryl-CoA', 26.1), ('Benzoyl-CoA', 25.0),
             ('Acetyl-CoA', 13.9), ('trans-1,2-CPDA', 13.7),
             ('3-Methylbutyryl-CoA', 12.5), ('fatty_acid', 9.7)],
            'SCO6273_AT1':
            [('Malonyl-CoA', 171.9), ('inactive', 73.8),
             ('Methoxymalonyl-CoA', 62.1), ('Methylmalonyl-CoA', 40.8),
             ('Propionyl-CoA', 29.3), ('Acetyl-CoA', 18.6),
             ('Isobutyryl-CoA', 15.6), ('2-Methylbutyryl-CoA', 14.1),
             ('Benzoyl-CoA', 9.6), ('trans-1,2-CPDA', 0.0),
             ('fatty_acid', 0.0), ('Ethylmalonyl-CoA', 0.0), ('CHC-CoA', 0.0),
             ('3-Methylbutyryl-CoA', 0.0)],
            'SCO6274_AT1': [('Malonyl-CoA', 171.9), ('inactive', 73.8),
                            ('Methoxymalonyl-CoA', 62.1),
                            ('Methylmalonyl-CoA', 40.8),
                            ('Propionyl-CoA', 29.3), ('Acetyl-CoA', 18.6),
                            ('Isobutyryl-CoA', 15.6),
                            ('2-Methylbutyryl-CoA', 14.1),
                            ('Benzoyl-CoA', 9.6), ('trans-1,2-CPDA', 0.0),
                            ('fatty_acid', 0.0), ('Ethylmalonyl-CoA', 0.0),
                            ('CHC-CoA', 0.0), ('3-Methylbutyryl-CoA', 0.0)],
            'SCO6274_AT2': [('Malonyl-CoA', 171.9), ('inactive', 73.8),
                            ('Methoxymalonyl-CoA', 62.1),
                            ('Methylmalonyl-CoA', 40.8),
                            ('Propionyl-CoA', 29.3), ('Acetyl-CoA', 18.6),
                            ('Isobutyryl-CoA', 15.6),
                            ('2-Methylbutyryl-CoA', 14.1),
                            ('Benzoyl-CoA', 9.6), ('trans-1,2-CPDA', 0.0),
                            ('fatty_acid', 0.0), ('Ethylmalonyl-CoA', 0.0),
                            ('CHC-CoA', 0.0), ('3-Methylbutyryl-CoA', 0.0)],
            'SCO6275_AT1':
            [('Malonyl-CoA', 209.2), ('inactive', 103.5),
             ('Methoxymalonyl-CoA', 75.4), ('Methylmalonyl-CoA', 68.4),
             ('Isobutyryl-CoA', 37.8), ('2-Methylbutyryl-CoA', 31.3),
             ('Benzoyl-CoA', 30.9), ('Acetyl-CoA', 30.9),
             ('Propionyl-CoA', 29.8), ('Ethylmalonyl-CoA', 28.1),
             ('fatty_acid', 20.5), ('CHC-CoA', 16.6),
             ('3-Methylbutyryl-CoA', 15.4), ('trans-1,2-CPDA', 15.0)],
            'SCO6275_AT2': [('Malonyl-CoA', 203.5), ('inactive', 97.1),
                            ('Methoxymalonyl-CoA', 72.9),
                            ('Methylmalonyl-CoA', 61.7),
                            ('Isobutyryl-CoA', 41.7), ('Propionyl-CoA', 30.9),
                            ('Ethylmalonyl-CoA', 16.8), ('Acetyl-CoA', 16.8),
                            ('2-Methylbutyryl-CoA', 14.2),
                            ('Benzoyl-CoA', 13.3),
                            ('3-Methylbutyryl-CoA', 9.0), ('fatty_acid', 8.4),
                            ('CHC-CoA', 3.9), ('trans-1,2-CPDA', 0.0)],
            'SCO6275_AT3':
            [('Malonyl-CoA', 207.6), ('inactive', 105.9),
             ('Methoxymalonyl-CoA', 62.0), ('Methylmalonyl-CoA', 50.9),
             ('Propionyl-CoA', 30.8), ('Ethylmalonyl-CoA', 17.7),
             ('Isobutyryl-CoA', 16.7), ('2-Methylbutyryl-CoA', 15.7),
             ('Acetyl-CoA', 15.4), ('Benzoyl-CoA', 11.6), ('CHC-CoA', 9.5),
             ('trans-1,2-CPDA', 0.0), ('fatty_acid', 0.0),
             ('3-Methylbutyryl-CoA', 0.0)],
            'SCO6827_AT1': [('Methylmalonyl-CoA', 165.7),
                            ('Ethylmalonyl-CoA', 150.9),
                            ('Methoxymalonyl-CoA', 141.2),
                            ('2-Methylbutyryl-CoA', 118.3),
                            ('Malonyl-CoA', 106.6), ('trans-1,2-CPDA', 94.3),
                            ('Benzoyl-CoA', 90.8), ('Isobutyryl-CoA', 90.1),
                            ('Propionyl-CoA', 89.7), ('CHC-CoA', 65.8),
                            ('Acetyl-CoA', 62.2), ('inactive', 45.4),
                            ('3-Methylbutyryl-CoA', 43.8),
                            ('fatty_acid', 23.7)]
        }

Ejemplo n.º 14

0

Mostrar archivo

        with open(input_filename, "w") as handle:
            for sig, domain in zip(signatures, a_domains):
                handle.write("%s\t%s\n" % (sig, domain.get_name()))
        # Run NRPSPredictor2 SVM
        commands = [
            'java',
            '-Ddatadir=%s' % data_dir, '-cp', classpath,
            'org.roettig.NRPSpredictor2.NRPSpredictor2', '-i', input_filename,
            '-r', output_filename, '-s', '1', '-b', bacterial
        ]
        result = subprocessing.execute(commands)
        if not result.successful():
            raise RuntimeError("NRPSPredictor2 failed: %s" % result.stderr)

        with open(output_filename) as handle:
            lines = handle.read().splitlines()[1:]  # strip the header

    return read_output(lines)


create_domain_fa = fasta.read_fasta(
    '/Users/robi0916/Documents/Wageningen_UR/github/sandpuma2_serina/flat/fullset20160624_cl.faa'
)
domain_list = []
for i, domain in enumerate(create_domain_fa):
    domain_list.append(AntismashDomain(FeatureLocation(
        1, 1, 1), tool="test"))  # arbitrary feature location
    domain_list[i].domain_id = list(create_domain_fa.keys())[i]
    domain_list[i].translation = list(create_domain_fa.values())[i]
run_nrpspredictor(domain_list)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: sandpuma.py Proyecto: eburgoswisc/antismash

def run_sandpuma(name2seq: Dict[str, str], threads: int, knownfaa: str,
                 wildcard: str, snn_thresh: float, knownasm: str,
                 max_depth: int, min_leaf_sup: int, jackknife_data: str,
                 ref_aln: str, ref_tree: str, ref_pkg: str, masscutoff: float,
                 seed_file: str, nodemap_file: str, traceback_file: str,
                 nrpsdir: str, phmmdb: str, piddb: str):
    """ SANDPUMA parallelized pipleline
    Arguments:
        name2seq: dictionary of seq names (str) to seqs (str)
        threads: number of threads
        knownfaa: filename for reference protein fasta; assumes each header ends in '_' followed by the <substrate specificity>
        wildcard: str to append to the end of each query sequence; should be different that all specificities (Default= 'UNK')
        snn_thresh: threshold for SNN score (Default= 0.5) NOTE: may need to be adjusted with new pplacer implementation
        knownasm: filename for reference active site motif protein fasta, similar header formatting as knownfaa
        max_depth: maximum depth for the sklearn decision tree; default= 40
        min_leaf_sup: minimum leaf support required within the decision tree; default= 10
        jackknife_data: filename for jackknife benchmarking results
        ref_aln: reference alignment (fasta) file
        ref_tree: reference tree (newick)
        ref_pkg: pplacer reference package
        masscutoff: cutoff value for pplacer masses
        seed_file: seed fasta file (single entry) used for stachelhaus code extraction
        nodemap_file: filename for map of decision tree outcomes
        traceback_file: jackknife results for all paths
        nrpsdir: dir for NRPSPredictor2
        phmmdb: pHMM database
        piddb: diamand db for PID

    Returns:                                     

    """
    ## Load jackknife data
    jk = {}
    allspec = {}
    with open(jackknife_data, "r") as j:
        next(j)  ## skip header
        for line in j:
            line = line.strip()
            l = line.split("\t")
            jk[l[10]] = {
                'true': l[4],
                'pid': l[3],
                'shuf': l[0],
                'jk': l[1],
                'query': l[2],
                'bin': l[11]
            }
            called_spec = l[5]
            if l[7] == 'N':
                called_spec = 'no_call'
            jk[l[10]]['method'] = {}
            jk[l[10]]['method'][l[6]] = called_spec
            allspec[l[4]] = -1
            allspec[l[5]] = -1
    ## Map specificities to integers
    i2s = []
    i = 0
    for spec in sorted(allspec, key=allspec.get):
        allspec[spec] = i
        i2s.append(spec)
        i += 1
    ## Prepare features and labels
    allmethods = ['prediCAT', 'forced_prediCAT_snn50', 'svm', 'stach', 'phmm']
    features = []
    labels = []
    for uname in jk:
        for m in allmethods:
            if m in jk[uname]['method']:
                continue
            else:
                jk[uname]['method'][m] = 'no_call'
        labels.append(allspec[jk[uname]['true']])
        feature_matrix = [jk[uname]['pid']]
        for m in allmethods:
            feature_matrix.extend(
                get_feature_matrix(jk[uname]['method'][m], i2s))
        features.append(feature_matrix)
    ## Train the decision tree
    clf = tree.DecisionTreeClassifier(min_samples_leaf=min_leaf_sup,
                                      max_depth=max_depth)
    clf = clf.fit(features, labels)
    ## Load the nodemap for decision tree
    nodemap = {}
    with open(nodemap_file, "r") as nm:
        for line in nm:
            if line[0] == '#':
                continue
            else:
                line = line.strip()
                l = line.split("\t")
                nodemap[int(l[0])] = {
                    'parent': int(l[1]),
                    'parent_call': l[2],
                    'decision': l[3],
                    'thresh': float(l[4])
                }
    nodemap = OrderedDict(sorted(nodemap.items(), key=lambda t: t[0]))
    ## Define paths
    paths = []
    for n in nodemap:
        if nodemap[n]['decision'] == 'LEAF_NODE':
            p = nodemap[n]['parent']
            traceback = nodemap[p]['decision'] + '%' + str(
                nodemap[p]['thresh']
            ) + '-' + nodemap[n]['parent_call'] + '&LEAF_NODE-' + str(n)
            while (p != 0):
                n = p
                p = nodemap[p]['parent']
                t = nodemap[p]['decision'] + '%' + str(
                    nodemap[p]['thresh']) + '-' + nodemap[n]['parent_call']
                traceback = t + '&' + traceback
            paths.append(traceback)
    ## Load path accuracies
    pathacc = {}
    with open(traceback_file, "r") as tb:
        for line in tb:
            line = line.strip()
            l = line.split("\t")
            l[2] = re.sub(r"\S+&(LEAF_NODE-\d+)$", "\g<1>", l[2])
            pathacc[l[2]] = {'pct': l[0], 'n': l[1]}
    ## Load ASM fastas
    stach_fa = fasta.read_fasta(knownasm)
    seed_fa = fasta.read_fasta(seed_file)
    ## Split groups
    groups = split_into_groups(name2seq, threads)
    for group in groups:
        toprocess = {}
        for name in name2seq:
            if name in groups[group]:
                toprocess[name] = name2seq[name]
        p = multiprocessing.Process(
            target=sandpuma_multithreaded,
            args=(group, toprocess, knownfaa, wildcard, snn_thresh, knownasm,
                  max_depth, min_leaf_sup, ref_aln, ref_tree, ref_pkg,
                  masscutoff, stach_fa, seed_fa, clf, i2s, paths, pathacc,
                  nrpsdir, phmmdb, piddb))
        p.start()

Ejemplo n.º 16

0

Mostrar archivo

Archivo: run_asm.py Proyecto: serina-robinson/nrps_predictor

            scores[str(match)] = {}
            for s in spec:
                scores[str(match)][s] = 1
    ## Dereplicate and return spec predictions
    for i in range(0,10):
        m = str(9-i)
        if m in scores:
            seen = {}
            for s in scores[m]:
                if s.count('|') > 0:
                    for ss in s.split('|'):
                        seen[ss] = 1
                else:
                    seen[s] = 1
            return('|'.join(sorted(seen)), m )
    return('no_call','0')


def main(queryfa, stachfa, seedfa):
    run_asm(queryfa, stachfa, seedfa)

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("Not enough arguments")

    if len(sys.argv) == 3:
        main(read_fasta(sys.argv[1]), 'data/fullset0_smiles.stach.faa', read_fasta(sys.argv[3]))

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test_extract_sig.py Proyecto: serina-robinson/nrps_predictor

 def setUp(self):
     self.aligns = read_fasta(
         path.get_full_path(nrps_pks.__file__, "test", "data",
                            "nrpspred_aligns.fasta"))
     mock("subprocessing.run_muscle_single", returns=self.aligns)