def match_seq_to_best_template_seq(tgt_seq: Sequence, templates: List[Tuple[str, Sequence]]) -> List[Tuple[str, Sequence]]:
    print("\nMatching desired target sequence to our template library...")

    # Filter sequences for those of same true length. If 1, done.
    # If zero, quit entirely; no point.
    new_seqs: List[Tuple[str, Sequence]] = [t for t in templates]
    ii: int = 0

    pdb_len: int = len(new_seqs[0][1].sequence.replace('-',''))

    # HOLY F**K TRY SOME ALIGNMENTS YOU MORON

    seq_matching_to_pdb: Dict[str, int] = {}
    for ii, (pp, new_seq) in enumerate(new_seqs):
        # s is tgt_seq, but expanded based on new_seq's dash pattern
        s: str = import_dash_pattern(already_dashed_seq=new_seq, 
            dest_seq_str=tgt_seq.sequence)
        seq_matching_to_pdb[new_seq.sequence] = simple_match(Sequence(s), new_seq, quiet=False)
  
    # Temp: we don't want only the best score, but maybe the top five.
    #best_score: int = max(seq_matching_to_pdb.values())
    #new_seqs = [(p, s) for (p, s) in new_seqs if s.sequence in seq_matching_to_pdb.keys() and seq_matching_to_pdb[s.sequence] == best_score]
    best_score: int = sorted(list(set(seq_matching_to_pdb.values())))[-20]
    new_seqs = [(p, s) for (p, s) in new_seqs if s.sequence in seq_matching_to_pdb.keys() and seq_matching_to_pdb[s.sequence] >= best_score]
    
    print("There are {n} sequences that match your template sequence to a score of {score}:".format(n=len(new_seqs), score=best_score))
    for ii, (p, q) in enumerate(new_seqs):
        print("\tSequence {serial} -- score {score}\n\t\tPDB: {pdb}\n\t\tSEQ: {seq}\n"
            .format(serial=ii, score=seq_matching_to_pdb[q.sequence], pdb=p, seq=q.sequence))

    return new_seqs
Example #2
0
def get_seqs(fn: str) -> List[Tuple[str, Sequence]]:
    lines = []  # type: List[str]
    with open(fn) as f:
        lines = f.readlines()
    #sequences = {l.strip().split()[0]: l.strip().split()[1] for l in lines if "ALIGNED_TO" in l }
    sequences = [(l.strip().split()[0], Sequence(l.strip().split()[1]))
                 for l in lines if "PDB_SEQ" in l]
    return sequences
Example #3
0
def get_seqs_mfa(fn: str) -> List[Tuple[str, Sequence]]:
    """
    Get sequences out of the tRNAdb mfa
    """
    lines = []  # type: List[str]
    with open(fn) as f:
        lines = f.readlines()
    sequences = [('', Sequence(l.strip())) for l in lines[1::2]]
    return sequences
Example #4
0
def test_simple_match_identity():
    """
    aaa
    """
    assert (simple_match(Sequence('A-'), Sequence('A-')) == 10)
    assert (simple_match(Sequence('G-'), Sequence('G-')) == 10)
    assert (simple_match(Sequence('GGAA---'), Sequence('GGAA---')) == 35)
Example #5
0
def modomics_from_pdb(pdb: str) -> Sequence:
    """
    Doesn't pair with secstruct (yet). dssr?
    """

    pdblines = []  # type: List[str]
    modomics_seq = ""
    with open(pdb) as f:
        pdblines = f.readlines()
    for l in pdblines:
        if " C4'" in l: modomics_seq += mod_from_tlc(l[17:20])

    # Replace with Rosetta-native SS determination?

    return Sequence(modomics_seq, '.' * len(modomics_seq))
def import_dash_pattern(already_dashed_seq: Sequence, dest_seq_str: str) -> str:
    """
    Put all the dashes from dashed_seq into other_seq.
    Try to be a LITTLE clever here. We don't want to just shove every in there
    in case there is an insertion. Maybe we should test each insertion to see
    if it improves alignment.

    Importantly, we have a maximum length to contend with...
    """

    dash_pos: List[int] = dash_positions(already_dashed_seq.sequence)
    possible_dest_seq_strs: List[str] = seqs_with_dashes(dash_pos, dest_seq_str, len(already_dashed_seq))
    score: int = None
    revised_seq_str: str = ""
    for possible_dest_seq_str in possible_dest_seq_strs:
        possible_dest_seq: Sequence = Sequence(possible_dest_seq_str)
        newscore: int = simple_match(possible_dest_seq, already_dashed_seq, quiet=True)
        if score is None or newscore > score:
            score = newscore
            revised_seq_str = possible_dest_seq_str  

    return revised_seq_str + (len(already_dashed_seq) - len(revised_seq_str)) * '-'
def main(args):
    if args.pdb is not None: 
        pass
    
    import os
    if not os.path.exists(my_loc() + "/data/all_trna_structure_seqs.dat"):
        print("Regenerating aligned template library")
        align_template_library(my_loc() + "/data/all_trna.mfa")

    templates = get_seqs(my_loc() + "/data/all_trna_structure_seqs.dat")

    tgt_seq = Sequence("", 
        ".(((((((..((((...........)))).(((((.......)))))........................(((((.......))))))))))))....")
    with open(args.seq_file[0]) as f:
        tgt_seq.sequence = f.readlines()[0].strip()

    pdb_seq_list = match_seq_to_best_template_seq(tgt_seq, templates)
    
    # Maybe there are many returned! That's cool; do them all
    for p, s in pdb_seq_list:
        if 'a' in tgt_seq.sequence and 'g' in tgt_seq.sequence and 'c' in tgt_seq.sequence and 'u' in tgt_seq.sequence:
            # annotated seq format, must translate first.
            tgt_seq = ann_to_mod(tgt_seq)
        remodel_new_sequence(s, tgt_seq, p, args.nstruct, '', args.defer, args.aggressive)
def add_dash_recursive(template: Sequence, trial: str, dashes, current_best):
    """
    Not in current use -- this is a very expensive function that 
    is good for aligning very difficult sequences. At the moment we
    have been using a few manual tweaks after automated alignment
    and that has been good enough.
    """
    
    def filled_trial_seq(trial: str, ii: int) -> str:
        return trial[:ii]+'-'+trial[ii:]+'-'*(len(template)-len(trial)-1)

    n_left_to_add = len(template)-len(trial)
    # Construct
    if len(dashes) == n_left_to_add:
        # We have enough.
        complete_trial_string = "-"*len(template)
        trial_index = 0
        for complete_index in range(len(template)-1):
            if complete_index in dashes: continue
            else:
                complete_trial_string = complete_trial_string[:complete_index]+trial[trial_index]+complete_trial_string[complete_index+1:]
                trial_index += 1
                if trial_index == len(trial): break
        score = simple_match(template, Sequence(complete_trial_string))
        if current_best is None or score > current_best[1]:
            current_best = (complete_trial_string, score)
    else:
        for ii in range(len(template)):
            if ii in dashes: continue
            else:
                new_dashes = list(dashes)
                new_dashes.append(ii)
            
                current_best = add_dash_recursive(template, trial, new_dashes, current_best)

    return current_best
Example #9
0
def test_import_dash_pattern_dashes():
    """
    Does this make sense?
    """
    assert (import_dash_pattern(Sequence('AA---', '....'), 'G-G') == 'G---G')
Example #10
0
def test_import_dash_pattern_end():
    assert (import_dash_pattern(Sequence('AA---', '....'), 'GG') == 'GG---')
Example #11
0
def test_import_dash_pattern_beginning():
    assert (import_dash_pattern(Sequence('---AA', '....'), 'GG') == '---GG')
Example #12
0
def test_import_dash_pattern_middle():
    assert (import_dash_pattern(Sequence('A--A', '....'), 'GG') == 'G--G')
Example #13
0
def test_simple_match_mod_U_eq():
    assert (simple_match(Sequence('U-'), Sequence('T-')) == 8)
Example #14
0
def test_simple_match_pyrimidine_eq():
    assert (simple_match(Sequence('U-'), Sequence('C-')) == 7)
Example #15
0
def test_simple_match_purine_eq():
    assert (simple_match(Sequence('A-'), Sequence('G-')) == 7)
Example #16
0
def test_simple_match_gap_penalty():
    assert (simple_match(Sequence('A-'), Sequence('-A')) == -20)