Example #1
0
 def search_cm(target_seq: str,
               target_ident: str) -> Tuple[bool, Optional[str]]:
     seq_db = None
     try:
         seq_db = infernal.generate_fasta({target_ident: target_seq})
         cm_res = infernal.search_cm(cm_purine_path, seq_db.name, inc_e=10)
     finally:
         if seq_db is not None and os.path.exists(seq_db.name):
             os.remove(seq_db.name)
     if cm_res is not None and cm_res:
         return True, cm_res[0]['E-value']
     else:
         return False, None
Example #2
0
def is_novel(sequence, cm_file):
    res = True
    fasta_file = None
    try:
        fasta_file = NTF(dir='.', mode='w', delete=False)
        fasta_file.write('> seq\n')
        for item in map(
                ''.join,
                itertools.zip_longest(*[iter(sequence)] * 80, fillvalue='')):
            fasta_file.write('{}\n'.format(item))
        fasta_file.close()
        cm_res = infernal.search_cm(cm_file,
                                    fasta_file.name,
                                    res_type=infernal.ResType.TBLOUT)
        if cm_res is not None and len(cm_res) > 0:
            res = False
    finally:
        if fasta_file is not None and os.path.exists(fasta_file.name):
            os.remove(fasta_file.name)
    return res
Example #3
0
def cm_search(sequence, seq_code, fasta_list=None):
    results = []
    # get fasta fils
    if fasta_list is None:
        fasta_dbs = gather_fasta_dbs()
    else:
        fasta_dbs = fasta_list
    # cm build \ calibrate
    cm_path = os.path.join(output_dir, "{}.cm".format(seq_code))
    infernal.generate_single_seq_cm(sequence, cm_path)
    # search fasta files
    for fasta_file in fasta_dbs:
        single_fasta_res = infernal.search_cm(cm_path, fasta_file)
        for res in single_fasta_res:
            res['file'] = fasta_file
            #res['sequence'] = recover_infernal_sequence(res, fasta_file)
        results += single_fasta_res
    for res in results:
        add_search_run(sequence, res['sequence'], seq_code, 'cm', res['file'])
    # return results list
    return results
Example #4
0
def run_search(run_code: str, designed_object):
    general_run_logger.info('Starting search {}'.format(run_code))
    cm_path = os.path.join(output_dir, '{}.cm'.format(run_code))
    if not infernal.generate_single_seq_cm(designed_object.sequence, cm_path,
                                           designed_object.structure):
        general_run_logger.error(
            'Failed to build covariance model. run code: {}\n{}\n{}'.format(
                run_code, designed_object.sequence, designed_object.structure))
        return
    results = infernal.search_cm(cm_path, NT_PATH)
    if results is None:
        general_run_logger.error('Search failed {} {}\n{}'.format(
            run_code, cm_path, designed_object.sequence))
        return
    general_run_logger.info('Finished search {}, {} results'.format(
        run_code, len(results)))
    for res_no, res in enumerate(results):
        try:
            sequence = res.get('sequence')
            structure = vienna.fold(sequence)['MFE']
            res_tree = shapiro_tree_aligner.get_tree(structure, sequence)
            tree, score = shapiro_tree_aligner.align_trees(
                res_tree, target_tree)
            if score < 300 and score % 100 < 30:
                general_run_logger.info(
                    'Adding result {}, score {} sequence {}'.format(
                        run_code, score, sequence))
                'seq code\tmatch no\tsequence\tstructure\tscore\ttarget id'
                result_logger.info('{}\t{}\t{}\t{}\t{}\t{}'.format(
                    run_code, res_no, sequence, structure, score,
                    res.get('identifier')))
            else:
                general_run_logger.warning(
                    'Score too low {} result no {}, score {}, sequence: {}'.
                    format(run_code, res_no, score, sequence))
        except Exception:
            general_run_logger.fatel(
                'Exception in search {}, res no {}, {}'.format(
                    run_code, res_no, res))
Example #5
0
def recreate_cm(folder_path: str):
    # read inputs
    gather_designs = {}
    with open(os.path.join(folder_path, "FINAL_summary"), "r") as input_sum:
        input_sum.readline()
        for line in input_sum:
            if line.strip() == '':
                continue
            parts = line.strip().split('\t')
            gather_designs[parts[0]] = parts[5]
    gather_results = {}
    with open(os.path.join(folder_path, "FINAL_all"), 'r') as input_all:
        input_all.readline()
        for line in input_all:
            if line.strip() == '':
                continue
            parts = line.strip().split('\t')
            res_map = gather_results.get(parts[0], {})
            res_map[parts[1]] = parts[4]
            gather_results[parts[0]] = res_map
    # start calculations
    folder = vienna.LiveRNAfold()
    folder.start()
    for design_code, sequence in gather_designs.items():
        structure = folder.fold(sequence)['MFE']
        cm_path = os.path.join(folder_path, "{}.cm".format(design_code))
        sto_path = os.path.join(folder_path, "{}.sto".format(design_code))
        if os.path.exists(cm_path):
            continue
        temp_cm_path = "{}_tmp".format(cm_path)
        temp_sto_path = "{}_tmp".format(sto_path)
        if not infernal.generate_single_seq_cm(sequence, cm_path, structure):
            print("Could not generate single cm for {}".format(design_code))
            exit(-1)
        if not infernal.align_sequences({'{}'.format(design_code): sequence},
                                        cm_path, sto_path):
            print("Could not generate single sto for {}".format(design_code))
            exit(-1)
        design_results = gather_results.get(design_code)
        no_found = 0
        temp_fasta = infernal.generate_fasta(design_results)
        while no_found < len(design_results):
            results = infernal.search_cm(cm_path, temp_fasta.name, inc_e=10.0)
            sto_parts = {}
            sto_target = get_sto_targets(sto_path)
            for item in results:
                if item['target name'] not in sto_target:
                    sto_parts[item['target name']] = item['sequence']
            if len(sto_parts) == 0:
                print(
                    "ERROR: no new sequences found for {} maxed at {} sequences out of {} original\nListing: {}"
                    .format(design_code, len(sto_target), len(design_results),
                            [
                                res for res in design_results.keys()
                                if res not in get_sto_targets(sto_path)
                            ]))
                break
            if not infernal.align_sequences(
                    sto_parts, cm_path, temp_sto_path, in_align_path=sto_path):
                print("Could not generate sto for {}".format(design_code))
                exit(-1)
            if filecmp.cmp(sto_path, temp_sto_path, shallow=False):
                print("ERROR: {} missing codes: {}".format(
                    design_code, [
                        res for res in design_results.keys()
                        if res not in get_sto_targets(sto_path)
                    ]))
                shutil.move(temp_sto_path, sto_path)
                break
            shutil.move(temp_sto_path, sto_path)
            if not infernal.generate_cm(sto_path, temp_cm_path):
                print("Could not generate cm for {}".format(design_code))
                exit(-1)
            shutil.move(temp_cm_path, cm_path)
            no_found = len(results)
        os.remove(temp_fasta.name)
Example #6
0
def dive_single(group_id: str, single_design_group: DesignGroup, cm_dir: str, seq_db_path: str, target_tree,
                filter_align_score: float = 250, filter_evalue: float = 10.0, cpus: int = 12) -> \
        Tuple[DesignGroup, int, Dict[int, List[str]]]:
    count = 0
    items_in_round = {}
    found_new = True
    base_cm_name = '{}.cm'.format(group_id)
    if not os.path.exists(os.path.join(cm_dir, base_cm_name)):
        infernal.generate_single_seq_cm(
            single_design_group.sequence,
            os.path.join(cm_dir, base_cm_name),
            structure=single_design_group.structure,
            cpus=cpus)
    cm_name = 'TEMP_{}'.format(base_cm_name)
    shutil.copyfile(os.path.join(cm_dir, base_cm_name),
                    os.path.join(cm_dir, cm_name))
    stockholm_file = os.path.join(cm_dir, '{}.sto'.format(group_id))
    design_group_identifies = {
        'sequence': single_design_group.sequence,
        'structure': single_design_group.structure
    }
    design_copy = copy(single_design_group)
    items_in_round[0] = design_copy.matches.keys()
    while found_new:
        count += 1
        found_new = False
        # rebuild cm (align to old, delete and create new)
        full_list = {}
        for identifier, match in single_design_group.matches.items():
            full_list[identifier] = match.get('sequence')
        full_list[
            single_design_group.identifier] = single_design_group.sequence
        success = infernal.align_sequences(full_list,
                                           os.path.join(cm_dir, cm_name),
                                           stockholm_file)
        os.remove(os.path.join(cm_dir, cm_name))
        cm_path = os.path.join(cm_dir, cm_name)
        success = infernal.generate_cm(stockholm_file, cm_path, cpus=cpus)
        # search on cm
        search_res = infernal.search_cm(cm_path, seq_db_path, cpus=cpus)
        # identify items (see different matches) and compare size of match group
        new_design_group = DesignGroup(single_design_group.identifier,
                                       design_group_identifies)
        for single_match in search_res:
            code = single_match.get('identifier')
            seq = single_match.get('sequence')
            align_score = get_align_score(code, seq, cm_path, target_tree)
            if float(single_match.get('E-value')
                     ) < filter_evalue and align_score < filter_align_score:
                old_res = design_copy.matches.get(code)
                if old_res is None:
                    single_match['round'] = count
                    found_new = True
                else:
                    single_match['round'] = old_res['round']
                new_design_group.add_match(code, single_match)
        design_copy = new_design_group
        items_in_round[count] = design_copy.matches.keys()
    # organize cm
    shutil.move(os.path.join(cm_dir, cm_name),
                os.path.join(cm_dir, 'FINAL_{}'.format(base_cm_name)))
    shutil.move(stockholm_file,
                os.path.join(cm_dir, 'FINAL_{}.sto'.format(group_id)))
    return design_copy, count, items_in_round