Example #1
0
def borodovsky_blosum_50_2():

    seqs = sequence.readFastaFile("./files/simple_seqs/borodovsky.fasta")
    profile1 = aln_profile.AlignmentProfile([seqs[0]])
    profile2 = aln_profile.AlignmentProfile([seqs[1]])

    phmm = align.load_params(params.borodovsky_4_7, [profile1, profile2],
                             sub_matrix.blosum62LatestProbs,
                             log_transform=False)

    return phmm
Example #2
0
def durbin_blosum_50_2():

    seqs = sequence.readFastaFile("./files/simple_seqs/durbin_2.fasta")
    profile1 = aln_profile.AlignmentProfile([seqs[0]])
    profile2 = aln_profile.AlignmentProfile([seqs[1]])

    phmm = align.load_params(params.basic_params, [profile1, profile2],
                             sub_matrix.blosum50,
                             log_transform=True)

    return phmm
Example #3
0
def probabilities_blosum_62_2():
    seqs = sequence.readFastaFile("./files/simple_seqs/simple_2.fasta")

    profile1 = aln_profile.AlignmentProfile([seqs[0]])
    profile2 = aln_profile.AlignmentProfile([seqs[1]])

    phmm = align.load_params(params.basic_params, [profile1, profile2],
                             sub_matrix.blosum62LatestProbs,
                             log_transform=True)

    return phmm
Example #4
0
def two_col_62_2():

    seqs = sequence.readFastaFile("./files/custom_seqs/2_col.fasta")
    profile1 = aln_profile.AlignmentProfile([seqs[0]])
    profile2 = aln_profile.AlignmentProfile([seqs[1]])

    phmm = align.load_params(params.basic_params, [profile1, profile2],
                             sub_matrix.blosum62,
                             log_transform=True)

    return phmm
Example #5
0
def ox_104t17_1():

    seqs = sequence.readFastaFile(
        "./files/qscore_corrections/ox_104t17_1.fasta")
    profile1 = aln_profile.AlignmentProfile([seqs[0]])
    profile2 = aln_profile.AlignmentProfile([seqs[1]])

    phmm = align.load_params(params.qscore_params, [profile1, profile2],
                             sub_matrix.blosum62EstimatedWithX,
                             log_transform=True)

    return phmm
Example #6
0
def test_profile_with_two_sequences():
    profile_2 = aln_profile.AlignmentProfile(['RTAG', '-TA-'])
    assert profile_2.profile[1]['T'] == 2
Example #7
0
def test_profile_with_one_sequence():
    profile_1 = aln_profile.AlignmentProfile(['RTAG'])

    assert profile_1.profile[0]['R'] == 1
Example #8
0
def align_seqs(inpath,
               outpath,
               aln_type,
               params=parameters.basic_params,
               subsmat=sub_matrix.blosum62EstimatedWithX_dict,
               log_transform=True):

    print("params are")
    print(params)

    # Read sequences in
    seqs = sequence.readFastaFile(inpath, alphabet=Protein_Alphabet_wB_X_Z)

    print(len(seqs))

    if len(seqs) == 2:
        aln_order = [("N0", [seqs[0].name, seqs[1].name])]

    else:

        # Calculate guide tree
        guide_tree = gt.get_guide_tree(seqs, random=False)
        print(guide_tree.ascii_art())

        # Get the alignment order
        aln_order = gt.get_aln_order(guide_tree)
        # print (aln_order)

    print(aln_order)

    seq_dict = {x.name: x for x in seqs}

    # Predecessors start off blank
    predecessors = [{}, {}]

    # Create alignment in order from guide tree
    for node in aln_order:

        # Get the current node name and list of sequences under that node
        curr_node = node[0]
        curr_seqs = node[1]

        # List to store the aligned sequences in
        aligned = []

        # While the node has sequences underneath yet to be aligned
        while curr_seqs:

            # Get a sequence
            seq = curr_seqs.pop()

            # Make it into a profile if it isn't one already
            if type(seq_dict[seq]) != aln_profile.AlignmentProfile:
                profile = aln_profile.AlignmentProfile([seq_dict[seq]])
            else:
                profile = seq_dict[seq]

            # Add sequence to the aligned list
            aligned.append(profile)

            # if len(alns) > 1:
            #     new_align = "-align-".join(alns)
            #     alns = []
            #     alns.append(new_align)

            # If we have two profiles it is time to align
            if len(aligned) > 1:

                pair_hmm = load_params(params, aligned, subsmat, log_transform,
                                       predecessors)

                if aln_type == 'viterbi':

                    pair_hmm.performViterbiAlignment(po=False)
                    aligned_profile = pair_hmm.get_alignment(
                        type_to_get='viterbi')

                elif aln_type == 'poviterbi':

                    pair_hmm.performViterbiAlignment(po=True)
                    aligned_profile = pair_hmm.get_alignment(
                        type_to_get='viterbi')

                elif aln_type == 'mea':

                    pair_hmm.performMEAAlignment(po=False)
                    aligned_profile = pair_hmm.get_alignment(type_to_get='mea')

                elif aln_type == 'pomea':

                    pair_hmm.performMEAAlignment(po=True)
                    aligned_profile = pair_hmm.get_alignment(type_to_get='mea')

                # Clear the previous unaligned sequences
                aligned = []

                # Add the aligned sequences
                aligned.append(aligned_profile)

        # print ('wowza')
        # print (aligned[0])
        # print(aligned[0].predecessors)

        seq_dict[curr_node] = aligned[0]

        # print('alignment is ')
        # print(aligned_profile)

    with open(outpath, 'w') as outfile:
        outfile.write(str(aligned_profile))

    return aligned_profile
Example #9
0
def run_qscore(name,
               aln_type,
               parameters,
               specific_files=None,
               save=False,
               outpath=""):
    base_dir = "./bench1.0/" + name

    in_dir = base_dir + "/in/"
    ref_dir = base_dir + "/ref/"
    out_dir = "./qscore_alignments/" + aln_type + "_" + name

    qscore_dict = defaultdict(dict)

    files = os.listdir(in_dir)

    file_count = 0

    start_time = timeit.default_timer()

    now = datetime.now()

    dt_string = now.strftime("%Y/%m/%d_%H:%M")

    # Add trailing slash to output directory if it isn't there
    outpath = outpath + "/" if outpath[-1] != "/" else outpath

    param_name = f"t={parameters['tau']}e={parameters['epsilon']}d={parameters['delta']}x={parameters['emissionX']}y={parameters['emissionY']}"

    output_file = "./qscore_alignments/" + aln_type + "_" + name + param_name + ".csv"

    if os.path.exists(outpath + name + ".p"):
        curr_dict = pickle.load(open(outpath + name + ".p", "rb"))
    else:
        curr_dict = {param_name: {}}

    if os.path.exists(outpath + name + "_best.p"):
        best_dict = pickle.load(open(outpath + name + "_best.p", "rb"))
    else:
        best_dict = {}

    if os.path.exists(outpath + "time.p"):
        time_dict = pickle.load(open(outpath + "time.p", "rb"))
    else:
        time_dict = {}

    failures = []

    with open(output_file, 'w+') as output:

        writer = csv.writer(output,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['Tool', 'Dataset', 'Name', 'Q', 'TC', 'M', 'C'])

        # If we don't already have a directory created to save the alignments, lets make one
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        for file in files:

            failed = False

            if file != ".DS_Store":

                seqs = sequence.readFastaFile(in_dir + file,
                                              alphabet=Protein_Alphabet_wB_X_Z)

                for seq in seqs:
                    if any(skip in seq.sequence for skip in aa_skip):
                        print("failed on " + seq.name)
                        failures.append(file)
                        failed = True

                if not failed:

                    qscore_dict[file] = defaultdict(dict)

                    if not specific_files or file in specific_files:

                        if param_name not in curr_dict:
                            curr_dict[param_name] = {}

                        # print (curr_dict)
                        file_count += 1

                        single_time = timeit.default_timer()

                        print(file)

                        # change_params = {'tau': 0.000002, 'epsilon': 0.0001, 'delta': 0.0002, 'emissionX': 0.2, 'emissionY':
                        #     0.2}
                        # change_params = {'tau': 0.00000000002, 'epsilon': 0.000175, 'delta': 0.00031, 'emissionX':
                        #     0.002,
                        #                  'emissionY':
                        #     0.002}
                        #
                        # change_params = {'tau': 0.1, 'epsilon': 0.02, 'delta': 0.01, 'emissionX':
                        #     0.5,
                        #                  'emissionY':
                        #     0.5}
                        # Update parameters using Baum Welch
                        for seq_order in list(itertools.combinations(seqs, 2)):
                            profiles = [
                                aln_profile.AlignmentProfile([x])
                                for x in seq_order
                            ]

                            # change_params = bw.runBaumWelch(parameters, profiles, aln_type)

                        print(parameters)
                        # print (change_params)

                        aligned_profile = align.align_seqs(
                            in_dir + file,
                            out_dir + "/" + file + ".aln",
                            aln_type=aln_type,
                            params=parameters,
                            subsmat=sub_matrix.blosum62EstimatedWithX_dict,
                            log_transform=log_transform)

                        process = subprocess.Popen(
                            "qscore -test %s -ref %s -cline -modeler" %
                            (out_dir + "/" + file + ".aln", ref_dir + file),
                            stderr=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            shell=True)

                        out = process.communicate()[0]
                        errcode = process.returncode

                        print('running')
                        print(errcode)

                        scores = [
                            x.strip()
                            for x in out.decode('utf-8').split(";")[2:]
                        ]

                        # scores = [x.split("=")[1] for x in scores]

                        # print (aligned_profile)
                        print(file)

                        print('\nScores be')
                        print(scores)

                        for score in scores:
                            score_type = score.split("=")[0].strip()
                            score_value = score.split("=")[1].strip()
                            qscore_dict[file][score_type] = score_value

                        curr_dict[param_name][file] = (scores, aligned_profile)

                        update_best_dict(best_dict, file, scores, param_name)

                        if scores and "=" in scores[0]:
                            writer.writerow([
                                aln_type + "_" + param_name + "_log=" +
                                str(log_transform), name, file,
                                scores[0].split("=")[1],
                                scores[1].split("=")[1],
                                scores[2].split("=")[1],
                                scores[3].split("=")[1]
                            ])

                        else:
                            failures.append(file)

                        # if file not in curr_dict[param_name].keys():
                        #     curr_dict[param_name][file] = (scores, aligned_profile)
                        # else:
                        #     curr_dict[param_name][file] = (scores, aligned_profile)
                        #

                        total_seconds = timeit.default_timer() - start_time
                        single_seconds = timeit.default_timer() - single_time

                        if save:

                            pickle.dump(
                                curr_dict,
                                open(outpath + aln_type + "_" + name + ".p",
                                     "wb"))
                            pickle.dump(
                                best_dict,
                                open(
                                    outpath + aln_type + "_" + name +
                                    "_best.p", "wb"))

                    if save:

                        if name in time_dict:
                            if total_seconds < time_dict[name][0]:
                                time_dict[name] = (total_seconds, dt_string)
                                print("New best time - " +
                                      utilities.format_time(total_seconds))
                        else:
                            time_dict[name] = (total_seconds, dt_string)
                            print("New best time - " +
                                  utilities.format_time(total_seconds))

                    pickle.dump(
                        time_dict,
                        open(outpath + aln_type + "_" + "time.p", "wb"))
    print('These files failed ')
    print(failures)
    return qscore_dict
Example #10
0
    'epsilon': 0.05,
    'delta': 0.02,
    'emissionX': 0.92,
    'emissionY': 0.2
}

change_params = {
    'tau': 0.002,
    'epsilon': 0.05,
    'delta': 0.02,
    'emissionX': 0.5,
    'emissionY': 0.5
}

for seq_order in list(itertools.combinations(seqs, 2)):
    profiles = [aln_profile.AlignmentProfile([x]) for x in seq_order]

    # change_params = bw.runBaumWelch(change_params, profiles, "viterbi")

alignment = align.align_seqs(seq,
                             "../../tests/files/custom_seqs/3_col.aln",
                             aln_type='mea',
                             params=change_params,
                             subsmat=sub_matrix.blosum62EstimatedWithX_dict,
                             log_transform=True)

po_alignment = align.align_seqs(seq,
                                "../../tests/files/custom_seqs/3_col.aln",
                                aln_type='pomea',
                                params=change_params,
                                subsmat=sub_matrix.blosum62EstimatedWithX_dict,