コード例 #1
0
def test_AlignmentArray():
    F = UtilityFunctions.FastaToDict("tests/test_fasta.fasta")
    ali = UtilityFunctions.AlignmentArray(F.values())
    comp = [['A', 'C', 'G', 'T', 'T', "-"], ['A', 'A', 'C', 'T'],
            ['C', 'T', 'A', 'G']]
    comp = np.array(comp)
    assert np.array_equal(comp, ali)
コード例 #2
0
def test_reverseComplementAlignment():
    F = UtilityFunctions.FastaToDict("tests/test_data/test_fasta.fasta")
    ali = UtilityFunctions.AlignmentArray(F.values())
    rcomp = [['-', 'A', 'A', 'C', 'G', 'T'], ['A', 'G', 'T', 'T'],
             ['C', 'T', 'A', 'G']]
    rcomp = np.array(rcomp)
    assert np.array_equal(UtilityFunctions.reverseComplementAlignment(ali),
                          rcomp)
コード例 #3
0
def alignWithRef(currentD, reference_dict, pD, outdir):
    for i in currentD:
        query_seq = currentD[i]['consensus']
        query_ali = currentD[i]['alignment']
        query_names = currentD[i]['names']
        q_matrix, nt_inds = Consensus.makeAlignmentMatrix(query_ali)
        candidates = UtilityFunctions.FastaToDict(reference_dict)
        for target_nam, target_seq in candidates.items():
            qm = copy.copy(q_matrix)
            target_ali = UtilityFunctions.AlignmentArray([target_seq])
            # Align the query and target consensus sequences
            result = Alignment.SWalign(query_seq, target_seq,
                                       pD, useSub=True)

            # Check the if the consensus sequences are a good match
            is_match = Alignment.alignmentMeetsCriteria(result, query_seq,
                                                        target_seq, pD)

            if is_match[0]:
                result['alignment'] = is_match[1]
                # get the full alignment for the two consensus sequences
                result = Alignment.getAlignmentFull(result,
                                                    query_seq,
                                                    target_seq,
                                                    pD)

                ali, matrix = Consensus.expandAlignment(result,
                                                        query_ali,
                                                        target_ali,
                                                        qm,
                                                        nt_inds)
                cons = Consensus.collapseAlignment(matrix, nt_inds)
                names = query_names + [target_nam]
                result2 = Alignment.SWalign(query_seq, cons, pD, useSub=True)
                is_match_2 = Alignment.alignmentMeetsCriteria(result2,
                                                              query_seq,
                                                              cons,
                                                              pD)
                result2['alignment'] = is_match_2[1]
                result2 = Alignment.getAlignmentFull(result2,
                                                     query_seq,
                                                     cons,
                                                     pD)
                a2 = UtilityFunctions.AlignmentArray([query_seq])
                ali, matrix = Consensus.expandAlignment(result2,
                                                        a2,
                                                        ali,
                                                        qm,
                                                        nt_inds)
                names = ["*consensus_%s" % i] + names
                path = "%s/final_clusters/consensus_%s_to_%s_ali.fasta" % (
                    outdir, i, target_nam.split(" ")[0])
                out = open(path, "w")
                for j, nam in enumerate(names):
                    out.write(">%s\n%s\n" % (nam,
                                             "".join(list(ali[j]))))
                out.close()
コード例 #4
0
def main():
    parser = configargparse.ArgumentParser(
        description='''Compare two sequences and calculate the % identity''')
    parser.add("--consensus",
               dest="consensus",
               type=str,
               help="path to consensus sequence fasta file")
    parser.add("--original",
               dest="original",
               type=str,
               help="path to original input file")
    parser.add("--orig_name",
               dest="orig_name",
               type=str,
               help="original sequence name")
    parser.add("--gap_open",
               dest="alignment_gap_open",
               type=int,
               help="gap opening penalty",
               default=5)
    parser.add("--gap_extend",
               dest="alignment_gap_extend",
               type=int,
               help="gap extension penalty",
               default=2)
    parser.add("--match_score",
               dest="alignment_match_score",
               type=int,
               help="score for each match in SW algorithm",
               default=5)
    parser.add("--mismatch_score",
               dest="alignment_mismatch_score",
               type=int,
               help="penalty for each mismatch in SW algorithm",
               default=-4)

    args = parser.parse_args()

    consD = UtilityFunctions.FastaToDict(args.consensus)
    origD = UtilityFunctions.FastaToDict(args.original)
    cons_seq_f = list(consD.values())[0]
    cons_seq_f = cons_seq_f.upper()

    cons_seq_r = UtilityFunctions.reverseComplement(cons_seq_f)
    orig_seq = origD[args.orig_name]
    orig_seq = orig_seq.upper()
    pD = {
        'alignment_gap_open': args.alignment_gap_open,
        'alignment_gap_extend': args.alignment_gap_extend,
        'alignment_match_score': args.alignment_match_score,
        'alignment_mismatch_score': args.alignment_mismatch_score
    }
    ali_f = Alignment.SWalign(cons_seq_f, orig_seq, pD)
    ali_r = Alignment.SWalign(cons_seq_r, orig_seq, pD)

    if ali_f['optimal_alignment_score'] > ali_r['optimal_alignment_score']:
        ali = ali_f
        cons_seq = cons_seq_f
    else:
        ali = ali_r
        cons_seq = cons_seq_r
    q, t, ident = Alignment.getAlignmentLocal(ali, cons_seq, orig_seq, pD)
    cons_length = len(cons_seq)
    orig_length = len(orig_seq)
    cons_perc_aligned = (ali['query_end'] - ali['query_start']) / cons_length
    orig_perc_aligned = (ali['target_end'] - ali['target_start']) / orig_length

    print(ident, cons_perc_aligned, orig_perc_aligned)
コード例 #5
0
def runCandidates(Z, fasta_dict, seqdict, candfile, pD, outdir, rround,
                  currentD=None):
    '''
    Allows the user to specify a set of reference sequences - in the first
    round of clustering, only contigs which align to these references
    (meeting the same minimum criteria as for normal clustering) are
    selected.  In the second round, the fragments identified in the query
    file which matched the reference sequence are used to identify
    further fragments.  From this point clustering of consensus
    sequences continues as normal.
    
    Parameters
    ----------
    Z: list
        List of two item tuples where the first element is an integer and
        the second the sequence name for all sequences in the main input file
    fasta_dict: pyfaidx.Fasta
        pyfaidx indexed Fasta object containing the main input file of contigs
    seqdict: dict
        A dictionary where keys are sequence IDs and values are empty
        dictionaries - these are used to store CIAlign logs for sequences
        later but it is not run at this stage when candidate sequences are
        used
    candfile: str
        Path to a file containing the reference sequences to match the contigs
        to in the first round of clustering
    pD: dict
        Dictionary containing the initial parameters set by the user
    outdir: str
        Path to directory in which to save all output files
    rround: int
        Round number - which round of clustering is this - used to
        determine where to save the output
    currentD: dict
        Dictionary containing the results of previous rounds of clustering
        used in this case to expand consensus sequences from round 1
    
    Returns
    -------
    D: dict
        Updated version of currentD containing the results of this
        round of clustering
    '''

    X = copy.copy(Z)
    candidates = UtilityFunctions.FastaToDict(candfile)
    D = dict()
    k = 0
    # iterate through the reference sequences
    for c_nam, c_seq in candidates.items():
        current = dict()
        # store candidate name and sequence in the results dictionary
        current['name'] = c_nam
        current['consensus'] = c_seq
        if rround == 1:
            # in the first round, none of the input sequences are
            # consensus sequences
            current['alignment'] = UtilityFunctions.AlignmentArray([c_seq])
            current['seqdict'] = seqdict
            current['names'] = [c_nam]
        elif rround == 2:
            # in the second round, expand the consensus sequences based
            # on the output of the previous round
            consn = int(c_nam.replace("*consensus_", ""))
            current['alignment'] = currentD[consn]['alignment']
            current['seqdict'] = currentD[consn]['seqdict']
            current['names'] = currentD[consn]['names']

        current['matrix'], current['nt_inds'] = Consensus.makeAlignmentMatrix(
                                                    current['alignment'])

        k += 1
        X, C = AlignmentSW.buildCluster(Z, fasta_dict, current, pD, k,
                                        candidate=True, skip=True)

        D.setdefault(k, dict())
        D[k]['consensus'] = C['current_consensus']
        D[k]['alignment'] = C['current_alignment']
        D[k]['names'] = C['current_names']
        D[k]['seqdict'] = C['seqdict']
        Alignment.writeFastas(D[k], k, rround, outdir, candidates=True,
                              reference=candidates)
        if rround == 2:
            Alignment.mergeFastas(2, len(D), outdir)
    return(D)
コード例 #6
0
def main():
    parser = configargparse.ArgumentParser(
        description='''Generate pseudo fragmented transcripts with different \
            degress of variation and insertions / deletions''')
    parser.add_argument("--infile",
                        dest="infile",
                        type=str,
                        help="path to input fasta file")
    parser.add_argument("--outfile",
                        dest="outfile",
                        type=str,
                        help="path to output fasta file")
    parser.add_argument("--config",
                        dest="config",
                        type=str,
                        help='path to config file',
                        is_config_file=True)

    parser.add_argument("--min_fragment_length",
                        dest="min_fragment_length",
                        type=int,
                        help="minimum fragment length")
    parser.add_argument("--max_fragment_length",
                        dest="max_fragment_length",
                        type=int,
                        help="maximum fragment length")

    parser.add_argument("--min_n_fragments",
                        dest="min_n_fragments",
                        type=int,
                        help="minimum number of fragments")
    parser.add_argument("--max_n_fragments",
                        dest="max_n_fragments",
                        type=int,
                        help="maximum number of fragments")

    parser.add_argument('--min_diversity',
                        dest='min_diversity',
                        type=float,
                        help='minimum proportion of diversity to introduce')
    parser.add_argument('--max_diversity',
                        dest='max_diversity',
                        type=float,
                        help='maximum proportion of diversity to introduce')

    parser.add_argument("--min_n_insertions",
                        dest='min_n_insertions',
                        type=int,
                        help="minimum number of insertions to introduce")
    parser.add_argument("--max_n_insertions",
                        dest='max_n_insertions',
                        type=int,
                        help="maximum number of insertions to introduce")

    parser.add_argument("--min_insertion_size",
                        dest="min_insertion_size",
                        type=int,
                        help='minimum size insertion to introduce')
    parser.add_argument("--max_insertion_size",
                        dest="max_insertion_size",
                        type=int,
                        help='maximum size insertion to introduce')

    parser.add_argument("--min_n_deletions",
                        dest='min_n_deletions',
                        type=int,
                        help="minimum number of deletions to introduce")
    parser.add_argument("--max_n_deletions",
                        dest='max_n_deletions',
                        type=int,
                        help="maximum number of deletions to introduce")

    parser.add_argument("--min_deletion_size",
                        dest="min_deletion_size",
                        type=int,
                        help='minimum size deletion to introduce')
    parser.add_argument("--max_deletion_size",
                        dest="max_deletion_size",
                        type=int,
                        help='maximum size deletion to introduce')

    args = parser.parse_args()

    F = UtilityFunctions.FastaToDict(args.infile)
    Fnew = dict()

    for nam, seq in F.items():
        freqs = getNucFreqs(seq)
        fragments = chopSequence(seq, nam, args.min_n_fragments,
                                 args.max_n_fragments,
                                 args.min_fragment_length,
                                 args.max_fragment_length)
        for fnam, fragment in fragments.items():
            fragment = addSNPs(fragment, freqs, args.min_diversity,
                               args.max_diversity)
            fragment = addIndels(fragment,
                                 freqs,
                                 args.min_n_insertions,
                                 args.max_n_insertions,
                                 args.min_insertion_size,
                                 args.max_insertion_size,
                                 typ='insertion')
            fragment = addIndels(fragment,
                                 freqs,
                                 args.min_n_deletions,
                                 args.max_n_deletions,
                                 args.min_deletion_size,
                                 args.max_deletion_size,
                                 typ='deletion')

            rc = np.random.choice([True, False])
            if rc:
                fragment = UtilityFunctions.reverseComplement(fragment)
            Fnew[fnam] = fragment
    out = open(args.outfile, "w")
    for key, val in Fnew.items():
        out.write(">%s\n%s\n" % (key, val))
    out.close()
コード例 #7
0
def test_FastaToDict():
    T = UtilityFunctions.FastaToDict("tests/test_fasta.fasta")
    assert T == {"Seq_1": "ACGTT-", "Seq_2": "AACT", "Seq_3": "CTAG"}