def test_AlignmentArray(): F = UtilityFunctions.FastaToDict("tests/test_fasta.fasta") ali = UtilityFunctions.AlignmentArray(F.values()) comp = [['A', 'C', 'G', 'T', 'T', "-"], ['A', 'A', 'C', 'T'], ['C', 'T', 'A', 'G']] comp = np.array(comp) assert np.array_equal(comp, ali)
def test_reverseComplementAlignment(): F = UtilityFunctions.FastaToDict("tests/test_data/test_fasta.fasta") ali = UtilityFunctions.AlignmentArray(F.values()) rcomp = [['-', 'A', 'A', 'C', 'G', 'T'], ['A', 'G', 'T', 'T'], ['C', 'T', 'A', 'G']] rcomp = np.array(rcomp) assert np.array_equal(UtilityFunctions.reverseComplementAlignment(ali), rcomp)
def alignWithRef(currentD, reference_dict, pD, outdir): for i in currentD: query_seq = currentD[i]['consensus'] query_ali = currentD[i]['alignment'] query_names = currentD[i]['names'] q_matrix, nt_inds = Consensus.makeAlignmentMatrix(query_ali) candidates = UtilityFunctions.FastaToDict(reference_dict) for target_nam, target_seq in candidates.items(): qm = copy.copy(q_matrix) target_ali = UtilityFunctions.AlignmentArray([target_seq]) # Align the query and target consensus sequences result = Alignment.SWalign(query_seq, target_seq, pD, useSub=True) # Check the if the consensus sequences are a good match is_match = Alignment.alignmentMeetsCriteria(result, query_seq, target_seq, pD) if is_match[0]: result['alignment'] = is_match[1] # get the full alignment for the two consensus sequences result = Alignment.getAlignmentFull(result, query_seq, target_seq, pD) ali, matrix = Consensus.expandAlignment(result, query_ali, target_ali, qm, nt_inds) cons = Consensus.collapseAlignment(matrix, nt_inds) names = query_names + [target_nam] result2 = Alignment.SWalign(query_seq, cons, pD, useSub=True) is_match_2 = Alignment.alignmentMeetsCriteria(result2, query_seq, cons, pD) result2['alignment'] = is_match_2[1] result2 = Alignment.getAlignmentFull(result2, query_seq, cons, pD) a2 = UtilityFunctions.AlignmentArray([query_seq]) ali, matrix = Consensus.expandAlignment(result2, a2, ali, qm, nt_inds) names = ["*consensus_%s" % i] + names path = "%s/final_clusters/consensus_%s_to_%s_ali.fasta" % ( outdir, i, target_nam.split(" ")[0]) out = open(path, "w") for j, nam in enumerate(names): out.write(">%s\n%s\n" % (nam, "".join(list(ali[j])))) out.close()
def main(): parser = configargparse.ArgumentParser( description='''Compare two sequences and calculate the % identity''') parser.add("--consensus", dest="consensus", type=str, help="path to consensus sequence fasta file") parser.add("--original", dest="original", type=str, help="path to original input file") parser.add("--orig_name", dest="orig_name", type=str, help="original sequence name") parser.add("--gap_open", dest="alignment_gap_open", type=int, help="gap opening penalty", default=5) parser.add("--gap_extend", dest="alignment_gap_extend", type=int, help="gap extension penalty", default=2) parser.add("--match_score", dest="alignment_match_score", type=int, help="score for each match in SW algorithm", default=5) parser.add("--mismatch_score", dest="alignment_mismatch_score", type=int, help="penalty for each mismatch in SW algorithm", default=-4) args = parser.parse_args() consD = UtilityFunctions.FastaToDict(args.consensus) origD = UtilityFunctions.FastaToDict(args.original) cons_seq_f = list(consD.values())[0] cons_seq_f = cons_seq_f.upper() cons_seq_r = UtilityFunctions.reverseComplement(cons_seq_f) orig_seq = origD[args.orig_name] orig_seq = orig_seq.upper() pD = { 'alignment_gap_open': args.alignment_gap_open, 'alignment_gap_extend': args.alignment_gap_extend, 'alignment_match_score': args.alignment_match_score, 'alignment_mismatch_score': args.alignment_mismatch_score } ali_f = Alignment.SWalign(cons_seq_f, orig_seq, pD) ali_r = Alignment.SWalign(cons_seq_r, orig_seq, pD) if ali_f['optimal_alignment_score'] > ali_r['optimal_alignment_score']: ali = ali_f cons_seq = cons_seq_f else: ali = ali_r cons_seq = cons_seq_r q, t, ident = Alignment.getAlignmentLocal(ali, cons_seq, orig_seq, pD) cons_length = len(cons_seq) orig_length = len(orig_seq) cons_perc_aligned = (ali['query_end'] - ali['query_start']) / cons_length orig_perc_aligned = (ali['target_end'] - ali['target_start']) / orig_length print(ident, cons_perc_aligned, orig_perc_aligned)
def runCandidates(Z, fasta_dict, seqdict, candfile, pD, outdir, rround, currentD=None): ''' Allows the user to specify a set of reference sequences - in the first round of clustering, only contigs which align to these references (meeting the same minimum criteria as for normal clustering) are selected. In the second round, the fragments identified in the query file which matched the reference sequence are used to identify further fragments. From this point clustering of consensus sequences continues as normal. Parameters ---------- Z: list List of two item tuples where the first element is an integer and the second the sequence name for all sequences in the main input file fasta_dict: pyfaidx.Fasta pyfaidx indexed Fasta object containing the main input file of contigs seqdict: dict A dictionary where keys are sequence IDs and values are empty dictionaries - these are used to store CIAlign logs for sequences later but it is not run at this stage when candidate sequences are used candfile: str Path to a file containing the reference sequences to match the contigs to in the first round of clustering pD: dict Dictionary containing the initial parameters set by the user outdir: str Path to directory in which to save all output files rround: int Round number - which round of clustering is this - used to determine where to save the output currentD: dict Dictionary containing the results of previous rounds of clustering used in this case to expand consensus sequences from round 1 Returns ------- D: dict Updated version of currentD containing the results of this round of clustering ''' X = copy.copy(Z) candidates = UtilityFunctions.FastaToDict(candfile) D = dict() k = 0 # iterate through the reference sequences for c_nam, c_seq in candidates.items(): current = dict() # store candidate name and sequence in the results dictionary current['name'] = c_nam current['consensus'] = c_seq if rround == 1: # in the first round, none of the input sequences are # consensus sequences current['alignment'] = UtilityFunctions.AlignmentArray([c_seq]) current['seqdict'] = seqdict current['names'] = [c_nam] elif rround == 2: # in the second round, expand the consensus sequences based # on the output of the previous round consn = int(c_nam.replace("*consensus_", "")) current['alignment'] = currentD[consn]['alignment'] current['seqdict'] = currentD[consn]['seqdict'] current['names'] = currentD[consn]['names'] current['matrix'], current['nt_inds'] = Consensus.makeAlignmentMatrix( current['alignment']) k += 1 X, C = AlignmentSW.buildCluster(Z, fasta_dict, current, pD, k, candidate=True, skip=True) D.setdefault(k, dict()) D[k]['consensus'] = C['current_consensus'] D[k]['alignment'] = C['current_alignment'] D[k]['names'] = C['current_names'] D[k]['seqdict'] = C['seqdict'] Alignment.writeFastas(D[k], k, rround, outdir, candidates=True, reference=candidates) if rround == 2: Alignment.mergeFastas(2, len(D), outdir) return(D)
def main(): parser = configargparse.ArgumentParser( description='''Generate pseudo fragmented transcripts with different \ degress of variation and insertions / deletions''') parser.add_argument("--infile", dest="infile", type=str, help="path to input fasta file") parser.add_argument("--outfile", dest="outfile", type=str, help="path to output fasta file") parser.add_argument("--config", dest="config", type=str, help='path to config file', is_config_file=True) parser.add_argument("--min_fragment_length", dest="min_fragment_length", type=int, help="minimum fragment length") parser.add_argument("--max_fragment_length", dest="max_fragment_length", type=int, help="maximum fragment length") parser.add_argument("--min_n_fragments", dest="min_n_fragments", type=int, help="minimum number of fragments") parser.add_argument("--max_n_fragments", dest="max_n_fragments", type=int, help="maximum number of fragments") parser.add_argument('--min_diversity', dest='min_diversity', type=float, help='minimum proportion of diversity to introduce') parser.add_argument('--max_diversity', dest='max_diversity', type=float, help='maximum proportion of diversity to introduce') parser.add_argument("--min_n_insertions", dest='min_n_insertions', type=int, help="minimum number of insertions to introduce") parser.add_argument("--max_n_insertions", dest='max_n_insertions', type=int, help="maximum number of insertions to introduce") parser.add_argument("--min_insertion_size", dest="min_insertion_size", type=int, help='minimum size insertion to introduce') parser.add_argument("--max_insertion_size", dest="max_insertion_size", type=int, help='maximum size insertion to introduce') parser.add_argument("--min_n_deletions", dest='min_n_deletions', type=int, help="minimum number of deletions to introduce") parser.add_argument("--max_n_deletions", dest='max_n_deletions', type=int, help="maximum number of deletions to introduce") parser.add_argument("--min_deletion_size", dest="min_deletion_size", type=int, help='minimum size deletion to introduce') parser.add_argument("--max_deletion_size", dest="max_deletion_size", type=int, help='maximum size deletion to introduce') args = parser.parse_args() F = UtilityFunctions.FastaToDict(args.infile) Fnew = dict() for nam, seq in F.items(): freqs = getNucFreqs(seq) fragments = chopSequence(seq, nam, args.min_n_fragments, args.max_n_fragments, args.min_fragment_length, args.max_fragment_length) for fnam, fragment in fragments.items(): fragment = addSNPs(fragment, freqs, args.min_diversity, args.max_diversity) fragment = addIndels(fragment, freqs, args.min_n_insertions, args.max_n_insertions, args.min_insertion_size, args.max_insertion_size, typ='insertion') fragment = addIndels(fragment, freqs, args.min_n_deletions, args.max_n_deletions, args.min_deletion_size, args.max_deletion_size, typ='deletion') rc = np.random.choice([True, False]) if rc: fragment = UtilityFunctions.reverseComplement(fragment) Fnew[fnam] = fragment out = open(args.outfile, "w") for key, val in Fnew.items(): out.write(">%s\n%s\n" % (key, val)) out.close()
def test_FastaToDict(): T = UtilityFunctions.FastaToDict("tests/test_fasta.fasta") assert T == {"Seq_1": "ACGTT-", "Seq_2": "AACT", "Seq_3": "CTAG"}