def test_multiple_seqs(): assert k_mer_frequencies(["A", "A"], 1, include_missing=False) == { 1: { "A": 1.0 } } assert k_mer_frequencies(["A", "T"], 1, include_missing=False) == { 1: { "A": 0.5, "T": 0.5 } } assert k_mer_frequencies(["AA", "TT"], 2, include_missing=False) == { 2: { "AA": 0.5, "TT": 0.5 } } assert k_mer_frequencies(["A", "T"], 1, include_missing=True) == { 1: { "A": 0.5, "T": 0.5, "G": 0.0, "C": 0.0 } } assert np.array_equal( k_mer_frequencies(["A", "T"], 1, include_missing=True, vector=True), np.array([0.5, 0.0, 0.0, 0.5]))
def test_vectorization(): # check that the ordering is alphabetical assert np.array_equal(k_mer_frequencies("A", 1, include_missing=True, vector=True), np.array([1.0, 0.0, 0.0, 0.0])) assert np.array_equal(k_mer_frequencies("T", 1, include_missing=True, vector=True), np.array([0.0, 0.0, 0.0, 1.0])) assert np.array_equal(k_mer_frequencies("G", 1, include_missing=True, vector=True), np.array([0.0, 0.0, 1.0, 0.0])) assert np.array_equal(k_mer_frequencies("C", 1, include_missing=True, vector=True), np.array([0.0, 1.0, 0.0, 0.0])) assert np.array_equal(k_mer_frequencies("AT", 1, include_missing=True, vector=True), np.array([0.5, 0.0, 0.0, 0.5]))
def test_multiple_k(): assert np.array_equal( k_mer_frequencies("AA", [1, 2], include_missing=True, vector=True), np.array([ 1.0, 0.0, 0.0, 0.0, # k = 1 1.0, 0.0, 0.0, 0.0, # k = 2 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ])) assert np.array_equal( k_mer_frequencies("AA", [2, 1], include_missing=True, vector=True), np.array([ 1.0, 0.0, 0.0, 0.0, # k = 1 1.0, 0.0, 0.0, 0.0, # k = 2 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ])) assert k_mer_frequencies("AA", [1, 2], include_missing=False) == { 1: { "A": 1.0 }, 2: { "AA": 1.0 } }
def aa(filepath, mode, trans_table, length, stop_codon, output, verbose): # translate the DNA seq, if using exact AA seq if mode == "seq": print(SeqIO.read(filepath, "fasta").seq.translate(table=trans_table)) return # if using frequency mode seqs = [] with open(filepath, "r") as handle: for record in SeqIO.parse(handle, "fasta"): try: aa_seq = str(record.seq.translate(table=trans_table)) except Bio.Data.CodonTable.TranslationError: aa_seq = str(record.seq) if "*" in aa_seq: aa_seq.replace("*", "") seqs.append(aa_seq) aa_seq = amino_acid_seq(length, k_mer_frequencies("".join(seqs), 1)) if stop_codon: aa_seq += "*" if output: with open(output, "w+") as output_handle: SeqIO.write( SeqRecord(Seq(aa_seq), id="Optimized by Freqgen", description=""), output_handle, "fasta") if verbose: print(aa_seq)
def test_amino_acid(): assert k_mer_frequencies("INQTEL", 1, include_missing=False) == {1: {'E': 0.16666666666666666, 'I': 0.16666666666666666, 'L': 0.16666666666666666, 'N': 0.16666666666666666, 'Q': 0.16666666666666666, 'T': 0.16666666666666666}}
def featurize(filepath, k, codon_usage, output): # get the sequences as strs seqs = [] with open(filepath, "r") as handle: for seq in SeqIO.parse(handle, "fasta"): seq = str(seq.seq) seqs.append(seq) if k: result = k_mer_frequencies(seqs, k, include_missing=True) else: result = {} # get the codon usage frequencies if codon_usage: for seq in seqs: if len(seq) % 3 != 0: raise ValueError( "Cannot calculate codons for sequence whose length is not divisible by 3" ) result["codons"] = codon_frequencies("".join(seqs)) if output: yaml.dump(result, open(output, "w+"), default_flow_style=False) return print(yaml.dump(result, default_flow_style=False))
def test_dna(): assert k_mer_frequencies("GATGATGGC", 3, include_missing=False) == { 'ATG': 0.2857142857142857, 'GAT': 0.2857142857142857, 'GGC': 0.14285714285714285, 'TGA': 0.14285714285714285, 'TGG': 0.14285714285714285 }
def aa(filepath, mode, genetic_code, length, stop_codon, output, verbose): # translate the DNA seq, if using exact AA seq if mode == "seq": try: aa_seq = SeqIO.read(filepath, "fasta").seq.translate(table=genetic_code) except Bio.Data.CodonTable.TranslationError: print( "Sequence is not able to be translated! Is it already an amino acid sequence?" ) return aa_seq = str(aa_seq).replace("*", "") elif mode == "freq": # ensure we know how ling the new sequence should be if not length: print("Must provide lenght parameter using -l INTEGER") return seqs = [] # extract the sequences from the reference set with open(filepath, "r") as handle: for record in SeqIO.parse(handle, "fasta"): try: aa_seq = str(record.seq.translate(table=genetic_code) ) # for DNA sequences, translate them except Bio.Data.CodonTable.TranslationError: aa_seq = str( record.seq ) # for amino acid sequences, just get the string seqs.append(aa_seq) # make them into one big sequence seqs = "".join(seqs) seqs = seqs.replace("*", "") # generate a new sequence of the right length aa_seq = amino_acid_seq(length, k_mer_frequencies(seqs, 1)[1]) # add a stop codon, if requested if stop_codon: aa_seq += "*" # output to the file if output: with open(output, "w+") as output_handle: if not isinstance(aa_seq, Seq): aa_seq = Seq(aa_seq) SeqIO.write( SeqRecord(aa_seq, id="Generated by Freqgen from " + str(filepath), description=""), output_handle, "fasta") if verbose or not output: print(aa_seq)
def test_include_missing(): assert k_mer_frequencies("GATGATGGC", 2, include_missing=True) == {2: {'AA': 0, 'AC': 0, 'AG': 0, 'AT': 0.25, 'CA': 0, 'CC': 0, 'CG': 0, 'CT': 0, 'GA': 0.25, 'GC': 0.125, 'GG': 0.125, 'GT': 0, 'TA': 0, 'TC': 0, 'TG': 0.25, 'TT': 0}}
def test_k_values(s): assume(len(s) > 6) # make sure that the lengths are right for i in range(1, 6): assert len(k_mer_frequencies(s, i, include_missing=True)[i]) == 4**i assert len(k_mer_frequencies(s, i, include_missing=True, vector=True)) == 4**i # ensure invalid values of k raise an error with pytest.raises(ValueError): k_mer_frequencies(s, 0) with pytest.raises(ValueError): k_mer_frequencies(s, -1)
def featurize(filepath, k, codon_usage, trans_table): # get the sequences as strs seqs = [] with open(filepath, "r") as handle: for seq in SeqIO.parse(handle, "fasta"): seqs.append(str(seq.seq)) # get the k-mer k_mer_frequencies for _k in k: print(yaml.dump( {_k: k_mer_frequencies(seqs, _k, include_missing=True)}, default_flow_style=False), end="") # get the codon usage frequencies if codon_usage: print(yaml.dump( dict(codons=codon_frequencies("".join(seqs), trans_table)), default_flow_style=False), end="")
def test_k_values(): # make sure that the lengths are right for i in range(1, 6): assert len(k_mer_frequencies("GATGATGGC", i, include_missing=True)) == 4**i assert len( k_mer_frequencies("GATGATGGC", i, include_missing=True, vector=True)) == 4**i # ensure invalid values of k raise an error with pytest.raises(ValueError): k_mer_frequencies("GATTACA", 0) with pytest.raises(ValueError): k_mer_frequencies("GATTACA", -1)
def test_invalid_args(): with pytest.raises(ValueError): k_mer_frequencies("A", 1, include_missing=False, vector=True) with pytest.raises(ValueError): k_mer_frequencies("", 1)
def test_codon_frequencies(s): assert k_mer_frequencies(s, 1, codons=True)["codons"] == codon_frequencies(s)
def visualize(original, target, optimized, title, width, height, output, show, genetic_code): target = yaml.safe_load(open(target)) # create a list of the k_mers k = sorted((_k for _k in target.keys() if not isinstance(_k, str))) k_mers = list( chain.from_iterable((("".join(k_mer) for k_mer in product("ACGT", repeat=_k)) for _k in k))) if "codons" in target.keys() and target.keys(): k_mers.extend( [codon + "*" for codon in sorted(target["codons"].keys())]) # generate the target vector target_vector = [] for _k in k: k_mer_vector = [ x[1] for x in sorted(list(target[_k].items()), key=lambda x: x[0]) ] target_vector.extend(k_mer_vector) if "codons" in target.keys(): target_vector.extend([ x[1] for x in sorted(list(target["codons"].items()), key=lambda x: x[0]) ]) seq = SeqIO.read(optimized, "fasta").seq if k: optimized = list(k_mer_frequencies(seq, k, vector=True)) else: optimized = [] if "codons" in target.keys(): optimized.extend([ x[1] for x in sorted(list(codon_frequencies(seq).items()), key=lambda x: x[0]) ]) # print(list(zip(optimized, target_vector, k_mers))) assert len(optimized) == len(target_vector) == len(k_mers) # sanity check # if the original sequence is given, calculate its k_mer_frequencies if original: original_seq = SeqIO.read(original, "fasta").seq if k: original = list(k_mer_frequencies(original_seq, k, vector=True)) else: original = [] if "codons" in target.keys(): original.extend([ x[1] for x in sorted( list(codon_frequencies(original_seq).items()), key=lambda x: x[0], ) ]) if max(k, default=0) >= 3 or "codons" in target.keys(): click.secho( "Displaying a large number of k-mers and/or codons. To view the results of each k-mer, use the zoom tool in the top right of the graph to zoom in or set the width of the graph manually using --width. Suggested width: " + str(35 * len(k_mers)), fg="yellow", ) click.pause() _visualize( k_mers, target_vector, optimized, original_freqs=original, title=title, plot_height=height, plot_width=width, filepath=output, codons="codons" in target.keys(), show=show, )