def design(inp, out, args): names, sequences = ft.read_fasta_lists(inp) seqs = list() for name, sequence in zip(names, sequences): seqs.append(Sequence(name=name, sequence=sequence)) if not args.quiet: print("Number of input sequences: ", len(seqs)) if args.gap_span: designer = GapSpanningLibraryDesigner(window_size=args.window_size, step_size=args.step_size) else: designer = LibraryDesigner(window_size=args.window_size, step_size=args.step_size) library = designer.design(seqs) if not args.quiet: print("Number of output Kmers: ", len(library)) outD = {e.name: e.sequence for e in library} namesSorted = sorted(list(outD.keys())) ft.write_fasta(namesSorted, [outD[n] for n in namesSorted], out) return len(namesSorted)
def main(): arg_parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument( '-r', '--reps', help= "Number of replicate datasets to generate for each level of divergence.", default=1, type=int) reqArgs = arg_parser.add_argument_group('Required Arguments') reqArgs.add_argument( '-i', '--input', help="Fasta file contianing the protein sequence to downsample.", required=True) reqArgs.add_argument( '-n', '--num', help= "Size(s) of downsampled datasets. Can be a comma-delimited list of integers", required=True) reqArgs.add_argument( '-o', '--output', help= "Directory name for output files. Will be created, if it doesn't already exist", required=True) args = arg_parser.parse_args() # Generate output directory if not os.path.isdir(args.output): os.mkdir(args.output) else: print("Warning: %s already exists!" % (args.output)) # Read in fasta file to downsample names, seqs = ft.read_fasta_lists(args.input) # Extract file basename bName = ".".join(os.path.basename(args.input).split(".")[:-1]) # Step through each dataset size sizes = [int(x) for x in args.num.split(",")] for s in sizes: sCount = 0 while sCount < args.reps: indexes = random.choices(range(len(names)), k=s) ft.write_fasta( [names[i] for i in indexes], [seqs[i] for i in indexes], "%s/%s_n%04d-%03d.fasta" % (args.output, bName, s, sCount)) sCount += 1
def gene_translator(genes_filename, output_filename): with open(genes_filename, 'r') as f: list_seq = ft.fasta_list(f) with open(output_filename, 'w') as fw: for seq_tuple in list_seq: sequence = '' for i in range(0, len(seq_tuple[1]), 3): code = seq_tuple[1][i:i + 3] if len(code) == 3: sequence += genecode[code] ft.write_fasta(fw, seq_tuple[0], sequence.rstrip('_'))
def gene_translator_frame(genes_filename, output_filename): with open(genes_filename, 'r') as f: list_seq = ft.fasta_list(f) write_switch = False with open(output_filename, 'w') as fw: for seq_tuple in list_seq: sequence = '' for i in range(0, len(seq_tuple[1]), 3): code = seq_tuple[1][i:i + 3] if code in ['ATG', 'GTG']: if not write_switch: code = 'ATG' write_switch = True elif code in ['TAA', 'TAG', 'TGA']: write_switch = False if write_switch and len(code) == 3: sequence += genecode[code] ft.write_fasta(fw, seq_tuple[0], sequence)
def orf(infile, outfile): def check_codon(index, codon): if codon == 'ATG' and not semaphores[index]: start = '' if index < 4: start = '+' + str(i + 1 + index) + '+' else: start = '+c' + str(len(v) - i - index + 3) + '+' seq[index] += '*' + start semaphores[index] = True elif codon in ['TAA', 'TAG', 'TGA'] and semaphores[index]: stop = '' if index < 4: stop = '+' + str(i + 3 + index) + '+' else: stop = '+' + str(len(v) - i + 1 - index) + '+' seq[index] += codon + stop + '*' semaphores[index] = False if semaphores[index]: seq[index] += codon with open(outfile, 'w') as fw: with open(infile, 'r') as f: semaphores = [False for _ in range(6)] seq = ['' for _ in range(6)] for head, v in bio.fasta_list2(f): cdna_codon = cdna(v)[::-1] for i in range(0, len(v), 3): check_codon(0, v[i:i + 3]) check_codon(1, v[i + 1:i + 4]) check_codon(2, v[i + 2:i + 5]) check_codon(3, cdna_codon[i:i + 3]) check_codon(4, cdna_codon[i + 1:i + 4]) check_codon(5, cdna_codon[i + 2:i + 5]) for i in set("".join(seq).split('*')): if len(i) > 6 and i[-1:] == '+': i = i.split('+') bio.write_fasta( fw, "|".join(head.split('|')[:-1]) + '|:' + i[1] + '-' + i[3], i[2])
def main(): argparser = argparse.ArgumentParser( "Output a FASTA containing only unique sequences." ) argparser.add_argument( '-i','--input', help = "Name of input file." ) argparser.add_argument( '-o','--output', help = "Name of output file This file will contain the " "same sequences as the input file, but duplicates will not be included." ) args = argparser.parse_args() in_names, in_seqs = fastatools.read_fasta_lists( args.input ) out_names, out_seqs = list(), list() seen_seqs = set() for name, seq in zip( in_names, in_seqs ): if seq not in seen_seqs: seen_seqs.add( seq ) out_names.append( name ) out_seqs.append( seq ) fastatools.write_fasta( out_names, out_seqs, args.output )
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("fastas", help="Fasta files to dedup", nargs='+') parser.add_argument( "--prepend", default="dedup", help="String to add to the beginning of deduped files.") args = parser.parse_args() for each in args.fastas: # Read in fasta file names, seqs = ft.read_fasta_lists(each) # Convert to dictionary with keys = names, and values = lists of seqs fD = defaultdict(list) for i, n in enumerate(names): fD[n].append(seqs[i]) newN = [] newS = [] for n, sL in fD.items(): if len(set(sL)) == 1: newN.append(n) newS.append(sL[0]) else: print(n) for s in sL: newN.append(n) newS.append(s) ft.write_fasta(newN, newS, "%s_%s" % (args.prepend, each))
def main(): arg_parser = argparse.ArgumentParser( description="Mutate input sequences to generate diverse datasets.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument( '-n', '--num', help="Number of mutated sequences to output per input sequence.", default=30, type=int) arg_parser.add_argument( '-r', '--reps', help= "Number of replicate datasets to generate for each level of divergence.", default=1, type=int) reqArgs = arg_parser.add_argument_group('Required Arguments') reqArgs.add_argument( '-i', '--input', help="Fasta file contianing the protein sequence(s) to mutate.", required=True) reqArgs.add_argument('-o', '--output', help="Base name for output fasta files.", required=True) reqArgs.add_argument( '-d', '--diverg', help= "Level of divergence from the input sequence. Should be between 0 and 1. Can include multiple comma-delimited values.", required=True) args = arg_parser.parse_args() # Possible amino acids AAs = [ 'A', 'C', "D", "E", 'F', "G", "H", 'I', "K", 'L', 'M', 'N', "P", "Q", "R", "S", "T", 'V', 'W', 'Y' ] # Parse target divergences divergs = [float(d) for d in args.diverg.split(",")] # Read in input seqs iD = ft.read_fasta_dict_upper(args.input) for d in divergs: r = 0 while r < args.reps: r += 1 outN = [] outS = [] for n, s in iD.items(): newS = s c = 0 muts = int(d * len(s)) while c < args.num: c += 1 sites = random.choices(range(len(s)), k=muts) for site in sites: subAAs = AAs[::] subAAs.remove(newS[site]) newS = newS[:site] + random.choice( subAAs) + newS[site + 1:] outS.append(newS) outN.append("%s_d%.3f_%03d" % (n, d, c)) ft.write_fasta(outN, outS, "%s_d%.3f_r%03d.fasta" % (args.output, d, r))
def design(inp, out, args): # Generate dict with xmer counts xcD = {} tN, tS = ft.read_fasta_lists(inp) for s in tS: xL = kt.kmerList(s, args.xMerSize) for x in xL: if len(set(x).intersection(args.exSet)) == 0: xcD[x] = xcD.get(x, 0) + 1 #Save count of total xmers in targets totalX = len(xcD) # Score each target sequence by summing contained xmer scores maxScore = 0 repS = "" repN = "" for i, s in enumerate(tS): theseXs = kt.kmerList(s, args.xMerSize) thisScore = sum([xcD[x] for x in theseXs if x in xcD]) if thisScore > maxScore: maxScore = thisScore repS = s repN = tN[i] # Generate peptides using a sliding window across the chosen representative sequence rep = [Sequence(name=repN, sequence=repS)] designer = LibraryDesigner(window_size=args.yMerSize, step_size=args.step_size) library = designer.design(rep) repD = {e.name: e.sequence for e in library} repNames = sorted(list(repD.keys())) repSeqs = [repD[n] for n in repNames] # Remove xmers covered by the sliding window peptides for s in repSeqs: xL = kt.kmerList(s, args.xMerSize) for x in xL: if x in xcD: del (xcD[x]) # Read in all yMers in targets ysD = {} yNameD = {} for i, s in enumerate(tS): yL = kt.kmerList(s, args.yMerSize) for j, y in enumerate(yL): if len(set(y).intersection(args.exSet)) == 0: ysD[y] = 0 yNameD[y] = "%s_%04d" % (tN[i], j) # Design peptides newSeqs = [] newNames = [] while (1 - (len(xcD) / totalX)) < args.target: thisPep = choosePep(ysD, xcD, args) thisName = yNameD[thisPep] newSeqs.append(thisPep) newNames.append(thisName) #Remove selected peptide from ysD del (ysD[thisPep]) #Remove covered xMers from xcD for eachX in kt.kmerList(thisPep, args.xMerSize): if eachX in xcD: del (xcD[eachX]) # Write out peptides ft.write_fasta(repNames + newNames, repSeqs + newSeqs, out) return len(repSeqs + newSeqs)
def cdna_writer(input_file, output_file): with open(output_file, 'w') as f: for k, v in cdna_list(input_file): bio.write_fasta(f, k, v)
def design(inp, out, args): # Generate dict with xmer counts xcD = defaultdict(int) tN, tS = ft.read_fasta_lists(inp) for s in tS: xL = kt.kmerList(s, args.xMerSize) for x in xL: if len(set(x).intersection(args.exSet)) == 0: xcD[x] += 1 # Write out tsv with xmer counts, if requested # if args.outputXmerTables: # writeXmerDict(xcD, "initialXmerCounts.tsv") #Save count of total xmers in targets totalX = len(xcD) # If pre-designed peptides are provided, remove any contained xmers from the xcD if args.pre: for each in args.pre.split(","): pN, pS = ft.read_fasta_lists(each) for s in pS: xL = kt.kmerList(s, args.xMerSize) for x in xL: if x in xcD: del (xcD[x]) # Write out tsv with xmer counts, if requested # if args.outputXmerTables: # writeXmerDict(xcD, "preRemovedXmerCounts.tsv") # Read in all yMers in targets ysD = {} yNameD = {} for i, s in enumerate(tS): # for s in tS: yL = kt.kmerList(s, args.yMerSize) for j, y in enumerate(yL): # for y in yL: if len(set(y).intersection(args.exSet)) == 0: ysD[y] = 0 yNameD[y] = "%s_%04d" % (tN[i], j) # Design peptides newPeps = [] newNames = [] while (1 - (len(xcD) / totalX)) < args.target: thisPep = choosePep(ysD, xcD, args) thisName = yNameD[thisPep] newPeps.append(thisPep) newNames.append(thisName) #Remove selected peptide from ysD del (ysD[thisPep]) #Remove covered xMers from xcD for eachX in kt.kmerList(thisPep, args.xMerSize): if eachX in xcD: del (xcD[eachX]) # Write out peptides ft.write_fasta(newNames, newPeps, out) return len(newPeps)