def rm_degenerates(inputfile): standard_nts = list('ACGTU') fl1 = FastaList(inputfile) for item in fl1.seq_list: seq_id = item.split('\n')[0] seq_seq = item.split('\n')[1].upper() nt = seq_seq[0] while nt not in standard_nts: seq_seq = seq_seq[1:] nt = seq_seq[0] nt = seq_seq[-1] while nt not in standard_nts: seq_seq = seq_seq[:-1] nt = seq_seq[-1] tmpfile.write(seq_id + '\n' + seq_seq + '\n') tmpfile.seek(0) return tmpfile.name
def rm_degenerates(inputfile): standard_nts = list('ACGTU') fl1 = FastaList(inputfile) for item in fl1.seq_list: seq_id = item.split('\n')[0] seq_seq = item.split('\n')[1].upper() nt = seq_seq[0] while nt not in standard_nts: seq_seq = seq_seq[1:] nt = seq_seq[0] nt = seq_seq[-1] while nt not in standard_nts: seq_seq = seq_seq[:-1] nt = seq_seq[-1] fraction_nondeg = 1 - (seq_seq.count('A') + seq_seq.count('T') + seq_seq.count('G') + seq_seq.count('C') + seq_seq.count('U')) / (len(seq_seq)) if fraction_nondeg <= ARGS.q: tmpfile.write(seq_id + '\n' + seq_seq + '\n') tmpfile.seek(0) return tmpfile.name
if __name__ == "__main__": PARSER = argparse.ArgumentParser(description='Finds target sequences from\ a target (-t) fasta file in a source fasta file (-s)') PARSER.add_argument('-p', type=int, help='nr of processor threads', default=1) PARSER.add_argument('-t', type=str, help='target fastafile', required=True) PARSER.add_argument('-s', type=str, help='source fastafile', required=True) PARSER.add_argument('-l', type=int, help='the length of the extracted seq', default=150) ARGS = PARSER.parse_args() FA_S = FastaList(ARGS.s) ALL_S = FA_S.seq_list + FA_S.rev_comp() FA_T = FastaList(ARGS.t) FA_CS = FastaList('aivcs.fa') fa_t_div = FA_T.divide(ARGS.p) aivcs = [] for seq in FA_CS.seq_list: aivcs.append(seq.split('\n')[1].strip()) FA_OUT = open('sources.fa', 'w') manager = Manager() # Multiprocessing manager res_lst = manager.list() processes = [] for i in range(ARGS.p): p = Process(target=process_work, args=(res_lst, fa_t_div[i])) processes.append(p) p.start()
def main(): # Writes log-file logfile = open(ARGS.od + os.path.basename(__file__). replace('.py', '.log'), 'w') logfile.write('Log for: {} at {}\nUser: {}\n\n'.format(os.path.basename( __file__), str(datetime.datetime.now()).split('.')[0], getpass.getuser())) logfile.write('Minimum sequence length = {}\n'.format(ARGS.m)) logfile.write('Minimum nr of sequences = {}\n'.format(ARGS.c)) logfile.write('Minimum fraction of most abundant sequence = {}\n\n'.format( ARGS.f)) # Loop over files in input directory (ARGS.id) but skip files without .fa # and .fastq file extension filelst = [name for name in os.listdir(ARGS.id) if os.path.isfile(ARGS.id + name) and (name.endswith('.fa') or name.endswith('.fastq') or name.endswith('.gz'))] nr_of_files = len(filelst) file_nr = 1 for seqfile in filelst: print('\rprocessing file {}/{}'.format(file_nr, nr_of_files), end=" ") inp_seq = FastaList(ARGS.id + seqfile) init_seq = inp_seq.nr_seq # The intitial number of seqs in fasta-file seq_fa = reduce_fa(inp_seq) logfile.write('{}: Read {} sequences. '.format(seqfile, init_seq)) nr_seq_demult = 0 primerlist = primer_fa.seq_list primerlist_rc = primer_fa.seq_list_revc() marker_maxcount = dict() fraction = 0 if ARGS.f > 0: # Assumes three charachters at the end of the primer id indicating # forward or reverse. Initilize the dict contaning the counts for # the sequence for each primer pair with the highest count # TODO: Try to remove dependence on specific primer names in # primer-file for primerid in primer_fa.id_list[::2]: marker_maxcount[primerid[:-3]] = 0 for seq in range(seq_fa.nr_seq): seqname = seq_fa.seq_list[seq].split('\n')[0] seqcount = int(seqname.split(':')[1].split('_')[0]) seqseq = seq_fa.seq_list[seq].split('\n')[1] for primer in range(0, primer_fa.nr_seq, 2): test_primers1 = [primerlist[primer+1].split('\n')[1], primerlist_rc[primer].split('\n')[1]] test_primers2 = [primerlist[primer].split('\n')[1], primerlist_rc[primer + 1].split('\n')[1]] if all(x in seq_fa.seq_list[seq] for x in test_primers1) or\ all(x in seq_fa.seq_list[seq] for x in test_primers2): if ARGS.f > 0: # Since seq_fa.seq_list is ordered with respect to count # the first occurance in the list of a marker sequence # will have the highest count if marker_maxcount[primerlist[primer].split('\n')[0][1:-3]] == 0: marker_maxcount[primerlist[primer].split('\n')[0][1:-3]] =\ seqcount fraction = seqcount / marker_maxcount[ primerlist[primer].split('\n')[0][1:-3]] # Filters on fraction of most abundant sequnce for the # marker if fraction < ARGS.f: break with open(ARGS.od + primer_fa.id_list[primer][:-3] + '.fa', 'a')\ as fi: if ARGS.f > 0: newseq = '>{}_fraction:{}\n{}\n'.format( seqname, round(fraction, 3), seqseq) fi.write(newseq) nr_seq_demult += 1 else: fi.write('>' + seq_fa.seq_list[seq]) nr_seq_demult += 1 fi.close() if seq_fa.nr_seq == 0: seq_fa.nr_seq = 1 logfile.write("Reduced to {} sequences.\n".format(nr_seq_demult)) file_nr += 1
'sequence-list', required=True) PARSER.add_argument('-m', type=int, help='minimum seq length', default=250, required=False) PARSER.add_argument('-c', type=int, help='minimum nr of seqs', default=1, required=False) PARSER.add_argument('-f', type=float, help='minimum fraction of most ' 'abundant seq', default=0, required=False) ARGS = PARSER.parse_args() # Some control of input file/directory names and parameter values if not os.path.isfile(ARGS.t): sys.exit('No PCR primer file. Exits.') if not ((ARGS.id.endswith(sep(os.name)) and ARGS.od.endswith(sep(os.name)))): sys.exit('Invalid directory name. Exits.') if os.path.isfile(ARGS.od[:-1]): print('{} is a file'.format(ARGS.od[:-1])) sys.exit('Exits') if os.path.isdir(ARGS.od): shutil.rmtree(ARGS.od) os.mkdir(ARGS.od) primer_fa = FastaList(ARGS.t) # Make fastaList of primerfile if not (0 <= ARGS.f <= 1): sys.exit('Fraction (-f) out of range. Exits.') if ARGS.m < 0: sys.exit('Minimum sequence length (-m) ut of range. Exits.') if ARGS.c < 1: sys.exit('Count (-c) ot of range. Exits.') main()
#!/usr/bin/python3 """Something""" import argparse from fasta import FastaList PARSER = argparse.ArgumentParser(description='Reverse complement a DNA strand') PARSER.add_argument('-s', type=str, help='oligonucleotide', required=True) ARGS = PARSER.parse_args() STRAND = FastaList(ARGS.s) print(STRAND.rev_comp(STRAND.seq_list))
# print('{:03} % completed'.format(int((100*counter/len(fasta_div)))), # end='\r', flush=True) return tmp_lst if __name__ == "__main__": PARSER = argparse.ArgumentParser(description='Finds target sequences from\ a target (-t) fasta file in a source fasta file (-s)') PARSER.add_argument('-p', type=int, help='nr of processor threads', default=1) PARSER.add_argument('-t', type=str, help='target fastafile', required=True) PARSER.add_argument('-s', type=str, help='source fastafile', required=True) PARSER.add_argument('-l', type=int, help='the length of the extracted seq', default=150) ARGS = PARSER.parse_args() FA_S = FastaList(ARGS.s) manager = Manager() ALL_S = manager.list([(fasta_s.split('\n')[0], fasta_s.split('\n')[1]) for fasta_s in FA_S.seq_list]) FA_T = FastaList(ARGS.t) fa_t_div = FA_T.divide(ARGS.p) nr_s_seq = len(ALL_S) nr_t_seq = len(FA_T.seq_list) print('searching {} subsequences in {} sequences'.format( nr_t_seq, nr_s_seq)) argument = [(x, y) for x in fa_t_div for y in [ALL_S]] aivcs = [] re_lst = [] try: with open('cs.js') as cs_file: aivcs = json.load(cs_file)
name_par = fa_hits[:-3].rpartition('/') fipa = open(name_par[0] + name_part[1] + 'nopar_' + name_par[2], 'w') fipa.write('No parameters, not contigs fasta file created by spades.') fipa.close() # Main PARSER = argparse.ArgumentParser(description='Split input fasta file based on\ existence of blast hits in input\ blast table file') PARSER.add_argument('-p', action='store_true', help='switch for parameter file\ output') PARSER.add_argument('-f', type=str, help='fastafile', required=True) PARSER.add_argument('-b', type=str, help='blastfile', required=True) ARGS = PARSER.parse_args() try: BL_IN = open(ARGS.b) except IOError: sys.exit('input file error') FA_LST = FastaList(ARGS.f) FA_IN = FA_LST.fa_file SPAFA_LST = SpaFaLst(FA_IN) BL_LST = BlastTbl(BL_IN) wr_files() FA_IN.close() if ARGS.f[-2:] == 'gz' and os.path.isfile(ARGS.f[:-3]): os.remove(ARGS.f[:-3]) BL_IN.close()
action='store_true', help='switch for removal of subseqs') PARSER.add_argument('--subsample', type=int, help='switch for removal of subseqs', required=False) ARGS = PARSER.parse_args() if ARGS.subsample: from random import sample org_seqs = list() unique_seqs = set() # Create a set of unique seqs without flanking Ns or polyA-tail and store # the originals seqs in the list org_seqs. The seqs must be at least of # MIN_LEN length and contain at most MAX_DEG number of deg nucs for item in FastaList(ARGS.i): org_seqs.append(item.split('\n')[1]) sequence = item.split('\n')[1].strip('N').rstrip('A') if len(sequence) >= MIN_LEN and cnt_nt_deg(sequence) <= MAX_DEG: unique_seqs.add(sequence) if ARGS.d: deg_summary(org_seqs) print('Nr of input sequences: {:38}'.format(len(org_seqs))) print('Nr of unique sequences (including subsequences): {:12}'.format( len(unique_seqs))) unique_seqs = list(unique_seqs) if ARGS.rm_subseqs: print('\n***Removing subsequences***') unique_seqs.sort(key=len, reverse=True) start = datetime.now() final_seqs = red_uniq_seq(unique_seqs)
help='output file directory', required=True) PARSER.add_argument('-m', type=str, help='muscle path', required=True) ARGS = PARSER.parse_args() # Some controls of input data if not ((ARGS.id.endswith(sep(os.name)) and ARGS.od.endswith(sep(os.name)))): sys.exit('Invalid directory name. Exits.') if os.path.isfile(ARGS.od[:-1]): print('{} is a file'.format(ARGS.od[:-1])) sys.exit('Exits') if os.path.isdir(ARGS.od): shutil.rmtree(ARGS.od) os.mkdir(ARGS.od) refs = FastaList(ARGS.r) for seqfile in os.listdir(ARGS.id): if 'log' in seqfile: continue print('Cleaning: {}'.format(seqfile)) input_name = ARGS.id + seqfile output_name = ARGS.od + seqfile fa_in = FastaList(input_name) # Remove primers if primers exit in seq. rmprimers return an empty list # if no primers are found if fa_in.rmprimers(ARGS.p): fa_in.wr_fasta_file(output_name, ARGS.p) fa_in = FastaList(output_name) for item in refs.seq_list: if seqfile.split('.')[0] in item.split('\n')[0]:
proc2.wait() if ARGS.l: fi.write(proc1.stderr.read()) fi.write(proc2.stderr.read()) if __name__ == "__main__": import argparse PARSER = argparse.ArgumentParser(description='TBD') PARSER.add_argument('-f', type=str, help='fasta filename', required=True) PARSER.add_argument('-o', type=str, help='output fasta filename', required=True) PARSER.add_argument('-m', type=str, help='minimun sequence length', default='200') PARSER.add_argument('-l', action='store_true', help='switch for log file output') ARGS = PARSER.parse_args() tmpfile = NamedTemporaryFile(mode='w+') rm_degenerates(ARGS.f) if ARGS.l: fi = open(ARGS.o.split('.')[0] + '.log', 'w') run_vsearch(rm_degenerates(ARGS.f)) fl2 = FastaList(ARGS.o) derep()
#!/usr/bin/python3 from fasta import FastaList import argparse import subprocess as sub PARSER = argparse.ArgumentParser(description='Test of primerdelete') PARSER.add_argument('-f', type=str, help='input fasta file', required=True) PARSER.add_argument('-p', type=str, help='input primer fasta' ' file', required=True) PARSER.add_argument('-o', type=str, help='output file', required=True) PARSER.add_argument('-m', type=str, help='muscle path', required=True) ARGS = PARSER.parse_args() fa = FastaList(ARGS.f) fa.wr_fasta_file(ARGS.o, ARGS.p) outfi = ARGS.o.split('.')[0] + '.afa' muscle = sub.Popen(ARGS.m + ' -in ' + ARGS.o + ' -out ' + outfi + ' -quiet') muscle.wait()
'sequence') PARSER.add_argument('-id', type=str, help='Directory for input data', required=True) PARSER.add_argument('-od', type=str, help='Directory for output data', required=True) ARGS = PARSER.parse_args() # Some control of input file/directory names and parameter values if not ((ARGS.id.endswith(sep(os.name)) and ARGS.od.endswith(sep(os.name)))): sys.exit('Invalid directory name. Exits.') if os.path.isfile(ARGS.od[:-1]): print('{} is a file'.format(ARGS.od[:-1])) sys.exit('Exits') if os.path.isdir(ARGS.od): shutil.rmtree(ARGS.od) os.mkdir(ARGS.od) for seqfile in os.listdir(ARGS.id): infa = FastaList(ARGS.id + seqfile) ref = infa.seq_list[0] refid = infa.seq_list[0].split('\n')[0] refseq = infa.seq_list[0].split('\n')[1] for seq in infa.seq_list[1:2]: seqid = seq.split('\n')[0] seqseq = seq.split('\n')[1] for nt in range(0, len(seqseq)): if refseq[nt] != seqseq[nt]: print(seqseq[nt])
#!/usr/bin/python3 """Something""" import argparse from fasta import FastaList PARSER = argparse.ArgumentParser(description='Finds target sequences from a\ target (-t) fasta file in a source fasta file (-s)') PARSER.add_argument('-t', type=str, help='target fastafile', required=True) PARSER.add_argument('-s', type=str, help='source fastafile', required=True) PARSER.add_argument('-l', type=str, help='the length of the extracted seq', default=150) ARGS = PARSER.parse_args() FA_S = FastaList(ARGS.s) FA_T = FastaList(ARGS.t) FA_CS = FastaList('aivcs.fa') aivcs = [] for seq in FA_CS.seq_list: aivcs.append(seq.split('\n')[1].strip()) ALL_S = FA_S.seq_list + FA_S.rev_comp() FA_OUT = open('sources.fa', 'w') count = len(FA_T.seq_list) // 100 percent = 0 incr1 = 0 incr2 = 0 seq_found = 0 for fasta_t in FA_T.seq_list: incr1 += 1 if incr1 > incr2:
import argparse from fasta import FastaList if __name__ == "__main__": PARSER = argparse.ArgumentParser(description='TBD') PARSER.add_argument('-f', type=str, help='input fasta file', required=True) PARSER.add_argument('--start', type=int, help='start cut', required=True) PARSER.add_argument('--end', type=int, help='end cut', required=True) ARGS = PARSER.parse_args() cut = ':' + str(ARGS.start) + '-' + str(ARGS.end) title = FastaList(ARGS.f).seq_list[0].split()[0] + cut + '\n' seq = FastaList(ARGS.f).seq_list[0].split()[1][ARGS.start:ARGS.end] outfile = ARGS.f.rsplit('.')[0] + cut.replace( ':', '_') + '.' + ARGS.f.rsplit('.')[1] with open(outfile, 'w') as f: f.write(title) f.write(seq + '\n')
#!/usr/local/miniconda3/bin/python import argparse as ap from fasta import FastaList PARSER = ap.ArgumentParser(description='Check fastafile') PARSER.add_argument('-f', type=str, help='fasta file', required=True) ARGS = PARSER.parse_args() fl = FastaList(ARGS.f) for seq in fl.seq_list: for nt in seq.split('\n')[1]: if nt not in [ 'A', 'C', 'G', 'T', 'R', 'W', 'K', 'Y', 'S', 'N', 'M', 'V', 'D', 'H', 'B' ]: #print(nt) print(seq.split('\n')[0])
#!/usr/bin/python3 # Removes columns containing degenerate nucleotides from alignment and writes # a new alignment as output file. import argparse from fasta import FastaList PARSER = argparse.ArgumentParser(description='Removes columns with degenerate ' 'nucleotides from alignment ' 'and writes a new alignment as ' 'output file') PARSER.add_argument('-i', type=str, help='fastq input filename', required=True) PARSER.add_argument('-o', type=str, help='fastq output filename', default='out.fa') ARGS = PARSER.parse_args() newseq_list = FastaList(ARGS.i) newseq_list.rm_non_agct_columns() newseq_list.wr_fasta_file(ARGS.o)