def seq_concat_id_fa(self, input_file_path, output_file_path): input = fa.SequenceSource(input_file_path) output = open(output_file_path, "w") while input.next(): output.write(input.id + "#" + input.seq + "\n") output.close()
def start_fasta_single(infile): """ Check defline format >seqid|otherstuff Check sequences for ATGC only """ f = fastalib.SequenceSource(infile) count = 1 while f.next(): line_no = count * 2 id = f.id.split()[0].split('|')[ 0] # <space> and bar '|' are the ONLY two dividers allowed #print id seq = f.seq #print seq if re.search(id_pattern, id): errors.append('ERROR: SeqID (' + id + ') contains bad character(s) about line ' + str(line_no)) if re.search(sequence_pattern, seq): errors.append('ERROR: Sequence (id=' + id + ') contains bad character(s) on about line ' + str(line_no)) count += 1 if len(errors) > 1: return errors else: return notes + ['OK -- File Validates']
def unsplit_fa(self, input_file_path, output_file_path): input = fa.SequenceSource(input_file_path) output = fa.FastaOutput(output_file_path) while input.next(): output.store(input, split=False) output.close()
def cut_region(self, input_file_path): input = fa.SequenceSource(input_file_path) self.total_seq = 0 while input.next(): self.total_seq += 1 self.get_region(input, self.args.forward_primer, self.args.distal_primer)
def parse_input(self): print "self.compressed = " print self.compressed fasta = fa.SequenceSource(self.filename, self.compressed) while fasta.next(): fasta.seq = fasta.seq.upper() self.number_of_sequences += 1 id = self.parse_taxonomy(fasta.id) self.parse_seq(id, fasta.seq)
def read_file_and_collect_info(self, in_fa_gz_file_name): print in_fa_gz_file_name input = fa.SequenceSource(in_fa_gz_file_name) while input.next(): t0 = utils.benchmark_w_return_1("parse_id") locus = self.parse_id(input.id) utils.benchmark_w_return_2(t0, "parse_id") self.sequences[locus.strip()] = input.seq.strip()
def get_out_file_names(self): print "get_out_file_names" n = 0 f_input = fastalib.SequenceSource(inputfile) while f_input.next(): n+=1 if (n % 100000 == 0 or n == 1): sys.stderr.write('\r[demultiplex] Reading FASTA into memory: %s\n' % (n)) sys.stderr.flush() f_out_name = self.make_file_name(f_input.id) self.out_file_names.add(f_out_name)
def go_single(args): """ reads input fa file finds frequencies if present and expands writes out SEQFILE_CLEAN.fa in same directory """ #sys.path.append('/groups/vampsweb/'+args.site+'/seqinfobin/merens-illumina-utils/') import IlluminaUtils.lib.fastalib as fastalib infile = args.infile print args.infile unique = False # should not unique until separated into datasets!! f = fastalib.SequenceSource(infile, unique=unique) pcounter = 0 datasets = {} file_handles = {} fh = open(args.outfile, 'w') cnt = '1' while f.next(): defline_items = f.id.split('|') id_clean = defline_items[0].split()[0] freq = defline_items[-1] seq_clean = f.seq.upper().strip() #print freq if freq[:4] == 'freq': try: cnt = freq.split(':')[1] except: try: cnt = freq.split('=')[1] except: cnt = '1' if RepresentsInt(cnt): for i in range(1, int(cnt) + 1): id = id_clean + '_' + str(i) if args.stdout: print '>' + id + '\n' + seq_clean else: fh.write('>' + id + '\n' + seq_clean + '\n') else: if args.stdout: print '>' + id_clean + '\n' + seq_clean else: fh.write('>' + id_clean + '\n' + seq_clean + '\n') fh.close()
def get_chimeric_ids(self, file_name): ids = set() print("Get ids from %s" % file_name) # todo: benchmark # read_fasta = fa.ReadFasta(file_name) # # ids.update(set(read_fasta.ids)) # ids = set(read_fasta.ids) chimeric_fasta = fa.SequenceSource(file_name, lazy_init=False) while next(chimeric_fasta): ids.add(chimeric_fasta.id) chimeric_fasta.close() return ids
def combine_w_gast_fa(self, input_file_path, output_file_path): output = fa.FastaOutput(output_file_path) fa_input = fa.SequenceSource(input_file_path) gast_file_name = input_file_path + ".gast" while fa_input.next(): file = open(gast_file_name, "r") gast_file_content = file.readlines() res = self.lines_that_contain(fa_input.id, gast_file_content) gast_taxonomy = res[0].split("\t") id_gast = fa_input.id + "|" + gast_taxonomy[1] fa_input.id = id_gast output.store(fa_input, split=False) output.close()
def demultiplex_input(self, inputfile): print "demultiplex_input" f_input = fastalib.SequenceSource(inputfile) i = 0 while f_input.next(): i += 1 id = f_input.id f_out_name = self.make_file_name(f_input.id) f_output = self.out_files[f_out_name] self.write_id(f_output, id) self.write_seq(f_output, f_input.seq) if (i % 100000 == 0 or i == 1): sys.stderr.write('\r[demultiplex] Writing entries into files: %s\n' % (i)) sys.stderr.flush()
def move_out_chimeric(self): chimeric_ids = self.get_chimeric_ids() for idx_key in self.input_file_names: fasta_file_path = os.path.join(self.indir, self.input_file_names[idx_key]) read_fasta = fa.ReadFasta(fasta_file_path) read_fasta.close() non_chimeric_file = fasta_file_path + self.nonchimeric_suffix non_chimeric_fasta = fa.FastaOutput(non_chimeric_file) fasta = fa.SequenceSource(fasta_file_path, lazy_init = False) while fasta.next(): if not fasta.id in chimeric_ids: non_chimeric_fasta.store(fasta, store_frequencies = False) non_chimeric_fasta.close()
def write_clean_abundance_file(self): """ Writes the abundance file from the new names file and new unique file. These files have already had their ids checked from the deleted file """ for lane_key in self.lane_keys: original_abundance_file = os.path.join(self.trim_dir, lane_key + ".abund.fa") new_abundance_file = os.path.join(self.trim_dir, lane_key + ".newabund.fa") new_names_file = os.path.join(self.trim_dir, lane_key + ".names") new_unique_file = os.path.join(self.trim_dir, lane_key + ".unique.fa") names = {} uniques = {} deleted_id_list = self.deleted_ids[lane_key] if len(deleted_id_list) == 0: continue newnames_fh = open(new_names_file, "r") for line in newnames_fh.readlines(): lst = line.strip().split() names[lst[0]] = lst[1].split(',') #print(names) fasta = fa.SequenceSource(new_unique_file) while fasta.next(): fasta.id uniques[fasta.seq] = fasta.id #print(uniques) sorted_uniques = mysort(uniques, names) for item in sorted_uniques: read_id = item[0] count = item[1] seq = item[2] sfastaRead = read_id + ";size=" + str(count) abundfa = sfasta(sfastaRead, seq) abundfa.write(new_abundance_file, 'a') # rename to newuniques => uniques os.rename( original_abundance_file, os.path.join(self.trim_dir, lane_key + ".abund_dirty.fa")) os.rename(new_abundance_file, original_abundance_file)
def move_out_chimeric(self): txt_ids = self.get_chimeric_ids( os.path.join(self.dir_name, self.chimeric_file_name_txt)) db_ids = self.get_chimeric_ids( os.path.join(self.dir_name, self.chimeric_file_name_db)) all_chimeric_ids = set(txt_ids) | set(db_ids) print("len(all_chimeric_ids) = ") print(len(all_chimeric_ids)) non_chimeric_fasta = fa.FastaOutput( os.path.join(self.dir_name, self.output_file_name)) orig_fasta = fa.SequenceSource(os.path.join(self.dir_name, self.chg_file), lazy_init=False) while next(orig_fasta): if not orig_fasta.id in all_chimeric_ids: non_chimeric_fasta.store(orig_fasta, store_frequencies=False) non_chimeric_fasta.close()
def write_clean_uniques_file(self): """ Write out a new unique file with all the deleted ids removed especially the chimeras which were detected after the original unique file was created. """ for lane_key in self.lane_keys: deleted_id_list = [] new_unique_file_name = os.path.join(self.trim_dir, lane_key + ".newunique.fa") new_unique_file = fa.FastaOutput(new_unique_file_name) original_unique_file = os.path.join(self.trim_dir, lane_key + '.unique.fa') deleted_id_list = self.deleted_ids[lane_key] if len(deleted_id_list) == 0: continue # open unique file and read a line uniquesfasta = fa.SequenceSource(original_unique_file) while uniquesfasta.next(): #print(uniquesfasta.id,self.orphans[lane_key]) if uniquesfasta.id in self.orphans[lane_key].keys(): #print("found orphan",uniquesfasta.id) uniquesfasta.id = self.orphans[lane_key][ uniquesfasta.id][0] #print("new id",uniquesfasta.id) if uniquesfasta.id not in deleted_id_list: new_unique_file.store(uniquesfasta) new_unique_file.close() # rename to newuniques => uniques os.rename( original_unique_file, os.path.join(self.trim_dir, lane_key + ".unique_dirty.fa")) os.rename(new_unique_file_name, original_unique_file)
def write_clean_fasta_file(self): """ def to write a new fasta from the original fasta file using the deleted file The deleted file contains the trimming deleted as well as the chimera deleted Then write the uniques from Meren's fastalib """ sleep(2) for lane_key in self.lane_keys: logger.debug("write_clean_fasta_file working on lanekey: " + lane_key) deleted_id_list = [] original_trimmed_file = os.path.join(self.trim_dir, lane_key + ".trimmed.fa") new_trimmed_file_name = os.path.join(self.trim_dir, lane_key + ".newtrimmed.fa") new_trimmed_file = fa.FastaOutput(new_trimmed_file_name) # open trimmed file and read a line trimmedfasta = fa.SequenceSource(original_trimmed_file) logger.debug( "write_clean_fasta_file about to check trimmedfasta file") deleted_id_list = self.deleted_ids[lane_key] if len(deleted_id_list) == 0: continue while trimmedfasta.next(): if trimmedfasta.id not in deleted_id_list: new_trimmed_file.store(trimmedfasta) new_trimmed_file.close() # rename to newtrimmed => trimmed os.rename( original_trimmed_file, os.path.join(self.trim_dir, lane_key + ".trimmed_with_chimera.fa")) os.rename(new_trimmed_file_name, original_trimmed_file)
def start_fasta_multi(infile): """ Check defline format >dsname|seqid|otherstuff Check sequences for ATGC only """ f = fastalib.SequenceSource(infile) datasets_hash = {} all_seq_count = 0 id_has_seq_count = False count_style_flip = 0 while f.next(): defline = f.id.split() if len(defline) > 1: #dataset_items = defline[0] ds_items = defline[0].split('_') #print len(ds_items),ds_items[-1] if len(ds_items) > 1: try: # ie: 10056.000009544_123294 this_seq_count = int(ds_items[-1]) dataset = '_'.join( ds_items[:-1] ) # join in case there were multiple '_' instances if id_has_seq_count == False: count_style_flip += 1 id_has_seq_count = True except: # ie: 10056.000009544 this_seq_count = 1 dataset = defline[0] if id_has_seq_count == True: count_style_flip += 1 id_has_seq_count = False else: this_seq_count = 1 dataset = defline[0] #print dataset datasets_hash[dataset] = 1 id = defline[1] # <space> and bar '|' are the ONLY two dividers seq = f.seq else: errors.append('ERROR: This file has the wrong format') break all_seq_count += 1 #print 'flip',count_style_flip if count_style_flip > 1: errors.append( 'ERROR: id style varied from "no count" to "count" too many times') #print all_seq_count #print len(datasets_hash) if all_seq_count == len(datasets_hash): errors.append( "ERROR: Looks like the number of datasets equals the number of sequences -- that can't be right. Maybe this is a single-dataset style fasta file?" ) else: notes.append('Good: dataset count is: ' + str(len(datasets_hash))) notes.append('Good: sequence count is: ' + str(all_seq_count)) if len(errors) > 1: return errors else: return notes + ['OK -- File Validates']
def write_seqfiles(args): outdir = args.project_dir datasets = {} files = {} stats = {} analysis_dir = os.path.join(outdir,'analysis') gast_dir = os.path.join(analysis_dir,'gast') #gast_dir = os.path.join(outdir,'analysis/gast') if args.upload_type == 'single': ds = args.dataset datasets[ds] = 0 ds_dir = os.path.join(gast_dir,ds) if not os.path.exists(ds_dir): os.makedirs(ds_dir, mode=0777) file = os.path.join(ds_dir,'seqfile.fa') fp = open(file,'w') files[ds] = fp seq_count = 0 ds_count = 0 f = fastalib.SequenceSource(args.fafile) #f = FastaReader(fafile) while f.next(): defline = f.id if args.upload_type == 'single': ds = args.dataset # should split on pipe and space #id = defline.split('|')[0].split('_')[0] id = defline.replace(' ','|').split('|')[0] datasets[ds] += 1 fp.write('>'+id+"\n"+f.seq+"\n") else: try: #id = defline.replace(' ','|') # mobe defline='>10056.000010538_2 HWI-M00888:59:000000000-A62ET:1:1101:15096:1532 1:N:0:GACCGTAAACTC orig_bc=GACCGTAAACTC new_bc=GACCGTAAACTC bc_diffs=0' if 'orig_bc' in defline and 'new_bc' in defline: #if there are orig_bc and new_bc in defline then assume mobe/qiime file #and break up like this: #print 'found mobe defline' tmp = defline.replace(' ','|').split('|') ds = tmp[0].split('_')[0] #id = tmp[1] id = tmp[0].split('_')[1] else: tmp = defline.replace(' ','|').split('|') #print defline ds = tmp[0] id = tmp[1] ds_dir = os.path.join(gast_dir,ds) file = os.path.join(ds_dir,'seqfile.fa') if ds in datasets: datasets[ds] +=1 else: datasets[ds] = 1 if ds in files: files[ds].write('>'+id+"\n"+f.seq+"\n") else: if not os.path.exists(ds_dir): os.makedirs(ds_dir, mode=0777) #os.makedirs(ds_dir) fp = open(file,'w') files[ds] = fp fp.write('>'+id+"\n"+f.seq+"\n") except: print "Please check the multi-dataset format: ( defline='>" + defline+"' )" sys.exit(1) seq_count += 1 ds_count = len(datasets) f.close() #print datasets for ds in files: files[ds].close() stats['seq_count'] = seq_count stats['ds_count'] = ds_count stats['datasets'] = datasets return stats
args = parse_arguments() fa_path = args.fa_path qual_path = args.qual_path fq_path = args.fq_path """ TODO: if no qual - use fake and do not process qual_path File "fasta_to_fastq.py", line 60, in <module> f_qual = fa.SequenceSource(qual_path) File "/bioware/python-2.7.12-201701011205/lib/python2.7/site-packages/illumina_utils-1.4.8-py2.7.egg/IlluminaUtils/lib/fastalib.py", line 84, in __init__ self.file_pointer = open(self.fasta_file_path) TypeError: coercing to Unicode: need string or buffer, NoneType found """ f_input = fa.SequenceSource(fa_path) f_input_dict = make_a_dict(f_input) if args.qual_path: f_qual = fa.SequenceSource(qual_path) f_qual_dict = make_a_dict(f_qual) # print "f_input_dict" # print f_input_dict # print "f_qual_dict" # print f_qual_dict def convert_qual_scores(line): # res = [] arr = line.split(" ")
def go_multi(args): """ NO:need qiime map file for ds names only and fasta file with defline like so: >ds|id|frequency:23 Should create directory structure: analysis/gast/ds for each ds found in seqfile """ import IlluminaUtils.lib.fastalib as fastalib infile = args.infile unique = False data = {} cnt = '1' f = fastalib.SequenceSource(infile, unique=unique) while f.next(): defline_items = f.id.split(args.delim) dataset = defline_items[0] # if ds like M9Dkey217.141053_69 # must remove the _69 from end # but not if like M9Dkey217_141053 # so: test_ds_parts = dataset.split('_') if RepresentsInt(test_ds_parts[-1]): dataset = '_'.join(test_ds_parts[:-1]) id = defline_items[1].split()[ 0] # M01028:102:000000000-AK07B:1:1101:19698:4186 1:N:0:6 freq = defline_items[-1] if dataset not in data: data[dataset] = [] if freq[:4] == 'freq': try: cnt = freq.split(':')[1] except: try: cnt = freq.split('=')[1] except: cnt = '1' data[dataset].append({'id': id, 'seq': f.seq, 'cnt': cnt}) analysis_dir = 'analysis' gast_dir = 'analysis/gast' if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) if not os.path.exists(gast_dir): os.makedirs(gast_dir) for ds in data: if ds != '': dir = os.path.join(gast_dir, ds) if os.path.exists(dir): shutil.rmtree(dir) os.makedirs(dir) outfile = os.path.join(dir, args.outfile) fh = open(outfile, 'w') for dict in data[ds]: cnt = dict['cnt'] id = dict['id'] seq = dict['seq'] if RepresentsInt(cnt): for m in range(1, int(cnt) + 1): idcnt = id + '_' + str(m) if args.stdout: print '>' + idcnt + '\n' + seq else: fh.write('>' + idcnt + '\n' + seq + '\n') else: if args.stdout: print '>' + id + '\n' + seq else: fh.write('>' + id + '\n' + seq + '\n') fh.close() else: print 'Empty ds name!!'
def sequences(self, key, tax_collector, read_id_lookup, file_collector): """ fill vamps_sequences.txt file """ logging.info("Writing to file: vamps_sequences_pipe") if self.runobj.vamps_user_upload or self.runobj.new_vamps_upload: project = self.runobj.project dataset = key else: if self.runobj.platform == 'illumina': project = self.runobj.samples[key].project dataset = self.runobj.samples[key].dataset elif self.runobj.platform == '454': pass else: pass project = project[0].capitalize() + project[1:] project_dataset = project + '--' + dataset # open gast_concat table to get the distances and the ferids refid_collector = {} #if os.path.exists(gast_concat_file): for line in open(file_collector['gast_concat_file'], 'r'): line = line.strip() items = line.split() id = items[0] distance = items[1] refhvr_ids = items[2] refid_collector[id] = {} refid_collector[id]['distance'] = distance refid_collector[id]['refhvr_ids'] = refhvr_ids fh = open(file_collector['sequences_file'], 'w') fh.write("\t".join([ "HEADER", "project", "dataset", "taxonomy", "refhvr_ids", "rank", "seq_count", "frequency", "distance", "read_id", "project_dataset" ]) + "\n") # open uniques fa file if os.path.exists(file_collector['unique_file']) and os.path.getsize( file_collector['unique_file']) > 0: f = fastalib.SequenceSource(file_collector['unique_file']) while f.next(): datarow = [''] defline_items = f.id.split('|') id = defline_items[0] cnt = defline_items[1].split(':')[1] seq = f.seq if id in read_id_lookup: tax = read_id_lookup[id] else: tax = '' if tax in tax_collector: rank = tax_collector[tax]['rank'] #cnt = tax_collector[tax]['knt'] freq = tax_collector[tax]['freq'] else: rank = 'NA' cnt = 0 freq = 0 if id in refid_collector: distance = refid_collector[id]['distance'] refhvr_ids = refid_collector[id]['refhvr_ids'] else: distance = '1.0' refhvr_ids = '0' if not cnt: cnt = 1 datarow.append(seq) datarow.append(project) datarow.append(dataset) datarow.append(tax) datarow.append(refhvr_ids) datarow.append(rank) datarow.append(str(cnt)) datarow.append(str(freq)) datarow.append(distance) datarow.append(id) datarow.append(project_dataset) w = "\t".join(datarow) #print 'w',w fh.write(w + "\n") fh.close() return refid_collector
def visualize_sequence_length_distribution(fasta_file_path, dest, title, max_seq_len=None, xtickstep=None, ytickstep=None): sequence_lengths = [] fasta = u.SequenceSource(fasta_file_path) while next(fasta): if fasta.pos % 10000 == 0 or fasta.pos == 1: sys.stderr.write('\rReading: %s' % (big_number_pretty_print(fasta.pos))) sys.stderr.flush() sequence_lengths.append(len(fasta.seq)) sys.stderr.write('\n') if not max_seq_len: max_seq_len = max(sequence_lengths) + (int( max(sequence_lengths) / 100.0) or 10) seq_len_distribution = [0] * (max_seq_len + 1) for l in sequence_lengths: seq_len_distribution[l] += 1 fig = plt.figure(figsize=(16, 12)) plt.rcParams.update({'axes.linewidth': 0.9}) plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1) gs = gridspec.GridSpec(10, 1) ax1 = plt.subplot(gs[0:8]) plt.grid(True) plt.subplots_adjust(left=0.05, bottom=0.03, top=0.95, right=0.98) plt.plot(seq_len_distribution, color='black', alpha=0.3) plt.fill_between(list(range(0, max_seq_len + 1)), seq_len_distribution, y2=0, color='black', alpha=0.15) plt.ylabel('number of sequences') plt.xlabel('sequence length') if xtickstep == None: xtickstep = (max_seq_len / 50) or 1 if ytickstep == None: ytickstep = max(seq_len_distribution) / 20 or 1 plt.xticks(list(range(xtickstep, max_seq_len + 1, xtickstep)), rotation=90, size='xx-small') plt.yticks(list(range(0, max(seq_len_distribution) + 1, ytickstep)), size='xx-small') plt.ylim(ymin=0, ymax=max(seq_len_distribution) + (max(seq_len_distribution) / 20.0)) plt.xlim(xmin=0, xmax=max_seq_len) plt.yticks(size='xx-small') plt.figtext(0.5, 0.96, '%s' % (title), weight='black', size='xx-large', ha='center') ax1 = plt.subplot(gs[9]) plt.rcParams.update({'axes.edgecolor': 20}) plt.grid(False) plt.yticks([]) plt.xticks([]) plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\ % (big_number_pretty_print(len(sequence_lengths)), numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\ big_number_pretty_print(min(sequence_lengths)),\ big_number_pretty_print(max(sequence_lengths))),\ va = 'center', alpha = 0.8, size = 'x-large') try: plt.savefig(dest + '.pdf') except: plt.savefig(dest + '.png')
def get_datasets(args): """ """ print args sys.path.append('/groups/vampsweb/' + args.site + '/seqinfobin/merens-illumina-utils/') import IlluminaUtils.lib.fastalib as fastalib # errors here are between 240 - 249 seq_allowed = dict.fromkeys('AGCTUNRYMKSWHBVDagctunrymkswhbvd') readid_allowed = dict.fromkeys( "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.:") bad_line = False dir = os.path.join('/groups/vampsweb/' + args.site + '/tmp/', args.code) gast_dir = os.path.join(dir, 'analysis/gast') datasets = {} out_file = os.path.join(dir, "SEQFILE_CLEAN.FA") for infile in os.listdir(args.indir): #if fileName[-3:]=='.fa': dataset = infile[:-3] datasets[dataset] = 0 file_handles = {} new_dir = os.path.join(gast_dir, dataset) print new_dir os.makedirs(new_dir) # open new fa file fasta_file = os.path.join(new_dir, 'seqfile.fa') fh = open(fasta_file, 'w') # write defline and seq file_handles[dataset] = fh file_path = os.path.join(args.indir, infile) #if os.path.exists(infile): #import fastalib out_fh = open(out_file, 'w') # if multiple datasets in fa file then must use raw # to be able to get ds and id from defline # BUT if single have to assume that id is firat and should use single raw_id = False unique = False raw_id = True # should not unique until separated into datasets!! f = fastalib.SequenceSource(file_path, unique=unique) # defline could be separated by spaces or '|' # from uclust otu creation: >Cluster10108;size=1 breaks here counter = 0 while f.next(): id_clean = f.id #print "ID:",id_clean if not all(x in seq_allowed for x in f.seq): bad_line = True msg = 'Sequence failed: ' + f.seq sys.exit() else: seq_clean = f.seq.upper().strip() #write to fa file file_handles[dataset].write('>' + id_clean + '\n' + seq_clean + '\n') # else: # # create new directory in /gast # # new_dir = os.path.join(gast_dir,dataset) # print new_dir # os.makedirs(new_dir) # # open new fa file # fasta_file = os.path.join(new_dir,'seqfile.fa') # fh = open(fasta_file,'w') # # write defline and seq # fh.write('>'+id_clean+'\n'+seq_clean+'\n') # file_handles[dataset] = fh # # save dataset to datasets if not bad_line: out_fh.write('>' + dataset + ' ' + id_clean + "\n") out_fh.write(seq_clean + "\n") counter += 1 datasets[dataset] = counter if bad_line: print msg sys.exit(241) if counter == 0: print "No sequences found! Remove any empty lines or comments at the top of your file and try again." sys.exit(242) #print str(counter)+" sequences processed" out_fh.close() #else: # print "Could not find infile.",file_path # sys.exit(244) sequence_count = counter print "sequence_count=" + str(sequence_count) print 'dir', dir print datasets return datasets