def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) # get cmd-line options fasta_fp=opts.input_fasta_fp qual_fp=opts.input_qual_fp output_dir=opts.output_dir # create output dir create_dir(output_dir) output_fps={} # open sequence files sequences=MinimalFastaParser(open(fasta_fp,'U')) qual_sequences=MinimalFastaParser(open(qual_fp,'U')) # iterate over seqs for seq_name,seq in sequences: # iterate over qual qual_seq_name,qual_seq=qual_sequences.next() # verify headers from seq and qual match if seq_name==qual_seq_name: # get the SampleID samp_id='_'.join(seq_name.split()[0].split('_')[:-1]) samp_filename='seqs_%s' % (str(samp_id)) # open files for output if not output_fps.has_key(str(samp_filename)): output_fps[str(samp_filename)] = open(join(output_dir, '%s.fastq' % (str(samp_filename))),'w') # write out the fastq format for seqs output_fps[str(samp_filename)].write('@%s\n%s\n+\n%s\n' % \ (seq_name,seq,qual_seq)) else: print seq_name # close the files for s_id in output_fps: output_fps[str(s_id)].close()
def generate_full_split_lib_fastq(study, study_input_dir, zip_fname, files_to_remove,output_dir): """ Generate the full split-library fastq file """ # define sequence output file seq_fname='study_%s_split_library_seqs.fastq.gz' % (str(study)) fna_fname='study_%s_split_library_seqs.fna.gz' % (str(study)) output_seq_fp=join(output_dir,seq_fname) output_fna_fp=join(output_dir,fna_fname) # add to list of files to remove files_to_remove.append(output_seq_fp) files_to_remove.append(output_fna_fp) output_seqs=gzip.open(output_seq_fp,'w') output_fna=gzip.open(output_fna_fp,'w') iterator=0 # get a list of all files in study_dir processed_folders=listdir(study_input_dir) samples={} biom_files=[] for processed_folder in processed_folders: # determine if the file startswith the word "processed" if processed_folder.startswith('processed'): # define split-lib seq fp split_lib_seqs=join(study_input_dir,processed_folder, 'split_libraries','seqs.fna') # open sequence files seqs=MinimalFastaParser(open(split_lib_seqs,'U')) try: # for illumina split_lib_qual=join(study_input_dir,processed_folder, 'split_libraries','seqs.qual') # open sequence files qual_sequences=MinimalFastaParser(open(split_lib_qual,'U')) except: # for 454 split_lib_qual=join(study_input_dir,processed_folder, 'split_libraries','seqs_filtered.qual') # open sequence files qual_sequences=MinimalFastaParser(open(split_lib_qual,'U')) # open split-lib seq fp seqs=MinimalFastaParser(open(split_lib_seqs,'U')) # iterate over sequences for seq_name,seq in seqs: # update sequence numbers since they may cause issues across # multiple split-lib runs qual_seq_name,qual_seq=qual_sequences.next() if seq_name==qual_seq_name: full_seq_name_list=seq_name.split() seq_name_prefix='_'.join(full_seq_name_list[0].split('_')[:-1]) # get per sample sequence counts if seq_name_prefix in samples: samples[seq_name_prefix]=samples[seq_name_prefix]+1 else: samples[seq_name_prefix]=1 # update the sequence name, but retain barcode info updated_seq_name=seq_name_prefix + '_' + str(iterator) + \ ' ' + ' '.join(full_seq_name_list[1:]) # write the sequence out in FASTA format output_seqs.write('@%s\n%s\n+\n%s\n' % \ (str(updated_seq_name), str(seq), str(qual_seq))) # write the sequence out in FASTA format output_fna.write('>%s\n%s\n' % (str(updated_seq_name), str(seq))) iterator=iterator+1 else: print seq_name # get list of biom files gg_biom_fp=join(study_input_dir,processed_folder, 'gg_97_otus','exact_uclust_ref_otu_table.biom') if exists(gg_biom_fp) and getsize(gg_biom_fp)>0: biom_files.append(gg_biom_fp) output_seqs.close() output_fna.close() # zip the full split-library sequence file #cmd_call='cd %s; tar rzvf %s %s' % (study_input_dir,zip_fname,seq_fname) #cmd_call='cd %s; tar rzvf %s %s' % (study_input_dir,zip_fname,fna_fname) #system(cmd_call) return files_to_remove, biom_files, samples