def test_run_process_sff_gz_through_split_lib_FLX(self): """run_process_sff_through_pick_otus runs without error: Convert to \ FLX""" # remove generated mapping file moved_mapping_file = join(self.wf_out, split(self.fasting_mapping_fp)[-1]) self.files_to_remove.append(moved_mapping_file) run_process_sff_through_split_lib( 0, "Fasting_subset", sff_input_fp=self.sff_gz_fp, mapping_fp=self.fasting_mapping_fp, output_dir=self.wf_out, command_handler=call_commands_serially, params=self.params, qiime_config=self.qiime_config, convert_to_flx=True, write_to_all_fasta=False, status_update_callback=no_status_updates, ) # get the file basename input_file_basename = splitext(splitext(split(self.sff_fp)[1])[0])[0] # get the split-library sequence fpath split_lib_seqs_fp = join(self.wf_out, "split_libraries", "seqs.fna") sff_fp = join(self.wf_out, "Fasting_subset_FLX.sff") sff_seqs_fp = join(self.wf_out, "Fasting_subset_FLX.fna") sff_qual_fp = join(self.wf_out, "Fasting_subset_FLX.qual") sff_flow_fp = join(self.wf_out, "Fasting_subset_FLX.txt") new_map_fp = join(self.wf_out, "Fasting_subset_mapping.txt") # define files to remove self.files_to_remove.append(sff_fp) self.files_to_remove.append(sff_seqs_fp) self.files_to_remove.append(sff_qual_fp) self.files_to_remove.append(sff_flow_fp) # get the head of files split_lib_head = get_top_fastq_two_lines(open(split_lib_seqs_fp, "U")) raw_seq_head = get_top_fastq_two_lines(open(sff_seqs_fp, "U")) raw_qual_head = get_top_fastq_two_lines(open(sff_qual_fp, "U")) raw_flow_head = get_top_fastq_two_lines(open(sff_flow_fp, "U")) # check results self.assertEqual("".join(split_lib_head), exp_FLX_split_lib_head) self.assertEqual("".join(raw_seq_head), exp_FLX_raw_seq_head) self.assertEqual("".join(raw_qual_head), exp_FLX_raw_qual_head) self.assertEqual("".join(raw_flow_head), exp_Ti_raw_flow_head) # Check that the log file is created and has size > 0 log_fp = glob(join(self.wf_out, "log*.txt"))[0] self.assertTrue(getsize(log_fp) > 0)
def test_run_process_fasta_through_split_lib(self): """run_run_process_fasta_through_split_lib runs without error""" self.files_to_remove.append(join(self.wf_out, "fasta_mapping_file.txt")) # process the sequence data run_process_fasta_through_split_lib( 0, "Fasting_subset", input_fp=",".join(self.fasta_fps), mapping_fp=self.fasta_map_fp, output_dir=self.wf_out, command_handler=call_commands_serially, params=self.params, qiime_config=self.qiime_config, write_to_all_fasta=False, status_update_callback=no_status_updates, ) # get the file basename input_file_basename = splitext(split(self.sff_fp)[1])[0] # get the split-library sequence fpath split_lib_seqs_fp = join(self.wf_out, "split_libraries", "seqs.fna") # get the head of files split_lib_head = get_top_fastq_two_lines(open(split_lib_seqs_fp, "U")) split_lib_seqs_only = [split_lib_head[1], split_lib_head[3]] # check results self.assertEqual("".join(split_lib_seqs_only), exp_fasta_split_lib_seqs_only) # Check that the log file is created and has size > 0 log_fp = glob(join(self.wf_out, "log*.txt"))[0] self.assertTrue(getsize(log_fp) > 0)
def submit_illumina_and_split_lib(data_access,fastq_files,metadata_study_id, input_dir): """ Illumina Loading: This function takes the fasta filenames and using that path, determines the location of the split-library and picked-otu files. Once file locations have been determined, it moves the files to the DB machine and load the files into the DB. """ # get DB connection and cursor con = data_access.getSFFDatabaseConnection() cur = con.cursor() ### this may help in speeding up loading but shouldn't be necessary #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...' #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ') #cur = con.cursor() # check if study exists study_id_exists=data_access.checkIfStudyIdExists(metadata_study_id) print "Study ID exists: " + str(study_id_exists) # get temp filename alphabet = "ABCDEFGHIJKLMNOPQRSTUZWXYZ" alphabet += alphabet.lower() alphabet += "01234567890" random_fname=''.join([choice(alphabet) for i in range(10)]) tmp_filename ='_'+random_fname+'_'+strftime("%Y_%m_%d_%H_%M_%S") # get fastq filenames fastq_filenames=fastq_files.split(',') seq_run_id=0 analysis_id=0 split_lib_input_checksums=[] ### by disabling constraints you can speed up loading as well, but shouldn't ### be necessary #valid = data_access.disableTableConstraints() #print "Disabled table constraints" #split the fastq filenames and determine filepaths for fastq_fname in fastq_filenames: input_fname, input_ext = splitext(split(fastq_fname)[-1]) input_basename, input_ext = splitext(fastq_fname) # get analysis notes analysis_notes=split(input_basename)[0] # get md5 for raw fastq files fastq_md5 = safe_md5(open(fastq_fname)).hexdigest() print 'MD5 is: %s' % str(fastq_md5) # create an analysis row in analysis table if analysis_id==0: analysis_id=data_access.createAnalysis(metadata_study_id) # check if fastq info already loaded fastq_exists=data_access.checkIfSFFExists(fastq_md5) print 'fastq in database? %s' % str(fastq_exists) # if fastq info not loaded, then insert into DB if not fastq_exists: if seq_run_id==0: seq_run_id=data_access.createSequencingRun(True,'ILLUMINA', None,seq_run_id) # get sequence count if fastq_fname.endswith('.gz'): count_seqs_cmd = "zcat %s | grep ^@ | wc -l" % (fastq_fname) else: count_seqs_cmd="grep ^@ %s | wc -l" % (fastq_fname) o,e,r = qiime_system_call(count_seqs_cmd) seq_counts = o.strip() # get header length and # of flows (length of seq) fastq_fname_open=open(fastq_fname) first_seq_fastq=get_top_fastq_two_lines(fastq_fname_open) header_length=len(first_seq_fastq[1]) num_flows=len(first_seq_fastq[1]) # insert fastq info valid=data_access.addSFFFileInfo(True,input_fname, seq_counts, header_length, None, num_flows, None, None, None, fastq_md5,seq_run_id) else: seq_run_id=data_access.getSeqRunIDUsingMD5(fastq_md5) print 'sequence_run_id is: %s' % str(seq_run_id) # get md5 for split-library input split_lib_input_md5sum=safe_md5(MD5Wrap(fastq_filenames)).hexdigest() print split_lib_input_md5sum print 'Finished loading the processed ILLUMINA data!' print 'Run ID: %s' % seq_run_id print 'Analysis ID: %s' % analysis_id # update analysis table with seq_run_id valid=data_access.updateAnalysisWithSeqRunID(True,analysis_id,seq_run_id) if not valid: raise ValueError, 'Error: Unable to append SEQ_RUN_ID into ANALYSIS table!' return analysis_id,input_dir,seq_run_id,split_lib_input_md5sum