def test_run_process_sff_gz_through_split_lib_FLX(self):
        """run_process_sff_through_pick_otus runs without error: Convert to \
FLX"""

        # remove generated mapping file
        moved_mapping_file = join(self.wf_out, split(self.fasting_mapping_fp)[-1])
        self.files_to_remove.append(moved_mapping_file)

        run_process_sff_through_split_lib(
            0,
            "Fasting_subset",
            sff_input_fp=self.sff_gz_fp,
            mapping_fp=self.fasting_mapping_fp,
            output_dir=self.wf_out,
            command_handler=call_commands_serially,
            params=self.params,
            qiime_config=self.qiime_config,
            convert_to_flx=True,
            write_to_all_fasta=False,
            status_update_callback=no_status_updates,
        )

        # get the file basename
        input_file_basename = splitext(splitext(split(self.sff_fp)[1])[0])[0]

        # get the split-library sequence fpath
        split_lib_seqs_fp = join(self.wf_out, "split_libraries", "seqs.fna")

        sff_fp = join(self.wf_out, "Fasting_subset_FLX.sff")
        sff_seqs_fp = join(self.wf_out, "Fasting_subset_FLX.fna")
        sff_qual_fp = join(self.wf_out, "Fasting_subset_FLX.qual")
        sff_flow_fp = join(self.wf_out, "Fasting_subset_FLX.txt")
        new_map_fp = join(self.wf_out, "Fasting_subset_mapping.txt")

        # define files to remove
        self.files_to_remove.append(sff_fp)
        self.files_to_remove.append(sff_seqs_fp)
        self.files_to_remove.append(sff_qual_fp)
        self.files_to_remove.append(sff_flow_fp)

        # get the head of files
        split_lib_head = get_top_fastq_two_lines(open(split_lib_seqs_fp, "U"))
        raw_seq_head = get_top_fastq_two_lines(open(sff_seqs_fp, "U"))
        raw_qual_head = get_top_fastq_two_lines(open(sff_qual_fp, "U"))
        raw_flow_head = get_top_fastq_two_lines(open(sff_flow_fp, "U"))

        # check results
        self.assertEqual("".join(split_lib_head), exp_FLX_split_lib_head)
        self.assertEqual("".join(raw_seq_head), exp_FLX_raw_seq_head)
        self.assertEqual("".join(raw_qual_head), exp_FLX_raw_qual_head)
        self.assertEqual("".join(raw_flow_head), exp_Ti_raw_flow_head)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.wf_out, "log*.txt"))[0]
        self.assertTrue(getsize(log_fp) > 0)
    def test_run_process_fasta_through_split_lib(self):
        """run_run_process_fasta_through_split_lib runs without error"""

        self.files_to_remove.append(join(self.wf_out, "fasta_mapping_file.txt"))

        # process the sequence data
        run_process_fasta_through_split_lib(
            0,
            "Fasting_subset",
            input_fp=",".join(self.fasta_fps),
            mapping_fp=self.fasta_map_fp,
            output_dir=self.wf_out,
            command_handler=call_commands_serially,
            params=self.params,
            qiime_config=self.qiime_config,
            write_to_all_fasta=False,
            status_update_callback=no_status_updates,
        )

        # get the file basename
        input_file_basename = splitext(split(self.sff_fp)[1])[0]

        # get the split-library sequence fpath
        split_lib_seqs_fp = join(self.wf_out, "split_libraries", "seqs.fna")

        # get the head of files
        split_lib_head = get_top_fastq_two_lines(open(split_lib_seqs_fp, "U"))

        split_lib_seqs_only = [split_lib_head[1], split_lib_head[3]]

        # check results
        self.assertEqual("".join(split_lib_seqs_only), exp_fasta_split_lib_seqs_only)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.wf_out, "log*.txt"))[0]
        self.assertTrue(getsize(log_fp) > 0)
def submit_illumina_and_split_lib(data_access,fastq_files,metadata_study_id,
                                  input_dir):
    """
        Illumina Loading: This function takes the fasta filenames and using 
        that path, determines the location of the split-library and picked-otu
        files.  Once file locations have been determined, it moves the files to
        the DB machine and load the files into the DB.
    """
    
    # get DB connection and cursor
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    
    ### this may help in speeding up loading but shouldn't be necessary
    #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...'
    #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ')
    #cur = con.cursor()
    
    # check if study exists
    study_id_exists=data_access.checkIfStudyIdExists(metadata_study_id)
    print "Study ID exists: " + str(study_id_exists)
    
    # get temp filename
    alphabet = "ABCDEFGHIJKLMNOPQRSTUZWXYZ"
    alphabet += alphabet.lower()
    alphabet += "01234567890"
    random_fname=''.join([choice(alphabet) for i in range(10)])
    tmp_filename ='_'+random_fname+'_'+strftime("%Y_%m_%d_%H_%M_%S")
    
    # get fastq filenames
    fastq_filenames=fastq_files.split(',')
    seq_run_id=0  
    analysis_id=0    
    split_lib_input_checksums=[]

    ### by disabling constraints you can speed up loading as well, but shouldn't
    ### be necessary
    #valid = data_access.disableTableConstraints()
    #print "Disabled table constraints"

    #split the fastq filenames and determine filepaths
    for fastq_fname in fastq_filenames:
        input_fname, input_ext = splitext(split(fastq_fname)[-1])
        input_basename, input_ext = splitext(fastq_fname)
        
        # get analysis notes
        analysis_notes=split(input_basename)[0]
        
        # get md5 for raw fastq files
        fastq_md5 = safe_md5(open(fastq_fname)).hexdigest()
        print 'MD5 is: %s' % str(fastq_md5)

        # create an analysis row in analysis table
        if analysis_id==0:
            analysis_id=data_access.createAnalysis(metadata_study_id)
        
        # check if fastq info already loaded
        fastq_exists=data_access.checkIfSFFExists(fastq_md5)
        print 'fastq in database? %s' % str(fastq_exists)
        
        # if fastq info not loaded, then insert into DB
        if not fastq_exists:
            if seq_run_id==0:
                seq_run_id=data_access.createSequencingRun(True,'ILLUMINA',
                                                           None,seq_run_id)
            
            # get sequence count
            if fastq_fname.endswith('.gz'):
                count_seqs_cmd = "zcat %s | grep ^@ | wc -l" % (fastq_fname)
            else:
                count_seqs_cmd="grep ^@ %s | wc -l" % (fastq_fname)
            o,e,r = qiime_system_call(count_seqs_cmd)
            seq_counts = o.strip()
            
            # get header length and # of flows (length of seq)
            fastq_fname_open=open(fastq_fname)
            first_seq_fastq=get_top_fastq_two_lines(fastq_fname_open)
            header_length=len(first_seq_fastq[1])
            num_flows=len(first_seq_fastq[1])
            
            # insert fastq info
            valid=data_access.addSFFFileInfo(True,input_fname,
                                          seq_counts,
                                          header_length,
                                          None,
                                          num_flows,
                                          None,
                                          None,
                                          None,
                                          fastq_md5,seq_run_id)
        else:
            seq_run_id=data_access.getSeqRunIDUsingMD5(fastq_md5)
    
    print 'sequence_run_id is: %s' % str(seq_run_id)
    
    # get md5 for split-library input
    split_lib_input_md5sum=safe_md5(MD5Wrap(fastq_filenames)).hexdigest()
    print split_lib_input_md5sum
    print 'Finished loading the processed ILLUMINA data!'
    print 'Run ID: %s' % seq_run_id
    print 'Analysis ID: %s' % analysis_id
    
    # update analysis table with seq_run_id
    valid=data_access.updateAnalysisWithSeqRunID(True,analysis_id,seq_run_id)
    if not valid:
        raise ValueError, 'Error: Unable to append SEQ_RUN_ID into ANALYSIS table!'

    return analysis_id,input_dir,seq_run_id,split_lib_input_md5sum