def test_fasta_to_tab_delim(self):
        """make sure we can go from fasta to tab delim"""
        input = """>a RUN1 orig_bc=AAAA new_bc=AAAA bc_diffs=0
123123123
>d RUN1 orig_bc=AAAA new_bc=AAAA bc_diffs=0
atcasdad
>h RUN1 orig_bc=AAAA new_bc=AAAA bc_diffs=0
10 11 12"""
        exp = ['1\t1\ta\ta\tRUN1\tAAAA\tAAAA\t0\t9\tf5bb0c8de146c67b44babbf4e6584cc0\t123123123', '1\t1\td\td\tRUN1\tAAAA\tAAAA\t0\t8\t1fae8caaf715bdc710b99e8c3e843092\tatcasdad', '1\t1\th\th\tRUN1\tAAAA\tAAAA\t0\t8\tb4c2a347f5d0453c4fdae6d5c7b5bc78\t10 11 12']
        obs = list(fasta_to_tab_delim(input.splitlines(),1,1))
        self.assertEqual(obs, exp)
def load_split_lib_sequences(data_access,input_dir,analysis_id, seq_run_id,
                             split_lib_input_md5sum):
    """ This function loads the split-library seqs into DB """
    
    # define the split library file paths using the original fasta input 
    # directory
    split_lib_seqs = join(input_dir, 'split_libraries', 'seqs.fna')
    split_lib_hist = join(input_dir, 'split_libraries', 'histograms.txt')
    split_lib_log = join(input_dir, 'split_libraries', 'split_library_log.txt')
    
    # this needs to be a try/except since FASTA files does not have these files
    try:
        split_hist_str = open(split_lib_hist).read()
        split_log_str = open(split_lib_log).read()
    except IOError:
        split_hist_str=None
        split_log_str=None
    
    # read in the workflow log file and determine timestamp and svn version of
    # Qiime used for the analysis
    svn_version = '1418' # This is temporarily defined, however will use script to dtermine this value
    run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S")
    print run_date
    
    # get the log file data
    full_log_fp = glob(join(input_dir, 'log*.txt'))[0]
    full_log_str = open(full_log_fp, 'U').read()
    log_str = open(full_log_fp, 'U').readlines()
    
    split_lib_cmd="Split-libraries was not run due to this being a FASTA-file"
    pick_otus_cmd=''
    # from the workflow log file get the split-library and pick-otus cmds
    for substr in log_str:
        if 'split_libraries_fastq.py' in substr:
            split_lib_cmd=substr
        elif 'parallel_pick_otus_uclust_ref.py' in substr:
            pick_otus_cmd=substr
        elif 'split_libraries.py' in substr:
            split_lib_cmd=substr
        elif 'pick_otus.py' in substr:
            pick_otus_cmd=substr

    # Insert the split-library log information in the DB
    valid,split_library_run_id=data_access.loadSplitLibInfo(True,analysis_id,\
                                     run_date, split_lib_cmd,\
                                     svn_version, split_log_str, \
                                     split_hist_str, split_lib_input_md5sum)
                                     
    print "Split-Lib ID: %s" % split_library_run_id
    if not valid:
        raise ValueError,'Error: Unable to load split-library info to database server!'
    
    print "Finished loading the split-library log information!"

    # process and load_fna_data
    print "starting new fna load"
    start = time.time()

    ''' 
    The output values and types for each value are as follows:
    0: sequence run id (integer)
    1: sample id (text)
    2: barcode read group tag (text)
    3: read id (text)    
    4: original barcode (text)
    5: new barcode (text)
    6: number of barcode diffs (integer)
    7: sequence length (integer)
    8: sequence md5 hash (text)
    9: sequence string (text)
    '''
    # define the data types for oracle
    types = ['i','i', 's', 's', 's', 's', 's', 'i', 'i', 'fc', 's']
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    
    ### this allows for rebuilding indices but shouldn't be necessary
    #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...'
    #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ')
    #cur = con.cursor()
    open_fasta = open(split_lib_seqs)
    iterator=0
    
    # using the generator, load the sequences
    for res in input_set_generator(fasta_to_tab_delim(open_fasta, seq_run_id,\
                                    split_library_run_id), cur, types,\
                                    buffer_size=500):
        #print str(res)
        print 'running %i' % (iterator)
        iterator=iterator+1
        valid = data_access.loadFNAFile(True, res)
        if not valid:
            raise ValueError, 'Error: Unable to load FNA file into database!'

    open_fasta.close()

    end = time.time()
    print 'Total processor time elapsed: %s' % str(end - start)

    print 'Finished loading split_library FNA file.'
    
    try:
        ### MOVING THIS INTO SEQUENCE LOADING SINCE RELIES ON SPLIT_LIBRARY_READ_MAP
        # try/except necessary since some datasets are metagenomes, 
        # which do not have OTU failures
    
        # Get otu_picking_run_id
        con = data_access.getSFFDatabaseConnection()
        cur = con.cursor()
        statement='select otu_picking_run_id from analysis where analysis_id=%s' % (str(analysis_id))
        results = cur.execute(statement)
        for i in results:
            otu_picking_run_id=i[0]
        
        # get the otu-picking failures file
        pick_otus_failures = join(input_dir, 'gg_97_otus', 'all_failures.txt')

        lines = open(pick_otus_failures,'U')
        otu_failures = []
        for line in lines:
            otu_failures.append('%s\t%s'% (line.strip('\n'),str(otu_picking_run_id)))
        
        # define oracle data types
        types=['s','i']
        con=data_access.getSFFDatabaseConnection()
        cur = con.cursor()
        set_count = 1
        
        # iterate over OTU failures and load them 
        for input_set in input_set_generator(otu_failures, cur, types, buffer_size=10000):
            valid = data_access.loadOTUFailuresAll(True, input_set)
            if not valid:
                raise ValueError, 'Error: Unable to load OTU failures data into database!'
            print "loading OTU failure set: %s" % set_count
            set_count += 1

        print 'Successfully loaded the OTU failures into the database!'
    except:
        print "Unable to load OTU failures!"
    print 'End of function'
    
    return split_library_run_id