def test_input_set_generator(self):
        """Test that we're generating data sets to input"""
        input = """#int\tstr\tfloat
1\tasd\t0.0
2\t123\t0.2
3\t\t0.3
4\tzxc\t0.4
5\tqaz\t0.5""".splitlines()
        con = MockConnection()
        cursor = con.cursor()
        types = ['i','s','f']

        gen = input_set_generator(input,cursor,types,2,type_lookup=type_lookup_mock)
        exp1 = [[1,2],['asd','123'],[0.0,0.2]]
        exp2 = [[3,4],['','zxc'],[0.3,0.4]]
        exp3 = [[5],['qaz'],[0.5]]
        obs1 = gen.next()
        obs2 = gen.next()
        obs3 = gen.next()

        self.assertRaises(StopIteration, gen.next)

        self.assertEqual(obs1, exp1)
        self.assertEqual(obs2, exp2)
        self.assertEqual(obs3, exp3)
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    fasta_file = dict(MinimalFastaParser(open(opts.input_file,'U')))
    fname=split(opts.input_file)[-1].split('_')
    ref_dataset=fname[0]
    if ref_dataset=='gg':
        reference_dataset='GREENGENES_REFERENCE'
    threshold=fname[1]
    print threshold
    try:
        from data_access_connections import data_access_factory
        from enums import ServerConfig
        import cx_Oracle
        if opts.submit_to_test_db:
            data_access = data_access_factory(DataAccessType.qiime_test)
        else:
            data_access = data_access_factory(ServerConfig.data_access_type)
    except ImportError:
        print "NOT IMPORTING QIIMEDATAACCESS"
        pass
        
    prokmsas=[]
    for prok_id in fasta_file:
        prokmsas.append('%s\t%s\t%s' % (str(prok_id),str(threshold),
                                            reference_dataset))
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()

    data_types=['s','i','s']
    for input_set in input_set_generator(prokmsas, cur,data_types):
        valid=data_access.loadSeqToSourceMap(True,input_set)
        if not valid:
            raise ValueError, 'Error: Unable to load Sequence to Source Map!'
def load_otu_mapping(data_access, input_dir, analysis_id):
    """ Load the OTU table into the DB """
    
    # For OTU Tables
    # read in the workflow log file and determine timestamp and svn version of
    # Qiime used for the analysis
    pOTUs_threshold = '97'
    ref_set_threshold = '97'
    pOTUs_method='UCLUST_REF'
    reference_set_name='GREENGENES_REFERENCE'
    otus_log_str = open(join(input_dir, 'gg_97_otus', 'log.txt')).read()
    log_str = open(join(input_dir, 'gg_97_otus', 'log.txt')).readlines()
    
    #from the workflow log file get the pick-otus cmd
    for substr in log_str:
        if 'parallel_pick_otus_uclust_ref.py' in substr:
            pick_otus_cmd=substr
        elif 'pick_otus.py' in substr:
            pick_otus_cmd=substr
    
    # define values for otu_picking_run table
    otu_run_set_id = 0
    svn_version = '1418' # This is temporarily defined, however will use script to dtermine this value
    run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S")    
    pick_otus_map = join(input_dir, 'gg_97_otus', 'exact_uclust_ref_otus.txt')
    
    # get md5 for split-lib seq file
    split_lib_seqs = join(input_dir, 'split_libraries', 'seqs.fna')
    split_lib_seqs_md5=safe_md5(open(split_lib_seqs)).hexdigest()
    
    # Insert the otu-picking log information in the DB
    print 'calling loadAllOTUInfo with analysis_id %s' % str(analysis_id)
    valid,new_otu_run_set_id,otu_picking_run_id=data_access.loadAllOTUInfo(True,
                                  otu_run_set_id, run_date,
                                  pOTUs_method, pOTUs_threshold,
                                  svn_version, pick_otus_cmd, otus_log_str,
                                  split_lib_seqs_md5,reference_set_name,
                                  ref_set_threshold, analysis_id)
    if not valid:
        raise ValueError, 'Error: Unable to load OTU run data into database!'
    else:
        print "Finished registering OTU run!"
    
    # define OTU mapping
    otu_map=[]
    otu_to_seqid = fields_to_dict(open(pick_otus_map, 'U'))
    for otu in otu_to_seqid:
        for sample in otu_to_seqid[otu]:
            otu_map.append('%s\t%s\t%s\t%s' % (otu,sample,new_otu_run_set_id, 
                                               reference_set_name))
    print 'Finished setting otu_map.'
    
    # define oracle data types
    types = ['s','s','i','s']
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    #print 'Starting PK_SPLIT_LIBRARY_READ_MAP index rebuild...'
    #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ')
    print 'Fisnished rebuilding index PK_SPLIT_LIBRARY_READ_MAP.'
    cur = con.cursor()
    set_count = 1
    
    # prepare the OTU table for laoding
    print 'Loading OTU Table into the database!'
    pick_otus_table = join(input_dir, 'gg_97_otus',
                           'exact_uclust_ref_otu_table.txt')
    otu_table_lines=open(pick_otus_table).readlines()
    sample_ids, otu_ids, otu_table, lineages = \
                                    parse_classic_otu_table(otu_table_lines)
    
    # convert OTU table to tab-delimited list
    otu_table_load=[]
    for i,otu in enumerate(otu_ids):
        for j,sample in enumerate(sample_ids):
            if otu_table[i][j]>0:
                otu_table_load.append("%s\t%s\t%s\t%s" % \
                                (otu,sample,new_otu_run_set_id,otu_table[i][j]))

    # get DB connection
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    
    # load otu table into DB
    data_types=['s','s','i','f']   
    set_count = 0      
    for input_set in input_set_generator(otu_table_load, cur,data_types,\
                                         buffer_size=1000):
        valid=data_access.loadOTUTable(True,input_set)
        if not valid:
            raise ValueError, 'Error: Unable to load OTU table!'
        print "loading OTU Table: %s" % set_count
        set_count += 1
    
    print 'Successfully loaded the OTU Table into the database!'
    print 'End of function' 
def load_split_lib_sequences(data_access,input_dir,analysis_id, seq_run_id,
                             split_lib_input_md5sum):
    """ This function loads the split-library seqs into DB """
    
    # define the split library file paths using the original fasta input 
    # directory
    split_lib_seqs = join(input_dir, 'split_libraries', 'seqs.fna')
    split_lib_hist = join(input_dir, 'split_libraries', 'histograms.txt')
    split_lib_log = join(input_dir, 'split_libraries', 'split_library_log.txt')
    
    # this needs to be a try/except since FASTA files does not have these files
    try:
        split_hist_str = open(split_lib_hist).read()
        split_log_str = open(split_lib_log).read()
    except IOError:
        split_hist_str=None
        split_log_str=None
    
    # read in the workflow log file and determine timestamp and svn version of
    # Qiime used for the analysis
    svn_version = '1418' # This is temporarily defined, however will use script to dtermine this value
    run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S")
    print run_date
    
    # get the log file data
    full_log_fp = glob(join(input_dir, 'log*.txt'))[0]
    full_log_str = open(full_log_fp, 'U').read()
    log_str = open(full_log_fp, 'U').readlines()
    
    split_lib_cmd="Split-libraries was not run due to this being a FASTA-file"
    pick_otus_cmd=''
    # from the workflow log file get the split-library and pick-otus cmds
    for substr in log_str:
        if 'split_libraries_fastq.py' in substr:
            split_lib_cmd=substr
        elif 'parallel_pick_otus_uclust_ref.py' in substr:
            pick_otus_cmd=substr
        elif 'split_libraries.py' in substr:
            split_lib_cmd=substr
        elif 'pick_otus.py' in substr:
            pick_otus_cmd=substr

    # Insert the split-library log information in the DB
    valid,split_library_run_id=data_access.loadSplitLibInfo(True,analysis_id,\
                                     run_date, split_lib_cmd,\
                                     svn_version, split_log_str, \
                                     split_hist_str, split_lib_input_md5sum)
                                     
    print "Split-Lib ID: %s" % split_library_run_id
    if not valid:
        raise ValueError,'Error: Unable to load split-library info to database server!'
    
    print "Finished loading the split-library log information!"

    # process and load_fna_data
    print "starting new fna load"
    start = time.time()

    ''' 
    The output values and types for each value are as follows:
    0: sequence run id (integer)
    1: sample id (text)
    2: barcode read group tag (text)
    3: read id (text)    
    4: original barcode (text)
    5: new barcode (text)
    6: number of barcode diffs (integer)
    7: sequence length (integer)
    8: sequence md5 hash (text)
    9: sequence string (text)
    '''
    # define the data types for oracle
    types = ['i','i', 's', 's', 's', 's', 's', 'i', 'i', 'fc', 's']
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    
    ### this allows for rebuilding indices but shouldn't be necessary
    #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...'
    #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ')
    #cur = con.cursor()
    open_fasta = open(split_lib_seqs)
    iterator=0
    
    # using the generator, load the sequences
    for res in input_set_generator(fasta_to_tab_delim(open_fasta, seq_run_id,\
                                    split_library_run_id), cur, types,\
                                    buffer_size=500):
        #print str(res)
        print 'running %i' % (iterator)
        iterator=iterator+1
        valid = data_access.loadFNAFile(True, res)
        if not valid:
            raise ValueError, 'Error: Unable to load FNA file into database!'

    open_fasta.close()

    end = time.time()
    print 'Total processor time elapsed: %s' % str(end - start)

    print 'Finished loading split_library FNA file.'
    
    try:
        ### MOVING THIS INTO SEQUENCE LOADING SINCE RELIES ON SPLIT_LIBRARY_READ_MAP
        # try/except necessary since some datasets are metagenomes, 
        # which do not have OTU failures
    
        # Get otu_picking_run_id
        con = data_access.getSFFDatabaseConnection()
        cur = con.cursor()
        statement='select otu_picking_run_id from analysis where analysis_id=%s' % (str(analysis_id))
        results = cur.execute(statement)
        for i in results:
            otu_picking_run_id=i[0]
        
        # get the otu-picking failures file
        pick_otus_failures = join(input_dir, 'gg_97_otus', 'all_failures.txt')

        lines = open(pick_otus_failures,'U')
        otu_failures = []
        for line in lines:
            otu_failures.append('%s\t%s'% (line.strip('\n'),str(otu_picking_run_id)))
        
        # define oracle data types
        types=['s','i']
        con=data_access.getSFFDatabaseConnection()
        cur = con.cursor()
        set_count = 1
        
        # iterate over OTU failures and load them 
        for input_set in input_set_generator(otu_failures, cur, types, buffer_size=10000):
            valid = data_access.loadOTUFailuresAll(True, input_set)
            if not valid:
                raise ValueError, 'Error: Unable to load OTU failures data into database!'
            print "loading OTU failure set: %s" % set_count
            set_count += 1

        print 'Successfully loaded the OTU failures into the database!'
    except:
        print "Unable to load OTU failures!"
    print 'End of function'
    
    return split_library_run_id