def test_input_set_generator(self): """Test that we're generating data sets to input""" input = """#int\tstr\tfloat 1\tasd\t0.0 2\t123\t0.2 3\t\t0.3 4\tzxc\t0.4 5\tqaz\t0.5""".splitlines() con = MockConnection() cursor = con.cursor() types = ['i','s','f'] gen = input_set_generator(input,cursor,types,2,type_lookup=type_lookup_mock) exp1 = [[1,2],['asd','123'],[0.0,0.2]] exp2 = [[3,4],['','zxc'],[0.3,0.4]] exp3 = [[5],['qaz'],[0.5]] obs1 = gen.next() obs2 = gen.next() obs3 = gen.next() self.assertRaises(StopIteration, gen.next) self.assertEqual(obs1, exp1) self.assertEqual(obs2, exp2) self.assertEqual(obs3, exp3)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) fasta_file = dict(MinimalFastaParser(open(opts.input_file,'U'))) fname=split(opts.input_file)[-1].split('_') ref_dataset=fname[0] if ref_dataset=='gg': reference_dataset='GREENGENES_REFERENCE' threshold=fname[1] print threshold try: from data_access_connections import data_access_factory from enums import ServerConfig import cx_Oracle if opts.submit_to_test_db: data_access = data_access_factory(DataAccessType.qiime_test) else: data_access = data_access_factory(ServerConfig.data_access_type) except ImportError: print "NOT IMPORTING QIIMEDATAACCESS" pass prokmsas=[] for prok_id in fasta_file: prokmsas.append('%s\t%s\t%s' % (str(prok_id),str(threshold), reference_dataset)) con = data_access.getSFFDatabaseConnection() cur = con.cursor() data_types=['s','i','s'] for input_set in input_set_generator(prokmsas, cur,data_types): valid=data_access.loadSeqToSourceMap(True,input_set) if not valid: raise ValueError, 'Error: Unable to load Sequence to Source Map!'
def load_otu_mapping(data_access, input_dir, analysis_id): """ Load the OTU table into the DB """ # For OTU Tables # read in the workflow log file and determine timestamp and svn version of # Qiime used for the analysis pOTUs_threshold = '97' ref_set_threshold = '97' pOTUs_method='UCLUST_REF' reference_set_name='GREENGENES_REFERENCE' otus_log_str = open(join(input_dir, 'gg_97_otus', 'log.txt')).read() log_str = open(join(input_dir, 'gg_97_otus', 'log.txt')).readlines() #from the workflow log file get the pick-otus cmd for substr in log_str: if 'parallel_pick_otus_uclust_ref.py' in substr: pick_otus_cmd=substr elif 'pick_otus.py' in substr: pick_otus_cmd=substr # define values for otu_picking_run table otu_run_set_id = 0 svn_version = '1418' # This is temporarily defined, however will use script to dtermine this value run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S") pick_otus_map = join(input_dir, 'gg_97_otus', 'exact_uclust_ref_otus.txt') # get md5 for split-lib seq file split_lib_seqs = join(input_dir, 'split_libraries', 'seqs.fna') split_lib_seqs_md5=safe_md5(open(split_lib_seqs)).hexdigest() # Insert the otu-picking log information in the DB print 'calling loadAllOTUInfo with analysis_id %s' % str(analysis_id) valid,new_otu_run_set_id,otu_picking_run_id=data_access.loadAllOTUInfo(True, otu_run_set_id, run_date, pOTUs_method, pOTUs_threshold, svn_version, pick_otus_cmd, otus_log_str, split_lib_seqs_md5,reference_set_name, ref_set_threshold, analysis_id) if not valid: raise ValueError, 'Error: Unable to load OTU run data into database!' else: print "Finished registering OTU run!" # define OTU mapping otu_map=[] otu_to_seqid = fields_to_dict(open(pick_otus_map, 'U')) for otu in otu_to_seqid: for sample in otu_to_seqid[otu]: otu_map.append('%s\t%s\t%s\t%s' % (otu,sample,new_otu_run_set_id, reference_set_name)) print 'Finished setting otu_map.' # define oracle data types types = ['s','s','i','s'] con = data_access.getSFFDatabaseConnection() cur = con.cursor() #print 'Starting PK_SPLIT_LIBRARY_READ_MAP index rebuild...' #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ') print 'Fisnished rebuilding index PK_SPLIT_LIBRARY_READ_MAP.' cur = con.cursor() set_count = 1 # prepare the OTU table for laoding print 'Loading OTU Table into the database!' pick_otus_table = join(input_dir, 'gg_97_otus', 'exact_uclust_ref_otu_table.txt') otu_table_lines=open(pick_otus_table).readlines() sample_ids, otu_ids, otu_table, lineages = \ parse_classic_otu_table(otu_table_lines) # convert OTU table to tab-delimited list otu_table_load=[] for i,otu in enumerate(otu_ids): for j,sample in enumerate(sample_ids): if otu_table[i][j]>0: otu_table_load.append("%s\t%s\t%s\t%s" % \ (otu,sample,new_otu_run_set_id,otu_table[i][j])) # get DB connection con = data_access.getSFFDatabaseConnection() cur = con.cursor() # load otu table into DB data_types=['s','s','i','f'] set_count = 0 for input_set in input_set_generator(otu_table_load, cur,data_types,\ buffer_size=1000): valid=data_access.loadOTUTable(True,input_set) if not valid: raise ValueError, 'Error: Unable to load OTU table!' print "loading OTU Table: %s" % set_count set_count += 1 print 'Successfully loaded the OTU Table into the database!' print 'End of function'
def load_split_lib_sequences(data_access,input_dir,analysis_id, seq_run_id, split_lib_input_md5sum): """ This function loads the split-library seqs into DB """ # define the split library file paths using the original fasta input # directory split_lib_seqs = join(input_dir, 'split_libraries', 'seqs.fna') split_lib_hist = join(input_dir, 'split_libraries', 'histograms.txt') split_lib_log = join(input_dir, 'split_libraries', 'split_library_log.txt') # this needs to be a try/except since FASTA files does not have these files try: split_hist_str = open(split_lib_hist).read() split_log_str = open(split_lib_log).read() except IOError: split_hist_str=None split_log_str=None # read in the workflow log file and determine timestamp and svn version of # Qiime used for the analysis svn_version = '1418' # This is temporarily defined, however will use script to dtermine this value run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S") print run_date # get the log file data full_log_fp = glob(join(input_dir, 'log*.txt'))[0] full_log_str = open(full_log_fp, 'U').read() log_str = open(full_log_fp, 'U').readlines() split_lib_cmd="Split-libraries was not run due to this being a FASTA-file" pick_otus_cmd='' # from the workflow log file get the split-library and pick-otus cmds for substr in log_str: if 'split_libraries_fastq.py' in substr: split_lib_cmd=substr elif 'parallel_pick_otus_uclust_ref.py' in substr: pick_otus_cmd=substr elif 'split_libraries.py' in substr: split_lib_cmd=substr elif 'pick_otus.py' in substr: pick_otus_cmd=substr # Insert the split-library log information in the DB valid,split_library_run_id=data_access.loadSplitLibInfo(True,analysis_id,\ run_date, split_lib_cmd,\ svn_version, split_log_str, \ split_hist_str, split_lib_input_md5sum) print "Split-Lib ID: %s" % split_library_run_id if not valid: raise ValueError,'Error: Unable to load split-library info to database server!' print "Finished loading the split-library log information!" # process and load_fna_data print "starting new fna load" start = time.time() ''' The output values and types for each value are as follows: 0: sequence run id (integer) 1: sample id (text) 2: barcode read group tag (text) 3: read id (text) 4: original barcode (text) 5: new barcode (text) 6: number of barcode diffs (integer) 7: sequence length (integer) 8: sequence md5 hash (text) 9: sequence string (text) ''' # define the data types for oracle types = ['i','i', 's', 's', 's', 's', 's', 'i', 'i', 'fc', 's'] con = data_access.getSFFDatabaseConnection() cur = con.cursor() ### this allows for rebuilding indices but shouldn't be necessary #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...' #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ') #cur = con.cursor() open_fasta = open(split_lib_seqs) iterator=0 # using the generator, load the sequences for res in input_set_generator(fasta_to_tab_delim(open_fasta, seq_run_id,\ split_library_run_id), cur, types,\ buffer_size=500): #print str(res) print 'running %i' % (iterator) iterator=iterator+1 valid = data_access.loadFNAFile(True, res) if not valid: raise ValueError, 'Error: Unable to load FNA file into database!' open_fasta.close() end = time.time() print 'Total processor time elapsed: %s' % str(end - start) print 'Finished loading split_library FNA file.' try: ### MOVING THIS INTO SEQUENCE LOADING SINCE RELIES ON SPLIT_LIBRARY_READ_MAP # try/except necessary since some datasets are metagenomes, # which do not have OTU failures # Get otu_picking_run_id con = data_access.getSFFDatabaseConnection() cur = con.cursor() statement='select otu_picking_run_id from analysis where analysis_id=%s' % (str(analysis_id)) results = cur.execute(statement) for i in results: otu_picking_run_id=i[0] # get the otu-picking failures file pick_otus_failures = join(input_dir, 'gg_97_otus', 'all_failures.txt') lines = open(pick_otus_failures,'U') otu_failures = [] for line in lines: otu_failures.append('%s\t%s'% (line.strip('\n'),str(otu_picking_run_id))) # define oracle data types types=['s','i'] con=data_access.getSFFDatabaseConnection() cur = con.cursor() set_count = 1 # iterate over OTU failures and load them for input_set in input_set_generator(otu_failures, cur, types, buffer_size=10000): valid = data_access.loadOTUFailuresAll(True, input_set) if not valid: raise ValueError, 'Error: Unable to load OTU failures data into database!' print "loading OTU failure set: %s" % set_count set_count += 1 print 'Successfully loaded the OTU failures into the database!' except: print "Unable to load OTU failures!" print 'End of function' return split_library_run_id