def test_get_header_info(self): """get_header_info should return a sff file common header as a dict""" header = get_header_info(self.rec) self.assertEqual(len(header), 11) self.assertEqual(header['Key Length'], '4') self.assertEqual(header['Key Sequence'], 'TCAG')
def submit_sff_and_split_lib(data_access,fasta_files,metadata_study_id): """ SFF Loading: This function takes the fasta filenames and using that path, determines the location of the split-library and picked-otu files. Once file locations have been determined, it moves the files to the DB machine and load the files into the DB. """ # get database connection and cursor con = data_access.getSFFDatabaseConnection() cur = con.cursor() ### this may help in speeding up loading but shouldn't be necessary #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...' #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ') #cur = con.cursor() # check if study exists study_id_exists=data_access.checkIfStudyIdExists(metadata_study_id) print "Study ID exists: " + str(study_id_exists) # create a temp filename alphabet = "ABCDEFGHIJKLMNOPQRSTUZWXYZ" alphabet += alphabet.lower() alphabet += "01234567890" random_fname=''.join([choice(alphabet) for i in range(10)]) tmp_filename ='_'+random_fname+'_'+strftime("%Y_%m_%d_%H_%M_%S") # get a list of filenames fasta_filenames=fasta_files.split(',') seq_run_id=0 analysis_id=0 split_lib_input_checksums=[] fasta_qual_files=[] ### by disabling constraints you can speed up loading as well, but shouldn't ### be necessary #valid = data_access.disableTableConstraints() #print "Disabled table constraints" #split the fasta filenames and determine filepaths for fasta_fname in fasta_filenames: input_fname, input_ext = splitext(split(fasta_fname)[-1]) input_basename, input_ext = splitext(fasta_fname) input_dir = split(input_basename)[:-1][0] # get the sff basename if re.search('0\d$', input_fname)==None or re.search('0\d$', input_fname).group()==None: sff_basename=input_fname else: sff_basename=input_fname[:-2] if re.search('0\d_FLX$', sff_basename)==None or re.search('0\d_FLX$', sff_basename).group()==None: sff_basename=sff_basename else: sff_basename=sff_basename[:-6] print 'sff_basename: %s' % sff_basename # get analysis notes analysis_notes=split(input_basename)[0] # using the fasta basename, define qual and flow files qual_fname=join(input_basename+'.qual') flow_fname=join(input_basename+'.txt') fasta_qual_files.append(fasta_fname) fasta_qual_files.append(qual_fname) # Run the Oracle process_sff_files load package ## Get the location and name of the SFF file, get it's MD5. .SFF is one # directory up from the other files rev = dirname(fasta_fname)[::-1] # check for sffs in the processed folder...only occurs for Ti processing sffs_in_processed_folder = glob(join(input_dir, '*_FLX.sff')) if len(sffs_in_processed_folder) == 0: sff_file_dir = split(input_dir)[0] else: sff_file_dir=input_dir # get SFF file sff_file = join(sff_file_dir, input_fname + '.sff') # get md5 of SFF sff_md5 = safe_md5(open(sff_file)).hexdigest() print 'MD5 is: %s' % str(sff_md5) # create an analysis if analysis_id==0: analysis_id=data_access.createAnalysis(metadata_study_id) # check if SFF info was already loaded into DB sff_exists=data_access.checkIfSFFExists(sff_md5) print 'sff in database? %s' % str(sff_exists) #if True: if not sff_exists: print 'flow_fname: %s' % flow_fname sff_header=get_header_info(open(flow_fname)) # get instrument info if sff_header['# of Flows']=='400': instrument_code='GS FLX' elif sff_header['# of Flows']=='168': instrument_code='GS2-' elif sff_header['# of Flows']=='800': instrument_code='Titanium' else: instrument_code='UNKNOWN' print 'Instrument Code: %s' % instrument_code # load SFF info if seq_run_id==0: seq_run_id=data_access.createSequencingRun(True,instrument_code, sff_header['Version'],seq_run_id) valid=data_access.addSFFFileInfo(True,sff_basename, sff_header['# of Reads'], sff_header['Header Length'], sff_header['Key Length'], sff_header['# of Flows'], sff_header['Flowgram Code'], sff_header['Flow Chars'], sff_header['Key Sequence'], sff_md5,seq_run_id) else: valid=data_access.addSFFFileInfo(True,sff_basename, sff_header['# of Reads'], sff_header['Header Length'], sff_header['Key Length'], sff_header['# of Flows'], sff_header['Flowgram Code'], sff_header['Flow Chars'], sff_header['Key Sequence'], sff_md5,seq_run_id) else: seq_run_id=data_access.getSeqRunIDUsingMD5(sff_md5) print 'sequence_run_id is: %s' % str(seq_run_id) # get md5 of fna/qual files print fasta_qual_files split_lib_input_md5sum=safe_md5(MD5Wrap(fasta_qual_files)).hexdigest() print split_lib_input_md5sum print 'Finished loading the processed SFF data!' print 'Run ID: %s' % seq_run_id print 'Analysis ID: %s' % analysis_id # add seq_run_id to Analysis table valid=data_access.updateAnalysisWithSeqRunID(True,analysis_id,seq_run_id) if not valid: raise ValueError, 'Error: Unable to append SEQ_RUN_ID into ANALYSIS table!' return analysis_id,input_dir,seq_run_id,split_lib_input_md5sum