Example #1
0
 def test_get_header_info(self):
     """get_header_info should return a sff file common header as a dict"""
     header = get_header_info(self.rec)
     self.assertEqual(len(header), 11)
     self.assertEqual(header['Key Length'], '4')
     self.assertEqual(header['Key Sequence'], 'TCAG')
def submit_sff_and_split_lib(data_access,fasta_files,metadata_study_id):
    """
       SFF Loading: This function takes the fasta filenames and using that path,
       determines the location of the split-library and picked-otu files.  Once 
       file locations have been determined, it moves the files to the DB machine
       and load the files into the DB.
    """
    # get database connection and cursor
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    
    ### this may help in speeding up loading but shouldn't be necessary
    #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...'
    #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ')
    #cur = con.cursor()
    
    # check if study exists
    study_id_exists=data_access.checkIfStudyIdExists(metadata_study_id)
    print "Study ID exists: " + str(study_id_exists)

    # create a temp filename
    alphabet = "ABCDEFGHIJKLMNOPQRSTUZWXYZ"
    alphabet += alphabet.lower()
    alphabet += "01234567890"
    random_fname=''.join([choice(alphabet) for i in range(10)])
    tmp_filename ='_'+random_fname+'_'+strftime("%Y_%m_%d_%H_%M_%S")
    
    # get a list of filenames
    fasta_filenames=fasta_files.split(',')
    seq_run_id=0  
    analysis_id=0    
    split_lib_input_checksums=[]
    fasta_qual_files=[]
    
    ### by disabling constraints you can speed up loading as well, but shouldn't
    ### be necessary
    #valid = data_access.disableTableConstraints()
    #print "Disabled table constraints"

    #split the fasta filenames and determine filepaths
    for fasta_fname in fasta_filenames:
        input_fname, input_ext = splitext(split(fasta_fname)[-1])
        input_basename, input_ext = splitext(fasta_fname)
        input_dir = split(input_basename)[:-1][0]
        
        # get the sff basename
        if re.search('0\d$', input_fname)==None or re.search('0\d$', 
                                                    input_fname).group()==None:
            sff_basename=input_fname
        else:
            sff_basename=input_fname[:-2]

        if re.search('0\d_FLX$', sff_basename)==None or re.search('0\d_FLX$', 
                                                    sff_basename).group()==None:
            sff_basename=sff_basename
        else:
            sff_basename=sff_basename[:-6]

        print 'sff_basename: %s' % sff_basename

        # get analysis notes
        analysis_notes=split(input_basename)[0]
        
        # using the fasta basename, define qual and flow files
        qual_fname=join(input_basename+'.qual')
        flow_fname=join(input_basename+'.txt')
        fasta_qual_files.append(fasta_fname)
        fasta_qual_files.append(qual_fname)
        
        # Run the Oracle process_sff_files load package
        ## Get the location and name of the SFF file, get it's MD5. .SFF is one 
        # directory up from the other files
        rev = dirname(fasta_fname)[::-1]
        
        # check for sffs in the processed folder...only occurs for Ti processing
        sffs_in_processed_folder = glob(join(input_dir, '*_FLX.sff'))
        if len(sffs_in_processed_folder) == 0:
            sff_file_dir = split(input_dir)[0]
        else:
            sff_file_dir=input_dir
        
        # get SFF file
        sff_file = join(sff_file_dir, input_fname + '.sff')
        
        # get md5 of SFF
        sff_md5 = safe_md5(open(sff_file)).hexdigest()
        
        print 'MD5 is: %s' % str(sff_md5)

        # create an analysis
        if analysis_id==0:
            analysis_id=data_access.createAnalysis(metadata_study_id)
        
        # check if SFF info was already loaded into DB
        sff_exists=data_access.checkIfSFFExists(sff_md5)
        print 'sff in database? %s' % str(sff_exists)
        
        #if True:
        if not sff_exists:
            print 'flow_fname: %s' % flow_fname
            sff_header=get_header_info(open(flow_fname))
            
            # get instrument info
            if sff_header['# of Flows']=='400':
                instrument_code='GS FLX'
            elif sff_header['# of Flows']=='168':
                instrument_code='GS2-'
            elif sff_header['# of Flows']=='800':
                instrument_code='Titanium'
            else:
                instrument_code='UNKNOWN'
            print 'Instrument Code: %s' % instrument_code
            
            # load SFF info
            if seq_run_id==0:
                seq_run_id=data_access.createSequencingRun(True,instrument_code,
                                            sff_header['Version'],seq_run_id)
                valid=data_access.addSFFFileInfo(True,sff_basename,
                                             sff_header['# of Reads'],
                                             sff_header['Header Length'],
                                             sff_header['Key Length'],
                                             sff_header['# of Flows'],
                                             sff_header['Flowgram Code'],
                                             sff_header['Flow Chars'],
                                             sff_header['Key Sequence'],
                                             sff_md5,seq_run_id)
            else:
                valid=data_access.addSFFFileInfo(True,sff_basename,
                                             sff_header['# of Reads'],
                                             sff_header['Header Length'],
                                             sff_header['Key Length'],
                                             sff_header['# of Flows'],
                                             sff_header['Flowgram Code'],
                                             sff_header['Flow Chars'],
                                             sff_header['Key Sequence'],
                                             sff_md5,seq_run_id)

        else:
            seq_run_id=data_access.getSeqRunIDUsingMD5(sff_md5)
    
    print 'sequence_run_id is: %s' % str(seq_run_id)
            
    
    # get md5 of fna/qual files
    print fasta_qual_files
    split_lib_input_md5sum=safe_md5(MD5Wrap(fasta_qual_files)).hexdigest()
    print split_lib_input_md5sum
    print 'Finished loading the processed SFF data!'
    print 'Run ID: %s' % seq_run_id
    print 'Analysis ID: %s' % analysis_id
    
    # add seq_run_id to Analysis table
    valid=data_access.updateAnalysisWithSeqRunID(True,analysis_id,seq_run_id)
    if not valid:
        raise ValueError, 'Error: Unable to append SEQ_RUN_ID into ANALYSIS table!'

    return analysis_id,input_dir,seq_run_id,split_lib_input_md5sum
Example #3
0
 def test_get_header_info(self):
     """get_header_info should return a sff file common header as a dict"""
     header = get_header_info(self.rec)
     self.assertEqual(len(header), 11)
     self.assertEqual(header['Key Length'], '4')
     self.assertEqual(header['Key Sequence'], 'TCAG')