Ejemplo n.º 1
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_id = opts.sample_id
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors
    
    # if this is not a demultiplexed run, 
    if barcode_type == 'not-barcoded':
        if sample_id == None:
            option_parser.error("If not providing barcode reads (because "
            "your data is not multiplexed), must provide a --sample_id.")
        barcode_read_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps == None:
        option_parser.error("Must provide --barcode_fps if "
                            "--barcode_type is not 'not-barcoded'")
    else:
        pass
    
    phred_offset = opts.phred_offset
    if phred_offset != None:
        try:
            phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
        except KeyError:
            # shouldn't be able to get here, but we'll stay on the
            # safe side
            opption_parser.error(\
             "Only valid phred offsets are: %s" %\
             ' '.join(phred_to_ascii_fs.keys()))
    else:
        # let split_libraries_fastq.process_fastq_single_end_read_file 
        # figure it out...
        phred_to_ascii_f = None
    
    if opts.last_bad_quality_char != None:
        option_parser.error('--last_bad_quality_char is no longer supported. '
         'Use -q instead (see option help text by passing -h)')
    
    if not (0 <= min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be between '
         '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction)
    
    try:
        barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type]
    except KeyError:
        barcode_correction_fn = None
    
    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)
    
    if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1:
        option_parser.error("Same number of sequence, barcode and mapping files must be provided.")
    
    output_dir = opts.output_dir
    create_dir(output_dir)
    
    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp,'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir
    
    if store_qual_scores:
        qual_f = open(qual_fp_temp,'w')
        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def qual_writer(h,q):
            qual_f.write('>%s\n%s\n' % (h,q))
    else:
        def qual_writer(h,q):
            pass
    
    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp,'w')
        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def fastq_writer(h,s,q):
            output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h,s,q))
    else:
        def fastq_writer(h,s,q):
            pass
    
    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp,'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp,'w')
    
    for sequence_read_fp, barcode_read_fp, mapping_fp in\
      zip(sequence_read_fps, barcode_read_fps, mapping_fps):
        mapping_f = open(mapping_fp, 'U')
        h, i, barcode_to_sample_id, warnings, errors, p, a =\
           check_map(mapping_f, 
                     disable_primer_check=True, 
                     has_barcodes=barcode_read_fp != None)
        
        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = \
             dict([(DNA.rc(k),v) for k,v in barcode_to_sample_id.items()])
        
        if barcode_type == 'golay_12':
            invalid_golay_barcodes = \
             get_invalid_golay_barcodes(barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error("Some or all barcodes are not valid golay codes. "+\
                "Do they need to be reverse complimented? If these are not "+\
                "golay barcodes pass --barcode_type 12 to disable barcode "+\
                "error correction, or pass --barcode_type # if the barcodes "+\
                "are not 12 base pairs, where # is the size of the barcodes. "+
                "  Invalid codes:\n\t%s" % \
                ' '.join(invalid_golay_barcodes))
        
        log_f.write("Input file paths\n")
        log_f.write('Mapping filepath: %s (md5: %s)\n' %\
          (mapping_fp,safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %\
          (sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest())))
       
        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp,'U')
        

        seq_id = start_seq_id
        
        if barcode_read_fp != None:
            
            log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\
              (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest()))
              
            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp,'U')
            seq_generator = process_fastq_single_end_read_file(
               sequence_read_f,
               barcode_read_f,
               barcode_to_sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp,
               rev_comp_barcode=rev_comp_barcode,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               barcode_correction_fn=barcode_correction_fn,
               max_barcode_errors=max_barcode_errors,
               phred_to_ascii_f=phred_to_ascii_f)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
               sequence_read_f,
               sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               phred_to_ascii_f=phred_to_ascii_f)
        
        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header,sequence))
            qual_writer(fasta_header,quality)
            fastq_writer(fasta_header,sequence,quality)
            
        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')
        
    output_f.close()
    rename(output_fp_temp,output_fp)
    
    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp,qual_fp)
    
    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp,output_fastq_fp)
Ejemplo n.º 2
0
def log_input_md5s(logger, fps):
    logger.write("Input file md5 sums:\n")
    for fp in fps:
        if fp is not None:
            logger.write("%s: %s\n" % (fp, safe_md5(open(fp)).hexdigest()))
    logger.write("\n")
def submit_fasta_and_split_lib(data_access,fasta_files,metadata_study_id,
                                  input_dir):
    """
        FASTA Loading: This function takes the fasta filenames and using that
        path, determines the location of the split-library and picked-otu files.
        Once file locations have been determined, it moves the files to the DB
        machine and load the files into the DB.
    """
    # get DB connection and cursor
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    
    # check if study exists
    study_id_exists=data_access.checkIfStudyIdExists(metadata_study_id)
    print "Study ID exists: " + str(study_id_exists)
    
    # get temp filename
    alphabet = "ABCDEFGHIJKLMNOPQRSTUZWXYZ"
    alphabet += alphabet.lower()
    alphabet += "01234567890"
    random_fname=''.join([choice(alphabet) for i in range(10)])
    tmp_filename ='_'+random_fname+'_'+strftime("%Y_%m_%d_%H_%M_%S")
    
    # get fasta filenames
    fasta_filenames=fasta_files.split(',')
    seq_run_id=0  
    analysis_id=0    
    split_lib_input_checksums=[]

    ### by disabling constraints you can speed up loading as well, but shouldn't
    ### be necessary
    #valid = data_access.disableTableConstraints()
    #print "Disabled table constraints"

    # split the fasta filenames and determine filepaths
    for fasta_fname in fasta_filenames:
        input_fname, input_ext = splitext(split(fasta_fname)[-1])
        input_basename, input_ext = splitext(fasta_fname)
        
        # define analysis notes
        analysis_notes=split(input_basename)[0]
        
        # get md5 for raw fasta files
        fasta_md5 = safe_md5(open(fasta_fname)).hexdigest()
        print 'MD5 is: %s' % str(fasta_md5)

        # create an analysis row in analysis table
        if analysis_id==0:
            analysis_id=data_access.createAnalysis(metadata_study_id)
        
        # check if fasta info already loaded
        fasta_exists=data_access.checkIfSFFExists(fasta_md5)
        print 'fasta in database? %s' % str(fasta_exists)
        
        # if fasta info not loaded, then insert into DB
        if not fasta_exists:
            if seq_run_id==0:
                seq_run_id=data_access.createSequencingRun(True,'FASTA',
                                                           None,seq_run_id)
            
            # get sequence count 
            count_seqs_cmd="grep '^>' %s | wc -l" % (fasta_fname)
            o,e,r = qiime_system_call(count_seqs_cmd)
            seq_counts = o.strip()
            
            # add fasta info
            valid=data_access.addSFFFileInfo(True,input_fname,
                                          seq_counts,
                                          None,
                                          None,
                                          None,
                                          None,
                                          None,
                                          None,
                                          fasta_md5,seq_run_id)
        else:
            seq_run_id=data_access.getSeqRunIDUsingMD5(fasta_md5)
             
    print 'sequence_run_id is: %s' % str(seq_run_id)
    
    # get md5 sum for input to split-libraries
    split_lib_input_md5sum=safe_md5(MD5Wrap(fasta_filenames)).hexdigest()
    print split_lib_input_md5sum
    print 'Finished loading the processed FASTA data!'
    print 'Run ID: %s' % seq_run_id
    print 'Analysis ID: %s' % analysis_id

    # update analysis table with seq_run_id
    valid=data_access.updateAnalysisWithSeqRunID(True,analysis_id,seq_run_id)
    if not valid:
        raise ValueError, 'Error: Unable to append SEQ_RUN_ID into ANALYSIS table!'
                                              
    return analysis_id,input_dir,seq_run_id,split_lib_input_md5sum
Ejemplo n.º 4
0
def log_input_md5s(logger,fps):
    logger.write("Input file md5 sums:\n")
    for fp in fps:
        if fp != None:
            logger.write("%s: %s\n" % (fp, safe_md5(open(fp)).hexdigest()))
    logger.write("\n")
def submit_illumina_and_split_lib(data_access,fastq_files,metadata_study_id,
                                  input_dir):
    """
        Illumina Loading: This function takes the fasta filenames and using 
        that path, determines the location of the split-library and picked-otu
        files.  Once file locations have been determined, it moves the files to
        the DB machine and load the files into the DB.
    """
    
    # get DB connection and cursor
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    
    ### this may help in speeding up loading but shouldn't be necessary
    #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...'
    #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ')
    #cur = con.cursor()
    
    # check if study exists
    study_id_exists=data_access.checkIfStudyIdExists(metadata_study_id)
    print "Study ID exists: " + str(study_id_exists)
    
    # get temp filename
    alphabet = "ABCDEFGHIJKLMNOPQRSTUZWXYZ"
    alphabet += alphabet.lower()
    alphabet += "01234567890"
    random_fname=''.join([choice(alphabet) for i in range(10)])
    tmp_filename ='_'+random_fname+'_'+strftime("%Y_%m_%d_%H_%M_%S")
    
    # get fastq filenames
    fastq_filenames=fastq_files.split(',')
    seq_run_id=0  
    analysis_id=0    
    split_lib_input_checksums=[]

    ### by disabling constraints you can speed up loading as well, but shouldn't
    ### be necessary
    #valid = data_access.disableTableConstraints()
    #print "Disabled table constraints"

    #split the fastq filenames and determine filepaths
    for fastq_fname in fastq_filenames:
        input_fname, input_ext = splitext(split(fastq_fname)[-1])
        input_basename, input_ext = splitext(fastq_fname)
        
        # get analysis notes
        analysis_notes=split(input_basename)[0]
        
        # get md5 for raw fastq files
        fastq_md5 = safe_md5(open(fastq_fname)).hexdigest()
        print 'MD5 is: %s' % str(fastq_md5)

        # create an analysis row in analysis table
        if analysis_id==0:
            analysis_id=data_access.createAnalysis(metadata_study_id)
        
        # check if fastq info already loaded
        fastq_exists=data_access.checkIfSFFExists(fastq_md5)
        print 'fastq in database? %s' % str(fastq_exists)
        
        # if fastq info not loaded, then insert into DB
        if not fastq_exists:
            if seq_run_id==0:
                seq_run_id=data_access.createSequencingRun(True,'ILLUMINA',
                                                           None,seq_run_id)
            
            # get sequence count
            if fastq_fname.endswith('.gz'):
                count_seqs_cmd = "zcat %s | grep ^@ | wc -l" % (fastq_fname)
            else:
                count_seqs_cmd="grep ^@ %s | wc -l" % (fastq_fname)
            o,e,r = qiime_system_call(count_seqs_cmd)
            seq_counts = o.strip()
            
            # get header length and # of flows (length of seq)
            fastq_fname_open=open(fastq_fname)
            first_seq_fastq=get_top_fastq_two_lines(fastq_fname_open)
            header_length=len(first_seq_fastq[1])
            num_flows=len(first_seq_fastq[1])
            
            # insert fastq info
            valid=data_access.addSFFFileInfo(True,input_fname,
                                          seq_counts,
                                          header_length,
                                          None,
                                          num_flows,
                                          None,
                                          None,
                                          None,
                                          fastq_md5,seq_run_id)
        else:
            seq_run_id=data_access.getSeqRunIDUsingMD5(fastq_md5)
    
    print 'sequence_run_id is: %s' % str(seq_run_id)
    
    # get md5 for split-library input
    split_lib_input_md5sum=safe_md5(MD5Wrap(fastq_filenames)).hexdigest()
    print split_lib_input_md5sum
    print 'Finished loading the processed ILLUMINA data!'
    print 'Run ID: %s' % seq_run_id
    print 'Analysis ID: %s' % analysis_id
    
    # update analysis table with seq_run_id
    valid=data_access.updateAnalysisWithSeqRunID(True,analysis_id,seq_run_id)
    if not valid:
        raise ValueError, 'Error: Unable to append SEQ_RUN_ID into ANALYSIS table!'

    return analysis_id,input_dir,seq_run_id,split_lib_input_md5sum
def submit_sff_and_split_lib(data_access,fasta_files,metadata_study_id):
    """
       SFF Loading: This function takes the fasta filenames and using that path,
       determines the location of the split-library and picked-otu files.  Once 
       file locations have been determined, it moves the files to the DB machine
       and load the files into the DB.
    """
    # get database connection and cursor
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    
    ### this may help in speeding up loading but shouldn't be necessary
    #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...'
    #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ')
    #cur = con.cursor()
    
    # check if study exists
    study_id_exists=data_access.checkIfStudyIdExists(metadata_study_id)
    print "Study ID exists: " + str(study_id_exists)

    # create a temp filename
    alphabet = "ABCDEFGHIJKLMNOPQRSTUZWXYZ"
    alphabet += alphabet.lower()
    alphabet += "01234567890"
    random_fname=''.join([choice(alphabet) for i in range(10)])
    tmp_filename ='_'+random_fname+'_'+strftime("%Y_%m_%d_%H_%M_%S")
    
    # get a list of filenames
    fasta_filenames=fasta_files.split(',')
    seq_run_id=0  
    analysis_id=0    
    split_lib_input_checksums=[]
    fasta_qual_files=[]
    
    ### by disabling constraints you can speed up loading as well, but shouldn't
    ### be necessary
    #valid = data_access.disableTableConstraints()
    #print "Disabled table constraints"

    #split the fasta filenames and determine filepaths
    for fasta_fname in fasta_filenames:
        input_fname, input_ext = splitext(split(fasta_fname)[-1])
        input_basename, input_ext = splitext(fasta_fname)
        input_dir = split(input_basename)[:-1][0]
        
        # get the sff basename
        if re.search('0\d$', input_fname)==None or re.search('0\d$', 
                                                    input_fname).group()==None:
            sff_basename=input_fname
        else:
            sff_basename=input_fname[:-2]

        if re.search('0\d_FLX$', sff_basename)==None or re.search('0\d_FLX$', 
                                                    sff_basename).group()==None:
            sff_basename=sff_basename
        else:
            sff_basename=sff_basename[:-6]

        print 'sff_basename: %s' % sff_basename

        # get analysis notes
        analysis_notes=split(input_basename)[0]
        
        # using the fasta basename, define qual and flow files
        qual_fname=join(input_basename+'.qual')
        flow_fname=join(input_basename+'.txt')
        fasta_qual_files.append(fasta_fname)
        fasta_qual_files.append(qual_fname)
        
        # Run the Oracle process_sff_files load package
        ## Get the location and name of the SFF file, get it's MD5. .SFF is one 
        # directory up from the other files
        rev = dirname(fasta_fname)[::-1]
        
        # check for sffs in the processed folder...only occurs for Ti processing
        sffs_in_processed_folder = glob(join(input_dir, '*_FLX.sff'))
        if len(sffs_in_processed_folder) == 0:
            sff_file_dir = split(input_dir)[0]
        else:
            sff_file_dir=input_dir
        
        # get SFF file
        sff_file = join(sff_file_dir, input_fname + '.sff')
        
        # get md5 of SFF
        sff_md5 = safe_md5(open(sff_file)).hexdigest()
        
        print 'MD5 is: %s' % str(sff_md5)

        # create an analysis
        if analysis_id==0:
            analysis_id=data_access.createAnalysis(metadata_study_id)
        
        # check if SFF info was already loaded into DB
        sff_exists=data_access.checkIfSFFExists(sff_md5)
        print 'sff in database? %s' % str(sff_exists)
        
        #if True:
        if not sff_exists:
            print 'flow_fname: %s' % flow_fname
            sff_header=get_header_info(open(flow_fname))
            
            # get instrument info
            if sff_header['# of Flows']=='400':
                instrument_code='GS FLX'
            elif sff_header['# of Flows']=='168':
                instrument_code='GS2-'
            elif sff_header['# of Flows']=='800':
                instrument_code='Titanium'
            else:
                instrument_code='UNKNOWN'
            print 'Instrument Code: %s' % instrument_code
            
            # load SFF info
            if seq_run_id==0:
                seq_run_id=data_access.createSequencingRun(True,instrument_code,
                                            sff_header['Version'],seq_run_id)
                valid=data_access.addSFFFileInfo(True,sff_basename,
                                             sff_header['# of Reads'],
                                             sff_header['Header Length'],
                                             sff_header['Key Length'],
                                             sff_header['# of Flows'],
                                             sff_header['Flowgram Code'],
                                             sff_header['Flow Chars'],
                                             sff_header['Key Sequence'],
                                             sff_md5,seq_run_id)
            else:
                valid=data_access.addSFFFileInfo(True,sff_basename,
                                             sff_header['# of Reads'],
                                             sff_header['Header Length'],
                                             sff_header['Key Length'],
                                             sff_header['# of Flows'],
                                             sff_header['Flowgram Code'],
                                             sff_header['Flow Chars'],
                                             sff_header['Key Sequence'],
                                             sff_md5,seq_run_id)

        else:
            seq_run_id=data_access.getSeqRunIDUsingMD5(sff_md5)
    
    print 'sequence_run_id is: %s' % str(seq_run_id)
            
    
    # get md5 of fna/qual files
    print fasta_qual_files
    split_lib_input_md5sum=safe_md5(MD5Wrap(fasta_qual_files)).hexdigest()
    print split_lib_input_md5sum
    print 'Finished loading the processed SFF data!'
    print 'Run ID: %s' % seq_run_id
    print 'Analysis ID: %s' % analysis_id
    
    # add seq_run_id to Analysis table
    valid=data_access.updateAnalysisWithSeqRunID(True,analysis_id,seq_run_id)
    if not valid:
        raise ValueError, 'Error: Unable to append SEQ_RUN_ID into ANALYSIS table!'

    return analysis_id,input_dir,seq_run_id,split_lib_input_md5sum
def load_otu_mapping(data_access, input_dir, analysis_id):
    """ Load the OTU table into the DB """
    
    # For OTU Tables
    # read in the workflow log file and determine timestamp and svn version of
    # Qiime used for the analysis
    pOTUs_threshold = '97'
    ref_set_threshold = '97'
    pOTUs_method='UCLUST_REF'
    reference_set_name='GREENGENES_REFERENCE'
    otus_log_str = open(join(input_dir, 'gg_97_otus', 'log.txt')).read()
    log_str = open(join(input_dir, 'gg_97_otus', 'log.txt')).readlines()
    
    #from the workflow log file get the pick-otus cmd
    for substr in log_str:
        if 'parallel_pick_otus_uclust_ref.py' in substr:
            pick_otus_cmd=substr
        elif 'pick_otus.py' in substr:
            pick_otus_cmd=substr
    
    # define values for otu_picking_run table
    otu_run_set_id = 0
    svn_version = '1418' # This is temporarily defined, however will use script to dtermine this value
    run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S")    
    pick_otus_map = join(input_dir, 'gg_97_otus', 'exact_uclust_ref_otus.txt')
    
    # get md5 for split-lib seq file
    split_lib_seqs = join(input_dir, 'split_libraries', 'seqs.fna')
    split_lib_seqs_md5=safe_md5(open(split_lib_seqs)).hexdigest()
    
    # Insert the otu-picking log information in the DB
    print 'calling loadAllOTUInfo with analysis_id %s' % str(analysis_id)
    valid,new_otu_run_set_id,otu_picking_run_id=data_access.loadAllOTUInfo(True,
                                  otu_run_set_id, run_date,
                                  pOTUs_method, pOTUs_threshold,
                                  svn_version, pick_otus_cmd, otus_log_str,
                                  split_lib_seqs_md5,reference_set_name,
                                  ref_set_threshold, analysis_id)
    if not valid:
        raise ValueError, 'Error: Unable to load OTU run data into database!'
    else:
        print "Finished registering OTU run!"
    
    # define OTU mapping
    otu_map=[]
    otu_to_seqid = fields_to_dict(open(pick_otus_map, 'U'))
    for otu in otu_to_seqid:
        for sample in otu_to_seqid[otu]:
            otu_map.append('%s\t%s\t%s\t%s' % (otu,sample,new_otu_run_set_id, 
                                               reference_set_name))
    print 'Finished setting otu_map.'
    
    # define oracle data types
    types = ['s','s','i','s']
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    #print 'Starting PK_SPLIT_LIBRARY_READ_MAP index rebuild...'
    #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ')
    print 'Fisnished rebuilding index PK_SPLIT_LIBRARY_READ_MAP.'
    cur = con.cursor()
    set_count = 1
    
    # prepare the OTU table for laoding
    print 'Loading OTU Table into the database!'
    pick_otus_table = join(input_dir, 'gg_97_otus',
                           'exact_uclust_ref_otu_table.txt')
    otu_table_lines=open(pick_otus_table).readlines()
    sample_ids, otu_ids, otu_table, lineages = \
                                    parse_classic_otu_table(otu_table_lines)
    
    # convert OTU table to tab-delimited list
    otu_table_load=[]
    for i,otu in enumerate(otu_ids):
        for j,sample in enumerate(sample_ids):
            if otu_table[i][j]>0:
                otu_table_load.append("%s\t%s\t%s\t%s" % \
                                (otu,sample,new_otu_run_set_id,otu_table[i][j]))

    # get DB connection
    con = data_access.getSFFDatabaseConnection()
    cur = con.cursor()
    
    # load otu table into DB
    data_types=['s','s','i','f']   
    set_count = 0      
    for input_set in input_set_generator(otu_table_load, cur,data_types,\
                                         buffer_size=1000):
        valid=data_access.loadOTUTable(True,input_set)
        if not valid:
            raise ValueError, 'Error: Unable to load OTU table!'
        print "loading OTU Table: %s" % set_count
        set_count += 1
    
    print 'Successfully loaded the OTU Table into the database!'
    print 'End of function' 
Ejemplo n.º 8
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    # opts.filter_bad_illumina_qual_digit
    filter_bad_illumina_qual_digit = False
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error(
                "If not providing barcode reads (because "
                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error(
                "If providing --sample_ids (because "
                "your data is not multiplexed), must provide the same number "
                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
        mapping_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
                            "--barcode_type is not 'not-barcoded'")
    elif mapping_fps is None:
        option_parser.error("Must provide --mapping_fps if "
                            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
        except KeyError:
            # shouldn't be able to get here, but we'll stay on the
            # safe side
            option_parser.error("Only valid phred offsets are: %s" %
                                ' '.join(phred_to_ascii_fs.keys()))
    else:
        # let split_libraries_fastq.process_fastq_single_end_read_file
        # figure it out...
        phred_to_ascii_f = None

    if opts.last_bad_quality_char is not None:
        option_parser.error(
            '--last_bad_quality_char is no longer supported. '
            'Use -q instead (see option help text by passing -h)')

    if not (0 <= min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be between '
                            '0 and 1 (inclusive). You passed %1.5f' %
                            min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(
            set([
                len(sequence_read_fps),
                len(barcode_read_fps),
                len(mapping_fps)
            ])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
                            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')

        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:

        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')

        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def fastq_writer(h, s, q):
            output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h, s, q))
    else:

        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        if mapping_fp is not None:
            mapping_f = open(mapping_fp, 'U')
            _, _, barcode_to_sample_id, _, _, _, _ = check_map(
                mapping_f,
                disable_primer_check=True,
                has_barcodes=barcode_read_fp is not None)
        else:
            mapping_f = None
            barcode_to_sample_id = {}

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {
                DNA.rc(k): v
                for k, v in barcode_to_sample_id.iteritems()
            }

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error(
                    "Some or all barcodes are not valid golay "
                    "codes. Do they need to be reverse complemented? If these "
                    "are not golay barcodes pass --barcode_type 12 to disable "
                    "barcode error correction, or pass --barcode_type # if "
                    "the barcodes are not 12 base pairs, where # is the size "
                    "of the barcodes. Invalid codes:\n\t%s" %
                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        if mapping_fp is not None:
            log_f.write('Mapping filepath: %s (md5: %s)\n' %
                        (mapping_fp, safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
                    (sequence_read_fp,
                     str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp is not None:
            log_f.write(
                'Barcode read filepath: %s (md5: %s)\n\n' %
                (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')

            seq_generator = process_fastq_single_end_read_file(
                sequence_read_f,
                barcode_read_f,
                barcode_to_sample_id,
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                rev_comp_barcode=rev_comp_barcode,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                barcode_correction_fn=barcode_correction_fn,
                max_barcode_errors=max_barcode_errors,
                phred_to_ascii_f=phred_to_ascii_f)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
                sequence_read_f,
                sample_ids[i],
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                phred_to_ascii_f=phred_to_ascii_f)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    mapping_fps = opts.mapping_fps
    
    retain_unassigned_reads = opts.retain_unassigned_reads
    max_bad_run_length = opts.max_bad_run_length
    last_bad_quality_char = opts.last_bad_quality_char
    min_per_read_length = opts.min_per_read_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit
    
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors
    
    try:
        barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type]
    except KeyError:
        barcode_correction_fn = None
    
    if len(sequence_read_fps) != len(barcode_read_fps):
        parser.error("Same number of sequence and barcode files must be provided.")
    
    output_dir = opts.output_dir
    create_dir(output_dir)
    
    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp,'w')
    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp,'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp,'w')
    
    for sequence_read_fp, barcode_read_fp, mapping_fp in\
      zip(sequence_read_fps, barcode_read_fps, mapping_fps):
        mapping_f = open(mapping_fp, 'U')
        h, i, barcode_to_sample_id, warnings, errors, p, a =\
           check_map(mapping_f, disable_primer_check=True)
           
            
        if barcode_type == 'golay_12':
            invalid_golay_barcodes = \
             get_invalid_golay_barcodes(barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error("Some or all barcodes are not valid golay codes. "+\
                "Do they need to be reverse complimented? If these are not "+\
                "golay barcodes pass --barcode_type 12 to disable barcode "+\
                "error correction. Invalid codes:\n\t%s" % \
                ' '.join(invalid_golay_barcodes))
        
        log_f.write("Input file paths\n")
        log_f.write('Mapping filepath: %s (md5: %s)\n' %\
          (mapping_fp,safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %\
          (sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest())))
        log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\
          (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest()))
        
        sequence_read_f = open(sequence_read_fp,'U')
        barcode_read_f = open(barcode_read_fp,'U')
        seq_id = start_seq_id
        for fasta_header, sequence, quality, seq_id in \
            process_fastq_single_end_read_file(
               sequence_read_f,
               barcode_read_f,
               barcode_to_sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               last_bad_quality_char=last_bad_quality_char,
               min_per_read_length=min_per_read_length,
               rev_comp=rev_comp,
               rev_comp_barcode=rev_comp_barcode,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               barcode_correction_fn=barcode_correction_fn,
               max_barcode_errors=max_barcode_errors):
            output_f.write('>%s\n%s\n' % (fasta_header,sequence))
        start_seq_id = seq_id + 1                                       
        log_f.write('\n---\n\n')
        
    output_f.close()
    rename(output_fp_temp,output_fp)