Exemple #1
0
 def __init__(self):
     parse_args()
     self.__dict__.update( vars(args) )
     self.validate_settings()
     self.initialize_output()
     if self.debug:
         initialize_logger( self.log_file, logging.DEBUG )
     else:
         initialize_logger( self.log_file, logging.INFO )
Exemple #2
0
 def __init__(self):
     parse_args()
     self.__dict__.update(vars(args))
     self.validate_settings()
     self.initialize_output()
     initialize_logger(log, log_file=self.log_file, debug=self.debug)
Exemple #3
0
    """
    log.info(
        "Filtering sequences below {0} Signal-To-Noise Ratio".format(min_snr))
    seq_count = 0
    pass_count = 0
    raw_data = BasH5Collection(raw_data_file)
    with FastqWriter(output_fastq) as writer:
        for record in FastqReader(input_fastq):
            seq_count += 1
            zmw_name = '/'.join(record.name.strip().split('/')[:2])
            zmw = raw_data[zmw_name]
            zmw_snr = min(zmw.zmwMetric("HQRegionSNR"))
            print zmw_name, zmw_snr
            if zmw_snr >= min_snr:
                pass_count += 1
                writer.writeRecord(record)
    percentage = round(100.0 * pass_count / seq_count)
    log.info("{0} sequences of {1} ({2}%) passed filtering".format(
        pass_count, seq_count, percentage))


if __name__ == '__main__':
    import sys

    input_file = sys.argv[1]
    raw_data = sys.argv[2]
    output_file = sys.argv[3]
    min_snr = float(sys.argv[4])

    initialize_logger()
    snr_filter(input_file, raw_data, output_file, min_snr)
 def __init__(self):
     parse_args()
     self.__dict__.update( vars(args) )
     self.validate_settings()
     self.initialize_output()
     initialize_logger( log, log_file=self.log_file, debug=self.debug )
    """
    log.info("Filtering sequences below {0} Signal-To-Noise Ratio".format(min_snr))
    seq_count = 0
    pass_count = 0
    raw_data = BasH5Collection( raw_data_file )
    with FastqWriter( output_fastq ) as writer:
        for record in FastqReader( input_fastq ):
            seq_count += 1
            zmw_name = '/'.join( record.name.strip().split('/')[:2] )
            zmw = raw_data[zmw_name]
            zmw_snr = min( zmw.zmwMetric("HQRegionSNR") )
            print zmw_name, zmw_snr
            if zmw_snr >= min_snr:
                pass_count += 1
                writer.writeRecord( record )
    percentage = round(100.0*pass_count/seq_count)
    log.info("{0} sequences of {1} ({2}%) passed filtering".format(pass_count,
                                                                   seq_count,
                                                                   percentage))


if __name__ == '__main__':
    import sys

    input_file = sys.argv[1]
    raw_data = sys.argv[2]
    output_file = sys.argv[3]
    min_snr = float(sys.argv[4])

    initialize_logger()
    snr_filter(input_file, raw_data, output_file, min_snr)
Exemple #6
0
def parse_args():
    """
    Parse the options for running the HLA pipeline and
    """
    desc = 'A pipeline tool for analyzing PacBio sequenced rRNA amplicons'
    parser = argparse.ArgumentParser(description=desc)

    add = parser.add_argument
    add('input_file',
        metavar='FILE',
        help="File of rRNA sequencing data to use")
    add('-r',
        '--raw_data',
        metavar='FILE',
        help='BasH5, BaxH5 or FOFN of raw H5-format sequence data')
    add('-a',
        '--min_accuracy',
        type=float,
        metavar='FLOAT',
        default=MIN_ACCURACY,
        help='Minimum predicted sequence accuracy to allow (%s)' %
        MIN_ACCURACY)
    add('-l',
        '--min_length',
        type=int,
        metavar='INT',
        default=MIN_LENGTH,
        help='Minimum length sequence to allow (%s)' % MIN_LENGTH)
    add('-s',
        '--min_snr',
        type=float,
        metavar='FLOAT',
        default=MIN_SNR,
        help='Minimum Signal-to-Noise ratio to allow (%s)' % MIN_SNR)
    add('-d',
        '--distance',
        type=float,
        metavar='FLOAT',
        default=DIST,
        help='Distance at which to cluster sequences (%s)' % DIST)
    add('-t',
        '--step',
        type=float,
        metavar='FLOAT',
        default=STEP,
        help="Step-size to use when clustering iteratively")
    add('-n',
        '--num_processes',
        type=int,
        metavar='INT',
        default=NPROC,
        dest='nproc',
        help='Number of processors to use (%s)' % NPROC)
    add('-f',
        '--fraction',
        type=float,
        metavar='FLOAT',
        default=FRACTION,
        help='Fraction of full-length to require of each read (%s)' % FRACTION)
    add('-o',
        '--output',
        dest='output_dir',
        metavar='DIR',
        default='rna_pipeline_run',
        help="Specify the output folder")
    add('-q',
        '--min_qv',
        type=int,
        metavar='INT',
        default=MIN_QV,
        help='Minimum QV to allow after sequence masking (%s)' % MIN_QV)
    add('-c',
        '--min_cluster_size',
        type=int,
        metavar='INT',
        default=MIN_CLUSTER_SIZE,
        help='Minimum cluster to generate consensus sequences (%s)' %
        MIN_CLUSTER_SIZE)
    add('--clustering_method',
        metavar='METHOD',
        dest='clusteringMethod',
        default=DEFAULT_METHOD,
        choices=CLUSTER_METHODS,
        help="Distance algorithm to use in clustering (%s)" % DEFAULT_METHOD)
    add('--precluster_diffs',
        type=int,
        metavar='INT',
        default=PRECLUSTER_DIFFS,
        help='Maximum number of differences to allow in pre-clustering (%s)' %
        PRECLUSTER_DIFFS)
    add('-A',
        '--alignment_reference',
        metavar='REF',
        default='silva.both.align',
        help="Reference MSA for aligning query sequences")
    add('-C',
        '--chimera_reference',
        metavar='REF',
        default='silva.gold.align',
        help="Reference MSA for Chimera detection")
    add('--sub_cluster',
        action="store_true",
        help="Subcluster each OTU to separate individual rDNA alleles")
    add('--disable_iteration',
        action='store_false',
        dest='enable_iteration',
        help="Turn off the iterative Clustering and Resequencing steps")
    add('--disable_consensus',
        action='store_false',
        dest='enable_consensus',
        help="Turn off the iterative Clustering and Resequencing steps")
    add('--blasr',
        metavar='BLASR_PATH',
        help="Specify the path to the Blasr executable")
    add('--mothur',
        metavar='MOTHUR_PATH',
        default='mothur',
        help="Specify the path to the Mothur executable")
    add('--debug', action='store_true', help="Turn on DEBUG message logging")
    add('--test_mode',
        action='store_true',
        help="Turn on current modifications being tested")

    class PrintVersionAction(argparse.Action):
        def __call__(self, parser, namespace, values, option_string=None):
            print "\tHLA Analysis Pipeline version: %s" % __VERSION__
            raise SystemExit

    add("--version", nargs=0, action=PrintVersionAction)

    parser.parse_args(namespace=args)
    initialize_logger(log, stream=sys.stdout, debug=args.debug)

    # Validate any input files
    if validate_file(args.alignment_reference) is None:
        msg = 'No alignment reference specified and default 16S Alignment reference not detected!'
        log.error(msg)
        raise ValueError(msg)

    if validate_file(args.chimera_reference) is None:
        msg = 'Default Chimera Reference not detected in PATH, falling back to Alignment Reference'
        log.warn(msg)
        args.chimera_reference = args.alignment_reference

    if args.enable_consensus and which('gcon.py') is None:
        msg = 'No copies of pbdagcon/gcon.py detected in PATH, disabling consensus'
        log.warn(msg)
        args.enable_consensus = False

    # Validate numerical parameters
    validate_int('NumProc', args.nproc, minimum=0)
    validate_float('Distance',
                   args.distance,
                   minimum=MIN_DIST,
                   maximum=MAX_DIST)