def __init__(self): parse_args() self.__dict__.update( vars(args) ) self.validate_settings() self.initialize_output() if self.debug: initialize_logger( self.log_file, logging.DEBUG ) else: initialize_logger( self.log_file, logging.INFO )
def __init__(self): parse_args() self.__dict__.update(vars(args)) self.validate_settings() self.initialize_output() initialize_logger(log, log_file=self.log_file, debug=self.debug)
""" log.info( "Filtering sequences below {0} Signal-To-Noise Ratio".format(min_snr)) seq_count = 0 pass_count = 0 raw_data = BasH5Collection(raw_data_file) with FastqWriter(output_fastq) as writer: for record in FastqReader(input_fastq): seq_count += 1 zmw_name = '/'.join(record.name.strip().split('/')[:2]) zmw = raw_data[zmw_name] zmw_snr = min(zmw.zmwMetric("HQRegionSNR")) print zmw_name, zmw_snr if zmw_snr >= min_snr: pass_count += 1 writer.writeRecord(record) percentage = round(100.0 * pass_count / seq_count) log.info("{0} sequences of {1} ({2}%) passed filtering".format( pass_count, seq_count, percentage)) if __name__ == '__main__': import sys input_file = sys.argv[1] raw_data = sys.argv[2] output_file = sys.argv[3] min_snr = float(sys.argv[4]) initialize_logger() snr_filter(input_file, raw_data, output_file, min_snr)
def __init__(self): parse_args() self.__dict__.update( vars(args) ) self.validate_settings() self.initialize_output() initialize_logger( log, log_file=self.log_file, debug=self.debug )
""" log.info("Filtering sequences below {0} Signal-To-Noise Ratio".format(min_snr)) seq_count = 0 pass_count = 0 raw_data = BasH5Collection( raw_data_file ) with FastqWriter( output_fastq ) as writer: for record in FastqReader( input_fastq ): seq_count += 1 zmw_name = '/'.join( record.name.strip().split('/')[:2] ) zmw = raw_data[zmw_name] zmw_snr = min( zmw.zmwMetric("HQRegionSNR") ) print zmw_name, zmw_snr if zmw_snr >= min_snr: pass_count += 1 writer.writeRecord( record ) percentage = round(100.0*pass_count/seq_count) log.info("{0} sequences of {1} ({2}%) passed filtering".format(pass_count, seq_count, percentage)) if __name__ == '__main__': import sys input_file = sys.argv[1] raw_data = sys.argv[2] output_file = sys.argv[3] min_snr = float(sys.argv[4]) initialize_logger() snr_filter(input_file, raw_data, output_file, min_snr)
def parse_args(): """ Parse the options for running the HLA pipeline and """ desc = 'A pipeline tool for analyzing PacBio sequenced rRNA amplicons' parser = argparse.ArgumentParser(description=desc) add = parser.add_argument add('input_file', metavar='FILE', help="File of rRNA sequencing data to use") add('-r', '--raw_data', metavar='FILE', help='BasH5, BaxH5 or FOFN of raw H5-format sequence data') add('-a', '--min_accuracy', type=float, metavar='FLOAT', default=MIN_ACCURACY, help='Minimum predicted sequence accuracy to allow (%s)' % MIN_ACCURACY) add('-l', '--min_length', type=int, metavar='INT', default=MIN_LENGTH, help='Minimum length sequence to allow (%s)' % MIN_LENGTH) add('-s', '--min_snr', type=float, metavar='FLOAT', default=MIN_SNR, help='Minimum Signal-to-Noise ratio to allow (%s)' % MIN_SNR) add('-d', '--distance', type=float, metavar='FLOAT', default=DIST, help='Distance at which to cluster sequences (%s)' % DIST) add('-t', '--step', type=float, metavar='FLOAT', default=STEP, help="Step-size to use when clustering iteratively") add('-n', '--num_processes', type=int, metavar='INT', default=NPROC, dest='nproc', help='Number of processors to use (%s)' % NPROC) add('-f', '--fraction', type=float, metavar='FLOAT', default=FRACTION, help='Fraction of full-length to require of each read (%s)' % FRACTION) add('-o', '--output', dest='output_dir', metavar='DIR', default='rna_pipeline_run', help="Specify the output folder") add('-q', '--min_qv', type=int, metavar='INT', default=MIN_QV, help='Minimum QV to allow after sequence masking (%s)' % MIN_QV) add('-c', '--min_cluster_size', type=int, metavar='INT', default=MIN_CLUSTER_SIZE, help='Minimum cluster to generate consensus sequences (%s)' % MIN_CLUSTER_SIZE) add('--clustering_method', metavar='METHOD', dest='clusteringMethod', default=DEFAULT_METHOD, choices=CLUSTER_METHODS, help="Distance algorithm to use in clustering (%s)" % DEFAULT_METHOD) add('--precluster_diffs', type=int, metavar='INT', default=PRECLUSTER_DIFFS, help='Maximum number of differences to allow in pre-clustering (%s)' % PRECLUSTER_DIFFS) add('-A', '--alignment_reference', metavar='REF', default='silva.both.align', help="Reference MSA for aligning query sequences") add('-C', '--chimera_reference', metavar='REF', default='silva.gold.align', help="Reference MSA for Chimera detection") add('--sub_cluster', action="store_true", help="Subcluster each OTU to separate individual rDNA alleles") add('--disable_iteration', action='store_false', dest='enable_iteration', help="Turn off the iterative Clustering and Resequencing steps") add('--disable_consensus', action='store_false', dest='enable_consensus', help="Turn off the iterative Clustering and Resequencing steps") add('--blasr', metavar='BLASR_PATH', help="Specify the path to the Blasr executable") add('--mothur', metavar='MOTHUR_PATH', default='mothur', help="Specify the path to the Mothur executable") add('--debug', action='store_true', help="Turn on DEBUG message logging") add('--test_mode', action='store_true', help="Turn on current modifications being tested") class PrintVersionAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): print "\tHLA Analysis Pipeline version: %s" % __VERSION__ raise SystemExit add("--version", nargs=0, action=PrintVersionAction) parser.parse_args(namespace=args) initialize_logger(log, stream=sys.stdout, debug=args.debug) # Validate any input files if validate_file(args.alignment_reference) is None: msg = 'No alignment reference specified and default 16S Alignment reference not detected!' log.error(msg) raise ValueError(msg) if validate_file(args.chimera_reference) is None: msg = 'Default Chimera Reference not detected in PATH, falling back to Alignment Reference' log.warn(msg) args.chimera_reference = args.alignment_reference if args.enable_consensus and which('gcon.py') is None: msg = 'No copies of pbdagcon/gcon.py detected in PATH, disabling consensus' log.warn(msg) args.enable_consensus = False # Validate numerical parameters validate_int('NumProc', args.nproc, minimum=0) validate_float('Distance', args.distance, minimum=MIN_DIST, maximum=MAX_DIST)