def setup_logging(log_dir, debug): logfile = os.path.join(log_dir, 'abstar.log') debug = True if debug > 0 else False # print_debug = True if debug == 2 else False log.setup_logging(logfile, debug=debug) global logger logger = log.get_logger('abstar')
def run_standalone(args): validate_args(args) global logger logfile = get_logfile(args) log.setup_logging(logfile, debug=args.debug) logger = log.get_logger('clonify') main(args)
def build_output(vdjs, output_type, pretty, padding): logger = log.get_logger() try: vdjs = [vdj for vdj in vdjs if vdj.rearrangement] if output_type.lower() == 'json': output = [] for vdj in vdjs: try: output.append(_json_output(vdj, pretty, padding)) except AttributeError: logger.debug('OUTPUT ERROR: {}'.format(vdj.id)) elif output_type.lower() == 'imgt': header, firstvals = _imgt_summary_output(vdjs[0], header=True) output = [ header, firstvals, ] for vdj in vdjs[1:]: try: output.append(_imgt_summary_output(vdj)) except AttributeError: logger.debug('OUTPUT ERROR: {}'.format(vdj.id)) elif output_type.lower() == 'hadoop': output = [] for vdj in vdjs: try: output.append(_hadoop_minimal_output(vdj)) except AttributeError: logger.debug('OUTPUT ERROR: {}'.format(vdj.id)) return output except: logger.debug( 'FILE-LEVEL OUTPUT ERROR: sequences {} - {}, output_type = {}'. format(vdjs[0].id, vdjs[-1].id, output_type)) logger.debug(traceback.format_exc())
def run_abstar(sequence_file, output_directory, args): ''' Wrapper function to multiprocess (or not) the assignment of V, D and J germline genes. Also writes the JSON-formatted output to file. Input is a a FASTA-formatted file of antibody sequences and the output directory. Optional input items include the species (supported species: 'human'); length of the unique antibody identifier (UAID); and debug mode (which forces single-threading and prints more verbose errors.) Output is the number of functional antibody sequences identified in the input file. ''' try: # setup logging global logger logger = log.get_logger(__name__) assigned_log = '' unassigned_log = '' # identify output file output_filename = os.path.basename(seq_file) if args.output_type == 'json': output_file = os.path.join(output_dir, output_filename + '.json') elif args.output_type in ['imgt', 'hadoop']: output_file = os.path.join(output_dir, output_filename + '.txt') # start assignment assigner = ASSIGNERS[args.assigner] assigner(sequence_file, args.species) # process all of the successfully assigned sequences assigned = [Antibody(vdj, args.species) for vdj in assigner.assigned] for ab in assigned: ab.annotate() if args.debug: assigned_log += ab.format_log() results = get_abstar_results(assigned, pretty=args.pretty, padding=args.padding, raw=args.raw) write_output(results, output_file, args.output_type) # capture the log for all unsuccessful sequences for vdj in unassigned: unassigned_log += vdj.format_log() return (len(assigned), assigned_log, unassigned_log) # vdj_output = process_sequence_file(seq_file, args) # if not vdj_output: # return None # clean_vdjs = [vdj for vdj in vdj_output if vdj.rearrangement] # output_count = write_output(clean_vdjs, output_file, args.output_type, args.pretty, args.padding) # return (output_file, output_count) except: logger.debug(traceback.format_exc()) raise Exception("".join(traceback.format_exception(*sys.exc_info())))
def compress(d, output, fmt='gz', logger=None): ''' Creates a compressed/uncompressed tar file. Args: d: Can be one of three things: 1. the path to a single file, as a string 2. the path to a single directory, as a string 3. an iterable of file or directory paths output (str): Output file path. fmt: Compression method. Options are ``'gz'`` (gzip), ``'bz2'`` (bzip2) and ``'none'`` (uncompressed). Default is ``'gz'``. ''' if not logger: logger = log.get_logger('s3') if type(d) not in [list, tuple]: d = [ d, ] d = [os.path.expanduser(_d) for _d in d] print_compress_info(d, output, compress, logger) if fmt.lower() == 'none': fmt = '' elif fmt.lower() not in ['gz', 'bz2']: logger.info( 'Compression option ("{}") is invalid.\nFalling back to uncompressed.' .format(fmt)) fmt = '' output = os.path.expanduser(output) tar = tarfile.open(output, 'w:{}'.format(fmt)) for obj in d: tar.add(obj) tar.close() return output
def print_compress_info(d, output, compress, logger): if not logger: logger = log.get_logger('s3') dirs = [obj for obj in d if os.path.isdir(obj)] files = [obj for obj in d if os.path.isfile(obj)] logger.info('') logger.info('') logger.info('') logger.info('-' * 25) logger.info('COMPRESSING DATA') logger.info('-' * 25) logger.info('') logger.info('Ouptut file: {}'.format(output)) logger.info('Compression: {}'.format(compress.lower())) if dirs: d = 'directories' if len(dirs) > 1 else 'directory' logger.info('Found {} {} to compress: {}'.format( len(dirs), d, ', '.join(dirs))) if files: f = 'files' if len(files) > 1 else 'file' logger.info('Found {} {} to compress: {}'.format( len(files), f, ', '.join(files)))
def configure(access_key=None, secret_key=None, logger=None): ''' Configures s3cmd prior to first use. If no arguments are provided, you will be prompted to enter the access key and secret key interactively. Args: access_key (str): AWS access key secret_key (str): AWS secret key ''' if not logger: logger = log.get_logger('s3') if not all([access_key, secret_key]): logger.info('') access_key = input('AWS Access Key: ') secret_key = input('AWS Secret Key: ') _write_config(access_key, secret_key) logger.info('') logger.info('Completed writing S3 config file.') logger.info('')
def put(f, s3_path, multipart_chunk_size_mb=500, logger=None): ''' Uploads a single file to S3, using s3cmd. Args: f (str): Path to a single file. s3_path (str): The S3 path, with the filename omitted. The S3 filename will be the basename of the ``f``. For example:: put(f='/path/to/myfile.tar.gz', s3_path='s3://my_bucket/path/to/') will result in an uploaded S3 path of ``s3://my_bucket/path/to/myfile.tar.gz`` ''' if not logger: logger = log.get_logger('s3') fname = os.path.basename(f) target = os.path.join(s3_path, fname) s3cmd_cline = 's3cmd put {} {} --multipart-chunk-size-mb {}'.format( f, target, multipart_chunk_size_mb) print_put_info(fname, target, logger) s3cmd = sp.Popen(s3cmd_cline, stdout=sp.PIPE, stderr=sp.PIPE, shell=True) stdout, stderr = s3cmd.communicate()
def run(*args, **kwargs): ''' Runs AbStar. Input sequences can be provided in several different formats: 1) individual sequences as positional arguments: ``run(seq1, seq2, temp=temp, output=output)`` 2) a list of sequences, as an argument: ``run([seq1, seq2], temp=temp, output=output)`` 3) a single FASTA/Q-formatted input file, passed via ``input`` 4) a directory of FASTA/Q-formatted files, passed via ``input`` When passing sequences (not FASTA/Q files), the sequences can be in any format recognized by ``abtools.sequence.Sequence``, including: - a raw nucleotide sequence, as a string (a random sequence ID will be assigned) - a list/tuple of the format ``[sequence_id, sequence]`` - a BioPython SeqRecord object - an AbTools Sequence object Either sequences, ``project_dir``, or all of ``input``, ``output`` and ``temp`` are required. Examples: If processing a single sequence, you can pass the raw sequence, as a string:: import abstar result = abstar.run('ATGC') or a list/tuple of the format ``[sequence_id, sequence]``:: result = abstar.run(['seq1', 'ATGC']) If you pass just the raw sequence, a random sequence ID will be generated with ``uuid.uuid4()``. In either case, when given a single sequence, ``abstar.run()`` will return a single AbTools ``Sequence`` object. If running multiple sequences, you can either pass each sequence as a positional argument:: result_list = run(['seq1', 'ATGC'], ['seq2', 'CGTA']) or you can pass a list of sequences as the first argument, in this case using sequences parsed from a FASTA file using Biopython:: from Bio import SeqIO fasta = open('my_sequences.fasta', 'r') seqs = [s for s in SeqIO.parse(fasta, 'fasta')] result_list = abstar.run(seqs) When given multiple sequences, ``abstar.run()`` will return a list of AbTools ``Sequence`` objects, one per input sequence. If you'd prefer not to parse the FASTQ/A file into a list (for example, if the input file is extremely large), you can pass the input file path directly, along with a temp directory and output directory:: result_files = abstar.run(input='/path/to/my_sequences.fasta', temp='/path/to/temp', output='/path/to/output') Given a file path, ``abstar.run()`` returns a list of output file paths. In the above case, ``result_files`` will be a list containing a single output file path: ``/path/to/output/my_sequences.json``. If you have a directory containing multiple FASTQ/A files, you can pass the directory path using ``input``:: result_files = abstar.run(input='/path/to/input', temp='/path/to/temp', output='/path/to/output') As before, ``result_files`` will contain a list of output file paths. If your input directory contains paired FASTQ files (gzip compressed or uncompressed) that need to be merged prior to processing with AbStar:: result_files = abstar.run(input='/path/to/input', temp='/path/to/temp', output='/path/to/output', merge=True) The paired read files in ``input`` will be merged with PANDAseq prior to processing with AbStar. By default, PANDAseq's 'simple bayesian' read merging algorithm is used, although alternate algorithms can be selected with ``pandaseq_algo``. AbStar also provides an alternate CSV-formatted output type that mimics the `IMGT Summary file`_. This option is provided to minimize the effort needed to convert existing IMGT-based pipelines to AbStar. Alternate output is only available when passing an input file or directory; passing individual sequences or a list of sequences will always return Sequence objects. To produce IMGT-formatted output:: result_files = abstar.run(input='/path/to/input', temp='/path/to/temp', output='/path/to/output', output_type='imgt') .. _IMGT Summary file: http://www.imgt.org/IMGT_vquest/share/textes/imgtvquest.html#Esummary Args: project_dir (str): Path to the project directory. Most useful when directly downloading files from BaseSpace, and all subdirectories will be created by AbStar. input (str): Path to input directory, containing FASTA/Q files. If performing read merging with PANDAseq, paired FASTQ files may be gzip compressed. output (str): Path to output directory. temp (str): Path to temp directory, where intermediate job files will be stored. log (str): Path to log file. If not provided and ``project_dir`` is provided, the log will be written to ``/path/to/project_dir/abstar.log``. If output is provided, log will be written to ``/path/to/output/abstar.log``. species (str): Species of the antibody sequences. Choices are 'human', 'macaque', 'mouse' and 'rabbit'. Default is 'human'. isotype (bool): If True, the isotype will infered by aligning the sequence region downstream of the J-gene. If False, the isotype will not be determined. Default is True. uid (int): Length (in nucleotides) of the Unique Molecular ID used to barcode input RNA. A positive integer results in the UMID being parsed from the start of the read (or merged read), a negative integer results in parsing from the end of the read. Default is 0, which results in no UMID parsing. gzip (bool): If True, compresses output files with gzip. Default is False. pretty (bool): If True, formats JSON output files to be more human-readable. If False, JSON output files contain one record per line. Default is False. output_type (str): Options are 'json' or 'imgt'. IMGT output mimics the Summary table produced by IMGT High-V/Quest, to maintain a level of compatibility with existing IMGT-based pipelines. JSON output is much more detailed. Default is 'json'. merge (bool): If True, input must be paired-read FASTA files (gzip compressed or uncompressed) which will be merged with PANDAseq prior to processing with AbStar. If ``basespace`` is True, ``merge`` is automatically set to True. Default is False. pandaseq_algo (str): Define merging algorithm to be used by PANDAseq. Options are 'simple_bayesian', 'ea_util', 'flash', 'pear', 'rdp_mle', 'stitch', or 'uparse'. Default is 'simple_bayesian', which is the default PANDAseq algorithm. debug (bool): If ``True``, ``abstar.run()`` runs in single-threaded mode, the log is much more verbose, and temporary files are not removed. Default is ``False``. Returns: If the input is a single sequence, ``run`` returns a single AbTools ``Sequence`` object. If the input is a list of sequences, ``run`` returns a list of AbTools ``Sequence`` objects. If the input is a file or a directory of files, ``run`` returns a list of output files. ''' warnings.filterwarnings("ignore") if len(args) == 1: # if there's a single arg, need to check if it's a single sequence... try: sequences = [ Sequence(args[0]), ] except: # ...or a list of sequences try: sequences = [Sequence(s) for s in args[0]] except: print('ERROR: invalid format for sequence input:') for a in args: print(a) sys.exit(1) # if multiple args, assume each is a sequence elif len(args) > 1: try: sequences = [Sequence(s) for s in args] except: print('ERROR: invalid format for sequence input:') for a in args: print(a) sys.exit(1) kwargs['sequences'] = sequences args = Args(**kwargs) validate_args(args) global logger logger = log.get_logger('abstar') output = main(args) # if args.sequences is not None: # output = [Sequence(o) for o in output] # if len(output) == 1: # return output[0] return output
def main(args, logfile=None): global logger logger = log.get_logger('demultiplex') print_start_info() if all([args.index is None, args.index_file is None]): err = 'Indexes must be provided, either using --index or --index-file' raise RuntimeError(err) log_options(args, logfile=logfile) make_directories(args) open(args.output, 'w').write('') db = mongodb.get_db(args.db, ip=args.ip, port=args.port, user=args.user, password=args.password) plate_map = parse_plate_map(args.plate_map) # all_seqs = [] collections = mongodb.get_collections(db, args.collection, prefix=args.collection_prefix, suffix=args.collection_suffix) for collection in collections: if collection not in plate_map: logger.info( '\n\n{} was not found in the supplied plate map file.'.format( collection)) continue plate_names = plate_map[collection] for plate_num, plate_name in enumerate(plate_names): if plate_name is None: continue print_plate_info(plate_name, collection) indexes = get_indexes(args.index, args.index_file, args.index_length, plate_num) for chain in ['heavy', 'kappa', 'lambda']: plate_seqs = [] logger.info('') logger.info('Querying for {} chain sequences'.format(chain)) score_cutoff = args.score_cutoff_heavy if chain == 'heavy' else args.score_cutoff_light sequences = get_sequences(db, collection, chain, score_cutoff) logger.info( 'QUERY RESULTS: {} {} chain sequences met the quality threshold' .format(len(sequences), chain.lower())) bins = bin_by_index(sequences, indexes, args.index_length, args.index_position, args.index_reverse_complement, args.raw_seq_field) if args.minimum_well_size == 'relative': min_well_size = int( len(sequences) / float(args.minimum_well_size_denom)) else: min_well_size = int(args.minimum_well_size) min_max_well_size = max(min_well_size, args.minimum_max_well_size) if max([len(b) for b in list(bins.values()) ]) < int(min_max_well_size): logger.info( 'The biggest well had fewer than {} sequences, so the plate was not processed' .format(min_max_well_size)) continue for b in sorted(bins.keys()): if len(bins[b]) < 25: continue print_bin_info(b) if args.raw_sequence_dir is not None: rs_handle = open( os.path.join( args.raw_sequence_dir, '{}-{}_{}'.format(plate_name, b, chain)), 'write') rs_handle.write('\n'.join( ['>{}\n{}'.format(s[0], s[1]) for s in bins[b]])) rs_handle.close() consentroid = cdhit_clustering( bins[b], b, plate_name, args.temp_dir, len(sequences), args.minimum_well_size, args.minimum_well_size_denom, args.minimum_cluster_fraction, args.raw_sequence_dir, args.alignment_pixel_dir, args.consensus, args.cdhit_threshold, chain) if consentroid: consentroid_name = '{}-{}'.format(plate_name, b) plate_seqs.append((consentroid_name, consentroid)) log_output(bins, plate_seqs, min_well_size) # all_seqs.extend(plate_seqs) write_output(plate_seqs, args.output) logger.info('') logger.info('')
from __future__ import absolute_import, division, print_function, unicode_literals from multiprocessing import cpu_count import os from subprocess import Popen, PIPE import sys from Bio import SeqIO from .utils.pandaseq import pair_files from abutils.utils.log import get_logger from abutils.utils.pipeline import list_files, make_dir logger = get_logger('preprocess') def quality_trim(input_directory=None, output_directory=None, quality_cutoff=20, length_cutoff=50, quality_type='sanger', compress_output=True, file_pairs=None, singles_directory=None, nextseq=False, paired_reads=True, allow_5prime_trimming=False, print_debug=False): ''' Performs quality trimming with sickle. Args: input_directory (str): Path to a directory of files to be quality trimmed. If the directory contains paired reads, they should follow the Illumina MiSeq naming scheme. If you have paired reads
def run(**kwargs): validate_args(args) args = Args(**kwargs) global logger log.get_logger('clonify') mai(args)
cluster_sizes = update_db(clusters, collection_group) else: cluster_sizes = [c.size for c in clusters] print_finished(cluster_sizes) def run_standalone(args): validate_args(args) global logger logfile = get_logfile(args) log.setup_logging(logfile, debug=args.debug) logger = log.get_logger('clonify') main(args) def run(**kwargs): validate_args(args) args = Args(**kwargs) global logger log.get_logger('clonify') mai(args) if __name__ == '__main__': args = parse_args() validate_args(args) logfile = get_logfile(args) log.setup_logging(logfile, debug=args.debug) logger = log.get_logger('clonify') main(args)
def compress_and_upload(data, compressed_file, s3_path, multipart_chunk_size_mb=500, method='gz', delete=False, access_key=None, secret_key=None): ''' Compresses data and uploads to S3. S3 upload uses ``s3cmd``, so you must either: 1) Manually configure ``s3cmd`` prior to use (typically using ``s3cmd --configure``). 2) Configure ``s3cmd`` using ``s3.configure()``. 3) Pass your access key and secret key to ``compress_and_upload``, which will automatically configure s3cmd. .. note: ``s3cmd`` configuration only needs to be done once per computer, which means that relaunching a cloud instance or Docker image will require re-configuration of ``s3cmd``. Args: data: Can be one of three things: 1) Path to a single file 2) Path to a directory 3) A list of one or more paths to files or directories compressed_file (str): Path to the compressed file. Required. s3_path (str): The S3 path, with the filename omitted. The S3 filename will be the basename of the ``compressed_file``. For example:: compress_and_upload(data='/path/to/data', compressed_file='/path/to/compressed.tar.gz', s3_path='s3://my_bucket/path/to/') will result in an uploaded S3 path of ``s3://my_bucket/path/to/compressed.tar.gz`` method (str): Compression method. Options are ``'gz'`` (gzip) or ``'bz2'`` (bzip2). Default is ``'gz'``. delete (bool): If ``True``, the ``compressed_file`` will be deleted after upload to S3. Default is ``False``. access_key (str): AWS access key. secret_key (str): AWS secret key. ''' logger = log.get_logger('s3') if all([access_key, secret_key]): configure(access_key=access_key, secret_key=secret_key, logger=logger) compress(data, compressed_file, fmt=method, logger=logger) put(compressed_file, s3_path, multipart_chunk_size_mb=multipart_chunk_size_mb, logger=logger) if delete: os.unlink(compressed_file)
import json import os import platform import sys import time from BaseSpacePy.api.BaseSpaceAPI import BaseSpaceAPI from BaseSpacePy.model.QueryParameters import QueryParameters as qp from abutils.utils import log from abutils.utils.pipeline import make_dir if sys.version_info[0] > 2: raw_input = input logger = log.get_logger('basespace') class BaseSpace(object): def __init__(self, project_id=None, project_name=None, get_all_projects=False): super(BaseSpace, self).__init__() # BaseSpace credentials creds = self._get_credentials() self.client_key = creds['client_id'] self.client_secret = creds['client_secret'] self.access_token = creds['access_token'] self.version = creds['version'] self.api_server = creds['api_server']