def setup_logging(log_dir, debug): logfile = os.path.join(log_dir, 'abstar.log') debug = True if debug > 0 else False print_debug = True if debug == 2 else False log.setup_logging(logfile, debug=debug) global logger logger = log.get_logger('abstar')
def build_output(vdjs, output_type, pretty, padding): logger = log.get_logger() try: vdjs = [vdj for vdj in vdjs if vdj.rearrangement] if output_type.lower() == 'json': output = [] for vdj in vdjs: try: output.append(_json_output(vdj, pretty, padding)) except AttributeError: logger.debug('OUTPUT ERROR: {}'.format(vdj.id)) elif output_type.lower() == 'imgt': header, firstvals = _imgt_summary_output(vdjs[0], header=True) output = [header, firstvals, ] for vdj in vdjs[1:]: try: output.append(_imgt_summary_output(vdj)) except AttributeError: logger.debug('OUTPUT ERROR: {}'.format(vdj.id)) elif output_type.lower() == 'hadoop': output = [] for vdj in vdjs: try: output.append(_hadoop_minimal_output(vdj)) except AttributeError: logger.debug('OUTPUT ERROR: {}'.format(vdj.id)) return output except: logger.debug('FILE-LEVEL OUTPUT ERROR: sequences {} - {}, output_type = {}'.format( vdjs[0].id, vdjs[-1].id, output_type)) logger.debug(traceback.format_exc())
def run_standalone(args): logfile = args.log if args.log else os.path.join(args.output, 'abcompare.log') log.setup_logging(logfile) global logger logger = log.get_logger('abcompare') main(args)
def put(f, s3_path, multipart_chunk_size_mb=500, logger=None): ''' Uploads a single file to S3, using s3cmd. Args: f (str): Path to a single file. s3_path (str): The S3 path, with the filename omitted. The S3 filename will be the basename of the ``f``. For example:: put(f='/path/to/myfile.tar.gz', s3_path='s3://my_bucket/path/to/') will result in an uploaded S3 path of ``s3://my_bucket/path/to/myfile.tar.gz`` ''' if not logger: logger = log.get_logger('s3') fname = os.path.basename(f) target = os.path.join(s3_path, fname) s3cmd_cline = 's3cmd put {} {} --multipart-chunk-size-mb {}'.format(f, target, multipart_chunk_size_mb) print_put_info(fname, target, logger) s3cmd = sp.Popen(s3cmd_cline, stdout=sp.PIPE, stderr=sp.PIPE, shell=True) stdout, stderr = s3cmd.communicate()
def build_output(vdjs, output_type, pretty, padding): logger = log.get_logger() try: vdjs = [vdj for vdj in vdjs if vdj.rearrangement] if output_type.lower() == 'json': output = [] for vdj in vdjs: try: output.append(_json_output(vdj, pretty, padding)) except AttributeError: logger.debug('OUTPUT ERROR: {}'.format(vdj.id)) elif output_type.lower() == 'imgt': header, firstvals = _imgt_summary_output(vdjs[0], header=True) output = [ header, firstvals, ] for vdj in vdjs[1:]: try: output.append(_imgt_summary_output(vdj)) except AttributeError: logger.debug('OUTPUT ERROR: {}'.format(vdj.id)) elif output_type.lower() == 'hadoop': output = [] for vdj in vdjs: try: output.append(_hadoop_minimal_output(vdj)) except AttributeError: logger.debug('OUTPUT ERROR: {}'.format(vdj.id)) return output except: logger.debug( 'FILE-LEVEL OUTPUT ERROR: sequences {} - {}, output_type = {}'. format(vdjs[0].id, vdjs[-1].id, output_type)) logger.debug(traceback.format_exc())
def find_insertions(blast_result): ''' Identifies and annotates/fixes insertions. Frameshift insertions (those with a length not evenly divisible by 3) will be removed. Codon-length insertions will be annotated. Input is a BlastResult object. Output is a list of insertion annotations, or None if there were no codon-length insertions. ''' logger = log.get_logger(__name__) try: insertions = [] o = 0 for i in re.finditer('-+', blast_result.germline_alignment): s = i.start() - o e = i.end() - o l = e - s if l % 3 == 0 or l > 3: insertions.append(_annotate_insertion(blast_result, s, e)) else: blast_result = _fix_frameshift_insertion(blast_result, s, e) o += l return insertions if insertions else [] except: logger.debug('FIND INSERTIONS ERROR: {}, {}'.format(blast_result.id, blast_result.input_sequence)) logger.debug(traceback.format_exc())
def get_junction(vdj, germ=False): global logger logger = log.get_logger(__name__) try: return Junction(vdj, germ) except Exception, err: logger.debug('JUNCTION ERROR: {id}'.format(id=vdj.id)) logger.debug(traceback.format_exc())
def parse_codons(vdj, gapped): logger = log.get_logger(__name__) try: codons = Codons(vdj, gapped) return codons except: logger.debug('PARSE CODONS ERROR: {}, {}'.format(vdj.id, vdj.raw_input)) logger.debug(traceback.format_exc())
def aa_mutations(blast_result): global logger logger = log.get_logger(__name__) try: return MutationsAA(blast_result) except: logger.debug('AA MUTATIONS ERROR: {}, {}\n'.format(blast_result.id, blast_result.input_sequence)) logger.debug(traceback.format_exc())
def compress_and_upload(data, compressed_file, s3_path, multipart_chunk_size_mb=500, method='gz', delete=False, access_key=None, secret_key=None): ''' Compresses data and uploads to S3. S3 upload uses ``s3cmd``, so you must either: 1) Manually configure ``s3cmd`` prior to use (typically using ``s3cmd --configure``). 2) Configure ``s3cmd`` using ``s3.configure()``. 3) Pass your access key and secret key to ``compress_and_upload``, which will automatically configure s3cmd. .. note: ``s3cmd`` configuration only needs to be done once per computer, which means that relaunching a cloud instance or Docker image will require re-configuration of ``s3cmd``. Args: data: Can be one of three things: 1) Path to a single file 2) Path to a directory 3) A list of one or more paths to files or directories compressed_file (str): Path to the compressed file. Required. s3_path (str): The S3 path, with the filename omitted. The S3 filename will be the basename of the ``compressed_file``. For example:: compress_and_upload(data='/path/to/data', compressed_file='/path/to/compressed.tar.gz', s3_path='s3://my_bucket/path/to/') will result in an uploaded S3 path of ``s3://my_bucket/path/to/compressed.tar.gz`` method (str): Compression method. Options are ``'gz'`` (gzip) or ``'bz2'`` (bzip2). Default is ``'gz'``. delete (bool): If ``True``, the ``compressed_file`` will be deleted after upload to S3. Default is ``False``. access_key (str): AWS access key. secret_key (str): AWS secret key. ''' logger = log.get_logger('s3') if all([access_key, secret_key]): configure(access_key=access_key, secret_key=secret_key, logger=logger) compress(data, compressed_file, compress=method, logger=logger) put(compressed_file, s3_path, multipart_chunk_size_mb=multipart_chunk_size_mb, logger=logger) if delete: os.unlink(compressed_file)
def get_isotype(vdj): logger = log.get_logger(__name__) try: mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) isotype_file = os.path.join(mod_dir, 'ssw/isotypes/{}_isotypes.fasta'.format(vdj.species)) isotype_seqs = [Sequence(s) for s in SeqIO.parse(open(isotype_file, 'r'), 'fasta')] isotype_seqs += [Sequence((s.id, s.reverse_complement)) for s in isotype_seqs] return Isotype(vdj, isotype_seqs) except: logger.debug('ISOTYPE ERROR: {}\t{}'.format(vdj.id, vdj.raw_query)) logger.debug(traceback.format_exc())
def run_abstar(sequence_file, output_directory, args): ''' Wrapper function to multiprocess (or not) the assignment of V, D and J germline genes. Also writes the JSON-formatted output to file. Input is a a FASTA-formatted file of antibody sequences and the output directory. Optional input items include the species (supported species: 'human'); length of the unique antibody identifier (UAID); and debug mode (which forces single-threading and prints more verbose errors.) Output is the number of functional antibody sequences identified in the input file. ''' try: # setup logging global logger logger = log.get_logger(__name__) assigned_log = '' unassigned_log = '' # identify output file output_filename = os.path.basename(seq_file) if args.output_type == 'json': output_file = os.path.join(output_dir, output_filename + '.json') elif args.output_type in ['imgt', 'hadoop']: output_file = os.path.join(output_dir, output_filename + '.txt') # start assignment assigner = ASSIGNERS[args.assigner] assigner(sequence_file, args.species) # process all of the successfully assigned sequences assigned = [Antibody(vdj, args.species) for vdj in assigner.assigned] for ab in assigned: ab.annotate() if args.debug: assigned_log += ab.format_log() results = get_abstar_results(assigned, pretty=args.pretty, padding=args.padding, raw=args.raw) write_output(results, output_file, args.output_type) # capture the log for all unsuccessful sequences for vdj in unassigned: unassigned_log += vdj.format_log() return (len(assigned), assigned_log, unassigned_log) # vdj_output = process_sequence_file(seq_file, args) # if not vdj_output: # return None # clean_vdjs = [vdj for vdj in vdj_output if vdj.rearrangement] # output_count = write_output(clean_vdjs, output_file, args.output_type, args.pretty, args.padding) # return (output_file, output_count) except: logger.debug(traceback.format_exc()) raise Exception("".join(traceback.format_exception(*sys.exc_info())))
def cdhit(seqs, out_file=None, temp_dir=None, threshold=0.975, make_db=True, quiet=False, threads=0, max_memory=800, debug=False): # ''' # Perform CD-HIT clustering on a set of sequences. # Inputs are an iterable of sequences, which can be in any format that abtools.sequence.Sequence # can handle. # Returns the centroid file name and cluster file name (from CD-HIT). # If ::make_db:: is True (default), a SQLite3 connection and database path are also returned. # ''' logger = log.get_logger('cluster') start_time = time.time() seqs = [Sequence(s) for s in seqs] if not quiet: logger.info('CD-HIT: clustering {} seqeunces'.format(len(seqs))) if out_file is None: out_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) ofile = out_file.name else: ofile = os.path.expanduser(out_file) ifile = _make_cdhit_input(seqs, temp_dir) cdhit_cmd = 'cd-hit -i {} -o {} -c {} -n 5 -d 0 -T {} -M {}'.format( ifile, ofile, threshold, threads, max_memory) cluster = sp.Popen(cdhit_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) stdout, stderr = cluster.communicate() if debug: print(stdout) print(stderr) else: os.unlink(ifile) if not quiet: logger.info( 'CD-HIT: clustering took {:.2f} seconds'.format(time.time() - start_time)) cfile = ofile + '.clstr' if make_db: if not quiet: logger.info('CD-HIT: building a SQLite3 database') seq_db, db_path = _build_seq_db(seqs, direc=temp_dir) return ofile, cfile, seq_db, db_path return ofile, cfile
def nt_mutations(blast_result): global logger logger = log.get_logger(__name__) try: return MutationsNT(blast_result) except: logger.debug('NT MUTATIONS ERROR: {}, {}\n'.format(blast_result.id, blast_result.input_sequence)) logger.debug('QUERY ALIGNMENT: {}'.format(blast_result.query_alignment)) logger.debug('GERMLINE ALIGNMENT: {}'.format(blast_result.germline_alignment)) logger.debug(blast_result.regions.raw_positions) logger.debug(blast_result.regions.adjusted_positions) logger.debug(blast_result.regions.nt_seqs) logger.debug(blast_result.regions.germline_nt_seqs) logger.debug(traceback.format_exc())
def check_productivity(vdj): logger = log.get_logger(__name__) try: problems = 0 problems += stop_codons(vdj) problems += ambig_codons(vdj) problems += vdj_agreement(vdj) problems += conserved_junc_residues(vdj) problems += rearrangement(vdj) problems += junction_frame(vdj) problems += indels(vdj) if problems: return 'no' return 'yes' except: logger.debug('PRODUCTIVITY CHECK ERROR: {}'.format(vdj.id)) logger.debug(traceback.format_exc())
def cdhit(seqs, out_file=None, temp_dir=None, threshold=0.975, make_db=True, quiet=False, threads=0, max_memory=800, debug=False): # ''' # Perform CD-HIT clustering on a set of sequences. # Inputs are an iterable of sequences, which can be in any format that abtools.sequence.Sequence # can handle. # Returns the centroid file name and cluster file name (from CD-HIT). # If ::make_db:: is True (default), a SQLite3 connection and database path are also returned. # ''' logger = log.get_logger('cluster') start_time = time.time() seqs = [Sequence(s) for s in seqs] if not quiet: logger.info('CD-HIT: clustering {} seqeunces'.format(len(seqs))) if out_file is None: out_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False) ofile = out_file.name else: ofile = os.path.expanduser(out_file) ifile = _make_cdhit_input(seqs, temp_dir) cdhit_cmd = 'cd-hit -i {} -o {} -c {} -n 5 -d 0 -T {} -M {}'.format(ifile, ofile, threshold, threads, max_memory) cluster = sp.Popen(cdhit_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) stdout, stderr = cluster.communicate() if debug: print(stdout) print(stderr) else: os.unlink(ifile) if not quiet: logger.info('CD-HIT: clustering took {:.2f} seconds'.format(time.time() - start_time)) cfile = ofile + '.clstr' if make_db: if not quiet: logger.info('CD-HIT: building a SQLite3 database') seq_db, db_path = _build_seq_db(seqs, direc=temp_dir) return ofile, cfile, seq_db, db_path return ofile, cfile
def regions(blast_result): ''' Returns a VarRegions (or JoinRegions) object, containing a variety of information about Variable (or Joining) gene regions. Both Regions objects contain equivalent attributes to ease downstream handling. Input is a BlastResult object for a variable gene. ''' global logger logger = log.get_logger(__name__) try: if blast_result.gene_type == 'variable': return VarRegions(blast_result) if blast_result.gene_type == 'joining': return JoinRegions(blast_result) except Exception: logger.debug('REGIONS ERROR: {} {}'.format(blast_result.id, blast_result.seq)) logger.debug(traceback.format_exc())
def compress(d, output, compress='gz', logger=None): ''' Creates a compressed/uncompressed tar file. Args: d: Can be one of three things: 1. the path to a single file, as a string 2. the path to a single directory, as a string 3. an iterable of file or directory paths output (str): Output file path. compress: Compression method. Options are ``'gz'`` (gzip), ``'bz2'`` (bzip2) and ``'none'`` (uncompressed). Default is ``'gz'``. ''' if not logger: logger = log.get_logger('s3') if type(d) not in [list, tuple]: d = [ d, ] d = [os.path.expanduser(_d) for _d in d] print_compress_info(d, output, compress, logger) if compress.lower() == 'none': compress = '' elif compress.lower() not in ['gz', 'bz2']: logger.info( 'Compression option ("{}") is invalid.\nFalling back to uncompressed.' .format(compress)) compress = '' output = os.path.expanduser(output) tar = tarfile.open(output, 'w:{}'.format(compress)) for obj in d: tar.add(obj) tar.close() return output
def print_compress_info(d, output, compress, logger): if not logger: logger = log.get_logger('s3') dirs = [obj for obj in d if os.path.isdir(obj)] files = [obj for obj in d if os.path.isfile(obj)] logger.info('') logger.info('') logger.info('') logger.info('-' * 25) logger.info('COMPRESSING DATA') logger.info('-' * 25) logger.info('') logger.info('Ouptut file: {}'.format(output)) logger.info('Compression: {}'.format(compress.lower())) if dirs: d = 'directories' if len(dirs) > 1 else 'directory' logger.info('Found {} {} to compress: {}'.format( len(dirs), d, ', '.join(dirs))) if files: f = 'files' if len(files) > 1 else 'file' logger.info('Found {} {} to compress: {}'.format( len(files), f, ', '.join(files)))
def print_compress_info(d, output, compress, logger): if not logger: logger = log.get_logger('s3') dirs = [obj for obj in d if os.path.isdir(obj)] files = [obj for obj in d if os.path.isfile(obj)] logger.info('') logger.info('') logger.info('') logger.info('-' * 25) logger.info('COMPRESSING DATA') logger.info('-' * 25) logger.info('') logger.info('Ouptut file: {}'.format(output)) logger.info('Compression: {}'.format(compress.lower())) if dirs: d = 'directories' if len(dirs) > 1 else 'directory' logger.info('Found {} {} to compress: {}'.format(len(dirs), d, ', '.join(dirs))) if files: f = 'files' if len(files) > 1 else 'file' logger.info('Found {} {} to compress: {}'.format(len(files), f, ', '.join(files)))
def configure(access_key=None, secret_key=None, logger=None): ''' Configures s3cmd prior to first use. If no arguments are provided, you will be prompted to enter the access key and secret key interactively. Args: access_key (str): AWS access key secret_key (str): AWS secret key ''' if not logger: logger = log.get_logger('s3') if not all([access_key, secret_key]): logger.info('') access_key = raw_input('AWS Access Key: ') secret_key = raw_input('AWS Secret Key: ') _write_config(access_key, secret_key) logger.info('') logger.info('Completed writing S3 config file.') logger.info('')
def put(f, s3_path, multipart_chunk_size_mb=500, logger=None): ''' Uploads a single file to S3, using s3cmd. Args: f (str): Path to a single file. s3_path (str): The S3 path, with the filename omitted. The S3 filename will be the basename of the ``f``. For example:: put(f='/path/to/myfile.tar.gz', s3_path='s3://my_bucket/path/to/') will result in an uploaded S3 path of ``s3://my_bucket/path/to/myfile.tar.gz`` ''' if not logger: logger = log.get_logger('s3') fname = os.path.basename(f) target = os.path.join(s3_path, fname) s3cmd_cline = 's3cmd put {} {} --multipart-chunk-size-mb {}'.format( f, target, multipart_chunk_size_mb) print_put_info(fname, target, logger) s3cmd = sp.Popen(s3cmd_cline, stdout=sp.PIPE, stderr=sp.PIPE, shell=True) stdout, stderr = s3cmd.communicate()
def compress(d, output, compress='gz', logger=None): ''' Creates a compressed/uncompressed tar file. Args: d: Can be one of three things: 1. the path to a single file, as a string 2. the path to a single directory, as a string 3. an iterable of file or directory paths output (str): Output file path. compress: Compression method. Options are ``'gz'`` (gzip), ``'bz2'`` (bzip2) and ``'none'`` (uncompressed). Default is ``'gz'``. ''' if not logger: logger = log.get_logger('s3') if type(d) not in [list, tuple]: d = [d, ] d = [os.path.expanduser(_d) for _d in d] print_compress_info(d, output, compress, logger) if compress.lower() == 'none': compress = '' elif compress.lower() not in ['gz', 'bz2']: logger.info('Compression option ("{}") is invalid.\nFalling back to uncompressed.'.format(compress)) compress = '' output = os.path.expanduser(output) tar = tarfile.open(output, 'w:{}'.format(compress)) for obj in d: tar.add(obj) tar.close() return output
def run_standalone(args): global logger logger = log.get_logger('demultiplex') main(args)
def run_standalone(args): global logger logger = log.get_logger('barcodes') main(args) def main(args): for f in list_files(args.input): # experiment = get_experiment(f, args) wb = load_workbook(f) ws = wb[wb.get_sheet_names()[0]] plate_blocks = get_plate_blocks(ws, args) plural = '' if len(plate_blocks) <= 2 else 's' logger.info('\nFound {} plate{} in the input file'.format( len(plate_blocks) - 1, plural)) # logger.info('Experiment name: {}\n'.format(experiment)) # plates = parse_plates(plate_blocks[1:], args) plates = parse_barcodes(plate_blocks[1:], args) write_output(plates, args) logger.info('') if __name__ == '__main__': args = parse_args() if args.log is None: args.log = os.path.join(args.output, 'barcodes.log') log.setup_logging(args.log) logger = log.get_logger('barcodes') main(args)
def run_standalone(args): logfile = args.log if args.log else os.path.join(args.output_dir, 'abfinder.log') log.setup_logging(logfile) global logger logger = log.get_logger('abfinder') main(args)
def run(**kwargs): ''' Mines NGS datasets for identity to known antibody sequences. All of ``db``, ``output``, ``temp`` and ``standard`` are required. Args: db (str): Name of a MongoDB database to query. collection (str): Name of a MongoDB collection. If not provided, all collections in ``db`` will be processed iteratively. output_dir (str): Path to the output directory, into which identity/divergence figures will be deposited. temp_dir (str): Path to a temporary directory. log (str): Path to a log file. If not provided, log information will not be retained. ip (str): IP address of the MongoDB server. Default is ``localhost``. port (str): Port of the MongoDB server. Default is ``27017``. user (str): Username with which to connect to the MongoDB database. If either of ``user`` or ``password`` is not provided, the connection to the MongoDB database will be attempted without authentication. password (str): Password with which to connect to the MongoDB database. If either of ``user`` or ``password`` is not provided, the connection to the MongoDB database will be attempted without authentication. standard (path): Path to a FASTA-formatted file containing one or more 'standard' sequences, against which the NGS sequences will be compared. chain (str): Antibody chain. Choices are 'heavy', 'kappa', 'lambda', and 'light'. Default is 'heavy'. Only NGS sequences matching ``chain`` (with 'light' covering both 'kappa' and 'lambda') will be compared to the ``standard`` sequences. update (bool): If ``True``, the MongoDB record for each NGS sequence will be updated with identity information for each standard. If ``False``, the updated is skipped. Default is ``True``. is_aa (bool): If ``True``, the ``standard`` sequences are amino acid sequences. If ``False``, they are nucleotide seqeunces. Default is ``False``. x_min (int): Minimum x-axis value on identity/divergence plots. x_max (int): Maximum x-axis value on identity/divergence plots. y_min (int): Minimum y-axis value on identity/divergence plots. y_max (int): Maximum y-axis value on identity/divergence plots. gridsize (int): Relative size of hexbin grids. mincount (int): Minimum number of sequences in a hexbin for the bin to be colored. Default is 3. colormap (str, colormap): Colormap to be used for identity/divergence plots. Default is ``Blues``. debug (bool): If ``True``, more verbose logging. ''' args = Args(**kwargs) global logger logger = log.get_logger('abfinder') main(args)
print_standards_info(standards) collections = mongodb.get_collections(db, args.collection, prefix=args.collection_prefix) print_collections_info(collections) for collection in collections: indexed = False print_single_collection(collection) if args.remove_padding: print_remove_padding() mongodb.remove_padding(db, collection) seq_files = get_sequences(db, collection, args.temp_dir, args) for standard in standards: print_single_standard(standard) scores = run_jobs(seq_files, standard, args) if args.output_dir: make_figure(standard.id, scores, collection, args) if args.update: if not indexed: mongodb.index(db, collection, 'seq_id') indexed = True update_db(db, standard.id, scores, collection, args) clean_up(seq_files) if __name__ == '__main__': parser = parse_args() args = parser.parse_args() logfile = args.log if args.log else os.path.join(args.output_dir, 'abfinder.log') log.setup_logging(logfile) logger = log.get_logger('abfinder') main(args)
def run(**kwargs): args = Args(**kwargs) global logger logger = log.get_logger('barcodes') main(args)
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # import glob from multiprocessing import cpu_count import os import sys import subprocess as sp from abtools import log logger = log.get_logger('basespace') def list_files(d): return sorted([f for f in glob.glob(d + '/*') if os.path.isfile(f)]) def pair_files(files, nextseq): pairs = {} for f in files: if nextseq: f_prefix = '_'.join(os.path.basename(f).split('_')[:3]) else: f_prefix = '_'.join(os.path.basename(f).split('_')[:2]) if f_prefix in pairs: pairs[f_prefix].append(f)
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING # BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # import os import sys import glob import subprocess as sp from multiprocessing import cpu_count from abtools import log logger = log.get_logger('basespace') def list_files(d): return sorted([f for f in glob.glob(d + '/*') if os.path.isfile(f)]) def pair_files(files, nextseq): pairs = {} for f in files: if nextseq: f_prefix = '_'.join(os.path.basename(f).split('_')[:3]) else: f_prefix = '_'.join(os.path.basename(f).split('_')[:2]) if f_prefix in pairs: pairs[f_prefix].append(f)
def run(*args, **kwargs): ''' Runs AbStar. Input sequences can be provided in several different formats: 1) individual sequences as positional arguments: ``run(seq1, seq2, temp=temp, output=output)`` 2) a list of sequences, as an argument: ``run([seq1, seq2], temp=temp, output=output)`` 3) a single FASTA/Q-formatted input file, passed via ``input`` 4) a directory of FASTA/Q-formatted files, passed via ``input`` When passing sequences (not FASTA/Q files), the sequences can be in any format recognized by ``abtools.sequence.Sequence``, including: - a raw nucleotide sequence, as a string (a random sequence ID will be assigned) - a list/tuple of the format ``[sequence_id, sequence]`` - a BioPython SeqRecord object - an AbTools Sequence object Either sequences, ``project_dir``, or all of ``input``, ``output`` and ``temp`` are required. Examples: If processing a single sequence, you can pass the raw sequence, as a string:: import abstar result = abstar.run('ATGC') or a list/tuple of the format ``[sequence_id, sequence]``:: result = abstar.run(['seq1', 'ATGC']) If you pass just the raw sequence, a random sequence ID will be generated with ``uuid.uuid4()``. In either case, when given a single sequence, ``abstar.run()`` will return a single AbTools ``Sequence`` object. If running multiple sequences, you can either pass each sequence as a positional argument:: result_list = run(['seq1', 'ATGC'], ['seq2', 'CGTA']) or you can pass a list of sequences as the first argument, in this case using sequences parsed from a FASTA file using Biopython:: from Bio import SeqIO fasta = open('my_sequences.fasta', 'r') seqs = [s for s in SeqIO.parse(fasta, 'fasta')] result_list = abstar.run(seqs) When given multiple sequences, ``abstar.run()`` will return a list of AbTools ``Sequence`` objects, one per input sequence. If you'd prefer not to parse the FASTQ/A file into a list (for example, if the input file is extremely large), you can pass the input file path directly, along with a temp directory and output directory:: result_files = abstar.run(input='/path/to/my_sequences.fasta', temp='/path/to/temp', output='/path/to/output') Given a file path, ``abstar.run()`` returns a list of output file paths. In the above case, ``result_files`` will be a list containing a single output file path: ``/path/to/output/my_sequences.json``. If you have a directory containing multiple FASTQ/A files, you can pass the directory path using ``input``:: result_files = abstar.run(input='/path/to/input', temp='/path/to/temp', output='/path/to/output') As before, ``result_files`` will contain a list of output file paths. If your input directory contains paired FASTQ files (gzip compressed or uncompressed) that need to be merged prior to processing with AbStar:: result_files = abstar.run(input='/path/to/input', temp='/path/to/temp', output='/path/to/output', merge=True) The paired read files in ``input`` will be merged with PANDAseq prior to processing with AbStar. By default, PANDAseq's 'simple bayesian' read merging algorithm is used, although alternate algorithms can be selected with ``pandaseq_algo``. AbStar also provides an alternate CSV-formatted output type that mimics the `IMGT Summary file`_. This option is provided to minimize the effort needed to convert existing IMGT-based pipelines to AbStar. Alternate output is only available when passing an input file or directory; passing individual sequences or a list of sequences will always return Sequence objects. To produce IMGT-formatted output:: result_files = abstar.run(input='/path/to/input', temp='/path/to/temp', output='/path/to/output', output_type='imgt') .. _IMGT Summary file: http://www.imgt.org/IMGT_vquest/share/textes/imgtvquest.html#Esummary Args: project_dir (str): Path to the project directory. Most useful when directly downloading files from BaseSpace, and all subdirectories will be created by AbStar. input (str): Path to input directory, containing FASTA/Q files. If performing read merging with PANDAseq, paired FASTQ files may be gzip compressed. output (str): Path to output directory. temp (str): Path to temp directory, where intermediate job files will be stored. log (str): Path to log file. If not provided and ``project_dir`` is provided, the log will be written to ``/path/to/project_dir/abstar.log``. If output is provided, log will be written to ``/path/to/output/abstar.log``. species (str): Species of the antibody sequences. Choices are 'human', 'macaque', 'mouse' and 'rabbit'. Default is 'human'. isotype (bool): If True, the isotype will infered by aligning the sequence region downstream of the J-gene. If False, the isotype will not be determined. Default is True. uid (int): Length (in nucleotides) of the Unique Molecular ID used to barcode input RNA. A positive integer results in the UMID being parsed from the start of the read (or merged read), a negative integer results in parsing from the end of the read. Default is 0, which results in no UMID parsing. gzip (bool): If True, compresses output files with gzip. Default is False. pretty (bool): If True, formats JSON output files to be more human-readable. If False, JSON output files contain one record per line. Default is False. output_type (str): Options are 'json' or 'imgt'. IMGT output mimics the Summary table produced by IMGT High-V/Quest, to maintain a level of compatibility with existing IMGT-based pipelines. JSON output is much more detailed. Default is 'json'. merge (bool): If True, input must be paired-read FASTA files (gzip compressed or uncompressed) which will be merged with PANDAseq prior to processing with AbStar. If ``basespace`` is True, ``merge`` is automatically set to True. Default is False. pandaseq_algo (str): Define merging algorithm to be used by PANDAseq. Options are 'simple_bayesian', 'ea_util', 'flash', 'pear', 'rdp_mle', 'stitch', or 'uparse'. Default is 'simple_bayesian', which is the default PANDAseq algorithm. debug (bool): If ``True``, ``abstar.run()`` runs in single-threaded mode, the log is much more verbose, and temporary files are not removed. Default is ``False``. Returns: If the input is a single sequence, ``run`` returns a single AbTools ``Sequence`` object. If the input is a list of sequences, ``run`` returns a list of AbTools ``Sequence`` objects. If the input is a file or a directory of files, ``run`` returns a list of output files. ''' warnings.filterwarnings("ignore") if len(args) == 1: # if there's a single arg, need to check if it's a single sequence... try: sequences = [ Sequence(args[0]), ] except: # ...or a list of sequences try: sequences = [Sequence(s) for s in args[0]] except: print('ERROR: invalid format for sequence input:') for a in args: print(a) sys.exit(1) # if multiple args, assume each is a sequence elif len(args) > 1: try: sequences = [Sequence(s) for s in args] except: print('ERROR: invalid format for sequence input:') for a in args: print(a) sys.exit(1) kwargs['sequences'] = sequences args = Args(**kwargs) validate_args(args) global logger logger = log.get_logger('abstar') output = main(args) # if args.sequences is not None: # output = [Sequence(o) for o in output] # if len(output) == 1: # return output[0] return output
from multiprocessing import cpu_count import os from subprocess import Popen, PIPE import sys from Bio import SeqIO from utils.pandaseq import pair_files from abtools.log import get_logger from abtools.pipeline import list_files, make_dir logger = get_logger('preprocess') def quality_trim(input_directory=None, output_directory=None, quality_cutoff=20, length_cutoff=50, quality_type='sanger', compress_output=True, file_pairs=None, singles_directory=None, nextseq=False, paired_reads=True, allow_5prime_trimming=False, print_debug=False): ''' Performs quality trimming with sickle. Args: input_directory (str): Path to a directory of files to be quality trimmed. If the directory contains paired reads, they should follow the Illumina MiSeq naming scheme. If you have paired reads
def mongoimport(json, database, ip='localhost', port=27017, user=None, password=None, delim='_', delim1=None, delim2=None, delim_occurance=1, delim1_occurance=1, delim2_occurance=1): ''' Performs mongoimport on one or more json files. Args: json: Can be one of several things: - path to a single JSON file - an iterable (list or tuple) of one or more JSON file paths - path to a directory containing one or more JSON files database (str): Name of the database into which the JSON files will be imported ip (str): IP address of the MongoDB server. Default is ``localhost``. port (int): Port of the MongoDB database. Default is ``27017``. user (str): Username for the MongoDB database, if authentication is enabled. Default is ``None``, which results in attempting connection without authentication. password (str): Password for the MongoDB database, if authentication is enabled. Default is ``None``, which results in attempting connection without authentication. delim (str): Delimiter, when generating collection names using a single delimiter. Default is ``_`` delim_occurance (int): Occurance at which to split filename when using a single delimiter. Default is ``1`` delim1 (str): Left delimiter when splitting with two delimiters. Default is None. delim1_occurance (int): Occurance of ``delim1`` at which to split filename. Default is ``1`` delim2 (str): Right delimiter when splitting with two delimiters. Default is None. delim2_occurance (int): Occurance of ``delim2`` at which to split filename. Default is ``1`` ''' logger = log.get_logger('mongodb') _print_mongoimport_info(logger) if type(json) in (list, tuple): pass elif os.path.isdir(json): from abtools.utils.pipeline import list_files json = list_files(json) else: json = [json, ] jsons = sorted([os.path.expanduser(j) for j in json if j.endswith('.json')]) collections = _get_import_collections(jsons, delim, delim_occurance, delim1, delim1_occurance, delim2, delim2_occurance) logger.info('Found {} files to import'.format(len(jsons))) logger.info('') for i, (json_file, collection) in enumerate(zip(jsons, collections)): logger.info('[ {} ] {} --> {}'.format(i + 1, os.path.basename(json_file), collection)) # logger.info("Performing mongoimport on {}.".format(os.path.basename(json_file))) # logger.info("Importing the file into collection {}.".format(collection)) if all([user, password]): host = '--host {} --port {} -username {} -password {}'.format(ip, port, user, password) else: host = '--host {} --port {}'.format(ip, port) mongo_cmd = "mongoimport {} --db {} --collection {} --file {}".format( host, database, collection, json_file) mongo = sp.Popen(mongo_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) stdout, stderr = mongo.communicate()
def _print_remove_padding(): logger = log.get_logger('mongodb') logger.info('Removing MongoDB padding...')
def run(**kwargs): args = Args(**kwargs) global logger logger = log.get_logger('demultiplex') main(args)
def run(**kwargs): ''' Corrects antibody reads using UAIDs (molecular barcodes) or identity-based clustering. Either ``json`` or ``db`` is required. Args: db (str): Name of a MongoDB database to query. collection (str): Name of a MongoDB collection. If not provided, all collections in ``db`` will be iteratively processed. json: Can be one of two things: 1. Path to a JSON file, containing sequence data annotated by AbStar. 2. Path to a directory, containing one or more JSON files of AbStar-annotated data. output (str): Path to the output directory, into which corrected FASTA files will be deposited. If it does not exist, it will be created. log (str): Path to the log file. If the parent directory doesn't exist, it will be created. ip (str): IP address of the MongoDB server. Default is ``localhost``. port (int): Port of the MongoDB server. Default is ``27017``. user (str): Username with which to connect to the MongoDB database. If either of ``user`` or ``password`` is not provided, the connection to the MongoDB database will be attempted without authentication. password (str): Password with which to connect to the MongoDB database. If either of ``user`` or ``password`` is not provided, the connection to the MongoDB database will be attempted without authentication. min_seqs (int): Minimum number of sequences for a centroid/consensus sequence to be calculated. After clustering (either by identity or using UAIDs), clusters with at least ``min_seqs`` sequences will be retained for consensus/centroid calculation. Default is ``1``. uaid (bool): If ``True``, use Unique Antibody IDs (UAIDs) for error correction. Sequences will be binned by UAID and the sequences in each bin will be used to compute a centroid or consensus sequence. If ``False``, sequences will be clustered by identity and each cluster will be used for consensus/centroid determination. parse_uaids (int): If UAIDs haven't been pre-parsed by AbStar, indicate the length of the UAID sequence (in nucleotides) and the UAIDs will be parsed during correction. If ``parse_uaids`` is negative, the UAID will be parsed from the end of the sequence. Default is ``0``, which does not parse a UAID sequence. consensus (bool): If ``True``, consensus sequences are calculated. If ``False``, centroid sequences are calculated. Default is ``True``. identity_threshold (float): Identity threshold, if clustering by identity (not UAIDs). Must be a float between 0 and 1. Default is 0.975. only_largest_cluster (bool): When clustering using UAIDs, there is a some probability that different sequences get labeled with the same UAID. To limit incorrect consensus/centroid calculation, sequences in each UAID bin are clustered using ``identity_threshold`` before calculating consens/centroid sequences. By default, all UAID clusters that meet the ``min_seqs`` size threshold are used to generate consensus/centroid sequences. If that behavior is not desired, setting ``only_largest_cluster`` to ``True`` results in only the largest UAID cluster being used to generate centroid/consensus sequences. nr (bool): If ``True``, a non-redundant sequence dataset will be generated using ``sort | uniq``. This is much faster than normal sequence clustering with CD-HIT, but can only be performed at an identity threshold of 100%. .. note: Using ``nr`` may produce different results than clustering sequences with ``identity_threshold`` set to ``1.0``. This is because sequences of different lengths that are otherwise identical will not be collapsed when using ``nr`` but will be collapsed using normal clustering. germs (str): Path to a file containing germline V-gene sequences. When clustering with ``min_seqs`` equal to 2, the appropriate germline sequence will be added to the alignment to serve as a consensus tiebreaker. aa (bool): If ``True``, perform sequence clustering (either using ``identity_threshold`` or ``nr``) using amino acid sequences. Default is ``False``, which performs clustering using nucleotide sequences. debug (bool): If ``True``, logging is more verbose. ''' args = Args(**kwargs) global logger logger = log.get_logger('abcorrect') main(args)
def run_standalone(args): global logger logger = log.get_logger('demultiplex') main(args) def main(args): for f in list_files(args.input): experiment = get_experiment(f, args) wb = load_workbook(f) ws = wb[wb.get_sheet_names()[0]] plate_blocks = get_plate_blocks(ws, args) plural = '' if len(plate_blocks) <= 2 else 's' logger.info('\nFound {} plate{} in the input file'.format( len(plate_blocks) - 1, plural)) logger.info('Experiment name: {}\n'.format(experiment)) plates = parse_plates(plate_blocks[1:], args) write_output(plates, experiment, args) logger.info('') if __name__ == '__main__': args = parse_args() if args.log is None: args.log = os.path.join(args.output, 'platemap.log') log.setup_logging(args.log) logger = log.get_logger('demultiplex') main(args)
s1_all_vgenes = get_vgenes(db, s1, args.chain) print_pair_info(s1, s2) s1_vgenes, s2_vgenes = get_vgenes(db, s2, args.chain, prev_data=s1_all_vgenes) logger.info('') logger.info('Calculating similarities...') median, counts, bins, similarities = calculate_similarities(s1_vgenes, s2_vgenes, args) write_output(s1, s2, median, counts, bins, similarities, args) scores = update_scores(s1, s2, median, scores) if args.control_similarity: logger.info('') logger.info('Calculating control similarities...') cmedian, ccounts, cbins, csimilarities = calculate_control_similarities(s1_vgenes, s2_vgenes, args) write_output(s1, s2, cmedian, ccounts, cbins, csimilarities, args) cscores = update_scores(s1, s2, cmedian, cscores) prev1 = s1 print_final_results(scores) print_final_results(cscores, control=True) if __name__ == '__main__': parser = parse_args() args = parser.parse_args() logfile = args.log if args.log else os.path.join(args.output, 'abcompare.log') log.setup_logging(logfile) logger = log.get_logger('abcompare') main(args)
def run_standalone(args): global logger logger = log.get_logger('barcodes') main(args)
def run(**kwargs): ''' Performs repertoire-level comparison of antibody sequencing datasets. Currently, the only metric for comparison is V-gene usage frequency. Additional measures are in the works (such as comparisons based on clonality). Args: db (str): MongoDB database name. collection1 (str): Name of the first MongoDB collection to query for comparison. If both ``collection1`` and ``collection2`` are provided, ``collection1`` will be compared only to ``collection2``. If neither ``collection1`` nor ``collection2`` are provided, all collections in ``db`` will be processed iteratively (all pairwise comparisons will be made). If ``collection1`` is provided but ``collection2`` is not, ``collection1`` will be iteratively compared to all other collections in ``db``. collection2 (str): Name of the second MongoDB collection to query for comparison. If both ``collection1`` and ``collection2`` are provided, ``collection1`` will be compared only to ``collection2``. If neither ``collection1`` nor ``collection2`` are provided, all collections in ``db`` will be processed iteratively (all pairwise comparisons will be made). collection_prefix (str): All collections beginning with ``collection_prefix`` will be iteratively compared (all pairwise comparisons will be made). ip (str): IP address of the MongoDB server. Default is ``localhost``. port (int): Port of the MongoDB server. Default is ``27017``. user (str): Username with which to connect to the MongoDB database. If either of ``user`` or ``password`` is not provided, the connection to the MongoDB database will be attempted without authentication. password (str): Password with which to connect to the MongoDB database. If either of ``user`` or ``password`` is not provided, the connection to the MongoDB database will be attempted without authentication. chunksize (int): Number of sequences for each iteration. Default is 100,000. iterations (int): Number of iterations to perform on each pair of samples. Default is 10,000 method (str): Similarity/divergence method to used for comparison. Default is ``marisita-horn``. Options are: - ``marisita-horn`` - ``kullback-leibler`` - ``jensen-shannon`` - ``jaccard`` - ``bray-curtis`` - ``renkonen`` - ``cosine`` control_similarity (bool): If ``True``, control similarity/divergence will be calculated, in which each sample is also compared to itself. Default is ``False``. chain (str): Antibody chain to be used for comparison. Options are ``heavy``, ``kappa`` and ``lambda``. Default is ``heavy``. ''' args = Args(**kwargs) global logger logger = log.get_logger('abcompare') main(args)
seqs = get_seqs(db, collection, args, make_seq_db=False) unique_file = unix_sort_unique(seqs, args) write_nr_output(collection, unique_file, collection_start, args) else: seq_db_path = get_seqs(db, collection, args) initial_clusters = initial_clustering(seq_db_path, args) if args.min_seqs == 1: singletons = [ic for ic in initial_clusters if ic.size == 1] initial_clusters = [ic for ic in initial_clusters if ic.size > 1] logger.info('{} clusters contained only a single sequence. Processing singletons...'.format(len(singletons))) singleton_consentroids = process_singleton_clusters(singletons, seq_db_path, args) logger.info('') else: singleton_consentroids = [] consentroids = process_initial_clusters(initial_clusters, seq_db_path, args) consentroids += singleton_consentroids sequences, sizes = zip(*consentroids) write_output(sample_name, sequences, sizes, collection_start, args) for ic in initial_clusters: ic.cleanup() remove_sqlite_db(args) if __name__ == '__main__': parser = parse_args() args = parser.parse_args() logfile = args.log if args.log else os.path.join(args.output, 'abcorrect.log') log.setup_logging(logfile) logger = log.get_logger('abcorrect') main(args)