예제 #1
0
def setup_logging(log_dir, debug):
    logfile = os.path.join(log_dir, 'abstar.log')
    debug = True if debug > 0 else False
    # print_debug = True if debug == 2 else False
    log.setup_logging(logfile, debug=debug)
    global logger
    logger = log.get_logger('abstar')
예제 #2
0
def run_standalone(args):
    validate_args(args)
    global logger
    logfile = get_logfile(args)
    log.setup_logging(logfile, debug=args.debug)
    logger = log.get_logger('clonify')
    main(args)
예제 #3
0
def build_output(vdjs, output_type, pretty, padding):
    logger = log.get_logger()
    try:
        vdjs = [vdj for vdj in vdjs if vdj.rearrangement]
        if output_type.lower() == 'json':
            output = []
            for vdj in vdjs:
                try:
                    output.append(_json_output(vdj, pretty, padding))
                except AttributeError:
                    logger.debug('OUTPUT ERROR: {}'.format(vdj.id))
        elif output_type.lower() == 'imgt':
            header, firstvals = _imgt_summary_output(vdjs[0], header=True)
            output = [
                header,
                firstvals,
            ]
            for vdj in vdjs[1:]:
                try:
                    output.append(_imgt_summary_output(vdj))
                except AttributeError:
                    logger.debug('OUTPUT ERROR: {}'.format(vdj.id))
        elif output_type.lower() == 'hadoop':
            output = []
            for vdj in vdjs:
                try:
                    output.append(_hadoop_minimal_output(vdj))
                except AttributeError:
                    logger.debug('OUTPUT ERROR: {}'.format(vdj.id))
        return output
    except:
        logger.debug(
            'FILE-LEVEL OUTPUT ERROR: sequences {} - {}, output_type = {}'.
            format(vdjs[0].id, vdjs[-1].id, output_type))
        logger.debug(traceback.format_exc())
예제 #4
0
파일: jobs.py 프로젝트: menis/abstar
def run_abstar(sequence_file, output_directory, args):
    '''
    Wrapper function to multiprocess (or not) the assignment of V, D and J
    germline genes. Also writes the JSON-formatted output to file.

    Input is a a FASTA-formatted file of antibody sequences and the output directory.
    Optional input items include the species (supported species: 'human'); length of
    the unique antibody identifier (UAID); and debug mode (which forces single-threading
    and prints more verbose errors.)

    Output is the number of functional antibody sequences identified in the input file.
    '''
    try:
        # setup logging
        global logger
        logger = log.get_logger(__name__)
        assigned_log = ''
        unassigned_log = ''
        # identify output file
        output_filename = os.path.basename(seq_file)
        if args.output_type == 'json':
            output_file = os.path.join(output_dir, output_filename + '.json')
        elif args.output_type in ['imgt', 'hadoop']:
            output_file = os.path.join(output_dir, output_filename + '.txt')
        # start assignment
        assigner = ASSIGNERS[args.assigner]
        assigner(sequence_file, args.species)
        # process all of the successfully assigned sequences
        assigned = [Antibody(vdj, args.species) for vdj in assigner.assigned]
        for ab in assigned:
            ab.annotate()
            if args.debug:
                assigned_log += ab.format_log()
        results = get_abstar_results(assigned,
                                     pretty=args.pretty,
                                     padding=args.padding,
                                     raw=args.raw)
        write_output(results, output_file, args.output_type)
        # capture the log for all unsuccessful sequences
        for vdj in unassigned:
            unassigned_log += vdj.format_log()

        return (len(assigned), assigned_log, unassigned_log)

    #     vdj_output = process_sequence_file(seq_file, args)
    #     if not vdj_output:
    #         return None
    #     clean_vdjs = [vdj for vdj in vdj_output if vdj.rearrangement]
    #     output_count = write_output(clean_vdjs, output_file, args.output_type, args.pretty, args.padding)
    #     return (output_file, output_count)
    except:
        logger.debug(traceback.format_exc())
        raise Exception("".join(traceback.format_exception(*sys.exc_info())))
예제 #5
0
파일: s3.py 프로젝트: menis/abutils
def compress(d, output, fmt='gz', logger=None):
    '''
    Creates a compressed/uncompressed tar file.

    Args:

        d: Can be one of three things:

            1. the path to a single file, as a string

            2. the path to a single directory, as a string

            3. an iterable of file or directory paths

        output (str): Output file path.

        fmt: Compression method. Options are ``'gz'`` (gzip),
            ``'bz2'`` (bzip2) and ``'none'`` (uncompressed). Default is ``'gz'``.
    '''
    if not logger:
        logger = log.get_logger('s3')
    if type(d) not in [list, tuple]:
        d = [
            d,
        ]
    d = [os.path.expanduser(_d) for _d in d]
    print_compress_info(d, output, compress, logger)
    if fmt.lower() == 'none':
        fmt = ''
    elif fmt.lower() not in ['gz', 'bz2']:
        logger.info(
            'Compression option ("{}") is invalid.\nFalling back to uncompressed.'
            .format(fmt))
        fmt = ''
    output = os.path.expanduser(output)
    tar = tarfile.open(output, 'w:{}'.format(fmt))
    for obj in d:
        tar.add(obj)
    tar.close()
    return output
예제 #6
0
파일: s3.py 프로젝트: menis/abutils
def print_compress_info(d, output, compress, logger):
    if not logger:
        logger = log.get_logger('s3')
    dirs = [obj for obj in d if os.path.isdir(obj)]
    files = [obj for obj in d if os.path.isfile(obj)]
    logger.info('')
    logger.info('')
    logger.info('')
    logger.info('-' * 25)
    logger.info('COMPRESSING DATA')
    logger.info('-' * 25)
    logger.info('')
    logger.info('Ouptut file: {}'.format(output))
    logger.info('Compression: {}'.format(compress.lower()))
    if dirs:
        d = 'directories' if len(dirs) > 1 else 'directory'
        logger.info('Found {} {} to compress: {}'.format(
            len(dirs), d, ', '.join(dirs)))
    if files:
        f = 'files' if len(files) > 1 else 'file'
        logger.info('Found {} {} to compress: {}'.format(
            len(files), f, ', '.join(files)))
예제 #7
0
파일: s3.py 프로젝트: menis/abutils
def configure(access_key=None, secret_key=None, logger=None):
    '''
    Configures s3cmd prior to first use.

    If no arguments are provided, you will be prompted to enter
    the access key and secret key interactively.

    Args:

        access_key (str): AWS access key

        secret_key (str): AWS secret key
    '''
    if not logger:
        logger = log.get_logger('s3')
    if not all([access_key, secret_key]):
        logger.info('')
        access_key = input('AWS Access Key: ')
        secret_key = input('AWS Secret Key: ')
    _write_config(access_key, secret_key)
    logger.info('')
    logger.info('Completed writing S3 config file.')
    logger.info('')
예제 #8
0
파일: s3.py 프로젝트: menis/abutils
def put(f, s3_path, multipart_chunk_size_mb=500, logger=None):
    '''
    Uploads a single file to S3, using s3cmd.

    Args:

        f (str): Path to a single file.

        s3_path (str): The S3 path, with the filename omitted. The S3 filename
            will be the basename of the ``f``. For example::

                put(f='/path/to/myfile.tar.gz', s3_path='s3://my_bucket/path/to/')

            will result in an uploaded S3 path of ``s3://my_bucket/path/to/myfile.tar.gz``
    '''
    if not logger:
        logger = log.get_logger('s3')
    fname = os.path.basename(f)
    target = os.path.join(s3_path, fname)
    s3cmd_cline = 's3cmd put {} {} --multipart-chunk-size-mb {}'.format(
        f, target, multipart_chunk_size_mb)
    print_put_info(fname, target, logger)
    s3cmd = sp.Popen(s3cmd_cline, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
    stdout, stderr = s3cmd.communicate()
예제 #9
0
def run(*args, **kwargs):
    '''
    Runs AbStar.

    Input sequences can be provided in several different formats:

        1) individual sequences as positional arguments: ``run(seq1, seq2, temp=temp, output=output)``
        2) a list of sequences, as an argument: ``run([seq1, seq2], temp=temp, output=output)``
        3) a single FASTA/Q-formatted input file, passed via ``input``
        4) a directory of FASTA/Q-formatted files, passed via ``input``

    When passing sequences (not FASTA/Q files), the sequences can be in any format recognized
    by ``abtools.sequence.Sequence``, including:

        - a raw nucleotide sequence, as a string (a random sequence ID will be assigned)
        - a list/tuple of the format ``[sequence_id, sequence]``
        - a BioPython SeqRecord object
        - an AbTools Sequence object

    Either sequences, ``project_dir``, or all of ``input``, ``output`` and ``temp`` are required.


    Examples:

        If processing a single sequence, you can pass the raw sequence, as a string::

            import abstar

            result = abstar.run('ATGC')

        or a list/tuple of the format ``[sequence_id, sequence]``::

            result = abstar.run(['seq1', 'ATGC'])

        If you pass just the raw sequence, a random sequence ID will be generated with ``uuid.uuid4()``.
        In either case, when given a single sequence, ``abstar.run()`` will return a single AbTools ``Sequence``
        object. If running multiple sequences, you can either pass each sequence as a positional argument::

            result_list = run(['seq1', 'ATGC'], ['seq2', 'CGTA'])

        or you can pass a list of sequences as the first argument, in this case using sequences parsed from a
        FASTA file using Biopython::

            from Bio import SeqIO

            fasta = open('my_sequences.fasta', 'r')
            seqs = [s for s in SeqIO.parse(fasta, 'fasta')]
            result_list = abstar.run(seqs)

        When given multiple sequences, ``abstar.run()`` will return a list of AbTools ``Sequence`` objects,
        one per input sequence.

        If you'd prefer not to parse the FASTQ/A file into a list (for example, if the input file is
        extremely large), you can pass the input file path directly, along with a temp directory and output
        directory::

            result_files = abstar.run(input='/path/to/my_sequences.fasta',
                                      temp='/path/to/temp',
                                      output='/path/to/output')

        Given a file path, ``abstar.run()`` returns a list of output file paths. In the above case,
        ``result_files`` will be a list containing a single output file path:
        ``/path/to/output/my_sequences.json``.

        If you have a directory containing multiple FASTQ/A files, you can pass the directory path
        using ``input``::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output')

        As before, ``result_files`` will contain a list of output file paths.

        If your input directory contains paired FASTQ files (gzip compressed or uncompressed)
        that need to be merged prior to processing with AbStar::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output',
                                      merge=True)

        The paired read files in ``input`` will be merged with PANDAseq prior to processing with AbStar.
        By default, PANDAseq's 'simple bayesian' read merging algorithm is used, although alternate
        algorithms can be selected with ``pandaseq_algo``.

        AbStar also provides an alternate CSV-formatted output type that mimics the `IMGT Summary file`_.
        This option is provided to minimize the effort needed to convert existing
        IMGT-based pipelines to AbStar. Alternate output is only available when passing an input file or
        directory; passing individual sequences or a list of sequences will always return Sequence objects.
        To produce IMGT-formatted output::

            result_files = abstar.run(input='/path/to/input',
                                      temp='/path/to/temp',
                                      output='/path/to/output',
                                      output_type='imgt')

        .. _IMGT Summary file: http://www.imgt.org/IMGT_vquest/share/textes/imgtvquest.html#Esummary


    Args:

        project_dir (str): Path to the project directory. Most useful when directly downloading
            files from BaseSpace, and all subdirectories will be created by AbStar.

        input (str): Path to input directory, containing FASTA/Q files. If performing
            read merging with PANDAseq, paired FASTQ files may be gzip compressed.

        output (str): Path to output directory.

        temp (str): Path to temp directory, where intermediate job files will be stored.

        log (str): Path to log file. If not provided and ``project_dir`` is provided,
            the log will be written to ``/path/to/project_dir/abstar.log``. If output is
            provided, log will be written to ``/path/to/output/abstar.log``.

        species (str): Species of the antibody sequences. Choices are 'human', 'macaque',
            'mouse' and 'rabbit'. Default is 'human'.

        isotype (bool): If True, the isotype will infered by aligning the sequence region
            downstream of the J-gene. If False, the isotype will not be determined.
            Default is True.

        uid (int): Length (in nucleotides) of the Unique Molecular ID used to barcode input RNA.
            A positive integer results in the UMID being parsed from the start of the read (or merged
            read), a negative integer results in parsing from the end of the read. Default is 0,
            which results in no UMID parsing.

        gzip (bool): If True, compresses output files with gzip. Default is False.

        pretty (bool): If True, formats JSON output files to be more human-readable. If False,
            JSON output files contain one record per line. Default is False.

        output_type (str): Options are 'json' or 'imgt'. IMGT output mimics the Summary
            table produced by IMGT High-V/Quest, to maintain a level of compatibility with
            existing IMGT-based pipelines. JSON output is much more detailed. Default is 'json'.

        merge (bool): If True, input must be paired-read FASTA files (gzip compressed or uncompressed)
            which will be merged with PANDAseq prior to processing with AbStar. If ``basespace`` is True,
            ``merge`` is automatically set to True. Default is False.

        pandaseq_algo (str): Define merging algorithm to be used by PANDAseq. Options are
            'simple_bayesian', 'ea_util', 'flash', 'pear', 'rdp_mle', 'stitch', or 'uparse'. Default is
            'simple_bayesian', which is the default PANDAseq algorithm.

        debug (bool): If ``True``, ``abstar.run()`` runs in single-threaded mode, the log is much more verbose,
            and temporary files are not removed. Default is ``False``.


    Returns:

        If the input is a single sequence, ``run`` returns a single AbTools ``Sequence`` object.

        If the input is a list of sequences, ``run`` returns a list of AbTools ``Sequence`` objects.

        If the input is a file or a directory of files, ``run`` returns a list of output files.
    '''

    warnings.filterwarnings("ignore")
    if len(args) == 1:
        # if there's a single arg, need to check if it's a single sequence...
        try:
            sequences = [
                Sequence(args[0]),
            ]
        except:
            # ...or a list of sequences
            try:
                sequences = [Sequence(s) for s in args[0]]
            except:
                print('ERROR: invalid format for sequence input:')
                for a in args:
                    print(a)
                sys.exit(1)
    # if multiple args, assume each is a sequence
    elif len(args) > 1:
        try:
            sequences = [Sequence(s) for s in args]
        except:
            print('ERROR: invalid format for sequence input:')
            for a in args:
                print(a)
            sys.exit(1)
    kwargs['sequences'] = sequences
    args = Args(**kwargs)
    validate_args(args)
    global logger
    logger = log.get_logger('abstar')
    output = main(args)
    # if args.sequences is not None:
    #     output = [Sequence(o) for o in output]
    #     if len(output) == 1:
    #         return output[0]
    return output
예제 #10
0
def main(args, logfile=None):
    global logger
    logger = log.get_logger('demultiplex')
    print_start_info()
    if all([args.index is None, args.index_file is None]):
        err = 'Indexes must be provided, either using --index or --index-file'
        raise RuntimeError(err)
    log_options(args, logfile=logfile)
    make_directories(args)
    open(args.output, 'w').write('')
    db = mongodb.get_db(args.db,
                        ip=args.ip,
                        port=args.port,
                        user=args.user,
                        password=args.password)
    plate_map = parse_plate_map(args.plate_map)
    # all_seqs = []
    collections = mongodb.get_collections(db,
                                          args.collection,
                                          prefix=args.collection_prefix,
                                          suffix=args.collection_suffix)
    for collection in collections:
        if collection not in plate_map:
            logger.info(
                '\n\n{} was not found in the supplied plate map file.'.format(
                    collection))
            continue
        plate_names = plate_map[collection]
        for plate_num, plate_name in enumerate(plate_names):
            if plate_name is None:
                continue
            print_plate_info(plate_name, collection)
            indexes = get_indexes(args.index, args.index_file,
                                  args.index_length, plate_num)
            for chain in ['heavy', 'kappa', 'lambda']:
                plate_seqs = []
                logger.info('')
                logger.info('Querying for {} chain sequences'.format(chain))
                score_cutoff = args.score_cutoff_heavy if chain == 'heavy' else args.score_cutoff_light
                sequences = get_sequences(db, collection, chain, score_cutoff)
                logger.info(
                    'QUERY RESULTS: {} {} chain sequences met the quality threshold'
                    .format(len(sequences), chain.lower()))
                bins = bin_by_index(sequences, indexes, args.index_length,
                                    args.index_position,
                                    args.index_reverse_complement,
                                    args.raw_seq_field)
                if args.minimum_well_size == 'relative':
                    min_well_size = int(
                        len(sequences) / float(args.minimum_well_size_denom))
                else:
                    min_well_size = int(args.minimum_well_size)
                min_max_well_size = max(min_well_size,
                                        args.minimum_max_well_size)
                if max([len(b) for b in list(bins.values())
                        ]) < int(min_max_well_size):
                    logger.info(
                        'The biggest well had fewer than {} sequences, so the plate was not processed'
                        .format(min_max_well_size))
                    continue
                for b in sorted(bins.keys()):
                    if len(bins[b]) < 25:
                        continue
                    print_bin_info(b)
                    if args.raw_sequence_dir is not None:
                        rs_handle = open(
                            os.path.join(
                                args.raw_sequence_dir,
                                '{}-{}_{}'.format(plate_name, b, chain)),
                            'write')
                        rs_handle.write('\n'.join(
                            ['>{}\n{}'.format(s[0], s[1]) for s in bins[b]]))
                        rs_handle.close()
                    consentroid = cdhit_clustering(
                        bins[b], b, plate_name, args.temp_dir, len(sequences),
                        args.minimum_well_size, args.minimum_well_size_denom,
                        args.minimum_cluster_fraction, args.raw_sequence_dir,
                        args.alignment_pixel_dir, args.consensus,
                        args.cdhit_threshold, chain)
                    if consentroid:
                        consentroid_name = '{}-{}'.format(plate_name, b)
                        plate_seqs.append((consentroid_name, consentroid))
                log_output(bins, plate_seqs, min_well_size)
                # all_seqs.extend(plate_seqs)
                write_output(plate_seqs, args.output)
                logger.info('')
    logger.info('')
예제 #11
0
from __future__ import absolute_import, division, print_function, unicode_literals

from multiprocessing import cpu_count
import os
from subprocess import Popen, PIPE
import sys

from Bio import SeqIO

from .utils.pandaseq import pair_files

from abutils.utils.log import get_logger
from abutils.utils.pipeline import list_files, make_dir


logger = get_logger('preprocess')


def quality_trim(input_directory=None, output_directory=None,
        quality_cutoff=20, length_cutoff=50,
        quality_type='sanger', compress_output=True, file_pairs=None,
        singles_directory=None, nextseq=False, paired_reads=True,
        allow_5prime_trimming=False, print_debug=False):
    '''
    Performs quality trimming with sickle.

    Args:

        input_directory (str): Path to a directory of files to be quality
            trimmed. If the directory contains paired reads, they should
            follow the Illumina MiSeq naming scheme. If you have paired reads
예제 #12
0
def run(**kwargs):
    validate_args(args)
    args = Args(**kwargs)
    global logger
    log.get_logger('clonify')
    mai(args)
예제 #13
0
            cluster_sizes = update_db(clusters, collection_group)
        else:
            cluster_sizes = [c.size for c in clusters]
        print_finished(cluster_sizes)


def run_standalone(args):
    validate_args(args)
    global logger
    logfile = get_logfile(args)
    log.setup_logging(logfile, debug=args.debug)
    logger = log.get_logger('clonify')
    main(args)


def run(**kwargs):
    validate_args(args)
    args = Args(**kwargs)
    global logger
    log.get_logger('clonify')
    mai(args)


if __name__ == '__main__':
    args = parse_args()
    validate_args(args)
    logfile = get_logfile(args)
    log.setup_logging(logfile, debug=args.debug)
    logger = log.get_logger('clonify')
    main(args)
예제 #14
0
파일: s3.py 프로젝트: menis/abutils
def compress_and_upload(data,
                        compressed_file,
                        s3_path,
                        multipart_chunk_size_mb=500,
                        method='gz',
                        delete=False,
                        access_key=None,
                        secret_key=None):
    '''
    Compresses data and uploads to S3.

    S3 upload uses ``s3cmd``, so you must either:

        1) Manually configure ``s3cmd`` prior to use (typically using ``s3cmd --configure``).

        2) Configure ``s3cmd`` using ``s3.configure()``.

        3) Pass your access key and secret key to ``compress_and_upload``, which will automatically configure s3cmd.

    .. note:

        ``s3cmd`` configuration only needs to be done once per computer,
        which means that relaunching a cloud instance or Docker image will
        require re-configuration of ``s3cmd``.

    Args:

        data: Can be one of three things:

            1) Path to a single file

            2) Path to a directory

            3) A list of one or more paths to files or directories

        compressed_file (str): Path to the compressed file. Required.

        s3_path (str): The S3 path, with the filename omitted. The S3 filename
          will be the basename of the ``compressed_file``. For example::

            compress_and_upload(data='/path/to/data',
                                compressed_file='/path/to/compressed.tar.gz',
                                s3_path='s3://my_bucket/path/to/')

          will result in an uploaded S3 path of ``s3://my_bucket/path/to/compressed.tar.gz``

        method (str): Compression method. Options are ``'gz'`` (gzip) or ``'bz2'`` (bzip2).
            Default is ``'gz'``.

        delete (bool): If ``True``, the ``compressed_file`` will be deleted after upload
            to S3. Default is ``False``.

        access_key (str): AWS access key.

        secret_key (str): AWS secret key.
    '''
    logger = log.get_logger('s3')
    if all([access_key, secret_key]):
        configure(access_key=access_key, secret_key=secret_key, logger=logger)
    compress(data, compressed_file, fmt=method, logger=logger)
    put(compressed_file,
        s3_path,
        multipart_chunk_size_mb=multipart_chunk_size_mb,
        logger=logger)
    if delete:
        os.unlink(compressed_file)
예제 #15
0
import json
import os
import platform
import sys
import time

from BaseSpacePy.api.BaseSpaceAPI import BaseSpaceAPI
from BaseSpacePy.model.QueryParameters import QueryParameters as qp

from abutils.utils import log
from abutils.utils.pipeline import make_dir

if sys.version_info[0] > 2:
    raw_input = input

logger = log.get_logger('basespace')


class BaseSpace(object):
    def __init__(self,
                 project_id=None,
                 project_name=None,
                 get_all_projects=False):
        super(BaseSpace, self).__init__()
        # BaseSpace credentials
        creds = self._get_credentials()
        self.client_key = creds['client_id']
        self.client_secret = creds['client_secret']
        self.access_token = creds['access_token']
        self.version = creds['version']
        self.api_server = creds['api_server']