Ejemplo n.º 1
0
def _process_zip_files(zip_files):
    out_dirs = []
    for z in zip_files:
        out_path = '.'.join(z.split('.')[:-1])
        make_dir(out_path)
        zhandle = ZipFile(z)
        zhandle.extractall(out_path)
        out_dirs.append(out_path)
    return out_dirs
Ejemplo n.º 2
0
def _process_zip_files(zip_files):
    out_dirs = []
    for z in zip_files:
        out_path = '.'.join(z.split('.')[:-1])
        make_dir(out_path)
        zhandle = ZipFile(z)
        zhandle.extractall(out_path)
        out_dirs.append(out_path)
    return out_dirs
Ejemplo n.º 3
0
def main(args):
    _print_start_info(args)
    if args.sleep:
        countdown(args)
    for d in [args.output, args.temp_dir]:
        make_dir(d)
    if args.consensus and args.germs:
        germs = parse_germs(args.germs)
    else:
        germs = args.germs
    # check whether JSON files have been passed
    if args.json is not None and all([args.db is None, args.collection is None]):
    	if os.path.isfile(args.json) and args.json.endswith('.json'):
    		collections = [args.json, ]
    	else:
        	collections = list_files(args.json, extension='json')
        db = None
        sample_names = [os.path.basename(c).replace('.json', '') for c in collections]
    # check whether MINIMAL files have been passed:
    if args.minimal_input is not None and all([args.db is None, args.collection is None]):
        if os.path.isfile(args.minimal_input) and args.minimal_input.endswith('.txt'):
            collections = [args.minimal_input, ]
        else:
            collections = list_files(args.minimal_input, extension='txt')
        db = None
        sample_names = [os.path.basename(c).replace('.txt', '') for c in collections]
    # otherwise, get sequences from MongoDB
    else:
        db = mongodb.get_db(args.db, args.ip, args.port, args.user, args.password)
        collections = mongodb.get_collections(db, collection=args.collection)
        sample_names = collections
    for collection, sample_name in zip(collections, sample_names):
        collection_start = time.time()
        print_collection_info(collection, sample_name)
        if args.non_redundant:
            seqs = get_seqs(db, collection, args, make_seq_db=False)
            unique_file = unix_sort_unique(seqs, args)
            write_nr_output(collection, unique_file, collection_start, args)
        else:
            seq_db_path = get_seqs(db, collection, args)
            initial_clusters = initial_clustering(seq_db_path, args)
            if args.min_seqs == 1:
                singletons = [ic for ic in initial_clusters if ic.size == 1]
                initial_clusters = [ic for ic in initial_clusters if ic.size > 1]
                logger.info('{} clusters contained only a single sequence. Processing singletons...'.format(len(singletons)))
                singleton_consentroids = process_singleton_clusters(singletons, seq_db_path, args)
                logger.info('')
            else:
                singleton_consentroids = []
            consentroids = process_initial_clusters(initial_clusters, seq_db_path, args)
            consentroids += singleton_consentroids
            sequences, sizes = zip(*consentroids)
            write_output(sample_name, sequences, sizes, collection_start, args)
            for ic in initial_clusters:
                ic.cleanup()
            remove_sqlite_db(args)
Ejemplo n.º 4
0
def split_file(json, args):
    split_files = []
    temp_dir = args.temp if args.temp is not None else os.path.join(args.mongo_input_dir, "temp")
    make_dir(temp_dir)
    with open(json) as f:
        for chunk in itertools.izip_longest(*[f] * args.split_file_lines):
            chunk = [c for c in chunk if c is not None]
            fname = os.path.join(temp_dir, str(uuid.uuid4()) + ".json")
            open(fname, "w").write("".join(chunk))
            split_files.append(fname)
    return split_files
Ejemplo n.º 5
0
def split_file(json, args):
    split_files = []
    temp_dir = args.temp if args.temp is not None else os.path.join(
        args.mongo_input_dir, 'temp')
    make_dir(temp_dir)
    with open(json) as f:
        for chunk in itertools.izip_longest(*[f] * args.split_file_lines):
            chunk = [c for c in chunk if c is not None]
            fname = os.path.join(temp_dir, str(uuid.uuid4()) + '.json')
            open(fname, 'w').write(''.join(chunk))
            split_files.append(fname)
    return split_files
Ejemplo n.º 6
0
def download(download_directory, project_id=None, project_name=None):
    '''
    Downloads sequencing data from BaseSpace (Illumina's cloud storage platform).

    Before accessing BaseSpace through the AbStar API, you need to set up a
    credentials file:

    1. You need a BaseSpace access token. The easiest way to do this is to
       set up a BaseSpace developer account following
       `these instructions <https://support.basespace.illumina.com/knowledgebase/articles/403618-python-run-downloader>`_

    2. Make a BaseSpace credentials file using your developer credentials::

        $ make_basespace_credfile

    and follow the instructions.


    Examples:

        If you know the name of the project you'd like to download::

            from abstar.utils import basespace

            basespace.download('/path/to/download_directory', project_name='MyProject')

        If you know the ID of the project you'd like to download::

            basespace.download('/path/to/download_directory', project_id='ABC123')

        If neither ``project_id`` nor ``project_name`` is provided, a list of your available
        BaseSpace projects will be provided and you can select a project from that list::

            basespace.download('/path/to/download_directory')

    Args:

        download_directory (str): Directory into which the raw sequences files should
            be downloaded. If the directory does not exist, it will be created.

        project_id (str): ID of the project to be downloaded.

        project_name (str): Name of the project to be downloaded.

    Returns:

        int: The number of sequence files downloaded.
    '''
    make_dir(download_directory)
    bs = BaseSpace(project_id, project_name)
    return bs.download(download_directory)
Ejemplo n.º 7
0
def schief_csv_output(pairs, output_file, sep=',', legacy_abstar=True):
    make_dir(os.path.dirname(output_file))
    header = _get_schief_output_header(sep)
    output = [
        header,
    ]
    for p in sorted(pairs, key=lambda x: _get_name(x)):
        name = _get_name(p)
        line = [
            name,
        ]
        line += _get_pair_metadata(p)
        line += _schief_output_line(p.heavy, legacy_abstar)
        line += _schief_output_line(p.light, legacy_abstar)
        output.append(sep.join([str(l) for l in line]))
    open(output_file, 'w').write('\n'.join(output))
Ejemplo n.º 8
0
def fastqc(input_directory, output_directory=None, threads=-1):
    '''
    Performs FASTQC analysis on raw NGS data.


    Args:

        input_directory (str): Path to the input directory, containing one
            or more FASTQ files (either gzip compressed or uncompressed).

        output_directory (str): Path to the output directory, where the FASTQC
            results will be deposited. If not provided, a directory named
            'fastqc_reports' will be created in the parent directory of
            ``input_directory``

        threads (int): Number of threads to be used (passed to the ``-t`` flag
            when running ``fastqc``). Default is -1, which uses all cores.


    Returns:

        str: path to the output directory
    '''
    input_directory = os.path.normpath(input_directory)
    if output_directory is None:
        oparent = os.path.dirname(input_directory)
        output_directory = os.path.join(oparent, 'fastqc_reports')
    make_dir(output_directory)
    files = list_files(input_directory)
    if threads < 0:
        threads = cpu_count()
    fastqc_cmd = 'fastqc --noextract -o={} -t={} {}'.format(output_directory,
        threads, ' '.join(files))
    p = Popen(fastqc_cmd, stdout=PIPE, stderr=PIPE, shell=True)
    stdout, stderr = p.communicate()
    logger.debug(stdout)
    logger.debug(stderr)
    return output_directory
Ejemplo n.º 9
0
def quality_trim(input_directory=None, output_directory=None,
        quality_cutoff=20, length_cutoff=50,
        quality_type='sanger', compress_output=True, file_pairs=None,
        singles_directory=None, nextseq=False, paired_reads=True,
        allow_5prime_trimming=False, print_debug=False):
    '''
    Performs quality trimming with sickle.

    Args:

        input_directory (str): Path to a directory of files to be quality
            trimmed. If the directory contains paired reads, they should
            follow the Illumina MiSeq naming scheme. If you have paired reads
            that do not follow the MiSeq naming scheme, you can group the paired
            read files yourself and pass them to ``--file-pairs``.

        output_directory (str): Path to the output directory, into which quality-
            trimmed read files will be deposited. If not provided, a directory
            will be created in the parent directory of ``input_directory``.
            Required if using ``file_pairs`` instead of ``input_directory``.

        quality_cutoff (int): Quality score at which to truncate reads. Default
            is ``20``.

        length_cutoff (int): Reads will be discarded if, after quality trimming,
            the length is shorter than this cutoff. Default is ``50``.

        quality_type (str): Quality score type. Options are ``solexa``, ``illumina``,
            and ``sanger``. ``illumina`` is equivalent to Casava 1.3-1.7 and ``sanger`` is
            Casava >= 1.8. Default is ``sanger``.

        compress_output (bool): If ``True``, output files will be gzip compressed.
            Default is ``True``.

        file_pairs (list): If input files are paired-end reads that don't follow Illumina's
            MiSeq naming scheme, you can pass a list of lists/tuples, with each list/tuple
            containing a pair of read file paths.

        singles_directory (str): Path to singles output directory. If processing paired reads
            and one read of the pair passes quality/length filters and the other doesn't,
            the single passing read will be written to this file. Default is ``None``, which
            results in the single sequences being discarded and not written to file.

        nextseq (bool): Set to ``True`` if the sequencing data comes from a NextSeq run. The
            file naming scheme for NextSeq runs is different that MiSeq runs, and setting
            this option will allow NextSeq paired read files to be processed appropriately.
            Default is ``False``.

        paired_reads (bool): If ``True``, reads will be processed as paired reads. If ``False``,
            each read will be processed separately. It is not advisable to process paired
            reads with ``paired_reads`` set to ``False`` because if paired read files are
            processed separately and one read passes filters while the paired read doesn't,
            this may cause problems with downstream processes (like read merging).

        allow_5prime_trimming (bool): If ``True``, quality trimming will be performed
            on the 5' end of the reads as well as the 3' end. Default is ``False``.


    Returns:

        str: Path to the output directory
    '''
    if input_directory is None and any([file_pairs is None, output_directory is None]):
        err = '\nERROR: Either an input_directory must be provided or '
        err += 'both file_pairs and an output_directory must be provided.\n'
        print(err)
        sys.exit(1)
    if file_pairs:
        files = file_pairs
    else:
        input_directory = os.path.normpath(input_directory)
        if output_directory is None:
            oparent = os.path.dirname(input_directory)
            output_directory = os.path.join(oparent, 'quality_trimmed')
        make_dir(output_directory)
        if paired_reads:
            files = list_files(input_directory)
            file_pairs = pair_files(files, nextseq)
            files = file_pairs.values()
        else:
            files = [[f] for f in list_files(input_directory)]
    for f in files:
        logger.info(f)
        if len(f) == 2:
            paired_end = True
        elif len(f) == 1:
            paired_end = False
        else:
            err = 'ERROR: Each batch of files must contain either 1 (single-end reads) or '
            err += '2 (paired-end reads) files. This batch contains {} files:\n{}'.format(
                len(f), '\n'.join(f))
            err2 += 'If you have paired-end reads that do not follow the Illumina naming scheme, '
            err2 += 'you can pass pairs of filenames (a list of lists/tuples) with the <file_pairs> option. '
            err2 += 'If using <file_pairs>, the output directory must also be provided.'
            logger.info(err)
            logger.info(err2)
            continue
        f.sort()
        # set basic sickle cmd options
        sickle = 'sickle pe' if paired_end else 'sickle se'
        sickle += ' -t {}'.format(quality_type)
        sickle += ' -l {}'.format(length_cutoff)
        sickle += ' -q {}'.format(quality_cutoff)
        if compress_output:
            sickle += ' -g'
        if not allow_5prime_trimming:
            sickle += ' -x'
        # compute input/output filenames, add to sickle cmd
        sickle += ' -f {}'.format(f[0])
        o1_basename = os.path.basename(f[0]).rstrip('.gz')
        if compress_output:
            o1_basename += '.gz'
        sickle += ' -o {}'.format(os.path.join(output_directory, o1_basename))
        if paired_end:
            sickle += ' -r {}'.format(f[1])
            o2_basename = os.path.basename(f[1]).rstrip('.gz')
            if compress_output:
                o2_basename += '.gz'
            sickle += ' -p {}'.format(os.path.join(output_directory, o2_basename))
        # compute singles output filename, add to sickle cmd
        if paired_end:
            if singles_directory is not None:
                sfilename = '{}_{}_singles.fastq'.format(
                    o1_basename.rstrip('.gz').rstrip('.fastq').rstrip('.fq'),
                    o2_basename.rstrip('.gz').rstrip('.fastq').rstrip('.fq'))
                if compress_output:
                    sfilename += '.gz'
                sickle += ' -s {}'.format(os.path.join(singles_directory, sfilename))
            else:
                sickle += ' -s /dev/null'
        if print_debug:
            print(sickle)
        # run sickle
        p = Popen(sickle, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        logger.debug(stdout)
        logger.debug(stderr)
        if print_debug:
            print(stdout)
            print('')
            print(stderr)
            print('')
    return output_directory
Ejemplo n.º 10
0
def adapter_trim(input_directory, output_directory=None,
        adapter_5prime=None, adapter_3prime=None,
        adapter_5prime_anchored=None, adapter_3prime_anchored=None,
        adapter_both=None, compress_output=True):
    '''
    Trims adapters with cutadapt.

    Args:

        input_directory (str): Path to a directory of FASTQ files
            to be adapter trimmed. Required.

        output_directory (str): Path to the output directory. If
            not provided, a directory will be created in the parent
            directory of ``input_directory``.

        adapter_5prime (str): Path to a FASTA-formatted file of
            adapters to be trimmed from the 5' end of reads.

        adapter_3prime (str): Path to a FASTA-formatted file of
            adapters to be trimmed from the 3' end of reads.

        adapter_5prime_anchored (str): Path to a FASTA-formatted file of
            adapters to be trimmed from the 5' end of reads. More strictly
            requires the read to be anchored to the 5' end of the read
            than when using ``adapter_5prime``.

        adapter_3prime_anchored (str): Path to a FASTA-formatted file of
            adapters to be trimmed from the 3' end of reads. More strictly
            requires the read to be anchored to the 3' end of the read
            than when using ``adapter_3prime``.

        adapter_both (str): Path to a FASTA-formatted file of adapters
            that will be trimmed from either end of the reads.

        compress_output (bool): If ``True``, output files will be gzip
            compressed. Default is ``True``.

    Returns:

        str: Path to the output directory
    '''
    input_directory = os.path.normpath(input_directory)
    if output_directory is None:
        oparent = os.path.dirname(input_directory)
        output_directory = os.path.join(oparent, 'adapter_trimmed')
    make_dir(output_directory)
    files = list_files(input_directory)
    # parse adapter FASTA files, compile adapter option list
    adapters = []
    opts = ['-g', '-a', '-b']
    adapt_files = [adapter_5prime, adapter_3prime, adapter_both]
    for o, a in zip(opts, adapt_files):
        if a is None:
            continue
        adapts = [str(s.seq) for s in SeqIO.parse(open(a, 'r'), 'fasta')]
        adapters += [' '.join(z) for z in zip([o] * len(adapts), adapts)]
    if adapter_5prime_anchored is not None:
        adapts = ['^{}'.format(str(s.seq)) for s in SeqIO.parse(open(adapter_5prime_anchored, 'r'), 'fasta')]
        adapters += ['-g {}'.format(a) for a in adapts]
    if adapter_3prime_anchored is not None:
        adapts = ['{}$'.format(str(s.seq)) for s in SeqIO.parse(open(adapter_3prime_anchored, 'r'), 'fasta')]
        adapters += ['-a {}'.format(a) for a in adapts]
    # process input files
    for ifile in files:
        oname = os.path.basename(ifile).rstrip('.gz')
        if compress_output:
            oname += '.gz'
        ofile = os.path.join(output_directory, oname)
        # set up cutadapt command
        adapter_string = ' '.join(adapters)
        cutadapt = 'cutadapt -o {} {} {}'.format(ofile, adapter_string, ifile)
        # run cutadapt
        p = Popen(cutadapt, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        logger.debug(stdout)
        logger.debug(stderr)
    return output_directory