Ejemplo n.º 1
0
def abi_to_fasta(input, output):
    '''
    Converts ABI or AB1 files to FASTA format.

    Args:

         input (str): Path to a file or directory containing abi/ab1 files or
            zip archives of abi/ab1 files

        output (str): Path to a directory for the output FASTA files
    '''
    direcs = [input, ]
    # unzip any zip archives
    zip_files = list_files(input, ['zip'])
    if zip_files:
        direcs.extend(_process_zip_files(zip_files))
    # convert files
    for d in direcs:
        files = list_files(d, ['ab1', 'abi'])
        seqs = [SeqIO.read(open(f, 'rb'), 'abi') for f in files]
        # seqs = list(chain.from_iterable(seqs))
        fastas = ['>{}\n{}'.format(s.id, str(s.seq)) for s in seqs]
        ofile = os.path.basename(os.path.normpath(d)) + '.fasta'
        opath = os.path.join(output, ofile)
        open(opath, 'w').write('\n'.join(fastas))
Ejemplo n.º 2
0
def abi_to_fasta(input, output):
    '''
    Converts ABI or AB1 files to FASTA format.

    Args:

         input (str): Path to a file or directory containing abi/ab1 files or
            zip archives of abi/ab1 files

        output (str): Path to a directory for the output FASTA files
    '''
    direcs = [
        input,
    ]
    # unzip any zip archives
    zip_files = list_files(input, ['zip'])
    if zip_files:
        direcs.extend(_process_zip_files(zip_files))
    # convert files
    for d in direcs:
        files = list_files(d, ['ab1', 'abi'])
        seqs = [SeqIO.read(open(f, 'rb'), 'abi') for f in files]
        # seqs = list(chain.from_iterable(seqs))
        fastas = ['>{}\n{}'.format(s.id, str(s.seq)) for s in seqs]
        ofile = os.path.basename(os.path.normpath(d)) + '.fasta'
        opath = os.path.join(output, ofile)
        open(opath, 'w').write('\n'.join(fastas))
Ejemplo n.º 3
0
def main(args):
    _print_start_info(args)
    if args.sleep:
        countdown(args)
    for d in [args.output, args.temp_dir]:
        make_dir(d)
    if args.consensus and args.germs:
        germs = parse_germs(args.germs)
    else:
        germs = args.germs
    # check whether JSON files have been passed
    if args.json is not None and all([args.db is None, args.collection is None]):
    	if os.path.isfile(args.json) and args.json.endswith('.json'):
    		collections = [args.json, ]
    	else:
        	collections = list_files(args.json, extension='json')
        db = None
        sample_names = [os.path.basename(c).replace('.json', '') for c in collections]
    # check whether MINIMAL files have been passed:
    if args.minimal_input is not None and all([args.db is None, args.collection is None]):
        if os.path.isfile(args.minimal_input) and args.minimal_input.endswith('.txt'):
            collections = [args.minimal_input, ]
        else:
            collections = list_files(args.minimal_input, extension='txt')
        db = None
        sample_names = [os.path.basename(c).replace('.txt', '') for c in collections]
    # otherwise, get sequences from MongoDB
    else:
        db = mongodb.get_db(args.db, args.ip, args.port, args.user, args.password)
        collections = mongodb.get_collections(db, collection=args.collection)
        sample_names = collections
    for collection, sample_name in zip(collections, sample_names):
        collection_start = time.time()
        print_collection_info(collection, sample_name)
        if args.non_redundant:
            seqs = get_seqs(db, collection, args, make_seq_db=False)
            unique_file = unix_sort_unique(seqs, args)
            write_nr_output(collection, unique_file, collection_start, args)
        else:
            seq_db_path = get_seqs(db, collection, args)
            initial_clusters = initial_clustering(seq_db_path, args)
            if args.min_seqs == 1:
                singletons = [ic for ic in initial_clusters if ic.size == 1]
                initial_clusters = [ic for ic in initial_clusters if ic.size > 1]
                logger.info('{} clusters contained only a single sequence. Processing singletons...'.format(len(singletons)))
                singleton_consentroids = process_singleton_clusters(singletons, seq_db_path, args)
                logger.info('')
            else:
                singleton_consentroids = []
            consentroids = process_initial_clusters(initial_clusters, seq_db_path, args)
            consentroids += singleton_consentroids
            sequences, sizes = zip(*consentroids)
            write_output(sample_name, sequences, sizes, collection_start, args)
            for ic in initial_clusters:
                ic.cleanup()
            remove_sqlite_db(args)
Ejemplo n.º 4
0
 def _get_builtin_matrix(matrix_name):
     matrix_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'matrices')
     matrices = [os.path.basename(f) for f in list_files(matrix_dir)]
     if matrix_name.lower() not in matrices:
         err = 'The maxtrix name you provided ({}) is not built-in.'.format(matrix_name)
         err += 'Built in matrices are: {}'.format(', '.join(matrices))
         raise RuntimeError()
     return os.path.join(matrix_dir, matrix_name.lower())
Ejemplo n.º 5
0
 def _get_builtin_matrix(matrix_name):
     matrix_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               'matrices')
     matrices = [os.path.basename(f) for f in list_files(matrix_dir)]
     if matrix_name.lower() not in matrices:
         err = 'The maxtrix name you provided ({}) is not built-in.'.format(
             matrix_name)
         err += 'Built in matrices are: {}'.format(', '.join(matrices))
         raise RuntimeError()
     return os.path.join(matrix_dir, matrix_name.lower())
Ejemplo n.º 6
0
def main(args):
    for f in list_files(args.input):
        experiment = get_experiment(f, args)
        wb = load_workbook(f)
        ws = wb[wb.get_sheet_names()[0]]
        plate_blocks = get_plate_blocks(ws, args)
        plural = '' if len(plate_blocks) <= 2 else 's'
        logger.info('\nFound {} plate{} in the input file'.format(
            len(plate_blocks) - 1, plural))
        logger.info('Experiment name: {}\n'.format(experiment))
        plates = parse_plates(plate_blocks[1:], args)
        write_output(plates, experiment, args)
        logger.info('')
Ejemplo n.º 7
0
 def _get_matrix_file(self, match=None, mismatch=None, matrix=None):
     matrix_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'utils/matrices')
     builtins = ['blosum62', 'match3mismatch2', 'match1mismatch0']
     if self._matrix is not None:
         matrix_name = self._matrix
     else:
         matrix_name = 'match{}mismatch{}'.format(abs(match), abs(mismatch))
     if matrix_name.lower() in builtins:
         return os.path.join(matrix_dir, matrix_name)
     builtin_names = [os.path.basename(f) for f in list_files(matrix_dir)]
     if self._matrix is not None:
         if self._matrix.lower() in builtin_names:
             return os.path.join(matrix_dir, self._matrix.lower())
         else:
             err = 'The supplied matrix name ({}) does not exist. '.format(matrix)
             err += 'Built-in matrices are: {}'.format(', '.join(builtins))
             raise RuntimeError(err)
     else:
         self._build_matrix_from_params(match, mismatch, os.path.join(matrix_dir, matrix_name))
         return os.path.join(matrix_dir, matrix_name)
Ejemplo n.º 8
0
def fastqc(input_directory, output_directory=None, threads=-1):
    '''
    Performs FASTQC analysis on raw NGS data.


    Args:

        input_directory (str): Path to the input directory, containing one
            or more FASTQ files (either gzip compressed or uncompressed).

        output_directory (str): Path to the output directory, where the FASTQC
            results will be deposited. If not provided, a directory named
            'fastqc_reports' will be created in the parent directory of
            ``input_directory``

        threads (int): Number of threads to be used (passed to the ``-t`` flag
            when running ``fastqc``). Default is -1, which uses all cores.


    Returns:

        str: path to the output directory
    '''
    input_directory = os.path.normpath(input_directory)
    if output_directory is None:
        oparent = os.path.dirname(input_directory)
        output_directory = os.path.join(oparent, 'fastqc_reports')
    make_dir(output_directory)
    files = list_files(input_directory)
    if threads < 0:
        threads = cpu_count()
    fastqc_cmd = 'fastqc --noextract -o={} -t={} {}'.format(output_directory,
        threads, ' '.join(files))
    p = Popen(fastqc_cmd, stdout=PIPE, stderr=PIPE, shell=True)
    stdout, stderr = p.communicate()
    logger.debug(stdout)
    logger.debug(stderr)
    return output_directory
Ejemplo n.º 9
0
 def _get_matrix_file(self, match=None, mismatch=None, matrix=None):
     matrix_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               'utils/matrices')
     builtins = ['blosum62', 'match3mismatch2', 'match1mismatch0']
     if self._matrix is not None:
         matrix_name = self._matrix
     else:
         matrix_name = 'match{}mismatch{}'.format(abs(match), abs(mismatch))
     if matrix_name.lower() in builtins:
         return os.path.join(matrix_dir, matrix_name)
     builtin_names = [os.path.basename(f) for f in list_files(matrix_dir)]
     if self._matrix is not None:
         if self._matrix.lower() in builtin_names:
             return os.path.join(matrix_dir, self._matrix.lower())
         else:
             err = 'The supplied matrix name ({}) does not exist. '.format(
                 matrix)
             err += 'Built-in matrices are: {}'.format(', '.join(builtins))
             raise RuntimeError(err)
     else:
         self._build_matrix_from_params(
             match, mismatch, os.path.join(matrix_dir, matrix_name))
         return os.path.join(matrix_dir, matrix_name)
Ejemplo n.º 10
0
def quality_trim(input_directory=None, output_directory=None,
        quality_cutoff=20, length_cutoff=50,
        quality_type='sanger', compress_output=True, file_pairs=None,
        singles_directory=None, nextseq=False, paired_reads=True,
        allow_5prime_trimming=False, print_debug=False):
    '''
    Performs quality trimming with sickle.

    Args:

        input_directory (str): Path to a directory of files to be quality
            trimmed. If the directory contains paired reads, they should
            follow the Illumina MiSeq naming scheme. If you have paired reads
            that do not follow the MiSeq naming scheme, you can group the paired
            read files yourself and pass them to ``--file-pairs``.

        output_directory (str): Path to the output directory, into which quality-
            trimmed read files will be deposited. If not provided, a directory
            will be created in the parent directory of ``input_directory``.
            Required if using ``file_pairs`` instead of ``input_directory``.

        quality_cutoff (int): Quality score at which to truncate reads. Default
            is ``20``.

        length_cutoff (int): Reads will be discarded if, after quality trimming,
            the length is shorter than this cutoff. Default is ``50``.

        quality_type (str): Quality score type. Options are ``solexa``, ``illumina``,
            and ``sanger``. ``illumina`` is equivalent to Casava 1.3-1.7 and ``sanger`` is
            Casava >= 1.8. Default is ``sanger``.

        compress_output (bool): If ``True``, output files will be gzip compressed.
            Default is ``True``.

        file_pairs (list): If input files are paired-end reads that don't follow Illumina's
            MiSeq naming scheme, you can pass a list of lists/tuples, with each list/tuple
            containing a pair of read file paths.

        singles_directory (str): Path to singles output directory. If processing paired reads
            and one read of the pair passes quality/length filters and the other doesn't,
            the single passing read will be written to this file. Default is ``None``, which
            results in the single sequences being discarded and not written to file.

        nextseq (bool): Set to ``True`` if the sequencing data comes from a NextSeq run. The
            file naming scheme for NextSeq runs is different that MiSeq runs, and setting
            this option will allow NextSeq paired read files to be processed appropriately.
            Default is ``False``.

        paired_reads (bool): If ``True``, reads will be processed as paired reads. If ``False``,
            each read will be processed separately. It is not advisable to process paired
            reads with ``paired_reads`` set to ``False`` because if paired read files are
            processed separately and one read passes filters while the paired read doesn't,
            this may cause problems with downstream processes (like read merging).

        allow_5prime_trimming (bool): If ``True``, quality trimming will be performed
            on the 5' end of the reads as well as the 3' end. Default is ``False``.


    Returns:

        str: Path to the output directory
    '''
    if input_directory is None and any([file_pairs is None, output_directory is None]):
        err = '\nERROR: Either an input_directory must be provided or '
        err += 'both file_pairs and an output_directory must be provided.\n'
        print(err)
        sys.exit(1)
    if file_pairs:
        files = file_pairs
    else:
        input_directory = os.path.normpath(input_directory)
        if output_directory is None:
            oparent = os.path.dirname(input_directory)
            output_directory = os.path.join(oparent, 'quality_trimmed')
        make_dir(output_directory)
        if paired_reads:
            files = list_files(input_directory)
            file_pairs = pair_files(files, nextseq)
            files = file_pairs.values()
        else:
            files = [[f] for f in list_files(input_directory)]
    for f in files:
        logger.info(f)
        if len(f) == 2:
            paired_end = True
        elif len(f) == 1:
            paired_end = False
        else:
            err = 'ERROR: Each batch of files must contain either 1 (single-end reads) or '
            err += '2 (paired-end reads) files. This batch contains {} files:\n{}'.format(
                len(f), '\n'.join(f))
            err2 += 'If you have paired-end reads that do not follow the Illumina naming scheme, '
            err2 += 'you can pass pairs of filenames (a list of lists/tuples) with the <file_pairs> option. '
            err2 += 'If using <file_pairs>, the output directory must also be provided.'
            logger.info(err)
            logger.info(err2)
            continue
        f.sort()
        # set basic sickle cmd options
        sickle = 'sickle pe' if paired_end else 'sickle se'
        sickle += ' -t {}'.format(quality_type)
        sickle += ' -l {}'.format(length_cutoff)
        sickle += ' -q {}'.format(quality_cutoff)
        if compress_output:
            sickle += ' -g'
        if not allow_5prime_trimming:
            sickle += ' -x'
        # compute input/output filenames, add to sickle cmd
        sickle += ' -f {}'.format(f[0])
        o1_basename = os.path.basename(f[0]).rstrip('.gz')
        if compress_output:
            o1_basename += '.gz'
        sickle += ' -o {}'.format(os.path.join(output_directory, o1_basename))
        if paired_end:
            sickle += ' -r {}'.format(f[1])
            o2_basename = os.path.basename(f[1]).rstrip('.gz')
            if compress_output:
                o2_basename += '.gz'
            sickle += ' -p {}'.format(os.path.join(output_directory, o2_basename))
        # compute singles output filename, add to sickle cmd
        if paired_end:
            if singles_directory is not None:
                sfilename = '{}_{}_singles.fastq'.format(
                    o1_basename.rstrip('.gz').rstrip('.fastq').rstrip('.fq'),
                    o2_basename.rstrip('.gz').rstrip('.fastq').rstrip('.fq'))
                if compress_output:
                    sfilename += '.gz'
                sickle += ' -s {}'.format(os.path.join(singles_directory, sfilename))
            else:
                sickle += ' -s /dev/null'
        if print_debug:
            print(sickle)
        # run sickle
        p = Popen(sickle, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        logger.debug(stdout)
        logger.debug(stderr)
        if print_debug:
            print(stdout)
            print('')
            print(stderr)
            print('')
    return output_directory
Ejemplo n.º 11
0
def adapter_trim(input_directory, output_directory=None,
        adapter_5prime=None, adapter_3prime=None,
        adapter_5prime_anchored=None, adapter_3prime_anchored=None,
        adapter_both=None, compress_output=True):
    '''
    Trims adapters with cutadapt.

    Args:

        input_directory (str): Path to a directory of FASTQ files
            to be adapter trimmed. Required.

        output_directory (str): Path to the output directory. If
            not provided, a directory will be created in the parent
            directory of ``input_directory``.

        adapter_5prime (str): Path to a FASTA-formatted file of
            adapters to be trimmed from the 5' end of reads.

        adapter_3prime (str): Path to a FASTA-formatted file of
            adapters to be trimmed from the 3' end of reads.

        adapter_5prime_anchored (str): Path to a FASTA-formatted file of
            adapters to be trimmed from the 5' end of reads. More strictly
            requires the read to be anchored to the 5' end of the read
            than when using ``adapter_5prime``.

        adapter_3prime_anchored (str): Path to a FASTA-formatted file of
            adapters to be trimmed from the 3' end of reads. More strictly
            requires the read to be anchored to the 3' end of the read
            than when using ``adapter_3prime``.

        adapter_both (str): Path to a FASTA-formatted file of adapters
            that will be trimmed from either end of the reads.

        compress_output (bool): If ``True``, output files will be gzip
            compressed. Default is ``True``.

    Returns:

        str: Path to the output directory
    '''
    input_directory = os.path.normpath(input_directory)
    if output_directory is None:
        oparent = os.path.dirname(input_directory)
        output_directory = os.path.join(oparent, 'adapter_trimmed')
    make_dir(output_directory)
    files = list_files(input_directory)
    # parse adapter FASTA files, compile adapter option list
    adapters = []
    opts = ['-g', '-a', '-b']
    adapt_files = [adapter_5prime, adapter_3prime, adapter_both]
    for o, a in zip(opts, adapt_files):
        if a is None:
            continue
        adapts = [str(s.seq) for s in SeqIO.parse(open(a, 'r'), 'fasta')]
        adapters += [' '.join(z) for z in zip([o] * len(adapts), adapts)]
    if adapter_5prime_anchored is not None:
        adapts = ['^{}'.format(str(s.seq)) for s in SeqIO.parse(open(adapter_5prime_anchored, 'r'), 'fasta')]
        adapters += ['-g {}'.format(a) for a in adapts]
    if adapter_3prime_anchored is not None:
        adapts = ['{}$'.format(str(s.seq)) for s in SeqIO.parse(open(adapter_3prime_anchored, 'r'), 'fasta')]
        adapters += ['-a {}'.format(a) for a in adapts]
    # process input files
    for ifile in files:
        oname = os.path.basename(ifile).rstrip('.gz')
        if compress_output:
            oname += '.gz'
        ofile = os.path.join(output_directory, oname)
        # set up cutadapt command
        adapter_string = ' '.join(adapters)
        cutadapt = 'cutadapt -o {} {} {}'.format(ofile, adapter_string, ifile)
        # run cutadapt
        p = Popen(cutadapt, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        logger.debug(stdout)
        logger.debug(stderr)
    return output_directory