def copy_from_basemount(basemount_directory, destination_directory): make_dir(os.path.abspath(destination_directory)) fastqs = [] for (path, dirs, files) in os.walk(basemount_directory): for f in files: if f.endswith('.fastq.gz'): fastqs.append(os.path.join(path, f)) print('') print('') print('========================================') print('Copying files from BaseMount') print('========================================') print('') print('Found {0} FASTQ files.'.format(len(fastqs))) print('') print('Copying to {}:'.format(destination_directory)) start = datetime.now() progress_bar(0, len(fastqs), start_time=start, completion_string='ITS DONE! good job') for i, fastq in enumerate(fastqs): dest = os.path.join(destination_directory, os.path.basename(fastq)) copyfile(fastq, dest) progress_bar(i + 1, len(fastqs), start_time=start) print('\n')
def split_file(json, args): split_files = [] temp_dir = args.temp if args.temp is not None else os.path.join(args.mongo_input_dir, 'temp') make_dir(temp_dir) with open(json) as f: for chunk in itertools.izip_longest(*[f] * args.split_file_lines): chunk = [c for c in chunk if c is not None] fname = os.path.join(temp_dir, str(uuid.uuid4()) + '.json') open(fname, 'w').write(''.join(chunk)) split_files.append(fname) return split_files
def download(download_directory, project_id=None, project_name=None): ''' Downloads sequencing data from BaseSpace (Illumina's cloud storage platform). Before accessing BaseSpace through the AbStar API, you need to set up a credentials file: 1. You need a BaseSpace access token. The easiest way to do this is to set up a BaseSpace developer account following `these instructions <https://support.basespace.illumina.com/knowledgebase/articles/403618-python-run-downloader>`_ 2. Make a BaseSpace credentials file using your developer credentials:: $ make_basespace_credfile and follow the instructions. Examples: If you know the name of the project you'd like to download:: from abstar.utils import basespace basespace.download('/path/to/download_directory', project_name='MyProject') If you know the ID of the project you'd like to download:: basespace.download('/path/to/download_directory', project_id='ABC123') If neither ``project_id`` nor ``project_name`` is provided, a list of your available BaseSpace projects will be provided and you can select a project from that list:: basespace.download('/path/to/download_directory') Args: download_directory (str): Directory into which the raw sequences files should be downloaded. If the directory does not exist, it will be created. project_id (str): ID of the project to be downloaded. project_name (str): Name of the project to be downloaded. Returns: int: The number of sequence files downloaded. ''' make_dir(download_directory) bs = BaseSpace(project_id, project_name) return bs.download(download_directory)
def make_directories(args): output_dir = os.path.dirname(args.output) make_dir(output_dir) make_dir(args.temp_dir) if args.raw_sequence_dir is not None: make_dir(args.raw_sequence_dir) if args.alignment_pixel_dir is not None: make_dir(args.alignment_pixel_dir)
def fastqc(input_directory, output_directory=None, threads=-1): ''' Performs FASTQC analysis on raw NGS data. Args: input_directory (str): Path to the input directory, containing one or more FASTQ files (either gzip compressed or uncompressed). output_directory (str): Path to the output directory, where the FASTQC results will be deposited. If not provided, a directory named 'fastqc_reports' will be created in the parent directory of ``input_directory`` threads (int): Number of threads to be used (passed to the ``-t`` flag when running ``fastqc``). Default is -1, which uses all cores. Returns: str: path to the output directory ''' input_directory = os.path.normpath(input_directory) if output_directory is None: oparent = os.path.dirname(input_directory) output_directory = os.path.join(oparent, 'fastqc_reports') make_dir(output_directory) files = list_files(input_directory) if threads < 0: threads = cpu_count() fastqc_cmd = 'fastqc --noextract -o={} -t={} {}'.format(output_directory, threads, ' '.join(files)) p = Popen(fastqc_cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() logger.debug(stdout) logger.debug(stderr) return output_directory
def quality_trim(input_directory=None, output_directory=None, quality_cutoff=20, length_cutoff=50, quality_type='sanger', compress_output=True, file_pairs=None, singles_directory=None, nextseq=False, paired_reads=True, allow_5prime_trimming=False, print_debug=False): ''' Performs quality trimming with sickle. Args: input_directory (str): Path to a directory of files to be quality trimmed. If the directory contains paired reads, they should follow the Illumina MiSeq naming scheme. If you have paired reads that do not follow the MiSeq naming scheme, you can group the paired read files yourself and pass them to ``--file-pairs``. output_directory (str): Path to the output directory, into which quality- trimmed read files will be deposited. If not provided, a directory will be created in the parent directory of ``input_directory``. Required if using ``file_pairs`` instead of ``input_directory``. quality_cutoff (int): Quality score at which to truncate reads. Default is ``20``. length_cutoff (int): Reads will be discarded if, after quality trimming, the length is shorter than this cutoff. Default is ``50``. quality_type (str): Quality score type. Options are ``solexa``, ``illumina``, and ``sanger``. ``illumina`` is equivalent to Casava 1.3-1.7 and ``sanger`` is Casava >= 1.8. Default is ``sanger``. compress_output (bool): If ``True``, output files will be gzip compressed. Default is ``True``. file_pairs (list): If input files are paired-end reads that don't follow Illumina's MiSeq naming scheme, you can pass a list of lists/tuples, with each list/tuple containing a pair of read file paths. singles_directory (str): Path to singles output directory. If processing paired reads and one read of the pair passes quality/length filters and the other doesn't, the single passing read will be written to this file. Default is ``None``, which results in the single sequences being discarded and not written to file. nextseq (bool): Set to ``True`` if the sequencing data comes from a NextSeq run. The file naming scheme for NextSeq runs is different that MiSeq runs, and setting this option will allow NextSeq paired read files to be processed appropriately. Default is ``False``. paired_reads (bool): If ``True``, reads will be processed as paired reads. If ``False``, each read will be processed separately. It is not advisable to process paired reads with ``paired_reads`` set to ``False`` because if paired read files are processed separately and one read passes filters while the paired read doesn't, this may cause problems with downstream processes (like read merging). allow_5prime_trimming (bool): If ``True``, quality trimming will be performed on the 5' end of the reads as well as the 3' end. Default is ``False``. Returns: str: Path to the output directory ''' if input_directory is None and any([file_pairs is None, output_directory is None]): err = '\nERROR: Either an input_directory must be provided or ' err += 'both file_pairs and an output_directory must be provided.\n' print(err) sys.exit(1) if file_pairs: files = file_pairs else: input_directory = os.path.normpath(input_directory) if output_directory is None: oparent = os.path.dirname(input_directory) output_directory = os.path.join(oparent, 'quality_trimmed') make_dir(output_directory) if paired_reads: files = list_files(input_directory) file_pairs = pair_files(files, nextseq) files = file_pairs.values() else: files = [[f] for f in list_files(input_directory)] for f in files: logger.info(f) if len(f) == 2: paired_end = True elif len(f) == 1: paired_end = False else: err = 'ERROR: Each batch of files must contain either 1 (single-end reads) or ' err += '2 (paired-end reads) files. This batch contains {} files:\n{}'.format( len(f), '\n'.join(f)) err2 += 'If you have paired-end reads that do not follow the Illumina naming scheme, ' err2 += 'you can pass pairs of filenames (a list of lists/tuples) with the <file_pairs> option. ' err2 += 'If using <file_pairs>, the output directory must also be provided.' logger.info(err) logger.info(err2) continue f.sort() # set basic sickle cmd options sickle = 'sickle pe' if paired_end else 'sickle se' sickle += ' -t {}'.format(quality_type) sickle += ' -l {}'.format(length_cutoff) sickle += ' -q {}'.format(quality_cutoff) if compress_output: sickle += ' -g' if not allow_5prime_trimming: sickle += ' -x' # compute input/output filenames, add to sickle cmd sickle += ' -f {}'.format(f[0]) o1_basename = os.path.basename(f[0]).rstrip('.gz') if compress_output: o1_basename += '.gz' sickle += ' -o {}'.format(os.path.join(output_directory, o1_basename)) if paired_end: sickle += ' -r {}'.format(f[1]) o2_basename = os.path.basename(f[1]).rstrip('.gz') if compress_output: o2_basename += '.gz' sickle += ' -p {}'.format(os.path.join(output_directory, o2_basename)) # compute singles output filename, add to sickle cmd if paired_end: if singles_directory is not None: sfilename = '{}_{}_singles.fastq'.format( o1_basename.rstrip('.gz').rstrip('.fastq').rstrip('.fq'), o2_basename.rstrip('.gz').rstrip('.fastq').rstrip('.fq')) if compress_output: sfilename += '.gz' sickle += ' -s {}'.format(os.path.join(singles_directory, sfilename)) else: sickle += ' -s /dev/null' if print_debug: print(sickle) # run sickle p = Popen(sickle, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() logger.debug(stdout) logger.debug(stderr) if print_debug: print(stdout) print('') print(stderr) print('') return output_directory
def adapter_trim(input_directory, output_directory=None, adapter_5prime=None, adapter_3prime=None, adapter_5prime_anchored=None, adapter_3prime_anchored=None, adapter_both=None, compress_output=True): ''' Trims adapters with cutadapt. Args: input_directory (str): Path to a directory of FASTQ files to be adapter trimmed. Required. output_directory (str): Path to the output directory. If not provided, a directory will be created in the parent directory of ``input_directory``. adapter_5prime (str): Path to a FASTA-formatted file of adapters to be trimmed from the 5' end of reads. adapter_3prime (str): Path to a FASTA-formatted file of adapters to be trimmed from the 3' end of reads. adapter_5prime_anchored (str): Path to a FASTA-formatted file of adapters to be trimmed from the 5' end of reads. More strictly requires the read to be anchored to the 5' end of the read than when using ``adapter_5prime``. adapter_3prime_anchored (str): Path to a FASTA-formatted file of adapters to be trimmed from the 3' end of reads. More strictly requires the read to be anchored to the 3' end of the read than when using ``adapter_3prime``. adapter_both (str): Path to a FASTA-formatted file of adapters that will be trimmed from either end of the reads. compress_output (bool): If ``True``, output files will be gzip compressed. Default is ``True``. Returns: str: Path to the output directory ''' input_directory = os.path.normpath(input_directory) if output_directory is None: oparent = os.path.dirname(input_directory) output_directory = os.path.join(oparent, 'adapter_trimmed') make_dir(output_directory) files = list_files(input_directory) # parse adapter FASTA files, compile adapter option list adapters = [] opts = ['-g', '-a', '-b'] adapt_files = [adapter_5prime, adapter_3prime, adapter_both] for o, a in zip(opts, adapt_files): if a is None: continue adapts = [str(s.seq) for s in SeqIO.parse(open(a, 'r'), 'fasta')] adapters += [' '.join(z) for z in zip([o] * len(adapts), adapts)] if adapter_5prime_anchored is not None: adapts = ['^{}'.format(str(s.seq)) for s in SeqIO.parse(open(adapter_5prime_anchored, 'r'), 'fasta')] adapters += ['-g {}'.format(a) for a in adapts] if adapter_3prime_anchored is not None: adapts = ['{}$'.format(str(s.seq)) for s in SeqIO.parse(open(adapter_3prime_anchored, 'r'), 'fasta')] adapters += ['-a {}'.format(a) for a in adapts] # process input files for ifile in files: oname = os.path.basename(ifile).rstrip('.gz') if compress_output: oname += '.gz' ofile = os.path.join(output_directory, oname) # set up cutadapt command adapter_string = ' '.join(adapters) cutadapt = 'cutadapt -o {} {} {}'.format(ofile, adapter_string, ifile) # run cutadapt p = Popen(cutadapt, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() logger.debug(stdout) logger.debug(stderr) return output_directory
def validate_args(args): for d in [args.output, args.temp]: if d is not None: make_dir(d)