def peek(sra, outdir=None): """return the full file names for all files which will be extracted Parameters ---------- outdir : path perform extraction in outdir. If outdir is None, the extraction will take place in a temporary directory, which will be deleted afterwards. Returns ------- files : list A list of fastq formatted files that are contained in the archive. format : string The quality score format in the :term:`fastq` formatted files. """ if outdir is None: workdir = tempfile.mkdtemp() else: workdir = outdir # --split-files creates files called prefix_#.fastq.gz, # where # is the read number. # If file cotains paired end data: # output = prefix_1.fastq.gz, prefix_2.fastq.gz # *special case: unpaired reads in a paired end --> prefix.fastq.gz # *special case: if paired reads are stored in a single read, # fastq-dump will split. There might be a joining # sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz # You want files 1 and 3. E.run("""fastq-dump --split-files --gzip -X 1000 --outdir %(workdir)s %(sra)s""" % locals()) f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz"))) ff = [os.path.basename(x) for x in f] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: # output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz") elif len(f) == 3: if ff[2].endswith("_3.fastq.gz"): f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) else: f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) # check format of fastqs in .sra fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False) fastq_datatype = Fastq.guessDataType( IOTools.openFile(f[0], "r"), raises=True) if outdir is None: shutil.rmtree(workdir) return f, fastq_format, fastq_datatype
def peek(sra, outdir=None): """return the full file names for all files which will be extracted Parameters ---------- outdir : path perform extraction in outdir. If outdir is None, the extraction will take place in a temporary directory, which will be deleted afterwards. Returns ------- files : list A list of fastq formatted files that are contained in the archive. format : string The quality score format in the :term:`fastq` formatted files. """ if outdir is None: workdir = tempfile.mkdtemp() else: workdir = outdir # --split-files creates files called prefix_#.fastq.gz, # where # is the read number. # If file cotains paired end data: # output = prefix_1.fastq.gz, prefix_2.fastq.gz # *special case: unpaired reads in a paired end --> prefix.fastq.gz # *special case: if paired reads are stored in a single read, # fastq-dump will split. There might be a joining # sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz # You want files 1 and 3. E.run("""fastq-dump --split-files --gzip -X 1000 --outdir %(workdir)s %(sra)s""" % locals()) f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz"))) ff = [os.path.basename(x) for x in f] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: # output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith("_1.fastq.gz") and ff[1].endswith("_2.fastq.gz") elif len(f) == 3: if ff[2].endswith("_3.fastq.gz"): f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) else: f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) # check format of fastqs in .sra fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False) fastq_datatype = Fastq.guessDataType(IOTools.openFile(f[0], "r"), raises=True) if outdir is None: shutil.rmtree(workdir) return f, fastq_format, fastq_datatype