def test_filehandle(self): """Filehandles slip through untouched""" with tempfile.TemporaryFile('r') as fh: with open_file(fh) as ffh: self.assertTrue(fh is ffh) # And it doesn't close the file-handle self.assertFalse(fh.closed)
def test_file_closed(self): """File gets closed in decorator""" f = tempfile.NamedTemporaryFile('r') filepath = f.name with open_file(filepath) as fh: pass self.assertTrue(fh.closed)
def test_hdf5IO_open(self): name = None with tempfile.NamedTemporaryFile(delete=False) as fh: name = fh.name fh.close() h5file = h5py.File(name, 'w') h5file.close() with open_file(name) as fh_inner: self.assertTrue(isinstance(fh_inner, h5py.File)) os.remove(name)
def test_file_closed_harder(self): """File gets closed in decorator, even if exceptions happen.""" f = tempfile.NamedTemporaryFile('r') filepath = f.name try: with open_file(filepath) as fh: raise TypeError except TypeError: self.assertTrue(fh.closed) else: # If we're here, no exceptions have been raised inside the # try clause, so the context manager swallowed them. No # good. raise Exception("`open_file` didn't propagate exceptions")
def submit_EBI(preprocessed_data_id, action, send, fastq_dir_fp=None): """Submit a preprocessed data to EBI Parameters ---------- preprocessed_data_id : int The preprocesssed data id action : %s The action to perform with this data send : bool True to actually send the files fastq_dir_fp : str, optional The fastq filepath Notes ----- If fastq_dir_fp is passed, it must not contain any empty files, or gzipped empty files """ preprocessed_data = PreprocessedData(preprocessed_data_id) preprocessed_data_id_str = str(preprocessed_data_id) study = Study(preprocessed_data.study) sample_template = SampleTemplate(study.sample_template) prep_template = PrepTemplate(preprocessed_data.prep_template) investigation_type = None new_investigation_type = None status = preprocessed_data.submitted_to_insdc_status() if status in ("submitting", "success"): raise ValueError("Cannot resubmit! Current status is: %s" % status) if send: # If we intend actually to send the files, then change the status in # the database preprocessed_data.update_insdc_status("submitting") # we need to figure out whether the investigation type is a known one # or if we have to submit a "new_investigation_type" to EBI current_type = prep_template.investigation_type ena_ontology = Ontology(convert_to_id("ENA", "ontology")) if current_type in ena_ontology.terms: investigation_type = current_type elif current_type in ena_ontology.user_defined_terms: investigation_type = "Other" new_investigation_type = current_type else: # This should never happen raise ValueError( "Unrecognized investigation type: '%s'. This term " "is neither one of the official terms nor one of the " "user-defined terms in the ENA ontology" ) if fastq_dir_fp is not None: # If the user specifies a FASTQ directory, use it # Set demux_samples to None so that MetadataTemplate.to_file will put # all samples in the template files demux_samples = None else: # If the user does not specify a FASTQ directory, create one and # re-serialize the per-sample FASTQs from the demux file fastq_dir_fp = mkdtemp(prefix=qiita_config.working_dir) demux = [path for _, path, ftype in preprocessed_data.get_filepaths() if ftype == "preprocessed_demux"][0] # Keep track of which files were actually in the demux file so that we # can write those rows to the prep and samples templates demux_samples = set() with open_file(demux) as demux_fh: for samp, iterator in to_per_sample_ascii(demux_fh, list(sample_template)): demux_samples.add(samp) sample_fp = join(fastq_dir_fp, "%s.fastq.gz" % samp) wrote_sequences = False with gzopen(sample_fp, "w") as fh: for record in iterator: fh.write(record) wrote_sequences = True if not wrote_sequences: remove(sample_fp) output_dir = fastq_dir_fp + "_submission" samp_fp = join(fastq_dir_fp, "sample_metadata.txt") prep_fp = join(fastq_dir_fp, "prep_metadata.txt") sample_template.to_file(samp_fp, demux_samples) prep_template.to_file(prep_fp, demux_samples) # Get specific output directory and set filepaths get_output_fp = partial(join, output_dir) study_fp = get_output_fp("study.xml") sample_fp = get_output_fp("sample.xml") experiment_fp = get_output_fp("experiment.xml") run_fp = get_output_fp("run.xml") submission_fp = get_output_fp("submission.xml") if not isdir(output_dir): makedirs(output_dir) else: raise IOError("The output folder already exists: %s" % output_dir) with open(samp_fp, "U") as st, open(prep_fp, "U") as pt: submission = EBISubmission.from_templates_and_per_sample_fastqs( preprocessed_data_id_str, study.title, study.info["study_abstract"], investigation_type, st, pt, fastq_dir_fp, new_investigation_type=new_investigation_type, pmids=study.pmids, ) submission.write_all_xml_files(study_fp, sample_fp, experiment_fp, run_fp, submission_fp, action) if send: submission.send_sequences() study_accession, submission_accession = submission.send_xml() if study_accession is None or submission_accession is None: preprocessed_data.update_insdc_status("failed") raise ComputeError("EBI Submission failed!") else: preprocessed_data.update_insdc_status("success", study_accession, submission_accession) else: study_accession, submission_accession = None, None return study_accession, submission_accession
def test_hdf5IO(self): f = h5py.File('test', driver='core', backing_store=False) with open_file(f) as fh: self.assertTrue(fh is f)
def test_BytesIO(self): """BytesIO (useful e.g. for testing) slips through.""" f = BytesIO(b"File contents") with open_file(f) as fh: self.assertTrue(fh is f)
def submit_EBI(preprocessed_data_id, action, send, fastq_dir_fp=None): """Submit a preprocessed data to EBI Parameters ---------- preprocessed_data_id : int The preprocesssed data id action : %s The action to perform with this data send : bool True to actually send the files fastq_dir_fp : str, optional The fastq filepath """ preprocessed_data = PreprocessedData(preprocessed_data_id) preprocessed_data_id_str = str(preprocessed_data_id) study = Study(preprocessed_data.study) sample_template = SampleTemplate(study.sample_template) prep_template = PrepTemplate(preprocessed_data.prep_template) investigation_type = None new_investigation_type = None status = preprocessed_data.submitted_to_insdc_status() if status in ('submitting', 'success'): raise ValueError("Cannot resubmit! Current status is: %s" % status) if send: # If we intend actually to send the files, then change the status in # the database preprocessed_data.update_insdc_status('submitting') # we need to figure out whether the investigation type is a known one # or if we have to submit a "new_investigation_type" to EBI current_type = prep_template.investigation_type ena_ontology = Ontology(convert_to_id('ENA', 'ontology')) if current_type in ena_ontology.terms: investigation_type = current_type elif current_type in ena_ontology.user_defined_terms: investigation_type = 'Other' new_investigation_type = current_type else: # This should never happen raise ValueError("Unrecognized investigation type: '%s'. This term " "is neither one of the official terms nor one of the " "user-defined terms in the ENA ontology") if fastq_dir_fp is not None: # If the user specifies a FASTQ directory, use it # Set demux_samples to None so that MetadataTemplate.to_file will put # all samples in the template files demux_samples = None else: # If the user does not specify a FASTQ directory, create one and # re-serialize the per-sample FASTQs from the demux file fastq_dir_fp = mkdtemp(prefix=qiita_config.working_dir) demux = [ path for _, path, ftype in preprocessed_data.get_filepaths() if ftype == 'preprocessed_demux' ][0] # Keep track of which files were actually in the demux file so that we # can write those rows to the prep and samples templates demux_samples = set() with open_file(demux) as demux_fh: for samp, iterator in to_per_sample_ascii(demux_fh, list(sample_template)): demux_samples.add(samp) sample_fp = join(fastq_dir_fp, "%s.fastq.gz" % samp) with gzopen(sample_fp, 'w') as fh: for record in iterator: fh.write(record) output_dir = fastq_dir_fp + '_submission' samp_fp = join(fastq_dir_fp, 'sample_metadata.txt') prep_fp = join(fastq_dir_fp, 'prep_metadata.txt') sample_template.to_file(samp_fp, demux_samples) prep_template.to_file(prep_fp, demux_samples) # Get specific output directory and set filepaths get_output_fp = partial(join, output_dir) study_fp = get_output_fp('study.xml') sample_fp = get_output_fp('sample.xml') experiment_fp = get_output_fp('experiment.xml') run_fp = get_output_fp('run.xml') submission_fp = get_output_fp('submission.xml') if not isdir(output_dir): makedirs(output_dir) else: raise IOError('The output folder already exists: %s' % output_dir) with open(samp_fp, 'U') as st, open(prep_fp, 'U') as pt: submission = EBISubmission.from_templates_and_per_sample_fastqs( preprocessed_data_id_str, study.title, study.info['study_abstract'], investigation_type, st, pt, fastq_dir_fp, new_investigation_type=new_investigation_type, pmids=study.pmids) submission.write_all_xml_files(study_fp, sample_fp, experiment_fp, run_fp, submission_fp, action) if send: submission.send_sequences() study_accession, submission_accession = submission.send_xml() if study_accession is None or submission_accession is None: preprocessed_data.update_insdc_status('failed') raise ComputeError("EBI Submission failed!") else: preprocessed_data.update_insdc_status('success', study_accession, submission_accession) else: study_accession, submission_accession = None, None return study_accession, submission_accession
def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None): """Generates demultiplexed fastq Parameters ---------- rewrite_fastq : bool, optional If true, it forces the rewrite of the fastq files mtime : float, optional The time to use when creating the gz files. If None, the current time will be used by gzip.GzipFile. This is useful for testing. Returns ------- demux_samples List of successful demultiplexed samples Notes ----- - As a performace feature, this method will check if self.full_ebi_dir already exists and, if it does, the script will assume that in a previous execution this step was performed correctly and will simply read the file names from self.full_ebi_dir - When the object is created (init), samples, samples_prep and sample_demux_fps hold values for all available samples in the database. Here some of those values will be deleted (del's, within the loops) for those cases where the fastq.gz files weren't written or exist. This is an indication that they had no sequences and this kind of files are not accepted in EBI Raises ------ EBISubmissionError - The demux file couldn't be read - All samples are removed """ ar = self.artifact dir_not_exists = not isdir(self.full_ebi_dir) if dir_not_exists or rewrite_fastq: makedirs(self.full_ebi_dir) # An artifact will hold only one file of type `preprocessed_demux` # Thus, we only use the first one (the only one present) demux = [ path for _, path, ftype in ar.filepaths if ftype == 'preprocessed_demux' ][0] demux_samples = set() with open_file(demux) as demux_fh: if not isinstance(demux_fh, File): error_msg = "'%s' doesn't look like a demux file" % demux LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) for s, i in to_per_sample_ascii(demux_fh, self.prep_template.keys()): sample_fp = self.sample_demux_fps[s] wrote_sequences = False with GzipFile(sample_fp, mode='w', mtime=mtime) as fh: for record in i: fh.write(record) wrote_sequences = True if wrote_sequences: demux_samples.add(s) else: del (self.samples[s]) del (self.samples_prep[s]) del (self.sample_demux_fps[s]) remove(sample_fp) else: demux_samples = set() extension = '.fastq.gz' extension_len = len(extension) for f in listdir(self.full_ebi_dir): fpath = join(self.full_ebi_dir, f) if isfile(fpath) and f.endswith(extension): demux_samples.add(f[:-extension_len]) missing_samples = set(self.samples.keys()).difference( set(demux_samples)) for ms in missing_samples: del (self.samples[ms]) del (self.samples_prep[ms]) del (self.sample_demux_fps[ms]) if not demux_samples: error_msg = ("All samples were removed from the submission " "because the demux file is empty or the sample names " "do not match.") LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) return demux_samples
def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None): """Generates demultiplexed fastq Parameters ---------- rewrite_fastq : bool, optional If true, it forces the rewrite of the fastq files mtime : float, optional The time to use when creating the gz files. If None, the current time will be used by gzip.GzipFile. This is useful for testing. Returns ------- demux_samples List of successful demultiplexed samples Notes ----- - As a performace feature, this method will check if self.full_ebi_dir already exists and, if it does, the script will assume that in a previous execution this step was performed correctly and will simply read the file names from self.full_ebi_dir - When the object is created (init), samples, samples_prep and sample_demux_fps hold values for all available samples in the database. Here some of those values will be deleted (del's, within the loops) for those cases where the fastq.gz files weren't written or exist. This is an indication that they had no sequences and this kind of files are not accepted in EBI Raises ------ EBISubmissionError - The demux file couldn't be read - All samples are removed """ ar = self.artifact dir_not_exists = not isdir(self.full_ebi_dir) if dir_not_exists or rewrite_fastq: makedirs(self.full_ebi_dir) # An artifact will hold only one file of type `preprocessed_demux` # Thus, we only use the first one (the only one present) demux = [path for _, path, ftype in ar.filepaths if ftype == 'preprocessed_demux'][0] demux_samples = set() with open_file(demux) as demux_fh: if not isinstance(demux_fh, File): error_msg = "'%s' doesn't look like a demux file" % demux LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) for s, i in to_per_sample_ascii(demux_fh, self.prep_template.keys()): sample_fp = self.sample_demux_fps[s] wrote_sequences = False with GzipFile(sample_fp, mode='w', mtime=mtime) as fh: for record in i: fh.write(record) wrote_sequences = True if wrote_sequences: demux_samples.add(s) else: del(self.samples[s]) del(self.samples_prep[s]) del(self.sample_demux_fps[s]) remove(sample_fp) else: demux_samples = set() extension = '.fastq.gz' extension_len = len(extension) for f in listdir(self.full_ebi_dir): fpath = join(self.full_ebi_dir, f) if isfile(fpath) and f.endswith(extension): demux_samples.add(f[:-extension_len]) missing_samples = set(self.samples.keys()).difference( set(demux_samples)) for ms in missing_samples: del(self.samples[ms]) del(self.samples_prep[ms]) del(self.sample_demux_fps[ms]) if not demux_samples: error_msg = ("All samples were removed from the submission " "because the demux file is empty or the sample names " "do not match.") LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) return demux_samples