Ejemplo n.º 1
0
def submit_to_EBI(job):
    """Submit a study to EBI

    Parameters
    ----------
    job : qiita_db.processing_job.ProcessingJob
        The processing job performing the task
    """
    with qdb.sql_connection.TRN:
        param_vals = job.parameters.values
        artifact_id = int(param_vals['artifact'])
        submission_type = param_vals['submission_type']
        artifact = qdb.artifact.Artifact(artifact_id)

        for info in artifact.study._ebi_submission_jobs():
            jid, aid, js, cbste, era = info
            if js in ('running', 'queued') and jid != job.id:
                error_msg = ("Cannot perform parallel EBI submission for "
                             "the same study. Current job running: %s" % js)
                raise EBISubmissionError(error_msg)
        submit_EBI(artifact_id, submission_type, True)
        job._set_status('success')
Ejemplo n.º 2
0
    def parse_EBI_reply(self, curl_result, test=False):
        """Parse and verify reply from EBI after sending XML files

        Parameters
        ----------
        curl_result : str
            The reply sent by EBI after sending XML files
        test : bool
            If true we will assume is a test and ignore some parsing errors

        Returns
        -------
        str
            The study accession number. None in case of failure
        dict of {str: str}
            The sample accession numbers, keyed by sample id. None in case of
            failure
        dict of {str: str}
            The biosample accession numbers, keyed by sample id. None in case
            of failure
        dict of {str: str}
            The experiment accession numbers, keyed by sample id. None in case
            of failure
        dict of {str: str}
            The run accession numbers, keyed by sample id. None in case of
            failure

        Raises
        ------
        EBISubmissionError
            If curl_result is not a valid XML file
            If the ebi subumission has not been successful
            If multiple study tags are found in the curl result
        """
        try:
            root = ET.fromstring(curl_result)
        except ParseError:
            error_msg = ("The curl result from the EBI submission doesn't "
                         "look like an XML file:\n%s" % curl_result)
            le = LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(
                "The curl result from the EBI submission doesn't look like "
                "an XML file. Contact and admin for more information. "
                "Log id: %d" % le.id)

        success = root.get('success') == 'true'
        if not success:
            # here we want to parse out the errors so the failures are clearer
            errors = {elem.text for elem in root.iter("ERROR")}

            raise EBISubmissionError("The EBI submission failed:\n%s" %
                                     '\n'.join(errors))
        if test:
            study_accession = 'MyStudyAccession'
            sample_accessions = {}
            biosample_accessions = {}
            experiment_accessions = {}
            run_accessions = {}

            return (study_accession, sample_accessions, biosample_accessions,
                    experiment_accessions, run_accessions)

        study_elem = root.findall("STUDY")
        if study_elem:
            if len(study_elem) > 1:
                raise EBISubmissionError(
                    "Multiple study tags found in EBI reply: %d" %
                    len(study_elem))
            study_elem = study_elem[0]
            study_accession = study_elem.get('accession')
        else:
            study_accession = None

        sample_accessions = {}
        biosample_accessions = {}
        for elem in root.iter("SAMPLE"):
            alias = elem.get('alias')
            sample_id = self._sample_aliases[alias]
            sample_accessions[sample_id] = elem.get('accession')
            ext_id = elem.find('EXT_ID')
            biosample_accessions[sample_id] = ext_id.get('accession')

        def data_retriever(key, trans_dict):
            res = {}
            for elem in root.iter(key):
                alias = elem.get('alias')
                res[trans_dict[alias]] = elem.get('accession')
            return res

        experiment_accessions = data_retriever("EXPERIMENT",
                                               self._experiment_aliases)
        run_accessions = data_retriever("RUN", self._run_aliases)

        return (study_accession, sample_accessions, biosample_accessions,
                experiment_accessions, run_accessions)
Ejemplo n.º 3
0
    def __init__(self, artifact_id, action):
        error_msgs = []

        if action not in self.valid_ebi_actions:
            error_msg = ("%s is not a valid EBI submission action, valid "
                         "actions are: %s" %
                         (action, ', '.join(self.valid_ebi_actions)))
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)

        ena_ontology = Ontology(convert_to_id('ENA', 'ontology'))
        self.action = action
        self.artifact = Artifact(artifact_id)
        if not self.artifact.can_be_submitted_to_ebi:
            error_msg = ("Artifact %d cannot be submitted to EBI" %
                         self.artifact.id)
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)

        self.study = self.artifact.study
        self.sample_template = self.study.sample_template
        # If we reach this point, there should be only one prep template
        # attached to the artifact. By design, each artifact has at least one
        # prep template. Artifacts with more than one prep template cannot be
        # submitted to EBI, so the attribute 'can_be_submitted_to_ebi' should
        # be set to false, which is checked in the previous if statement
        self.prep_template = self.artifact.prep_templates[0]

        if self.artifact.is_submitted_to_ebi and action != 'MODIFY':
            error_msg = ("Cannot resubmit! Artifact %d has already "
                         "been submitted to EBI." % artifact_id)
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)

        self.artifact_id = artifact_id
        self.study_title = self.study.title
        self.study_abstract = self.study.info['study_abstract']

        it = self.prep_template.investigation_type
        if it in ena_ontology.terms:
            self.investigation_type = it
            self.new_investigation_type = None
        elif it in ena_ontology.user_defined_terms:
            self.investigation_type = 'Other'
            self.new_investigation_type = it
        else:
            # This should never happen
            error_msgs.append("Unrecognized investigation type: '%s'. This "
                              "term is neither one of the official terms nor "
                              "one of the user-defined terms in the ENA "
                              "ontology." % it)
        _, base_fp = get_mountpoint("preprocessed_data")[0]
        self.ebi_dir = '%d_ebi_submission' % artifact_id
        self.full_ebi_dir = join(base_fp, self.ebi_dir)
        self.ascp_reply = join(self.full_ebi_dir, 'ascp_reply.txt')
        self.curl_reply = join(self.full_ebi_dir, 'curl_reply.xml')
        self.xml_dir = join(self.full_ebi_dir, 'xml_dir')
        self.study_xml_fp = None
        self.sample_xml_fp = None
        self.experiment_xml_fp = None
        self.run_xml_fp = None
        self.submission_xml_fp = None
        self.publications = self.study.publications

        # getting the restrictions
        st_restrictions = [self.sample_template.columns_restrictions['EBI']]
        pt_restrictions = [self.prep_template.columns_restrictions['EBI']]
        if self.artifact.data_type in TARGET_GENE_DATA_TYPES:
            # adding restictions on primer and barcode as these are
            # conditionally requiered for target gene
            pt_restrictions.append(
                PREP_TEMPLATE_COLUMNS_TARGET_GENE['demultiplex'])
        st_missing = self.sample_template.check_restrictions(st_restrictions)
        pt_missing = self.prep_template.check_restrictions(pt_restrictions)
        # testing if there are any missing columns
        if st_missing:
            error_msgs.append("Missing column in the sample template: %s" %
                              ', '.join(list(st_missing)))
        if pt_missing:
            error_msgs.append("Missing column in the prep template: %s" %
                              ', '.join(list(pt_missing)))

        # generating all samples from sample template
        self.samples = {}
        self.samples_prep = {}
        self.sample_demux_fps = {}
        get_output_fp = partial(join, self.full_ebi_dir)
        nvp = []
        nvim = []
        for k, v in viewitems(self.sample_template):
            if k not in self.prep_template:
                continue
            sample_prep = self.prep_template[k]

            # validating required fields
            if ('platform' not in sample_prep
                    or sample_prep['platform'] is None):
                nvp.append(k)
            else:
                platform = sample_prep['platform'].upper()
                if platform not in self.valid_platforms:
                    nvp.append(k)
                else:
                    if ('instrument_model' not in sample_prep
                            or sample_prep['instrument_model'] is None):
                        nvim.append(k)
                    else:
                        im = sample_prep['instrument_model'].upper()
                        if im not in self.valid_platforms[platform]:
                            nvim.append(k)

            self.samples[k] = v
            self.samples_prep[k] = sample_prep
            self.sample_demux_fps[k] = get_output_fp("%s.fastq.gz" % k)

        if nvp:
            error_msgs.append("These samples do not have a valid platform "
                              "(instrumet model wasn't checked): %s" %
                              (', '.join(nvp)))
        if nvim:
            error_msgs.append("These samples do not have a valid instrument "
                              "model: %s" % (', '.join(nvim)))
        if error_msgs:
            error_msgs = ("Errors found during EBI submission for study #%d, "
                          "artifact #%d and prep template #%d:\n%s" %
                          (self.study.id, artifact_id, self.prep_template.id,
                           '\n'.join(error_msgs)))
            LogEntry.create('Runtime', error_msgs)
            raise EBISubmissionError(error_msgs)

        self._sample_aliases = {}
        self._experiment_aliases = {}
        self._run_aliases = {}

        self._ebi_sample_accessions = \
            self.sample_template.ebi_sample_accessions
        self._ebi_experiment_accessions = \
            self.prep_template.ebi_experiment_accessions
Ejemplo n.º 4
0
    def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
        """Generates demultiplexed fastq

        Parameters
        ----------
        rewrite_fastq : bool, optional
            If true, it forces the rewrite of the fastq files
        mtime : float, optional
            The time to use when creating the gz files. If None, the current
            time will be used by gzip.GzipFile. This is useful for testing.

        Returns
        -------
        demux_samples
            List of successful demultiplexed samples

        Notes
        -----
        - As a performace feature, this method will check if self.full_ebi_dir
        already exists and, if it does, the script will assume that in a
        previous execution this step was performed correctly and will simply
        read the file names from self.full_ebi_dir
        - When the object is created (init), samples, samples_prep and
        sample_demux_fps hold values for all available samples in the database.
        Here some of those values will be deleted (del's, within the loops) for
        those cases where the fastq.gz files weren't written or exist. This is
        an indication that they had no sequences and this kind of files are not
        accepted in EBI

        Raises
        ------
        EBISubmissionError
            - The demux file couldn't be read
            - All samples are removed
        """
        dir_not_exists = not isdir(self.full_ebi_dir)
        missing_samples = []
        if dir_not_exists or rewrite_fastq:
            # if it exists, remove folder and start from scratch
            if isdir(self.full_ebi_dir):
                rmtree(self.full_ebi_dir)

            makedirs(self.full_ebi_dir)

            if self.artifact.artifact_type == 'per_sample_FASTQ':
                demux_samples, missing_samples = \
                    self._generate_demultiplexed_fastq_per_sample_FASTQ()
            else:
                demux_samples = self._generate_demultiplexed_fastq_demux(mtime)
        else:
            demux_samples = set()
            extension = '.fastq.gz'
            extension_len = len(extension)
            for f in listdir(self.full_ebi_dir):
                fpath = join(self.full_ebi_dir, f)
                if isfile(fpath) and f.endswith(extension):
                    demux_samples.add(f[:-extension_len])

            missing_samples = set(
                self.samples.keys()).difference(demux_samples)

        if missing_samples:
            for ms in missing_samples:
                del (self.samples[ms])
                del (self.samples_prep[ms])
                del (self.sample_demux_fps[ms])

        if not demux_samples:
            error_msg = ("All samples were removed from the submission "
                         "because the demux file is empty or the sample names "
                         "do not match.")
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)

        return demux_samples
Ejemplo n.º 5
0
    def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
        """Generates demultiplexed fastq

        Parameters
        ----------
        rewrite_fastq : bool, optional
            If true, it forces the rewrite of the fastq files
        mtime : float, optional
            The time to use when creating the gz files. If None, the current
            time will be used by gzip.GzipFile. This is useful for testing.

        Returns
        -------
        demux_samples
            List of successful demultiplexed samples

        Notes
        -----
        - As a performace feature, this method will check if self.full_ebi_dir
        already exists and, if it does, the script will assume that in a
        previous execution this step was performed correctly and will simply
        read the file names from self.full_ebi_dir
        - When the object is created (init), samples, samples_prep and
        sample_demux_fps hold values for all available samples in the database.
        Here some of those values will be deleted (del's, within the loops) for
        those cases where the fastq.gz files weren't written or exist. This is
        an indication that they had no sequences and this kind of files are not
        accepted in EBI

        Raises
        ------
        EBISubmissionError
            - The demux file couldn't be read
            - All samples are removed
        """
        ar = self.artifact

        dir_not_exists = not isdir(self.full_ebi_dir)
        if dir_not_exists or rewrite_fastq:
            makedirs(self.full_ebi_dir)

            # An artifact will hold only one file of type `preprocessed_demux`
            # Thus, we only use the first one (the only one present)
            demux = [
                path for _, path, ftype in ar.filepaths
                if ftype == 'preprocessed_demux'
            ][0]

            demux_samples = set()
            with open_file(demux) as demux_fh:
                if not isinstance(demux_fh, File):
                    error_msg = "'%s' doesn't look like a demux file" % demux
                    LogEntry.create('Runtime', error_msg)
                    raise EBISubmissionError(error_msg)
                for s, i in to_per_sample_ascii(demux_fh,
                                                self.prep_template.keys()):
                    sample_fp = self.sample_demux_fps[s]
                    wrote_sequences = False
                    with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
                        for record in i:
                            fh.write(record)
                            wrote_sequences = True

                    if wrote_sequences:
                        demux_samples.add(s)
                    else:
                        del (self.samples[s])
                        del (self.samples_prep[s])
                        del (self.sample_demux_fps[s])
                        remove(sample_fp)
        else:
            demux_samples = set()
            extension = '.fastq.gz'
            extension_len = len(extension)
            for f in listdir(self.full_ebi_dir):
                fpath = join(self.full_ebi_dir, f)
                if isfile(fpath) and f.endswith(extension):
                    demux_samples.add(f[:-extension_len])

            missing_samples = set(self.samples.keys()).difference(
                set(demux_samples))
            for ms in missing_samples:
                del (self.samples[ms])
                del (self.samples_prep[ms])
                del (self.sample_demux_fps[ms])

        if not demux_samples:
            error_msg = ("All samples were removed from the submission "
                         "because the demux file is empty or the sample names "
                         "do not match.")
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)
        return demux_samples
Ejemplo n.º 6
0
Archivo: ebi.py Proyecto: mcmk3/qiita
    def _generate_demultiplexed_fastq_per_sample_FASTQ(self):
        """Modularity helper"""

        # helper function to write files in this method
        def _rename_file(fp, new_fp):
            if fp.endswith('.gz'):
                copyfile(fp, new_fp)
            else:
                cmd = "gzip -c %s > %s" % (fp, new_fp)
                stdout, stderr, rv = system_call(cmd)
                if rv != 0:
                    error_msg = ("Error:\nStd output:%s\nStd error:%s" %
                                 (stdout, stderr))
                    raise EBISubmissionError(error_msg)

        fwd_reads = []
        rev_reads = []
        for x in self.artifact.filepaths:
            if x['fp_type'] == 'raw_forward_seqs':
                fwd_reads.append((basename(x['fp']), x['fp']))
            elif x['fp_type'] == 'raw_reverse_seqs':
                rev_reads.append((basename(x['fp']), x['fp']))
        fwd_reads.sort(key=lambda x: x[1])
        rev_reads.sort(key=lambda x: x[1])
        if rev_reads:
            self.per_sample_FASTQ_reverse = True

        # merging forward and reverse into a single list, note that at this
        # stage the files have passed multiple rounds of reviews: validator
        # when the artifact was created, the summary generator, etc. so we can
        # assure that if a rev exists for 1 fwd, there is one for all of them
        fps = []
        for f, r in zip_longest(fwd_reads, rev_reads):
            sample_name = f[0]
            fwd_read = f[1]
            rev_read = r[1] if r is not None else None
            fps.append((sample_name, (fwd_read, rev_read)))

        if 'run_prefix' in self.prep_template.categories():
            rps = [(k, v) for k, v in self.prep_template.get_category(
                'run_prefix').items()]
        else:
            rps = [(v, v.split('.', 1)[1]) for v in self.prep_template.keys()]
        rps.sort(key=lambda x: x[1])

        demux_samples = set()
        for sn, rp in rps:
            for i, (bn, fp) in enumerate(fps):
                if bn.startswith(rp):
                    demux_samples.add(sn)
                    new_fp = self.sample_demux_fps[sn] + self.FWD_READ_SUFFIX
                    _rename_file(fp[0], new_fp)

                    if fp[1] is not None:
                        new_fp = self.sample_demux_fps[
                            sn] + self.REV_READ_SUFFIX
                        _rename_file(fp[1], new_fp)
                    del fps[i]
                    break
        if fps:
            error_msg = (
                'Discrepancy between filepaths and sample names. Extra'
                ' filepaths: %s' % ', '.join([fp[0] for fp in fps]))
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)

        return demux_samples, \
            set(self.samples.keys()).difference(set(demux_samples))