def display_template(self, preprocessed_data_id, msg, msg_level): """Simple function to avoid duplication of code""" preprocessed_data_id = int(preprocessed_data_id) try: preprocessed_data = PreprocessedData(preprocessed_data_id) except QiitaDBUnknownIDError: raise HTTPError(404, "PreprocessedData %d does not exist!" % preprocessed_data_id) else: user = self.current_user if user.level != 'admin': raise HTTPError(403, "No permissions of admin, " "get/EBISubmitHandler: %s!" % user.id) prep_template = PrepTemplate(preprocessed_data.prep_template) sample_template = SampleTemplate(preprocessed_data.study) study = Study(preprocessed_data.study) stats = [('Number of samples', len(prep_template)), ('Number of metadata headers', len(sample_template.categories()))] demux = [path for _, path, ftype in preprocessed_data.get_filepaths() if ftype == 'preprocessed_demux'] demux_length = len(demux) if not demux_length: msg = ("Study does not appear to have demultiplexed " "sequences associated") msg_level = 'danger' elif demux_length > 1: msg = ("Study appears to have multiple demultiplexed files!") msg_level = 'danger' elif demux_length == 1: demux_file = demux[0] demux_file_stats = demux_stats(demux_file) stats.append(('Number of sequences', demux_file_stats.n)) msg_level = 'success' # Check if the templates have all the required columns for EBI pt_missing_cols = prep_template.check_restrictions( [PREP_TEMPLATE_COLUMNS['EBI']]) st_missing_cols = sample_template.check_restrictions( [SAMPLE_TEMPLATE_COLUMNS['EBI']]) allow_submission = (len(pt_missing_cols) == 0 and len(st_missing_cols) == 0) if not allow_submission: msg_list = ["Submission to EBI disabled due to missing columns:"] if len(pt_missing_cols) > 0: msg_list.append("Columns missing in prep template: %s" % ', '.join(pt_missing_cols)) if len(st_missing_cols) > 0: msg_list.append("Columns missing in sample template: %s" % ', '.join(st_missing_cols)) ebi_disabled_msg = "<br/>".join(msg_list) else: ebi_disabled_msg = None self.render('ebi_submission.html', study_title=study.title, stats=stats, message=msg, study_id=study.id, level=msg_level, preprocessed_data_id=preprocessed_data_id, investigation_type=prep_template.investigation_type, allow_submission=allow_submission, ebi_disabled_msg=ebi_disabled_msg)
class EBISubmission(object): """Define an EBI submission, generate submission files and submit Submit a preprocessed data to EBI The steps for EBI submission are: 1. Validate that we have all required info to submit 2. Generate per sample demultiplexed files 3. Generate XML files for submission 4. Submit sequences files 5. Submit XML files. The answer has the EBI submission numbers. Parameters ---------- preprocessed_data_id : int The preprocesssed data id to submit action : str The action to perform. Valid options see EBISubmission.valid_ebi_actions Raises ------ EBISubmissionError - If the action is not in EBISubmission.valid_ebi_actions - If the preprocessed data has been already submitted to EBI - If the status of the study attached to the preprocessed data is submitting - If the prep template investigation type is not in the ena_ontology.terms or not in the ena_ontology.user_defined_terms - If the submission is missing required EBI fields either in the sample or prep template - If the sample preparation metadata doesn't have a platform field or it isn't a EBISubmission.valid_platforms """ valid_ebi_actions = ('ADD', 'VALIDATE', 'MODIFY') valid_ebi_submission_states = ('submitting') # valid_platforms dict of 'platform': ['valid_instrument_models'] valid_platforms = {'LS454': ['454 GS', '454 GS 20', '454 GS FLX', '454 GS FLX+', '454 GS FLX TITANIUM', '454 GS JUNIOR', 'UNSPECIFIED'], 'ILLUMINA': ['ILLUMINA GENOME ANALYZER', 'ILLUMINA GENOME ANALYZER II', 'ILLUMINA GENOME ANALYZER IX', 'ILLUMINA HISEQ 2500', 'ILLUMINA HISEQ 2000', 'ILLUMINA HISEQ 1500', 'ILLUMINA HISEQ 1000', 'ILLUMINA MISEQ', 'ILLUMINA HISCANSQ', 'HISEQ X TEN', 'NEXTSEQ 500', 'UNSPECIFIED']} xmlns_xsi = "http://www.w3.org/2001/XMLSchema-instance" xsi_noNSL = "ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_3/SRA.%s.xsd" experiment_library_fields = ['library_strategy'] def __init__(self, preprocessed_data_id, action): error_msgs = [] if action not in self.valid_ebi_actions: error_msg = ("%s is not a valid EBI submission action, valid " "actions are: %s" % (action, ', '.join(self.valid_ebi_actions))) LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) ena_ontology = Ontology(convert_to_id('ENA', 'ontology')) self.action = action self.preprocessed_data = PreprocessedData(preprocessed_data_id) self.study = Study(self.preprocessed_data.study) self.sample_template = SampleTemplate(self.study.sample_template) self.prep_template = PrepTemplate(self.preprocessed_data.prep_template) if self.preprocessed_data.is_submitted_to_ebi and action != 'MODIFY': error_msg = ("Cannot resubmit! Preprocessed data %d has already " "been submitted to EBI.") LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) status = self.study.ebi_submission_status if status in self.valid_ebi_submission_states: error_msg = ("Cannot perform parallel EBI submission for the same " "study. Current status of the study: %s" % status) LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) self.preprocessed_data_id = preprocessed_data_id self.study_title = self.study.title self.study_abstract = self.study.info['study_abstract'] it = self.prep_template.investigation_type if it in ena_ontology.terms: self.investigation_type = it self.new_investigation_type = None elif it in ena_ontology.user_defined_terms: self.investigation_type = 'Other' self.new_investigation_type = it else: # This should never happen error_msgs.append("Unrecognized investigation type: '%s'. This " "term is neither one of the official terms nor " "one of the user-defined terms in the ENA " "ontology." % it) _, base_fp = get_mountpoint("preprocessed_data")[0] self.ebi_dir = '%d_ebi_submission' % preprocessed_data_id self.full_ebi_dir = join(base_fp, self.ebi_dir) self.ascp_reply = join(self.full_ebi_dir, 'ascp_reply.txt') self.curl_reply = join(self.full_ebi_dir, 'curl_reply.xml') self.xml_dir = join(self.full_ebi_dir, 'xml_dir') self.study_xml_fp = None self.sample_xml_fp = None self.experiment_xml_fp = None self.run_xml_fp = None self.submission_xml_fp = None self.pmids = self.study.pmids # getting the restrictions st_missing = self.sample_template.check_restrictions( [self.sample_template.columns_restrictions['EBI']]) pt_missing = self.prep_template.check_restrictions( [self.prep_template.columns_restrictions['EBI']]) # testing if there are any missing columns if st_missing: error_msgs.append("Missing column in the sample template: %s" % ', '.join(list(st_missing))) if pt_missing: error_msgs.append("Missing column in the prep template: %s" % ', '.join(list(pt_missing))) # generating all samples from sample template self.samples = {} self.samples_prep = {} self.sample_demux_fps = {} get_output_fp = partial(join, self.full_ebi_dir) nvp = [] nvim = [] for k, v in viewitems(self.sample_template): if k not in self.prep_template: continue sample_prep = self.prep_template[k] # validating required fields if ('platform' not in sample_prep or sample_prep['platform'] is None): nvp.append(k) else: platform = sample_prep['platform'].upper() if platform not in self.valid_platforms: nvp.append(k) else: if ('instrument_model' not in sample_prep or sample_prep['instrument_model'] is None): nvim.append(k) else: im = sample_prep['instrument_model'].upper() if im not in self.valid_platforms[platform]: nvim.append(k) self.samples[k] = v self.samples_prep[k] = sample_prep self.sample_demux_fps[k] = get_output_fp("%s.fastq.gz" % k) if nvp: error_msgs.append("These samples do not have a valid platform " "(instrumet model wasn't checked): %s" % ( ', '.join(nvp))) if nvim: error_msgs.append("These samples do not have a valid instrument " "model: %s" % (', '.join(nvim))) if error_msgs: error_msgs = ("Errors found during EBI submission for study #%d, " "preprocessed data #%d and prep template #%d:\n%s" % (self.study.id, preprocessed_data_id, self.prep_template.id, '\n'.join(error_msgs))) LogEntry.create('Runtime', error_msgs) raise EBISubmissionError(error_msgs) self._sample_aliases = {} self._experiment_aliases = {} self._run_aliases = {} self._ebi_sample_accessions = \ self.sample_template.ebi_sample_accessions self._ebi_experiment_accessions = \ self.prep_template.ebi_experiment_accessions def _get_study_alias(self): """Format alias using ``self.preprocessed_data_id``""" study_alias_format = '%s_sid_%s' return study_alias_format % ( qiita_config.ebi_organization_prefix, escape(clean_whitespace(str(self.study.id)))) def _get_sample_alias(self, sample_name): """Format alias using ``self.preprocessed_data_id``, `sample_name`""" alias = "%s:%s" % (self._get_study_alias(), escape(clean_whitespace(str(sample_name)))) self._sample_aliases[alias] = sample_name return alias def _get_experiment_alias(self, sample_name): """Format alias using ``self.preprocessed_data_id``, and `sample_name` Currently, this is identical to _get_sample_alias above, since we are only going to allow submission of one prep for each sample """ exp_alias_format = '%s_ptid_%s:%s' alias = exp_alias_format % ( qiita_config.ebi_organization_prefix, escape(clean_whitespace(str(self.prep_template.id))), escape(clean_whitespace(str(sample_name)))) self._experiment_aliases[alias] = sample_name return alias def _get_submission_alias(self): """Format alias using ``self.preprocessed_data_id``""" safe_preprocessed_data_id = escape( clean_whitespace(str(self.preprocessed_data_id))) submission_alias_format = '%s_submission_%s' return submission_alias_format % (qiita_config.ebi_organization_prefix, safe_preprocessed_data_id) def _get_run_alias(self, sample_name): """Format alias using `sample_name` """ alias = '%s_ppdid_%s:%s' % ( qiita_config.ebi_organization_prefix, escape(clean_whitespace(str(self.preprocessed_data_id))), sample_name) self._run_aliases[alias] = sample_name return alias def _get_library_name(self, sample_name): """Format alias using `sample_name` """ return escape(clean_whitespace(sample_name)) def _add_dict_as_tags_and_values(self, parent_node, attribute_element_name, data_dict): """Format key/value data using a common EBI XML motif""" for attr, val in sorted(data_dict.items()): if val is None: val = "Unknown" attribute_element = ET.SubElement(parent_node, attribute_element_name) tag = ET.SubElement(attribute_element, 'TAG') tag.text = clean_whitespace(attr) value = ET.SubElement(attribute_element, 'VALUE') value.text = clean_whitespace(val) def _get_pmid_element(self, study_links, pmid): study_link = ET.SubElement(study_links, 'STUDY_LINK') xref_link = ET.SubElement(study_link, 'XREF_LINK') db = ET.SubElement(xref_link, 'DB') db.text = 'PUBMED' _id = ET.SubElement(xref_link, 'ID') _id.text = str(pmid) def generate_study_xml(self): """Generates the string for study XML file Returns ------- ET.Element Object with study XML values """ study_set = ET.Element('STUDY_SET', { 'xmlns:xsi': self.xmlns_xsi, 'xsi:noNamespaceSchemaLocation': self.xsi_noNSL % "study"}) study = ET.SubElement(study_set, 'STUDY', { 'alias': self._get_study_alias(), 'center_name': qiita_config.ebi_center_name} ) descriptor = ET.SubElement(study, 'DESCRIPTOR') study_title = ET.SubElement(descriptor, 'STUDY_TITLE') study_title.text = escape(clean_whitespace(self.study_title)) if self.investigation_type == 'Other': ET.SubElement(descriptor, 'STUDY_TYPE', { 'existing_study_type': 'Other', 'new_study_type': escape(clean_whitespace( self.new_investigation_type))} ) else: ET.SubElement(descriptor, 'STUDY_TYPE', { 'existing_study_type': escape(clean_whitespace( self.investigation_type))} ) study_abstract = ET.SubElement(descriptor, 'STUDY_ABSTRACT') study_abstract.text = clean_whitespace(escape(self.study_abstract)) # Add pubmed IDs if self.pmids: study_links = ET.SubElement(study, 'STUDY_LINKS') for pmid in self.pmids: self._get_pmid_element(study_links, pmid) return study_set def generate_sample_xml(self, samples=None): """Generates the sample XML file Parameters ---------- samples : list of str, optional The list of samples to be included in the sample xml. If not provided or an empty list is provided, all the samples are used Returns ------- ET.Element Object with sample XML values """ sample_set = ET.Element('SAMPLE_SET', { 'xmlns:xsi': self.xmlns_xsi, "xsi:noNamespaceSchemaLocation": self.xsi_noNSL % "sample"}) if not samples: samples = viewkeys(self.samples) for sample_name in sorted(samples): sample_info = dict(self.samples[sample_name]) sample = ET.SubElement(sample_set, 'SAMPLE', { 'alias': self._get_sample_alias(sample_name), 'center_name': qiita_config.ebi_center_name} ) sample_title = ET.SubElement(sample, 'TITLE') sample_title.text = escape(clean_whitespace(sample_name)) sample_sample_name = ET.SubElement(sample, 'SAMPLE_NAME') taxon_id = ET.SubElement(sample_sample_name, 'TAXON_ID') text = sample_info.pop('taxon_id') taxon_id.text = escape(clean_whitespace(text)) scientific_name = ET.SubElement( sample_sample_name, 'SCIENTIFIC_NAME') text = sample_info.pop('scientific_name') scientific_name.text = escape(clean_whitespace(text)) description = ET.SubElement(sample, 'DESCRIPTION') text = sample_info.pop('description') description.text = escape(clean_whitespace(text)) if sample_info: sample_attributes = ET.SubElement(sample, 'SAMPLE_ATTRIBUTES') self._add_dict_as_tags_and_values(sample_attributes, 'SAMPLE_ATTRIBUTE', sample_info) return sample_set def _generate_spot_descriptor(self, design, platform): """This XML element (and its subelements) must be written for every sample, but its generation depends on only study-level information. Therefore, we can break it out into its own method. """ # This section applies only to the LS454 platform if platform is not 'LS454': return # There is some hard-coded information in here, but this is what we # have always done in the past... spot_descriptor = ET.SubElement(design, 'SPOT_DESCRIPTOR') ET.SubElement(spot_descriptor, 'SPOT_DECODE_SPEC') read_spec = ET.SubElement(spot_descriptor, 'READ_SPEC') read_index = ET.SubElement(read_spec, 'READ_INDEX') read_index.text = '0' read_class = ET.SubElement(read_spec, 'READ_CLASS') read_class.text = 'Application Read' read_type = ET.SubElement(read_spec, 'READ_TYPE') read_type.text = 'Forward' base_coord = ET.SubElement(read_spec, 'BASE_COORD') base_coord.text = '1' def generate_experiment_xml(self, samples=None): """Generates the experiment XML file Parameters ---------- samples : list of str, optional The list of samples to be included in the experiment xml Returns ------- ET.Element Object with experiment XML values """ study_accession = self.study.ebi_study_accession if study_accession: study_ref_dict = {'accession': study_accession} else: study_ref_dict = {'refname': self._get_study_alias()} experiment_set = ET.Element('EXPERIMENT_SET', { 'xmlns:xsi': self.xmlns_xsi, "xsi:noNamespaceSchemaLocation": self.xsi_noNSL % "experiment"}) samples = samples if samples is not None else viewkeys(self.samples) for sample_name in sorted(samples): experiment_alias = self._get_experiment_alias(sample_name) sample_prep = dict(self.samples_prep[sample_name]) if self._ebi_sample_accessions[sample_name]: sample_descriptor_dict = { 'accession': self._ebi_sample_accessions[sample_name]} else: sample_descriptor_dict = { 'refname': self._get_sample_alias(sample_name)} platform = sample_prep.pop('platform') experiment = ET.SubElement(experiment_set, 'EXPERIMENT', { 'alias': experiment_alias, 'center_name': qiita_config.ebi_center_name} ) title = ET.SubElement(experiment, 'TITLE') title.text = experiment_alias ET.SubElement(experiment, 'STUDY_REF', study_ref_dict) design = ET.SubElement(experiment, 'DESIGN') design_description = ET.SubElement(design, 'DESIGN_DESCRIPTION') edd = sample_prep.pop('experiment_design_description') design_description.text = escape(clean_whitespace(edd)) ET.SubElement(design, 'SAMPLE_DESCRIPTOR', sample_descriptor_dict) # this is the library contruction section. The only required fields # is library_construction_protocol, the other are optional library_descriptor = ET.SubElement(design, 'LIBRARY_DESCRIPTOR') library_name = ET.SubElement(library_descriptor, 'LIBRARY_NAME') library_name.text = self._get_library_name(sample_name) # hardcoding some values, # see https://github.com/biocore/qiita/issues/1485 library_source = ET.SubElement(library_descriptor, "LIBRARY_SOURCE") library_source.text = "METAGENOMIC" library_selection = ET.SubElement(library_descriptor, "LIBRARY_SELECTION") library_selection.text = "PCR" library_layout = ET.SubElement(library_descriptor, "LIBRARY_LAYOUT") ET.SubElement(library_layout, "SINGLE") lcp = ET.SubElement(library_descriptor, "LIBRARY_CONSTRUCTION_PROTOCOL") lcp.text = escape(clean_whitespace( sample_prep.pop('library_construction_protocol'))) # these are not requiered field but present add them in the right # format for field in self.experiment_library_fields: if field in sample_prep: element = ET.SubElement(library_descriptor, field.upper()) element.text = sample_prep.pop(field) self._generate_spot_descriptor(design, platform) platform_element = ET.SubElement(experiment, 'PLATFORM') platform_info = ET.SubElement(platform_element, platform.upper()) instrument_model = ET.SubElement(platform_info, 'INSTRUMENT_MODEL') instrument_model.text = sample_prep.pop('instrument_model') if sample_prep: experiment_attributes = ET.SubElement( experiment, 'EXPERIMENT_ATTRIBUTES') self._add_dict_as_tags_and_values(experiment_attributes, 'EXPERIMENT_ATTRIBUTE', sample_prep) return experiment_set def generate_run_xml(self): """Generates the run XML file Returns ------- ET.Element Object with run XML values """ run_set = ET.Element('RUN_SET', { 'xmlns:xsi': self.xmlns_xsi, "xsi:noNamespaceSchemaLocation": self.xsi_noNSL % "run"}) for sample_name, sample_prep in sorted(viewitems(self.samples_prep)): sample_prep = dict(sample_prep) if self._ebi_experiment_accessions[sample_name]: experiment_ref_dict = { 'accession': self._ebi_experiment_accessions[sample_name]} else: experiment_alias = self._get_experiment_alias(sample_name) experiment_ref_dict = {'refname': experiment_alias} # We only submit fastq file_type = 'fastq' file_path = self.sample_demux_fps[sample_name] with open(file_path) as fp: md5 = safe_md5(fp).hexdigest() run = ET.SubElement(run_set, 'RUN', { 'alias': self._get_run_alias(sample_name), 'center_name': qiita_config.ebi_center_name} ) ET.SubElement(run, 'EXPERIMENT_REF', experiment_ref_dict) data_block = ET.SubElement(run, 'DATA_BLOCK') files = ET.SubElement(data_block, 'FILES') ET.SubElement(files, 'FILE', { 'filename': join(self.ebi_dir, basename(file_path)), 'filetype': file_type, 'quality_scoring_system': 'phred', 'checksum_method': 'MD5', 'checksum': md5} ) return run_set def generate_submission_xml(self, submission_date=None): """Generates the submission XML file Parameters ---------- submission_date : date, optional Date when the submission was created, when None date.today() will be used. Returns ------- ET.Element Object with submission XML values Notes ----- EBI requieres a date when the submission will be automatically made public. This date is generated from the submission date + 365 days. """ submission_set = ET.Element('SUBMISSION_SET', { 'xmlns:xsi': self.xmlns_xsi, "xsi:noNamespaceSchemaLocation": self.xsi_noNSL % "submission"}) submission = ET.SubElement(submission_set, 'SUBMISSION', { 'alias': self._get_submission_alias(), 'center_name': qiita_config.ebi_center_name} ) actions = ET.SubElement(submission, 'ACTIONS') if self.study_xml_fp: study_action = ET.SubElement(actions, 'ACTION') ET.SubElement(study_action, self.action, { 'schema': 'study', 'source': basename(self.study_xml_fp)} ) if self.sample_xml_fp: sample_action = ET.SubElement(actions, 'ACTION') ET.SubElement(sample_action, self.action, { 'schema': 'sample', 'source': basename(self.sample_xml_fp)} ) if self.experiment_xml_fp: experiment_action = ET.SubElement(actions, 'ACTION') ET.SubElement(experiment_action, self.action, { 'schema': 'experiment', 'source': basename(self.experiment_xml_fp)} ) if self.run_xml_fp: run_action = ET.SubElement(actions, 'ACTION') ET.SubElement(run_action, self.action, { 'schema': 'run', 'source': basename(self.run_xml_fp)} ) if submission_date is None: submission_date = date.today() if self.action == 'ADD': hold_action = ET.SubElement(actions, 'ACTION') ET.SubElement(hold_action, 'HOLD', { 'HoldUntilDate': str(submission_date + timedelta(365))} ) return submission_set def write_xml_file(self, element, fp): """Writes an XML file after calling one of the XML generation functions Parameters ---------- element : ET.Element The Element to be written fp : str The filepath to which the XML will be written """ create_dir(self.xml_dir) ET.ElementTree(element).write(fp, encoding='UTF-8') def generate_xml_files(self): """Generate all the XML files""" get_output_fp = partial(join, self.xml_dir) # There are really only 2 main cases for EBI submission: ADD and # MODIFY and the only exception is in MODIFY if self.action != 'MODIFY': # The study.xml file needs to be generated if and only if the study # does NOT have an ebi_study_accession if not self.study.ebi_study_accession: self.study_xml_fp = get_output_fp('study.xml') self.write_xml_file(self.generate_study_xml(), self.study_xml_fp) # The sample.xml file needs to be generated if and only if there # are samples in the current submission that do NOT have an # ebi_sample_accession new_samples = { sample for sample, accession in viewitems( self.sample_template.ebi_sample_accessions) if accession is None} new_samples = new_samples.intersection(self.samples) if new_samples: self.sample_xml_fp = get_output_fp('sample.xml') self.write_xml_file(self.generate_sample_xml(new_samples), self.sample_xml_fp) # The experiment.xml needs to be generated if and only if there are # samples in the current submission that do NO have an # ebi_experiment_accession new_samples = { sample for sample, accession in viewitems( self.prep_template.ebi_experiment_accessions) if accession is None} new_samples = new_samples.intersection(self.samples) if new_samples: self.experiment_xml_fp = get_output_fp('experiment.xml') self.write_xml_file(self.generate_experiment_xml(new_samples), self.experiment_xml_fp) # Generate the run.xml as it should always be generated self.run_xml_fp = get_output_fp('run.xml') self.write_xml_file(self.generate_run_xml(), self.run_xml_fp) self.submission_xml_fp = get_output_fp('submission.xml') else: # When MODIFY we can only modify the sample (sample.xml) and prep # (experiment.xml) template. The easiest is to generate both and # submit them. Note that we are assuming that Qiita is not # allowing to change preprocessing required information samples = self.sample_template.ebi_sample_accessions # finding unique name for sample xml i = 0 while True: self.sample_xml_fp = get_output_fp('sample_%d.xml' % i) if not exists(self.sample_xml_fp): break i = i + 1 self.write_xml_file(self.generate_sample_xml(samples), self.sample_xml_fp) # finding unique name for experiment xml i = 0 while True: self.experiment_xml_fp = get_output_fp('experiment_%d.xml' % i) if not exists(self.experiment_xml_fp): break i = i + 1 self.write_xml_file(self.generate_experiment_xml(samples), self.experiment_xml_fp) # finding unique name for run xml i = 0 while True: self.submission_xml_fp = get_output_fp('submission_%d.xml' % i) if not exists(self.submission_xml_fp): break i = i + 1 # The submission.xml is always generated self.write_xml_file(self.generate_submission_xml(), self.submission_xml_fp) def generate_curl_command( self, ebi_seq_xfer_user=qiita_config.ebi_seq_xfer_user, ebi_seq_xfer_pass=qiita_config.ebi_seq_xfer_pass, ebi_dropbox_url=qiita_config.ebi_dropbox_url): """Generates the curl command for submission Parameters ---------- ebi_seq_xfer_user : str The user to use when submitting to EBI ebi_seq_xfer_pass : str The user password issued by EBI for REST submissions ebi_dropbox_url : str The dropbox url Returns ------- curl_command The curl string to be executed Notes ----- - All 5 XML files (study, sample, experiment, run, and submission) must be generated before executing this function """ # make sure that the XML files have been generated url = '?auth=ENA%20{0}%20{1}'.format(quote(ebi_seq_xfer_user), quote(ebi_seq_xfer_pass)) curl_cmd = ['curl -sS -k'] if self.submission_xml_fp is not None: curl_cmd.append(' -F "SUBMISSION=@%s"' % self.submission_xml_fp) if self.study_xml_fp is not None: curl_cmd.append(' -F "STUDY=@%s"' % self.study_xml_fp) if self.sample_xml_fp is not None: curl_cmd.append(' -F "SAMPLE=@%s"' % self.sample_xml_fp) if self.run_xml_fp is not None: curl_cmd.append(' -F "RUN=@%s"' % self.run_xml_fp) if self.experiment_xml_fp is not None: curl_cmd.append(' -F "EXPERIMENT=@%s"' % self.experiment_xml_fp) curl_cmd.append(' "%s"' % join(ebi_dropbox_url, url)) return ''.join(curl_cmd) def generate_send_sequences_cmd(self): """Generate the sequences to EBI via ascp command Returns ------- ascp_command The ascp command to be executed Notes ----- - All 5 XML files (study, sample, experiment, run, and submission) must be generated before executing this function """ fastqs = [sfp for _, sfp in viewitems(self.sample_demux_fps)] # divide all the fastqs in groups of 10 fastqs_div = [fastqs[i::10] for i in range(10) if fastqs[i::10]] ascp_commands = [] for f in fastqs_div: ascp_commands.append('ascp --ignore-host-key -d -QT -k2 ' '{0} {1}@{2}:./{3}/'.format( ' '.join(f), qiita_config.ebi_seq_xfer_user, qiita_config.ebi_seq_xfer_url, self.ebi_dir)) return ascp_commands def parse_EBI_reply(self, curl_result): """Parse and verify reply from EBI after sending XML files Parameters ---------- curl_result : str The reply sent by EBI after sending XML files Returns ------- str The study accession number. None in case of failure dict of {str: str} The sample accession numbers, keyed by sample id. None in case of failure dict of {str: str} The biosample accession numbers, keyed by sample id. None in case of failure dict of {str: str} The experiment accession numbers, keyed by sample id. None in case of failure dict of {str: str} The run accession numbers, keyed by sample id. None in case of failure Raises ------ EBISubmissionError If curl_result is not a valid XML file If the ebi subumission has not been successful If multiple study tags are found in the curl result """ try: root = ET.fromstring(curl_result) except ParseError: error_msg = ("The curl result from the EBI submission doesn't " "look like an XML file:\n%s" % curl_result) le = LogEntry.create('Runtime', error_msg) raise EBISubmissionError( "The curl result from the EBI submission doesn't look like " "an XML file. Contact and admin for more information. " "Log id: %d" % le.id) success = root.get('success') == 'true' if not success: raise EBISubmissionError("The EBI submission failed:\n%s" % curl_result) study_elem = root.findall("STUDY") if study_elem: if len(study_elem) > 1: raise EBISubmissionError( "Multiple study tags found in EBI reply: %d" % len(study_elem)) study_elem = study_elem[0] study_accession = study_elem.get('accession') else: study_accession = None sample_accessions = {} biosample_accessions = {} for elem in root.iter("SAMPLE"): alias = elem.get('alias') sample_id = self._sample_aliases[alias] sample_accessions[sample_id] = elem.get('accession') ext_id = elem.find('EXT_ID') biosample_accessions[sample_id] = ext_id.get('accession') def data_retriever(key, trans_dict): res = {} for elem in root.iter(key): alias = elem.get('alias') res[trans_dict[alias]] = elem.get('accession') return res experiment_accessions = data_retriever("EXPERIMENT", self._experiment_aliases) run_accessions = data_retriever("RUN", self._run_aliases) return (study_accession, sample_accessions, biosample_accessions, experiment_accessions, run_accessions) def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None): """Generates demultiplexed fastq Parameters ---------- rewrite_fastq : bool, optional If true, it forces the rewrite of the fastq files mtime : float, optional The time to use when creating the gz files. If None, the current time will be used by gzip.GzipFile. This is useful for testing. Returns ------- demux_samples List of successful demultiplexed samples Notes ----- - As a performace feature, this method will check if self.full_ebi_dir already exists and, if it does, the script will assume that in a previous execution this step was performed correctly and will simply read the file names from self.full_ebi_dir - When the object is created (init), samples, samples_prep and sample_demux_fps hold values for all available samples in the database. Here some of those values will be deleted (del's, within the loops) for those cases where the fastq.gz files weren't written or exist. This is an indication that they had no sequences and this kind of files are not accepted in EBI Raises ------ EBISubmissionError - The demux file couldn't be read - All samples are removed """ ppd = self.preprocessed_data dir_not_exists = not isdir(self.full_ebi_dir) if dir_not_exists or rewrite_fastq: makedirs(self.full_ebi_dir) demux = [path for _, path, ftype in ppd.get_filepaths() if ftype == 'preprocessed_demux'][0] demux_samples = set() with open_file(demux) as demux_fh: if not isinstance(demux_fh, File): error_msg = "'%s' doesn't look like a demux file" % demux LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) for s, i in to_per_sample_ascii(demux_fh, self.prep_template.keys()): sample_fp = self.sample_demux_fps[s] wrote_sequences = False with GzipFile(sample_fp, mode='w', mtime=mtime) as fh: for record in i: fh.write(record) wrote_sequences = True if wrote_sequences: demux_samples.add(s) else: del(self.samples[s]) del(self.samples_prep[s]) del(self.sample_demux_fps[s]) remove(sample_fp) else: demux_samples = set() extension = '.fastq.gz' extension_len = len(extension) for f in listdir(self.full_ebi_dir): fpath = join(self.full_ebi_dir, f) if isfile(fpath) and f.endswith(extension): demux_samples.add(f[:-extension_len]) missing_samples = set(self.samples.keys()).difference( set(demux_samples)) for ms in missing_samples: del(self.samples[ms]) del(self.samples_prep[ms]) del(self.sample_demux_fps[ms]) if not demux_samples: error_msg = ("All samples were removed from the submission " "because the demux file is empty or the sample names " "do not match.") LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) return demux_samples