def deblur(qclient, job_id, parameters, out_dir): """Run deblur with the given parameters Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run deblur out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job Notes ----- The code will check if the artifact has a preprocessed_demux element, if not it will use the preprocessed_fastq. We prefer to work with the preprocessed_demux as running time will be greatly improved """ out_dir = join(out_dir, 'deblur_out') # Step 1 get the rest of the information need to run deblur qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") artifact_id = parameters['Demultiplexed sequences'] # removing input from parameters so it's not part of the final command del parameters['Demultiplexed sequences'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Step 2 generating command deblur if 'preprocessed_demux' in fps: qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample " "from demux (1/2)") if not exists(out_dir): mkdir(out_dir) split_out_dir = join(out_dir, 'split') if not exists(split_out_dir): mkdir(split_out_dir) # using the same number of parallel jobs as defined by the command n_jobs = int(parameters['Jobs to start']) # [0] cause there should be only 1 file to_per_sample_files(fps['preprocessed_demux'][0], out_dir=split_out_dir, n_jobs=n_jobs) qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample " "from demux (2/2)") out_dir = join(out_dir, 'deblured') cmd = generate_deblur_workflow_commands([split_out_dir], out_dir, parameters) else: qclient.update_job_step(job_id, "Step 2 of 4: Generating deblur " "command") cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'], out_dir, parameters) # Step 3 execute deblur qclient.update_job_step(job_id, "Step 3 of 4: Executing deblur job") std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg # Generating artifact pb = partial(join, out_dir) # Generate the filepaths final_biom = pb('all.biom') final_seqs = pb('all.seqs.fa') final_biom_hit = pb('reference-hit.biom') final_seqs_hit = pb('reference-hit.seqs.fa') if not exists(final_biom_hit): # Create an empty table. We need to send something to Qiita that is # a valid BIOM, so we are going to create an empty table t = Table([], [], []) with biom_open(final_biom_hit, 'w') as f: t.to_hdf5(f, 'qp-deblur generated') if not exists(final_seqs_hit): # Same as before, create an empty sequence file so we can send it with open(final_seqs_hit, 'w') as f: f.write("") # Step 4, communicate with archive to check and generate placements qclient.update_job_step(job_id, "Step 4 of 4 (1/4): Retrieving " "observations information") features = list(load_table(final_biom_hit).ids(axis='observation')) fp_phylogeny = None if features: observations = qclient.post( "/qiita_db/archive/observations/", data={'job_id': job_id, 'features': features}) novel_fragments = list(set(features) - set(observations.keys())) qclient.update_job_step(job_id, "Step 4 of 4 (2/4): Generating %d new " "placements" % len(novel_fragments)) # Once we support alternative reference phylogenies for SEPP in the # future, we need to translate the reference name here into # filepaths pointing to the correct reference alignment and # reference tree. If left 'None' the Greengenes 13.8 reference # shipped with the fragment-insertion conda package will be used. fp_reference_alignment = None fp_reference_phylogeny = None fp_reference_template = None fp_reference_rename = None if 'Reference phylogeny for SEPP' in parameters: if parameters['Reference phylogeny for SEPP'] == 'tiny': fp_reference_alignment = qp_deblur.get_data(join( 'sepp', 'reference_alignment_tiny.fasta')) fp_reference_phylogeny = qp_deblur.get_data(join( 'sepp', 'reference_phylogeny_tiny.nwk')) fp_reference_template = qp_deblur.get_data(join( 'sepp', 'tmpl_tiny_placement.json')) fp_reference_rename = qp_deblur.get_data(join( 'sepp', 'tmpl_tiny_rename-json.py')) try: new_placements = generate_sepp_placements( novel_fragments, out_dir, parameters['Threads per sample'], reference_alignment=fp_reference_alignment, reference_phylogeny=fp_reference_phylogeny) except ValueError as e: return False, None, str(e) qclient.update_job_step(job_id, "Step 4 of 4 (3/4): Archiving %d " "new placements" % len(novel_fragments)) # values needs to be json strings as well for fragment in new_placements.keys(): new_placements[fragment] = json.dumps(new_placements[fragment]) # fragments that get rejected by a SEPP run don't show up in # the placement file, however being rejected is a valuable # information and should be stored in the archive as well. # Thus, we avoid re-computation for rejected fragments in the # future. for fragment in novel_fragments: if fragment not in new_placements: new_placements[fragment] = "" if len(new_placements.keys()) > 0: qclient.patch(url="/qiita_db/archive/observations/", op="add", path=job_id, value=json.dumps(new_placements)) # retrieve all fragments and create actuall tree qclient.update_job_step(job_id, "Step 4 of 4 (4/4): Composing " "phylogenetic insertion tree") placements = qclient.post( "/qiita_db/archive/observations/", data={'job_id': job_id, 'features': features}) # remove fragments that have been rejected by SEPP, i.e. whoes # placement is the empty string and # convert all other placements from string to json placements = {frag: json.loads(placements[frag]) for frag, plc in placements.items() if plc != ''} try: fp_phylogeny = generate_insertion_trees( placements, out_dir, reference_template=fp_reference_template, reference_rename=fp_reference_rename) except ValueError as e: return False, None, str(e) else: new_placements = None ainfo = [ArtifactInfo('deblur final table', 'BIOM', [(final_biom, 'biom'), (final_seqs, 'preprocessed_fasta')])] if fp_phylogeny is not None: ainfo.append(ArtifactInfo('deblur reference hit table', 'BIOM', [(final_biom_hit, 'biom'), (final_seqs_hit, 'preprocessed_fasta'), (fp_phylogeny, 'plain_text')], new_placements)) return True, ainfo, ""
def call_qiime2(qclient, job_id, parameters, out_dir): """helper method to call Qiime2 Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to process out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") q2plugin = parameters.pop('qp-hide-plugin') q2method = parameters.pop('qp-hide-method').replace('-', '_') pm = qiime2.sdk.PluginManager() method = pm.plugins[q2plugin].actions[q2method] out_dir = join(out_dir, q2method) # making sure that we always start with an empty folder if not exists(out_dir): mkdir(out_dir) # let's generate the parameters, first remove the hidden parameters. We are # going to separate in q2params and q2inputs as the inputs are going to # need to be retrieved from qiita and converted to qza label = 'qp-hide-param' label_len = len(label) q2params = {} q2inputs = {} method_inputs = method.signature.inputs.copy() method_params = method.signature.parameters.copy() artifact_id = None analysis_id = None biom_fp = None tree_fp = None tree_fp_check = False for k in list(parameters): if k in parameters and k.startswith(label): key = parameters.pop(k) val = parameters.pop(k[label_len:]) if key in method_inputs.keys(): if key == 'phylogeny': if val == '': continue # there is a chance that we parse/loop over the phylogeny # option before the artifact so tree_fp will still be # None; thus we will need to check this after we are done # with this loop if val == 'Artifact tree, if exists': tree_fp_check = True fpath = val qiita_name = QIITA_Q2_SEMANTIC_TYPE[key] if qiita_name['expression']: # for these cases we need an expresion so for # simplicity using the first one [0] artifact_method = '%s[%s]' % ( qiita_name['name'], qiita_name['expression'][0]) elif key in ('classifier', 'data'): fpath = val artifact_method = None k = key else: # this is going to be an artifact so let's collect the # filepath here, this will also allow us to collect the # analysis_id artifact_id = val ainfo = qclient.get( "/qiita_db/artifacts/%s/" % artifact_id) if ainfo['analysis'] is None: msg = ('Artifact "%s" is not an analysis ' 'artifact.' % val) return False, None, msg analysis_id = ainfo['analysis'] dt = method_inputs[key].qiime_type.to_ast()['name'] if 'qza' not in ainfo['files']: # at this stage in qiita we only have 2 types of # artifacts: biom / plain_text if Q2_QIITA_SEMANTIC_TYPE[dt] == 'BIOM': fpath = ainfo['files']['biom'][0] biom_fp = fpath else: fpath = ainfo['files']['plain_text'][0] else: fpath = ainfo['files']['qza'][0] # if it's a BIOM and there is a plain_text is the # result of the archive at this stage: a tree if Q2_QIITA_SEMANTIC_TYPE[dt] == 'BIOM': if 'plain_text' in ainfo['files']: tree_fp = ainfo['files']['plain_text'][0] if biom_fp is None and 'biom' in ainfo['files']: biom_fp = ainfo['files']['biom'][0] q2artifact_name = Q2_QIITA_SEMANTIC_TYPE[ method_inputs[key].qiime_type.to_ast()['name']] qiita_name = QIITA_Q2_SEMANTIC_TYPE[q2artifact_name] if qiita_name['expression']: # for these cases we need an expresion so for # simplicity using the first one [0] artifact_method = '%s[%s]' % ( qiita_name['name'], qiita_name['expression'][0]) else: artifact_method = qiita_name['name'] q2inputs[key] = (fpath, artifact_method) elif key == 'qp-hide-metadata-field': if val == '': msg = ("Error: You didn't write a metadata field in " "'%s'" % k[label_len:]) return False, None, msg q2inputs['metadata'] = (val, val) else: if val in ('', 'None'): continue # let's bring back the original name of these parameters mkey = method_params[key] value_pair = (q2method, key) if (q2plugin == 'diversity' and value_pair in RENAME_COMMANDS): val = RENAME_COMMANDS[value_pair][val] # if the view_type is set convert to set if mkey.view_type is set: val = {val} else: val = qiime2.sdk.util.parse_primitive( mkey.qiime_type.to_ast(), val) q2params[key] = val elif k in ('qp-hide-metadata', 'qp-hide-FeatureData[Taxonomy]'): # remember, if we need metadata, we will always have # qp-hide-metadata and optionaly we will have # qp-hide-metadata-field key = parameters.pop(k) if key in parameters: q2params['metadata'] = qiime2.Artifact.load( parameters.pop(key)).view(qiime2.Metadata) else: q2inputs[key] = ('', '') # if 'metadata' is in q2inputs but 'where' exist and is empty in q2params, # remove the parameter metadata # NOTE: AFAIK there is no way to differentiate between sample and prep # metadata in Q2 so the need to remove for filter_features if ('metadata' in q2inputs and 'where' in q2params and not q2params['where']): q2inputs.pop('metadata') # if we are here, we need to use the internal tree from the artifact if tree_fp_check: q2inputs['phylogeny'] = (tree_fp, q2inputs['phylogeny'][1]) # let's process/import inputs qclient.update_job_step( job_id, "Step 2 of 4: Converting Qiita artifacts to Q2 artifact") for k, (fpath, dt) in q2inputs.items(): if k in ('metadata', 'sample_metadata'): metadata = qclient.get( "/qiita_db/analysis/%s/metadata/" % str(analysis_id)) metadata = pd.DataFrame.from_dict(metadata, orient='index') # the reason we need to save and load the mapping file is # so Qiime2 assings the expected data types to the columns metadata_fp = join(out_dir, 'metadata.txt') metadata.to_csv(metadata_fp, index_label='#SampleID', na_rep='', sep='\t', encoding='utf-8') q2Metadata = qiime2.Metadata.load(metadata_fp) if fpath: q2params[k] = q2Metadata.get_column(fpath) else: q2params[k] = q2Metadata elif k == 'FeatureData[Taxonomy]': try: qza = qiime2.Artifact.import_data( 'FeatureData[Taxonomy]', biom_fp, 'BIOMV210Format') except Exception: return False, None, ('Error generating taxonomy. Are you ' 'sure this artifact has taxonomy?') q2params['taxonomy'] = qza elif fpath is not None: if not fpath.endswith('.qza'): try: qza = qiime2.Artifact.import_data(dt, fpath) except Exception as e: return False, None, 'Error converting "%s": %s' % ( str(dt), str(e)) elif exists(fpath): qza = qiime2.Artifact.load(fpath) q2params[k] = qza else: # adding an else for completeness: if we get here then we should # ignore that parameter/input passed. By design, this should only # happen in one scenario: the user selected an artifact, in # specific a tree, that doesn't exist. This was added while solving # https://github.com/biocore/qiita/issues/3039. However, in the # future it might be useful to always ignore anything that doesn't # exits. pass # if feature_classifier and classify_sklearn we need to transform the # input data to sequences if q2plugin == 'feature-classifier' and q2method == 'classify_sklearn': ainfo = qclient.get("/qiita_db/artifacts/%s/" % parameters['The feature data to be classified.']) biom_fp = ainfo['files']['biom'][0] plain_text_fp = None if 'plain_text' in ainfo['files']: plain_text_fp = ainfo['files']['plain_text'][0] biom_table = load_table(biom_fp) fna_fp = join(out_dir, 'sequences.fna') with open(fna_fp, 'w') as f: for _id in biom_table.ids(axis='observation'): f.write('>{0}\n{0}\n'.format(_id)) try: q2params['reads'] = qiime2.Artifact.import_data( 'FeatureData[Sequence]', fna_fp) except (ValueError, qiime2.core.exceptions.ValidationError) as e: msg = str(e) if 'DNAFASTAFormat file' in msg: msg = ('Table IDs are not sequences, please confirm that this ' 'is not a closed reference table?') return False, None, 'Error converting "%s": %s' % ( 'Input Table', msg) qclient.update_job_step( job_id, "Step 3 of 4: Running '%s %s'" % (q2plugin, q2method)) try: results = method(**q2params) except Exception as e: return False, None, 'Error running: %s' % str(e) qclient.update_job_step(job_id, "Step 4 of 4: Processing results") out_info = [] # if feature_classifier and classify_sklearn we need to add the taxonomy # to the original table and generate the new artifact if q2plugin == 'feature-classifier' and q2method == 'classify_sklearn': new_biom = join(out_dir, 'feature-table-with-taxonomy.biom') new_qza = join(out_dir, 'feature-table-with-taxonomy.qza') df = results[0].view(pd.DataFrame) df.rename(columns={'Taxon': 'taxonomy'}, inplace=True) df['taxonomy'] = [[y.strip() for y in x] for x in df['taxonomy'].str.split(';')] biom_table.add_metadata(df.to_dict(orient='index'), axis='observation') with biom_open(new_biom, 'w') as bf: biom_table.to_hdf5(bf, 'Generated in Qiita') qza = qiime2.Artifact.import_data( 'FeatureTable[Frequency]', new_biom, 'BIOMV210Format') qza.save(new_qza) ftc_fps = [(new_biom, 'biom'), (new_qza, 'qza')] if plain_text_fp is not None: # if we enter here, it means that the input artifact had a tree # (saved as plain_text); thus, we need to make sure we make a copy # so we don't move the original file bn = basename(plain_text_fp) new_tree_fp = join(out_dir, bn) copyfile(ainfo['files']['plain_text'][0], new_tree_fp) ftc_fps.append((new_tree_fp, 'plain_text')) out_info.append(ArtifactInfo( 'Feature Table with Classification', 'BIOM', ftc_fps)) for aname, q2artifact in zip(results._fields, results): aout = join(out_dir, aname) if isinstance(q2artifact, qiime2.Visualization): qzv_fp = q2artifact.save(aout) out_info.append( ArtifactInfo(aname, 'q2_visualization', [(qzv_fp, 'qzv')])) else: qza_fp = q2artifact.save(aout + '.qza') q2artifact.export_data(output_dir=aout) files = listdir(aout) if len(files) != 1: msg = ('Error processing results: There are some unexpected ' 'files: "%s"' % ', '.join(files)) return False, None, msg fp = join(aout, files[0]) # making sure the newly created file comes with the correct # permissions for nginx chmod(fp, 0o664) if (q2artifact.type.name == 'FeatureTable'): # Re-add the observation metadata if exists in the input and if # not one of the plugin/methods that actually changes that # information if biom_fp is not None and (q2plugin, q2method) not in [ ('taxa', 'collapse')]: fin = load_table(biom_fp) fout = load_table(fp) # making sure that the resulting biom is not empty if fout.shape == (0, 0): msg = ('The resulting table is empty, please review ' 'your parameters') return False, None, msg metadata = { i: fin.metadata(i, axis='observation') for i in fout.ids(axis='observation')} fout.add_metadata(metadata, axis='observation') with biom_open(fp, 'w') as bf: fout.to_hdf5(bf, "Qiita's Qiime2 plugin with " "observation metadata") # if there is a tree, let's copy it and then add it to # the new artifact if tree_fp is not None: bn = basename(tree_fp) new_tree_fp = join( out_dir, aout, 'from_%s_%s' % (artifact_id, bn)) copyfile(tree_fp, new_tree_fp) ai = ArtifactInfo(aname, 'BIOM', [ (fp, 'biom'), (new_tree_fp, 'plain_text'), (qza_fp, 'qza')]) else: ai = ArtifactInfo( aname, 'BIOM', [(fp, 'biom'), (qza_fp, 'qza')]) else: atype = Q2_QIITA_SEMANTIC_TYPE[q2artifact.type.name] ai = ArtifactInfo( aname, atype, [(fp, 'plain_text'), (qza_fp, 'qza')]) out_info.append(ai) return True, out_info, ""
def test_validate_per_sample_FASTQ_preprocessed_fastq(self): f1 = join(self.source_dir, 'SKB2.640194_file.fastq') f2 = join(self.source_dir, 'SKM4.640180_file.fastq') f3 = join(self.source_dir, 'SKB3.640195_file.fastq') copyfile(self.fastq, f1) copyfile(self.fastq, f2) copyfile(self.fastq, f3) self._clean_up_files.append(f1) self._clean_up_files.append(f2) self._clean_up_files.append(f3) prep_info = { "1.SKB2.640194": { "not_a_run_prefix": "prefix1" }, "1.SKM4.640180": { "not_a_run_prefix": "prefix1" }, "1.SKB3.640195": { "not_a_run_prefix": "prefix2" } } files = {'preprocessed_fastq': [f1, f2, f3]} job_id, _ = self._create_template_and_job(prep_info, files, "per_sample_FASTQ") obs_success, obs_ainfo, obs_error = _validate_per_sample_FASTQ( self.qclient, job_id, prep_info, files) self.assertTrue(obs_success) filepaths = [(f1 + '.gz', 'preprocessed_fastq'), (f2 + '.gz', 'preprocessed_fastq'), (f3 + '.gz', 'preprocessed_fastq')] exp = [ArtifactInfo(None, "per_sample_FASTQ", filepaths)] self.assertEqual(obs_ainfo, exp) self.assertEqual(obs_error, "") # making sure the regular fastq files doesn't exist anymore but # the gz do self.assertFalse(exists(f1)) self.assertTrue(exists(f1 + '.gz')) f1 = join(self.source_dir, 'SKB2.640194_file_R1.fastq') f2 = join(self.source_dir, 'SKB2.640194_file_R2.fastq') f3 = join(self.source_dir, 'SKB2.640194_file_unmatched_R1.fastq') f4 = join(self.source_dir, 'SKB2.640194_file_unmatched_R2.fastq') f5 = join(self.source_dir, 'SKM4.640180_file_R1.fastq') f6 = join(self.source_dir, 'SKM4.640180_file_R2.fastq') f7 = join(self.source_dir, 'SKM4.640180_file_unmatched_R1.fastq') f8 = join(self.source_dir, 'SKM4.640180_file_unmatched_R2.fastq') f9 = join(self.source_dir, 'SKB3.640195_file_R1.fastq') fA = join(self.source_dir, 'SKB3.640195_file_R2.fastq') fB = join(self.source_dir, 'SKB3.640195_file_unmatched_R1.fastq') fC = join(self.source_dir, 'SKB3.640195_file_unmatched_R2.fastq') raw_files = [f1, f2, f3, f4, f5, f6, f7, f8, f9, fA, fB, fC] for x in raw_files: copyfile(self.fastq, x) self._clean_up_files.append(x) prep_info = { "1.SKB2.640194": { "not_a_run_prefix": "prefix1" }, "1.SKM4.640180": { "not_a_run_prefix": "prefix1" }, "1.SKB3.640195": { "not_a_run_prefix": "prefix2" } } files = {'preprocessed_fastq': raw_files} job_id, _ = self._create_template_and_job(prep_info, files, "per_sample_FASTQ") obs_success, obs_ainfo, obs_error = _validate_per_sample_FASTQ( self.qclient, job_id, prep_info, files) self.assertEqual(obs_error, "") self.assertTrue(obs_success) filepaths = [('%s.gz' % x, 'preprocessed_fastq') for x in raw_files] exp = [ArtifactInfo(None, "per_sample_FASTQ", filepaths)] self.assertEqual(obs_ainfo, exp)
def test_validate(self): # Test artifact type error job_id, params = self._create_job( 'NotAType', {'plan_text': 'Will fail before checking this'}, 1) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, params, self.out_dir) self.assertFalse(obs_success) self.assertIsNone(obs_ainfo) self.assertEqual( obs_error, "Unknown artifact type NotAType. Supported types: " "alpha_vector, distance_matrix, ordination_results") # Test missing metadata error - to be fair, I don't know how this error # can happen in the live system, but better be safe than sorry job_id, params = self._create_job( 'distance_matrix', {'plan_text': 'Will fail before checking this'}, None) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, params, self.out_dir) self.assertFalse(obs_success) self.assertIsNone(obs_ainfo) self.assertEqual(obs_error, "Missing metadata information") # Test distance matrix success sample_ids = [ '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192', '1.SKB7.640196' ] dm_fp = self._create_distance_matrix(sample_ids) job_id, params = self._create_job('distance_matrix', {'plain_text': [dm_fp]}, 1) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, params, self.out_dir) self.assertTrue(obs_success) html_fp = join(self.out_dir, 'index.html') exp_ainfo = [ ArtifactInfo(None, "distance_matrix", [(dm_fp, 'plain_text'), (html_fp, 'html_summary')]) ] self.assertEqual(obs_ainfo, exp_ainfo) self.assertEqual(obs_error, "") # Test ordination results success ord_res_fp = self._create_ordination_results(sample_ids) job_id, params = self._create_job('ordination_results', {'plain_text': [ord_res_fp]}, 1) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, params, self.out_dir) self.assertTrue(obs_success) html_fp = join(self.out_dir, 'index.html') esf_fp = join(self.out_dir, 'emperor_support_files') exp_ainfo = [ ArtifactInfo(None, "ordination_results", [(ord_res_fp, 'plain_text'), (html_fp, 'html_summary'), (esf_fp, 'html_summary_dir')]) ] self.assertEqual(obs_ainfo, exp_ainfo) self.assertEqual(obs_error, "") # Test alpha vector success alpha_vector_fp = self._create_alpha_vector(sample_ids) job_id, params = self._create_job('alpha_vector', {'plain_text': [alpha_vector_fp]}, 1) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, params, self.out_dir) self.assertTrue(obs_success) html_fp = join(self.out_dir, 'index.html') sf_fp = join(self.out_dir, 'support_files') exp_ainfo = [ ArtifactInfo(None, "alpha_vector", [(alpha_vector_fp, 'plain_text'), (html_fp, 'html_summary'), (sf_fp, 'html_summary_dir')]) ] self.assertEqual(obs_ainfo, exp_ainfo) self.assertEqual(obs_error, "")
def woltka(qclient, job_id, parameters, out_dir): """Run Woltka with the given parameters Parameters ---------- qclient : tgp.qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run split libraries out_dir : str The path to the job's output directory Returns ------- bool, list, str The results of the job """ database_taxonomy, database_gene_coordinates = _process_database_files( parameters['Database']) errors = [] ainfo = [] fp_biom = f'{out_dir}/free.biom' fp_alng = f'{out_dir}/alignment.tar' if exists(fp_biom) and exists(fp_alng): ainfo = [ ArtifactInfo('Alignment Profile', 'BIOM', [(fp_biom, 'biom'), (fp_alng, 'log')]) ] else: ainfo = [] errors.append('Missing files from the "Alignment Profile"; please ' 'contact [email protected] for more information') for rank in ['phylum', 'genus', 'species']: fp = f'{out_dir}/{rank}.biom' if exists(fp): # making sure that the tables have taxonomy bt = load_table(fp) metadata = { x: { 'taxonomy': x.split(';') } for x in bt.ids(axis='observation') } bt.add_metadata(metadata, axis='observation') with biom_open(fp, 'w') as f: bt.to_hdf5(f, "woltka") ainfo.append( ArtifactInfo(f'Taxonomic Predictions - {rank}', 'BIOM', [(fp, 'biom')])) else: errors.append(f'Table {rank} was not created, please contact ' '[email protected] for more information') fp_biom = f'{out_dir}/none.biom' if exists(fp_biom): ainfo.append( ArtifactInfo('Per genome Predictions', 'BIOM', [(fp_biom, 'biom')])) else: errors.append('Table none/per-genome was not created, please contact ' '[email protected] for more information') if database_gene_coordinates is not None: fp_biom = f'{out_dir}/per-gene.biom' if exists(fp_biom): ainfo.append( ArtifactInfo('Per gene Predictions', 'BIOM', [(fp_biom, 'biom')])) else: errors.append('Table per-gene was not created, please contact ' '[email protected] for more information') if errors: return False, ainfo, '\n'.join(errors) else: return True, ainfo, ""
def validate(qclient, job_id, parameters, out_dir): """Validate and fix a new BIOM artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to validate and create the artifact out_dir : str The path to the job's output directory Returns ------- bool, list of qiita_client.ArtifactInfo , str Whether the job is successful The artifact information, if successful The error message, if not successful """ prep_id = parameters.get('template') analysis_id = parameters.get('analysis') files = loads(parameters['files']) a_type = parameters['artifact_type'] if a_type != "BIOM": return (False, None, "Unknown artifact type %s. Supported types: BIOM" % a_type) qclient.update_job_step(job_id, "Step 1: Collecting metadata") if prep_id is not None: metadata = qclient.get("/qiita_db/prep_template/%s/data/" % prep_id) metadata = metadata['data'] elif analysis_id is not None: metadata = qclient.get("/qiita_db/analysis/%s/metadata/" % analysis_id) else: return (False, None, "Missing metadata information") # Check if the biom table has the same sample ids as the prep info qclient.update_job_step(job_id, "Step 2: Validting BIOM file") new_biom_fp = biom_fp = files['biom'][0] table = load_table(biom_fp) metadata_ids = set(metadata) biom_sample_ids = set(table.ids()) if not metadata_ids.issuperset(biom_sample_ids): # The BIOM sample ids are different from the ones in the prep template qclient.update_job_step(job_id, "Step 3: Fixing BIOM sample ids") # Attempt 1: the user provided the run prefix column - in this case # the run prefix column holds the sample ids present in the BIOM file if 'run_prefix' in metadata[next(iter(metadata_ids))]: id_map = {v['run_prefix']: k for k, v in metadata.items()} else: # Attemp 2: the sample ids in the BIOM table are the same that in # the prep template but without the prefix prefix = next(iter(metadata_ids)).split('.', 1)[0] prefixed = set("%s.%s" % (prefix, s) for s in biom_sample_ids) if metadata_ids.issuperset(prefixed): id_map = {s: "%s.%s" % (prefix, s) for s in biom_sample_ids} else: # There is nothing we can do. The samples in the BIOM table do # not match the ones in the prep template and we can't fix it error_msg = ('The sample ids in the BIOM table do not match ' 'the ones in the prep information. Please, ' 'provide the column "run_prefix" in the prep ' 'information to map the existing sample ids to ' 'the prep information sample ids.') return False, None, error_msg # Fix the sample ids try: table.update_ids(id_map, axis='sample') except TableException: missing = biom_sample_ids - set(id_map) error_msg = ('Your prep information is missing samples that are ' 'present in your BIOM table: %s' % ', '.join(missing)) return False, None, error_msg new_biom_fp = join(out_dir, basename(biom_fp)) with biom_open(new_biom_fp, 'w') as f: table.to_hdf5(f, "Qiita BIOM type plugin") filepaths = [(new_biom_fp, 'biom')] # Validate the representative set, if it exists if 'preprocessed_fasta' in files: repset_fp = files['preprocessed_fasta'][0] # The observations ids of the biom table should be the same # as the representative sequences ids found in the representative set observation_ids = table.ids(axis='observation').tolist() extra_ids = [] for record in load([repset_fp], constructor=FastaIterator): rec_id = record['SequenceID'].split()[0] try: observation_ids.remove(rec_id) except ValueError: extra_ids.append(rec_id) error_msg = [] if extra_ids: error_msg.append("The representative set sequence file includes " "observations not found in the BIOM table: %s" % ', '.join(extra_ids)) if observation_ids: error_msg.append("The representative set sequence file is missing " "observation ids found in the BIOM tabe: %s" % ', '.join(observation_ids)) if error_msg: return False, None, '\n'.join(error_msg) filepaths.append((repset_fp, 'preprocessed_fasta')) for fp_type, fps in files.items(): if fp_type not in ('biom', 'preprocessed_fasta'): for fp in fps: filepaths.append((fp, fp_type)) return True, [ArtifactInfo(None, 'BIOM', filepaths)], ""
def deblur(qclient, job_id, parameters, out_dir): """Run deblur with the given parameters Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run deblur out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job Notes ----- The code will check if the artifact has a preprocessed_demux element, if not it will use the preprocessed_fastq. We prefer to work with the preprocessed_demux as running time will be greatly improved """ out_dir = join(out_dir, 'deblur_out') # Step 1 get the rest of the information need to run deblur qclient.update_job_step(job_id, "Step 1 of 3: Collecting information") artifact_id = parameters['seqs-fp'] # removing input from parameters so it's not part of the final command del parameters['seqs-fp'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Step 2 generating command deblur if 'preprocessed_demux' in fps: qclient.update_job_step( job_id, "Step 2 of 3: Generating per sample " "from demux (1/2)") if not exists(out_dir): mkdir(out_dir) split_out_dir = join(out_dir, 'split') if not exists(split_out_dir): mkdir(split_out_dir) # using the same number of parallel jobs as defined by the command n_jobs = parameters['jobs-to-start'] # [0] cause there should be only 1 file to_per_sample_files(fps['preprocessed_demux'][0], out_dir=split_out_dir, n_jobs=n_jobs) qclient.update_job_step( job_id, "Step 2 of 3: Generating per sample " "from demux (2/2)") out_dir = join(out_dir, 'deblured') cmd = generate_deblur_workflow_commands([split_out_dir], out_dir, parameters) else: qclient.update_job_step(job_id, "Step 2 of 3: Generating deblur " "command") cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'], out_dir, parameters) # Step 3 execute deblur qclient.update_job_step(job_id, "Step 3 of 3: Executing deblur job") std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg # Generating artifact pb = partial(join, out_dir) # Generate the filepaths final_biom = pb('final.biom') final_seqs = pb('final.seqs.fa') final_biom_16s = pb('final.only-16s.biom') final_seqs_na = pb('final.seqs.fa.no_artifacts') if not exists(final_biom_16s): # Create an empty table. We need to send something to Qiita that is # a valid BIOM, so we are going to create an empty table t = Table([], [], []) with biom_open(final_biom_16s, 'w') as f: t.to_hdf5(f, 'qp-deblur generated') if not exists(final_seqs_na): # Same as before, create an empty sequence file so we can send it with open(final_seqs_na, 'w') as f: f.write("") ainfo = [ ArtifactInfo('deblur final table', 'BIOM', [(final_biom, 'biom'), (final_seqs, 'preprocessed_fasta')]), ArtifactInfo('deblur 16S only table', 'BIOM', [(final_biom_16s, 'biom'), (final_seqs_na, 'preprocessed_fasta')]) ] return True, ainfo, ""
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp, fastq_fp=None, fasta_fp=None, log_fp=None): """Validate and fix a 'demux' file and regenerate fastq and fasta files Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id out_dir : str The output directory demux_fp : str The demux file path fastq_fp : str, optional The original fastq filepath. If demux is correct, it will not be regenerated fasta_fp : str, optional The original fasta filepath. If demux is correct, it will no be regenerated log_fp : str, optional The original log filepath Returns ------- dict The results og the job """ pt_sample_ids = set(prep_info) with open_file(demux_fp) as f: demux_sample_ids = set(f.keys()) if not pt_sample_ids.issuperset(demux_sample_ids): # The demux sample ids are different from the ones in the prep template qclient.update_job_step(job_id, "Step 3: Fixing sample ids") # Atempt 1: the user provided the run prefix column - in this case the # run prefix column holds the sample ids present in the demux file if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]: id_map = {v['run_prefix']: k for k, v in prep_info.items()} if not set(id_map).issuperset(demux_sample_ids): error_msg = ('The sample ids in the "run_prefix" columns ' 'from the prep information do not match the ' 'ones in the demux file. Please, correct the ' 'column "run_prefix" in the prep information to ' 'map the existing sample ids to the prep ' 'information sample ids.') return False, None, error_msg else: # Attempt 2: the sample ids in the demux table are the same that # in the prep template but without the prefix prefix = next(iter(pt_sample_ids)).split('.', 1)[0] prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids) if pt_sample_ids.issuperset(prefixed): id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids} else: # There is nothing we can do. The samples in the demux file do # not match the ones in the prep template and we can't fix it error_msg = ('The sample ids in the demultiplexed files do ' 'not match the ones in the prep information. ' 'Please, provide the column "run_prefix" in ' 'the prep information to map the existing sample' ' ids to the prep information sample ids.') return False, None, error_msg # Fix the sample ids # Do not modify the original demux file, copy it to a new location new_demux_fp = join(out_dir, basename(demux_fp)) # this if is important so we don't regenerate the demux file if the # user uploads fastq or fna if demux_fp != new_demux_fp: copy(demux_fp, new_demux_fp) demux_fp = new_demux_fp with open_file(demux_fp, 'r+') as f: for old in f: f.move(old, id_map[old]) # When we fix, we always generate the FASTQ and FASTA file # By setting them to None, below will be generated fastq_fp = None fasta_fp = None # If we didn't fix anything, we only generate the files if they don't # already exists name = splitext(basename(demux_fp))[0] if not fastq_fp: fastq_fp = join(out_dir, "%s.fastq" % name) to_ascii_file(demux_fp, fastq_fp, out_format='fastq') fastq_fp, error_msg = _gzip_file(fastq_fp) if error_msg is not None: return False, None, error_msg if not fasta_fp: fasta_fp = join(out_dir, "%s.fasta" % name) to_ascii_file(demux_fp, fasta_fp, out_format='fasta') fasta_fp, error_msg = _gzip_file(fasta_fp) if error_msg is not None: return False, None, error_msg filepaths = [(fastq_fp, 'preprocessed_fastq'), (fasta_fp, 'preprocessed_fasta'), (demux_fp, 'preprocessed_demux')] if log_fp: filepaths.append((log_fp, 'log')) return True, [ArtifactInfo(None, 'Demultiplexed', filepaths)], ""
def test_validate_representative_set(self): sample_ids = [ '1.SKB2.640194', '1.SKM4.640180', '1.SKB3.640195', '1.SKB6.640176', '1.SKD6.640190', '1.SKM6.640187', '1.SKD9.640182', '1.SKM8.640201', '1.SKM2.640199' ] biom_fp, job_id, parameters = self._create_job_and_biom(sample_ids, template=1) fd, fasta_fp = mkstemp(suffix=".fna") close(fd) with open(fasta_fp, 'w') as f: f.write(">O1 something\nACTG\n>O2\nATGC\n") self._clean_up_files.append(fasta_fp) exp_fp = partial(join, self.out_dir) exp_index_fp = exp_fp('index.html') exp_viz_fp = exp_fp('support_files') exp_qza_fp = exp_fp('feature-table.qza') with open(exp_index_fp, 'w') as f: f.write("my html") mkdir(exp_viz_fp) parameters = { 'template': parameters['template'], 'files': dumps({ 'biom': [biom_fp], 'preprocessed_fasta': [fasta_fp] }), 'artifact_type': 'BIOM' } obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) self.assertTrue(obs_success) files = [(biom_fp, 'biom'), (fasta_fp, 'preprocessed_fasta'), (exp_index_fp, 'html_summary'), (exp_viz_fp, 'html_summary_dir'), (exp_qza_fp, 'qza')] self.assertEqual(obs_ainfo, [ArtifactInfo(None, 'BIOM', files)]) self.assertEqual(obs_error, "") # Extra ids with open(fasta_fp, 'w') as f: f.write(">O1 something\nACTG\n>O2\nATGC\n>O3\nATGC\n") obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) self.assertFalse(obs_success) self.assertIsNone(obs_ainfo) self.assertEqual( obs_error, "The representative set sequence file includes observations not " "found in the BIOM table: O3") # Missing ids with open(fasta_fp, 'w') as f: f.write(">O1 something\nACTG\n") obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) self.assertFalse(obs_success) self.assertIsNone(obs_ainfo) self.assertEqual( obs_error, "The representative set sequence file is missing observation ids " "found in the BIOM tabe: O2")
def shogun(qclient, job_id, parameters, out_dir): """Run Shogun with the given parameters Parameters ---------- qclient : tgp.qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run split libraries out_dir : str The path to the job's output directory Returns ------- bool, list, str The results of the job """ # Step 1 get the rest of the information need to run Atropos qclient.update_job_step(job_id, "Step 1 of 7: Collecting information") artifact_id = parameters['input'] del parameters['input'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Get the artifact metadata prep_info = qclient.get('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) qiime_map = prep_info['qiime-map'] # Step 2 converting to fna qclient.update_job_step(job_id, "Step 2 of 7: Converting to FNA for Shogun") with TemporaryDirectory(dir=out_dir, prefix='shogun_') as temp_dir: rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else [] samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs, qiime_map) # Combining files comb_fp = generate_fna_file(temp_dir, samples) # Formatting parameters parameters = _format_params(parameters, SHOGUN_PARAMS) # Step 3 align sys_msg = "Step 3 of 7: Aligning FNA with Shogun (%d/{0})" align_cmd = generate_shogun_align_commands(comb_fp, temp_dir, parameters) success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg, 'Shogun Align') if not success: return False, None, msg # Step 4 taxonomic profile sys_msg = "Step 4 of 7: Taxonomic profile with Shogun (%d/{0})" assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands( temp_dir, parameters) success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg, 'Shogun taxonomy assignment') if not success: return False, None, msg # Step 5 redistribute profile sys_msg = "Step 5 of 7: Redistributed profile with Shogun (%d/{0})" levels = ['genus', 'species', 'strain'] redist_fps = [] for level in levels: redist_cmd, output = generate_shogun_redist_commands( profile_fp, temp_dir, parameters, level) redist_fps.append(output) success, msg = _run_commands(qclient, job_id, redist_cmd, sys_msg, 'Shogun redistribute') if not success: return False, None, msg # Step 6 functional profile sys_msg = "Step 6 of 7: Functional profile with Shogun (%d/{0})" levels = ['species'] func_fp = '' for level in levels: func_cmd, output = generate_shogun_functional_commands( profile_fp, temp_dir, parameters, level) func_fp = output success, msg = _run_commands(qclient, job_id, func_cmd, sys_msg, 'Shogun functional') if not success: return False, None, msg # Step 6 functional profile sys_msg = "Step 7 of 7: Converting results to BIOM (%d/{0})" func_biom_outputs = [] redist_biom_outputs = [] # Converting redistributed files to biom redist_levels = ['genus', 'species', 'strain'] for redist_fp, level in zip(redist_fps, redist_levels): biom_cmd, output = generate_biom_conversion_commands( redist_fp, out_dir, level, 'redist') success, msg = _run_commands(qclient, job_id, biom_cmd, sys_msg, 'Redistribute Biom conversion') if not success: return False, None, msg else: redist_biom_outputs.append(output) # Coverting funcitonal files to biom for level in levels: func_to_biom_fps = [ "kegg.modules.coverage", "kegg.modules", "kegg.pathways.coverage", "kegg.pathways", "kegg", "normalized" ] for biom_in in func_to_biom_fps: biom_in_fp = join(func_fp, "profile.%s.%s.txt" % (level, biom_in)) biom_cmd, output = generate_biom_conversion_commands( biom_in_fp, out_dir, level, biom_in) success, msg = _run_commands(qclient, job_id, biom_cmd, sys_msg, ' Functional Biom conversion') if not success: return False, None, msg else: func_biom_outputs.append(output) func_files_type_name = 'Functional Predictions' redist_files_type_name = 'Taxonomic Predictions' ainfo = [ ArtifactInfo(func_files_type_name, 'BIOM', func_biom_outputs), ArtifactInfo(redist_files_type_name, 'BIOM', redist_biom_outputs) ] return True, ainfo, ""
def test_validate_per_sample_FASTQ_preprocessed_fastq(self): prep_info = { "1.SKB2.640194": { "not_a_run_prefix": "prefix1" }, "1.SKM4.640180": { "not_a_run_prefix": "prefix1" }, "1.SKB3.640195": { "not_a_run_prefix": "prefix2" } } files = { 'preprocessed_fastq': [ '/path/to/SKB2.640194_file.fastq', '/path/to/SKM4.640180_file.fastq', '/path/to/SKB3.640195_file.fastq' ] } job_id = self._create_template_and_job(prep_info, files, "per_sample_FASTQ") obs_success, obs_ainfo, obs_error = _validate_per_sample_FASTQ( self.qclient, job_id, prep_info, files) self.assertTrue(obs_success) filepaths = [('/path/to/SKB2.640194_file.fastq', 'preprocessed_fastq'), ('/path/to/SKM4.640180_file.fastq', 'preprocessed_fastq'), ('/path/to/SKB3.640195_file.fastq', 'preprocessed_fastq')] exp = [ArtifactInfo(None, "per_sample_FASTQ", filepaths)] self.assertEqual(obs_ainfo, exp) self.assertEqual(obs_error, "") prep_info = { "1.SKB2.640194": { "not_a_run_prefix": "prefix1" }, "1.SKM4.640180": { "not_a_run_prefix": "prefix1" }, "1.SKB3.640195": { "not_a_run_prefix": "prefix2" } } files = { 'preprocessed_fastq': [ '/path/to/SKB2.640194_file_R1.fastq', '/path/to/SKB2.640194_file_R2.fastq', '/path/to/SKB2.640194_file_unmatched_R1.fastq', '/path/to/SKB2.640194_file_unmatched_R2.fastq', '/path/to/SKM4.640180_file_R1.fastq', '/path/to/SKM4.640180_file_R2.fastq', '/path/to/SKM4.640180_file_unmatched_R1.fastq', '/path/to/SKM4.640180_file_unmatched_R2.fastq', '/path/to/SKB3.640195_file_R1.fastq', '/path/to/SKB3.640195_file_R2.fastq', '/path/to/SKB3.640195_file_unmatched_R1.fastq', '/path/to/SKB3.640195_file_unmatched_R2.fastq' ] } job_id = self._create_template_and_job(prep_info, files, "per_sample_FASTQ") obs_success, obs_ainfo, obs_error = _validate_per_sample_FASTQ( self.qclient, job_id, prep_info, files) self.assertTrue(obs_success) filepaths = [ ('/path/to/SKB2.640194_file_R1.fastq', 'preprocessed_fastq'), ('/path/to/SKB2.640194_file_R2.fastq', 'preprocessed_fastq'), ('/path/to/SKB2.640194_file_unmatched_R1.fastq', 'preprocessed_fastq'), ('/path/to/SKB2.640194_file_unmatched_R2.fastq', 'preprocessed_fastq'), ('/path/to/SKM4.640180_file_R1.fastq', 'preprocessed_fastq'), ('/path/to/SKM4.640180_file_R2.fastq', 'preprocessed_fastq'), ('/path/to/SKM4.640180_file_unmatched_R1.fastq', 'preprocessed_fastq'), ('/path/to/SKM4.640180_file_unmatched_R2.fastq', 'preprocessed_fastq'), ('/path/to/SKB3.640195_file_R1.fastq', 'preprocessed_fastq'), ('/path/to/SKB3.640195_file_R2.fastq', 'preprocessed_fastq'), ('/path/to/SKB3.640195_file_unmatched_R1.fastq', 'preprocessed_fastq'), ('/path/to/SKB3.640195_file_unmatched_R2.fastq', 'preprocessed_fastq') ] exp = [ArtifactInfo(None, "per_sample_FASTQ", filepaths)] self.assertEqual(obs_ainfo, exp) self.assertEqual(obs_error, "")
def _validate_multiple(qclient, job_id, prep_info, files, atype): """Validate and fix a new 'SFF', 'FASTQ', 'FASTA' or 'FASTA_Sanger' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type atype: str The type of the artifact Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating '%s' files" % atype) req_fp_types, opt_fp_types = FILEPATH_TYPE_DICT[atype] all_fp_types = req_fp_types | opt_fp_types # Check if there is any filepath type that is not supported unsupported_fp_types = set(files) - all_fp_types if unsupported_fp_types: error_msg = ("Filepath type(s) %s not supported by artifact " "type %s. Supported filepath types: %s" % (', '.join(unsupported_fp_types), atype, ', '.join( sorted(all_fp_types)))) return False, None, error_msg # Check if the run_prefix column is present in the prep info offending = {} types_seen = set() if 'run_prefix' in prep_info[next(iter(prep_info))]: # We can potentially have more than one lane in the prep information # so check that the provided files are prefixed with the values in # the run_prefix column run_prefixes = set(v['run_prefix'] for k, v in prep_info.items()) num_prefixes = len(run_prefixes) # Check those filepath types that are required for ftype, t_files in files.items(): # SFF is an special case cause we can have multiple files with # the same prefix if num_prefixes != len(t_files) and atype != 'SFF': offending[ftype] = ( "The number of provided files (%d) doesn't match the " "number of run prefix values in the prep info (%d): %s" % (len(t_files), num_prefixes, ', '.join( basename(f) for f in t_files))) else: rps = [] fps = [] for fp in t_files: bn = basename(fp) found = [rp for rp in run_prefixes if bn.startswith(rp)] if found: rps.extend(found) else: fps.append(bn) if fps: offending[ftype] = ( "The provided files do not match the run prefix " "values in the prep information: %s" % ', '.join(fps)) else: rps = run_prefixes - set(rps) if rps: offending[ftype] = ( "The following run prefixes in the prep " "information file do not match any file: %s" % ', '.join(rps)) types_seen.add(ftype) else: # If the run prefix column is not provided, we only allow a single # lane, so check that we have a single file for each provided # filepath type for ftype, t_files in files.items(): if len(t_files) != 1: offending[ftype] = ( "Only one file per type is allowed. Please provide the " "column 'run_prefix' if you need more than one file per " "type: %s" % ', '.join(basename(fp) for fp in t_files)) types_seen.add(ftype) # Check that all required filepath types where present missing = req_fp_types - types_seen if missing: error_msg = ("Missing required filepath type(s): %s" % ', '.join(missing)) return False, None, error_msg # Check if there was any offending file if offending: error_list = ["%s: %s" % (k, v) for k, v in offending.items()] error_msg = ("Error creating artifact. Offending files:\n%s" % '\n'.join(error_list)) return False, None, error_msg # Everything is ok filepaths = [] for fps_type, fps in files.items(): filepaths.extend([(fp, fps_type) for fp in fps]) return True, [ArtifactInfo(None, atype, filepaths)], ""
def shogun(qclient, job_id, parameters, out_dir): """Run Shogun with the given parameters Parameters ---------- qclient : tgp.qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run split libraries out_dir : str The path to the job's output directory Returns ------- bool, list, str The results of the job """ # Step 1 get the rest of the information need to run Atropos qclient.update_job_step(job_id, "Step 1 of 6: Collecting information") artifact_id = parameters['input'] del parameters['input'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Get the artifact metadata prep_info = qclient.get('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) qiime_map = prep_info['qiime-map'] # Step 2 converting to fna qclient.update_job_step(job_id, "Step 2 of 6: Converting to FNA for Shogun") rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else [] samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs, qiime_map) # Combining files comb_fp = generate_fna_file(out_dir, samples) # Formatting parameters parameters = _format_params(parameters, SHOGUN_PARAMS) # Step 3 align align_cmd = generate_shogun_align_commands(comb_fp, out_dir, parameters) sys_msg = "Step 3 of 6: Aligning FNA with Shogun (%d/{0})".format( len(align_cmd)) success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg, 'Shogun Align') if not success: return False, None, msg # Step 4 taxonomic profile sys_msg = "Step 4 of 6: Taxonomic profile with Shogun (%d/{0})" assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands( out_dir, parameters) success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg, 'Shogun taxonomy assignment') if not success: return False, None, msg sys_msg = "Step 5 of 6: Compressing and converting alignment to BIOM" qclient.update_job_step(job_id, msg) alignment_fp = join( out_dir, 'alignment.%s.%s' % (parameters['aligner'], ALN2EXT[parameters['aligner']])) xz_cmd = 'xz -9 -T%s %s' % (parameters['threads'], alignment_fp) std_out, std_err, return_value = system_call(xz_cmd) if return_value != 0: error_msg = ("Error during %s:\nStd out: %s\nStd err: %s" "\n\nCommand run was:\n%s" % (sys_msg, std_out, std_err, xz_cmd)) return False, None, error_msg output = run_shogun_to_biom(profile_fp, [None, None, None, True], out_dir, 'profile') ainfo = [ ArtifactInfo('Shogun Alignment Profile', 'BIOM', [(output, 'biom'), ('%s.xz' % alignment_fp, 'log')]) ] # Step 5 redistribute profile sys_msg = "Step 6 of 6: Redistributed profile with Shogun (%d/{0})" levels = ['phylum', 'genus', 'species'] redist_fps = [] for level in levels: redist_cmd, output = generate_shogun_redist_commands( profile_fp, out_dir, parameters, level) redist_fps.append(output) success, msg = _run_commands(qclient, job_id, redist_cmd, sys_msg, 'Shogun redistribute') if not success: return False, None, msg # Converting redistributed files to biom for redist_fp, level in zip(redist_fps, levels): biom_in = ["redist", None, '', True] output = run_shogun_to_biom(redist_fp, biom_in, out_dir, level, 'redist') aname = 'Taxonomic Predictions - %s' % level ainfo.append(ArtifactInfo(aname, 'BIOM', [(output, 'biom')])) return True, ainfo, ""
def humann2(qclient, job_id, parameters, out_dir): """Run humann2 with the given parameters Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run HUMAnN2 out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job """ # Step 1 get the rest of the information need to run humann2 qclient.update_job_step(job_id, "Step 1 of 6: Collecting information") artifact_id = parameters['input'] # removing input from parameters so it's not part of the final command del parameters['input'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Get the artifact metadata prep_info = qclient.get('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) qiime_map = prep_info['qiime-map'] # Step 2 generating command humann2 qclient.update_job_step(job_id, "Step 2 of 6: Generating HUMANn2 command") rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else [] commands = generate_humann2_analysis_commands(fps['raw_forward_seqs'], rs, qiime_map, out_dir, parameters) # Step 3 execute humann2 msg = "Step 3 of 6: Executing HUMANn2 job (%d/{0})".format(len(commands)) success, msg = _run_commands(qclient, job_id, commands, msg) if not success: return False, None, msg # Step 4 merge tables commands = [] commands.append(('humann2_join_tables -i {0} -o {0}/genefamilies.biom ' '--file_name genefamilies --search-subdirectories ' '--verbose').format(out_dir)) commands.append(('humann2_join_tables -i {0} -o {0}/pathcoverage.biom ' '--file_name pathcoverage --search-subdirectories ' '--verbose').format(out_dir)) commands.append(('humann2_join_tables -i {0} -o {0}/pathabundance.biom ' '--file_name pathabundance --search-subdirectories ' '--verbose').format(out_dir)) msg = "Step 4 of 6: Merging resulting tables job (%d/3)" success, msg = _run_commands(qclient, job_id, commands, msg) if not success: return False, None, msg # Step 5 generating re-normalized tables commands = [] commands.append(('humann2_renorm_table -i {0}/genefamilies.biom -u cpm ' '-o {0}/genefamilies_cpm.biom').format(out_dir)) commands.append(('humann2_renorm_table -i {0}/pathcoverage.biom -u relab ' '-o {0}/pathcoverage_relab.biom').format(out_dir)) commands.append(('humann2_renorm_table -i {0}/pathabundance.biom -u relab ' '-o {0}/pathabundance_relab.biom').format(out_dir)) msg = "Step 5 of 6: Re-normalizing tables (%d/3)" success, msg = _run_commands(qclient, job_id, commands, msg) if not success: return False, None, msg # Step 6 stratifiying re-normalized tables commands = [] pb = partial(join, out_dir) cmd = "humann2_split_stratified_table --input %s --output %s" commands.append(cmd % (pb(out_dir, 'genefamilies_cpm.biom'), out_dir)) commands.append(cmd % (pb(out_dir, 'pathcoverage_relab.biom'), out_dir)) commands.append(cmd % (pb(out_dir, 'pathabundance_relab.biom'), out_dir)) msg = "Step 6 of 6: Stratifiying re-normalizing tables (%d/3)" success, msg = _run_commands(qclient, job_id, commands, msg) if not success: return False, None, msg # Generating 6 artifacts, separation is important for analysis ainfo = [ ArtifactInfo('Gene family table', 'BIOM', [(pb('genefamilies.biom'), 'biom')]), ArtifactInfo('Path coverage table', 'BIOM', [(pb('pathcoverage.biom'), 'biom')]), ArtifactInfo('Path abundance table', 'BIOM', [(pb('pathabundance.biom'), 'biom')]), ArtifactInfo('Gene family CMP table', 'BIOM', [(pb('genefamilies_cpm.biom'), 'biom')]), ArtifactInfo('Path coverage RELAB table', 'BIOM', [(pb('pathcoverage_relab.biom'), 'biom')]), ArtifactInfo('Path abundance RELAB table', 'BIOM', [(pb('pathabundance_relab.biom'), 'biom')]), ArtifactInfo('Gene family CMP table - stratified', 'BIOM', [(pb('genefamilies_cpm_stratified.biom'), 'biom')]), ArtifactInfo('Path coverage RELAB table - stratified', 'BIOM', [(pb('pathcoverage_relab_stratified.biom'), 'biom')]), ArtifactInfo('Path abundance RELAB table - stratified', 'BIOM', [(pb('pathabundance_relab_stratified.biom'), 'biom')]), ArtifactInfo('Gene family CMP table - unstratified', 'BIOM', [(pb('genefamilies_cpm_unstratified.biom'), 'biom')]), ArtifactInfo('Path coverage RELAB table - unstratified', 'BIOM', [(pb('pathcoverage_relab_unstratified.biom'), 'biom')]), ArtifactInfo('Path abundance RELAB table - unstratified', 'BIOM', [(pb('pathabundance_relab_unstratified.biom'), 'biom')]) ] return True, ainfo, ""
def test_woltka_to_array_wol(self): # inserting new prep template prep_info_dict = { 'SKB8.640193': { 'run_prefix': 'S22205_S104_L001_R1' }, 'SKD8.640184': { 'run_prefix': 'S22282_S102_L001_R1' } } database = join(self.db_path, 'wol/WoLmin') pid, aid, job_id = self._helper_woltka_bowtie(prep_info_dict, database) out_dir = mkdtemp() self._clean_up_files.append(out_dir) # retriving info of the prep/artifact just created artifact_info = self.qclient.get("/qiita_db/artifacts/%s/" % aid) directory = { dirname(ffs) for _, fs in artifact_info['files'].items() for ffs in fs } directory = directory.pop() prep_info = artifact_info['prep_information'] prep_info = self.qclient.get('/qiita_db/prep_template/%s/' % prep_info[0]) prep_file = prep_info['prep-file'] url = 'this-is-my-url' main_qsub_fp, merge_qsub_fp = woltka_to_array(directory, out_dir, database, prep_file, url, job_id) self.assertEqual(join(out_dir, f'{job_id}.qsub'), main_qsub_fp) self.assertEqual(join(out_dir, f'{job_id}.merge.qsub'), merge_qsub_fp) with open(main_qsub_fp) as f: main_qsub = f.readlines() with open(merge_qsub_fp) as f: merge_qsub = f.readlines() exp_main_qsub = [ '#!/bin/bash\n', '#PBS -M [email protected]\n', f'#PBS -N {job_id}\n', '#PBS -l nodes=1:ppn=8\n', '#PBS -l walltime=10:00:00\n', '#PBS -l mem=64g\n', f'#PBS -o {out_dir}/{job_id}_' '${PBS_ARRAYID}.log\n', f'#PBS -e {out_dir}/{job_id}_' '${PBS_ARRAYID}.err\n', '#PBS -t 1-2%8\n', f'cd {out_dir}\n', f'{self.environment}\n', 'date\n', 'hostname\n', 'offset=${PBS_ARRAYID}\n', 'step=$(( $offset - 0 ))\n', 'if [[ $step -gt 2 ]]; then exit 0; fi\n', f'args0=$(head -n $step {out_dir}/{job_id}.array-details' ' | tail -n 1)\n', "infile0=$(echo -e $args0 | awk '{ print $1 }')\n", "outfile0=$(echo -e $args0 | awk '{ print $2 }')\n", 'set -e\n', 'cat $infile0*.fastq.gz > $outfile0.fastq.gz; bowtie2 -p 8 -x ' f'{database} -q $outfile0.fastq.gz -S $outfile0.sam --seed 42 ' '--very-sensitive -k 16 --np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" ' '--score-min "L,0,-0.05" --no-head --no-unal; woltka classify ' '-i $outfile0.sam -o $outfile0.woltka-taxa --no-demux ' f'--lineage {database}.tax --rank phylum,genus,species,free,none; ' f'woltka classify -i $outfile0.sam -c {database}.coords ' '-o $outfile0.woltka-per-gene --no-demux; xz -9 -T8 -c ' '$outfile0.sam > $outfile0.xz\n', 'set +e\n', 'date\n' ] self.assertEqual(main_qsub, exp_main_qsub) exp_merge_qsub = [ '#!/bin/bash\n', '#PBS -M [email protected]\n', f'#PBS -N merge-{job_id}\n', '#PBS -l nodes=1:ppn=6\n', '#PBS -l walltime=4:00:00\n', '#PBS -l mem=48g\n', f'#PBS -o {out_dir}/merge-{job_id}.log\n', f'#PBS -e {out_dir}/merge-{job_id}.err\n', f'cd {out_dir}\n', f'{self.environment}\n', 'date\n', 'hostname\n', 'set -e\n', f'woltka_merge --prep {prep_file} --base {out_dir} --name ' 'phylum --glob "*.woltka-taxa/phylum.biom" &\n', f'woltka_merge --prep {prep_file} --base {out_dir} --name ' 'genus --glob "*.woltka-taxa/genus.biom" &\n', f'woltka_merge --prep {prep_file} --base {out_dir} --name ' 'species --glob "*.woltka-taxa/species.biom" &\n', f'woltka_merge --prep {prep_file} --base {out_dir} --name ' 'free --glob "*.woltka-taxa/free.biom" &\n', f'woltka_merge --prep {prep_file} --base {out_dir} --name ' 'none --glob "*.woltka-taxa/none.biom" &\n', f'woltka_merge --prep {prep_file} --base {out_dir} --name ' 'per-gene --glob "*.woltka-per-gene" --rename &\n', 'wait\n', f'cd {out_dir}; tar -cvf alignment.tar *.sam.xz\n', f'finish_woltka {url} {job_id} {out_dir}\n', 'date\n' ] self.assertEqual(merge_qsub, exp_merge_qsub) # now let's test that if finished correctly sdir = 'qp_woltka/support_files/' copyfile(f'{sdir}/genus.biom', f'{out_dir}/genus.biom') copyfile(f'{sdir}/none.biom', f'{out_dir}/none.biom') copyfile(f'{sdir}/per-gene.biom', f'{out_dir}/per-gene.biom') copyfile(f'{sdir}/species.biom', f'{out_dir}/species.biom') copyfile(f'{sdir}/phylum.biom', f'{out_dir}/phylum.biom') copyfile(f'{sdir}/free.biom', f'{out_dir}/free.biom') copyfile(f'{sdir}/alignment.tar', f'{out_dir}/alignment.tar') success, ainfo, msg = woltka(self.qclient, job_id, self.params, out_dir) self.assertEqual("", msg) self.assertTrue(success) exp = [ ArtifactInfo('Alignment Profile', 'BIOM', [(f'{out_dir}/free.biom', 'biom'), (f'{out_dir}/alignment.tar', 'log')]), ArtifactInfo('Taxonomic Predictions - phylum', 'BIOM', [(f'{out_dir}/phylum.biom', 'biom')]), ArtifactInfo('Taxonomic Predictions - genus', 'BIOM', [(f'{out_dir}/genus.biom', 'biom')]), ArtifactInfo('Taxonomic Predictions - species', 'BIOM', [(f'{out_dir}/species.biom', 'biom')]), ArtifactInfo('Per genome Predictions', 'BIOM', [(f'{out_dir}/none.biom', 'biom')]), ArtifactInfo('Per gene Predictions', 'BIOM', [(f'{out_dir}/per-gene.biom', 'biom')]) ] self.assertCountEqual(ainfo, exp) # check that the produced table have feature taxonomy bt = load_table(f'{out_dir}/phylum.biom') self.assertCountEqual( bt.metadata_to_dataframe('observation').columns, ['taxonomy_0', 'taxonomy_1'])
def _validate_per_sample_FASTQ(qclient, job_id, prep_info, files, test=False): """Validate and fix a new 'per_sample_FASTQ' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type test: bolean, optional If True this is being called by a test Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating 'per_sample_FASTQ' files") samples = list(prep_info.keys()) samples_count = len(samples) # Check if there is any filepath type that is not supported unsupported_fp_types = set(files) - { 'raw_forward_seqs', 'raw_reverse_seqs', 'preprocessed_fastq' } if unsupported_fp_types: error_msg = ("Filepath type(s) %s not supported by artifact " "type per_sample_FASTQ. Supported filepath types: " "raw_forward_seqs, raw_reverse_seqs, preprocessed_fastq" % ', '.join(unsupported_fp_types)) return False, None, error_msg if 'raw_forward_seqs' in files: if 'preprocessed_fastq' in files: error_msg = ("If raw_forward_seqs is provided, preprocessed_fastq " "should not be provided") return False, None, error_msg read_files = files['raw_forward_seqs'] read_files_count = len(read_files) counts_match = read_files_count == samples_count elif 'preprocessed_fastq' in files: if 'raw_reverse_seqs' in files: error_msg = ("If preprocessed_fastq is provided, raw_reverse_seqs " "should not be provided") return False, None, error_msg read_files = files['preprocessed_fastq'] read_files_count = len(read_files) # In the preprocessed_fastq case, we either have 1 file per sample # or 4 files per sample counts_match = ((read_files_count == samples_count) or (read_files_count == 4 * samples_count)) else: error_msg = ("Missing required filepath type: raw_forward_seqs or " "preprocessed_fastq") return False, None, error_msg # Make sure that we hve the same number of files than samples if 'raw_reverse_seqs' in files: rev_count = len(files['raw_reverse_seqs']) counts_match = counts_match and (rev_count == samples_count) else: rev_count = 0 if not counts_match: error_msg = ("The number of provided files doesn't match the " "number of samples (%d): %d raw_forward_seqs, " "%d raw_reverse_seqs (optional, 0 is ok)" % (samples_count, read_files_count, rev_count)) return False, None, error_msg def _check_files(run_prefixes, read_files, rev_count, files): # Check that the provided files match the run prefixes fwd_fail = [ basename(fp) for fp in read_files if not basename(fp).startswith(tuple(run_prefixes)) ] if rev_count > 0: rev_fail = [ basename(fp) for fp in files['raw_reverse_seqs'] if not basename(fp).startswith(tuple(run_prefixes)) ] else: rev_fail = [] return fwd_fail, rev_fail # first let's check via sample sample_names run_prefixes = [sid.split('.', 1)[1] for sid in samples] fwd_fail, rev_fail = _check_files(run_prefixes, read_files, rev_count, files) # if that doesn't work, let's test via run_prefix run_prefix_present = 'run_prefix' in prep_info[samples[0]] if (fwd_fail or rev_fail) and run_prefix_present: run_prefixes = [v['run_prefix'] for k, v in prep_info.items()] if samples_count != len(set(run_prefixes)): repeated = [ "%s (%d)" % (p, run_prefixes.count(p)) for p in set(run_prefixes) if run_prefixes.count(p) > 1 ] error_msg = ("The values for the column 'run_prefix' are not " "unique for each sample. Repeated values: %s" % ', '.join(repeated)) return False, None, error_msg fwd_fail, rev_fail = _check_files(run_prefixes, read_files, rev_count, files) if fwd_fail or rev_fail: error_msg = "The provided files are not prefixed by sample id" if run_prefix_present: error_msg += (" or do not match the run prefix values in the " "prep information.") else: error_msg += "." error_msg += (" Offending files:\n raw_forward_seqs: %s\n" "raw_reverse_seqs: %s" % (', '.join(fwd_fail), ', '.join(rev_fail))) return False, None, error_msg filepaths = [] empty_files = [] for fps_type, fps in files.items(): for fp in fps: try: fp_size = getsize(fp) except OSError: fp_size = 0 # 62 is the size of a gzip empty files that we generate if fp_size <= 100: empty_files.append(basename(fp)) if fps_type in MUST_GZ: fp, error_msg = _gzip_file(fp, test) if error_msg is not None: return False, None, error_msg filepaths.append((fp, fps_type)) if empty_files: error_msg = "Some of the files are empty: %s" % ', '.join(empty_files) return False, None, error_msg return True, [ArtifactInfo(None, 'per_sample_FASTQ', filepaths)], ""
def shogun(qclient, job_id, parameters, out_dir): """Run Shogun with the given parameters Parameters ---------- qclient : tgp.qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run split libraries out_dir : str The path to the job's output directory Returns ------- bool, list, str The results of the job """ # Step 1 get the rest of the information need to run Atropos qclient.update_job_step(job_id, "Step 1 of 5: Collecting information") artifact_id = parameters['input'] del parameters['input'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Get the artifact metadata prep_info = qclient.get('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) qiime_map = prep_info['qiime-map'] # Step 2 converting to fna qclient.update_job_step(job_id, "Step 2 of 5: Converting to FNA for Shogun") with TemporaryDirectory(dir=out_dir, prefix='shogun_') as temp_dir: rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else [] samples = make_read_pairs_per_sample(fps['raw_forward_seqs'], rs, qiime_map) # Combining files comb_fp = generate_fna_file(temp_dir, samples) # Formatting parameters parameters = _format_params(parameters, SHOGUN_PARAMS) # Step 3 align align_cmd = generate_shogun_align_commands(comb_fp, temp_dir, parameters) sys_msg = "Step 3 of 5: Aligning FNA with Shogun (%d/{0})".format( len(align_cmd)) success, msg = _run_commands(qclient, job_id, align_cmd, sys_msg, 'Shogun Align') if not success: return False, None, msg # Step 4 taxonomic profile sys_msg = "Step 4 of 5: Taxonomic profile with Shogun (%d/{0})" assign_cmd, profile_fp = generate_shogun_assign_taxonomy_commands( temp_dir, parameters) success, msg = _run_commands(qclient, job_id, assign_cmd, sys_msg, 'Shogun taxonomy assignment') if not success: return False, None, msg sys_msg = "Step 5 of 5: Converting output to BIOM" qclient.update_job_step(job_id, msg) output = run_shogun_to_biom(profile_fp, [None, None, None, True], out_dir, 'profile') ainfo = [ ArtifactInfo('Shogun Alignment Profile', 'BIOM', [(output, 'biom')]) ] return True, ainfo, ""
def _validate_multiple(qclient, job_id, prep_info, files, atype, test=False): """Validate and fix a new 'SFF', 'FASTQ', 'FASTA' or 'FASTA_Sanger' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type atype: str The type of the artifact test: bolean, optional If True this is being called by a test Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating '%s' files" % atype) req_fp_types, opt_fp_types = FILEPATH_TYPE_DICT[atype] all_fp_types = req_fp_types | opt_fp_types # Check if there is any filepath type that is not supported unsupported_fp_types = set(files) - all_fp_types if unsupported_fp_types: error_msg = ("Filepath type(s) %s not supported by artifact " "type %s. Supported filepath types: %s" % (', '.join(unsupported_fp_types), atype, ', '.join( sorted(all_fp_types)))) return False, None, error_msg # Check if the run_prefix column is present in the prep info offending = {} types_seen = set() if 'run_prefix' in prep_info[next(iter(prep_info))]: # We can potentially have more than one lane in the prep information # so check that the provided files are prefixed with the values in # the run_prefix column run_prefixes = set(v['run_prefix'] for k, v in prep_info.items()) num_prefixes = len(run_prefixes) # Check those filepath types that are required for ftype, t_files in files.items(): # SFF is an special case cause we can have multiple files with # the same prefix if num_prefixes != len(t_files) and atype != 'SFF': offending[ftype] = ( "The number of provided files (%d) doesn't match the " "number of run prefix values in the prep info (%d): %s" % (len(t_files), num_prefixes, ', '.join( basename(f) for f in t_files))) else: rps = [] fps = [] for fp in t_files: bn = basename(fp) found = [rp for rp in run_prefixes if bn.startswith(rp)] if found: rps.extend(found) else: fps.append(bn) if fps: offending[ftype] = ( "The provided files do not match the run prefix " "values in the prep information: %s" % ', '.join(fps)) else: rps = run_prefixes - set(rps) if rps: offending[ftype] = ( "The following run prefixes in the prep " "information file do not match any file: %s" % ', '.join(rps)) types_seen.add(ftype) else: # If the run prefix column is not provided, we only allow a single # lane, so check that we have a single file for each provided # filepath type for ftype, t_files in files.items(): if len(t_files) != 1: offending[ftype] = ( "Only one file per type is allowed. Please provide the " "column 'run_prefix' if you need more than one file per " "type: %s" % ', '.join(basename(fp) for fp in t_files)) types_seen.add(ftype) # Check that all required filepath types where present missing = req_fp_types - types_seen if missing: error_msg = ("Missing required filepath type(s): %s" % ', '.join(missing)) return False, None, error_msg # Check if there was any offending file if offending: error_list = ["%s: %s" % (k, v) for k, v in offending.items()] error_msg = ("Error creating artifact. Offending files:\n%s" % '\n'.join(error_list)) return False, None, error_msg # Everything is ok filepaths = [] for fps_type, fps in files.items(): for fp in fps: if fps_type in MUST_GZ: fp, error_msg = _gzip_file(fp, test) if error_msg is not None: return False, None, error_msg filepaths.append((fp, fps_type)) # let's count sequences; this is basically the last check errors = [] artifact_information = [] if atype not in FILEPATH_TYPE_NO_FQTOOLS: for fp, fpt in filepaths: cmd = f'fqtools count {fp}' std_out, std_err, return_value = system_call(cmd) fn = basename(fp) if std_err or return_value != 0: errors.append(f'{fn}: {std_err}') else: reads = int(std_out) artifact_information.append({ 'filename': fn, 'reads': reads, 'file_type': fpt }) if errors: raise ValueError('Found errors: \n %s' % ''.join(errors)) dname = dirname(fp) pd.DataFrame(artifact_information).to_csv( f'{dname}/qtp-sequencing-validate-data.csv', index=False) return True, [ArtifactInfo(None, atype, filepaths)], ""
def _validate_per_sample_FASTQ(qclient, job_id, prep_info, files): """Validate and fix a new 'per_sample_FASTQ' artifact Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id files : dict of {str: list of str} The files to add to the new artifact, keyed by filepath type Returns ------- dict The results of the job """ qclient.update_job_step(job_id, "Step 2: Validating 'per_sample_FASTQ' files") samples = prep_info.keys() samples_count = len(samples) # Check if there is any filepath type that is not supported unsupported_fp_types = set(files) - { 'raw_forward_seqs', 'raw_reverse_seqs', 'preprocessed_fastq' } if unsupported_fp_types: error_msg = ("Filepath type(s) %s not supported by artifact " "type per_sample_FASTQ. Supported filepath types: " "raw_forward_seqs, raw_reverse_seqs, preprocessed_fastq" % ', '.join(unsupported_fp_types)) return False, None, error_msg if 'raw_forward_seqs' in files: if 'preprocessed_fastq' in files: error_msg = ("If raw_forward_seqs is provided, preprocessed_fastq " "should not be provided") return False, None, error_msg read_files = files['raw_forward_seqs'] read_files_count = len(read_files) counts_match = read_files_count == samples_count elif 'preprocessed_fastq' in files: if 'raw_reverse_seqs' in files: error_msg = ("If preprocessed_fastq is provided, raw_reverse_seqs " "should not be provided") return False, None, error_msg read_files = files['preprocessed_fastq'] read_files_count = len(read_files) # In the preprocessed_fastq case, we either have 1 file per sample # or 4 files per sample counts_match = ((read_files_count == samples_count) or (read_files_count == 4 * samples_count)) else: error_msg = ("Missing required filepath type: raw_forward_seqs or " "preprocessed_fastq") return False, None, error_msg # Make sure that we hve the same number of files than samples if 'raw_reverse_seqs' in files: rev_count = len(files['raw_reverse_seqs']) counts_match = counts_match and (rev_count == samples_count) else: rev_count = 0 if not counts_match: error_msg = ("The number of provided files doesn't match the " "number of samples (%d): %d raw_forward_seqs, " "%d raw_reverse_seqs (optional, 0 is ok)" % (samples_count, read_files_count, rev_count)) return False, None, error_msg if 'run_prefix' in prep_info[samples[0]]: # The column 'run_prefix' is present in the prep information. # Make sure that twe have the same number of run_prefix values # than the number of samples run_prefixes = [v['run_prefix'] for k, v in prep_info.items()] if samples_count != len(set(run_prefixes)): repeated = [ "%s (%d)" % (p, run_prefixes.count(p)) for p in set(run_prefixes) if run_prefixes.count(p) > 1 ] error_msg = ("The values for the column 'run_prefix' are not " "unique for each sample. Repeated values: %s" % ', '.join(repeated)) return False, None, error_msg error_msg = ("The provided files do not match the run prefix values " "in the prep information. Offending files: " "raw_forward_seqs: %s, raw_reverse_seqs: %s") else: # The column 'run_prefix' is not in the prep template. In this case, # check that the files are prefixed by the sample ids without the # study id run_prefixes = [sid.split('.', 1)[1] for sid in samples] error_msg = ("The provided files are not prefixed by sample id. " "Please provide the 'run_prefix' column in your prep " "information. Offending files: raw_forward_seqs: %s, " "raw_reverse_seqs: %s") # Check that the provided files match the run prefixes fwd_fail = [ basename(fp) for fp in read_files if not basename(fp).startswith(tuple(run_prefixes)) ] if rev_count > 0: rev_fail = [ basename(fp) for fp in files['raw_reverse_seqs'] if not basename(fp).startswith(tuple(run_prefixes)) ] else: rev_fail = [] if fwd_fail or rev_fail: error_msg = error_msg % (', '.join(fwd_fail), ', '.join(rev_fail)) return False, None, error_msg filepaths = [] for fps_type, fps in files.items(): filepaths.extend([(fp, fps_type) for fp in fps]) return True, [ArtifactInfo(None, 'per_sample_FASTQ', filepaths)], ""