def main(): """Program """ parser = get_parser() args = parser.parse_args() infile = args.infile outfile = args.outfile dcc_mode = args.dcc_mode conn = Connection(dcc_mode=dcc_mode) fh = open(infile, 'r') fout = open(outfile, 'w') for line in fh: rec = line.strip("\n").split("\t")[0] if not rec or rec.startswith("#"): fout.write(line) continue rec = conn.get(rec_ids=rec, ignore404=False) aliases = rec["aliases"] for a in aliases: line = [line.strip("\n")] outline = line.extend(aliases) fout.write("\t".join(line) + "\n") fout.close() fh.close()
def main(): """Program """ EXP_PROFILE_ID = "experiment" FILE_PROFILE_ID = "file" VALID_PROFILES = [EXP_PROFILE_ID, FILE_PROFILE_ID] parser = get_parser() args = parser.parse_args() infile = args.infile outfile = args.outfile dcc_mode = args.dcc_mode conn = Connection(dcc_mode) fh = open(infile, 'r') fout = open(outfile, 'w') for line in fh: rec_id = line.strip("\n").split("\t")[0] if not rec_id or rec_id.startswith("#"): continue rec = conn.get(rec_id, ignore404=False) profile = conn.profiles.get_profile_from_id(rec["@id"]) profile_id = profile.name if profile_id not in VALID_PROFILES: raise Exception( "Record identifier '{}' must be an identifer for an object of a type in the set {}." .format(rec_id, VALID_PROFILES)) if profile_id == EXP_PROFILE_ID: # List of FASTQ file objects in JSON format. fastq_recs = conn.get_fastqfiles_on_exp(rec_id) exp_accession = rec["accession"] else: fastq_recs = [conn.get(rec_id, ignore404=False)] exp_accession = fastq_recs[0]["dataset"].split("/")[-1] for fq_rec in fastq_recs: status = fq_rec["status"] error_msg = "" if status == "content error": error_msg = fq_rec["content_error_detail"] fout.write("\t".join([exp_accession, rec_id, error_msg]) + "\n") fout.close() fh.close()
def main(): """Program """ parser = get_parser() args = parser.parse_args() infile = args.infile outfile = args.outfile dcc_mode = args.dcc_mode submitter_lab = args.submitter_lab if not submitter_lab: submitter_lab = encode_utils.LAB_PREFIX.rstrip(":") conn = Connection(dcc_mode=dcc_mode) fh = open(infile, 'r') fout = open(outfile, 'w') for line in fh: alias = line.strip("\n").split("\t")[0] if not alias or alias.startswith("#"): fout.write(line) continue alias_lab_prefix = alias.split(":", 1) try: lab_prefix, alias_name = alias.split(":", 1) except ValueError: if not submitter_lab: raise Exception( "Unknown submitting lab name for alias {}. See description for --submitter-lab argument." .format(alias)) alias = submitter_lab + ":" + alias rec = conn.get(rec_ids=alias, ignore404=False) try: dcc_id = rec["accession"] except KeyError: dcc_id = rec["uuid"] line = [line.strip("\n")] outline = line.append(dcc_id) fout.write("\t".join(line) + "\n") fout.close() fh.close()
def main(): """Program """ parser = get_parser() args = parser.parse_args() infile = args.infile outfile = args.outfile dcc_mode = args.dcc_mode conn = Connection(dcc_mode) fh = open(infile, 'r') fout = open(outfile, 'w') for line in fh: rec_id = line.strip() if not rec_id or rec_id.startswith("#"): continue rec = conn.get(rec_id, ignore404=True) if not rec: print("'{}' not found.".format(rec_id)) fout.write(rec_id + "\n") fout.close() fh.close()
class TestConnection(unittest.TestCase): """Tests the ``encode_utils.connection.py`` module. """ def setUp(self): self.conn = Connection(eu.DCC_DEV_MODE, no_log_file=True) def test_arbitrary_host(self): self.conn = Connection(dcc_mode='test.encodedcc.org', no_log_file=True) def test_before_file_post(self): """ Tests the method ``before_file_post()`` for correctly setting the `md5sum` property of a file record. """ payload = { self.conn.PROFILE_KEY: profiles.Profiles.FILE_PROFILE_ID, profiles.Profiles.SUBMITTED_FILE_PROP_NAME: os.path.join(DATA_DIR, "test_fq_40recs.fastq.gz") } res = self.conn.before_post_file(payload) self.assertEqual(res["md5sum"], "a3e7cb3df359d0642ab0edd33ea7e93e") def test_get_lookup_ids_from_payload(self): """ Tests the method ``get_lookup_ids_from_payload()`` for returning the correct result when given a variaty of identifiers (accession, alias, and md5sum). """ accession = "ENCSR502NRF" alias = "michael-snyder:SCGPM_SReq-1103_HG7CL_L3_GGCTAC_R1.fastq.gz" md5 = "3fef3e25315f105b944691668838b9b5" payload = { self.conn.ENCID_KEY: accession, "aliases": [alias], "md5sum": md5 } res = self.conn.get_lookup_ids_from_payload(payload) self.assertEqual(sorted(res), sorted([accession, alias, md5])) def test_get_profile_from_payload(self): """ Tests the method ``get_profile_from_payload()`` for returning the correct result when only the key ``encode_utils.connection.Connection.PROFILE_KEY`` is set in the payload. """ # Use a valid profile ID that exists as a key in profiles.Profile.PROFILES. profile_id = "genetic_modification" payload = {} payload[self.conn.PROFILE_KEY] = "genetic_modification" res = self.conn.get_profile_from_payload(payload) self.assertEqual(res.name, profile_id) def test_2_get_profile_from_payload(self): """ Tests the method ``get_profile_from_payload()`` for returning the correct result when only the key for the `@id` property is set in the payload. """ # Use a valid profile ID that exists as a key in profiles.Profile.PROFILES. profile_id = "genetic_modification" payload = {} payload["@id"] = "genetic_modification" res = self.conn.get_profile_from_payload(payload) self.assertEqual(res.name, profile_id) def test_3_get_profile_from_payload(self): """ Tests the method ``get_profile_from_payload()`` for raising the exception ``encode_utils.exceptions.ProfileNotSpecified`` when neither the ``self.PROFILE_KEY`` or `@id` key is present in the payload. """ # Use a valid profile ID that exists as a key in profiles.Profile.PROFILES. payload = {} self.assertRaises(ProfileNotSpecified, self.conn.get_profile_from_payload, payload) def test_4_get_profile_from_payload(self): """ Tests the method ``get_profile_from_payload()`` for raising the exception ``profiles.UnknownProfile`` when an unknown profile is specified in the payload. """ # Use a valid profile ID that exists as a key in profiles.Profile.PROFILES. payload = {} payload[self.conn.PROFILE_KEY] = "unknown_profile" self.assertRaises(profiles.UnknownProfile, self.conn.get_profile_from_payload, payload) def test_extract_aws_upload_credentials(self): """ Tests the ``method extract_aws_upload_credentials()`` for extracting the upload credentials for from a file object's JSON. """ access_key = "access_key" secret_key = "secret_key" session_token = "session_token" upload_url = "upload_url" payload = { access_key: access_key, secret_key: secret_key, session_token: session_token, upload_url: upload_url } res = self.conn.extract_aws_upload_credentials(payload) aws_creds = {} aws_creds["AWS_ACCESS_KEY_ID"] = access_key aws_creds["AWS_SECRET_ACCESS_KEY"] = secret_key aws_creds["AWS_SESSION_TOKEN"] = session_token aws_creds["UPLOAD_URL"] = upload_url self.assertEqual(res, aws_creds) def test_make_search_url(self): """ Tests the method ``make_search_url()`` for building the correct URL given the query arguments to find ChIP-seq assays performed on primary cells from blood. """ query = { "assay_title": "ChIP-seq", "biosample_type": "primary cell", "organ_slims": "blood", "type": "Experiment" } res = self.conn.make_search_url(search_args=query) query = "search/?assay_title=ChIP-seq&biosample_type=primary+cell&organ_slims=blood&type=Experiment" self.assertEqual(res, os.path.join(self.conn.dcc_mode.url, query)) def test_get(self): res = self.conn.get('experiments/ENCSR502NRF/', frame='object') self.assertEqual(res.get('uuid', ""), "e44c59cc-f14a-4722-a9c5-2fe63c2b9533") def test_dry_run_enabled(self): """ Tests the method ``check_dry_run`` for returning True when the ``Connection`` class is instantiated in dry-run mode. """ self.conn = Connection(eu.DCC_DEV_MODE, dry_run=True, no_log_file=True) self.assertEqual(True, self.conn.check_dry_run()) def test_bedfile_download(self): """ Tests the method ``download`` for downloading a tiny BED file record (ENCFF815QOR) of size 44 KB. in this directory. """ filepath = self.conn.download(rec_id="ENCFF815QOR", directory=os.getcwd()) self.assertTrue(os.stat(filepath).st_size > 0) def test_doc_download(self): """ Tests the method ``download`` for downloading a document record (michael-snyder:P-17) in this directory. """ filepath = self.conn.download(rec_id="michael-snyder:P-17", directory=os.getcwd()) self.assertTrue(os.stat(filepath).st_size > 0) def test_autosql_attachment(self): """ Tests the method ``set_attachment`` for autosql attachment. """ encoded_uri = self.conn.set_attachment( os.path.join(DATA_DIR, "estarr_counts.as")) print(encoded_uri) self.assertTrue(encoded_uri['href'] == ( 'data:text/autosql;base64,' 'dGFibGUgZXN0YXJyX2NvdW50cwoiZVNUQVJSIGNvdW50cyIKKApzdHJpbmcgS' 'UQ7ICJDYW5kaWRhdGUgaWRlbnRpZmllciIKc3RyaW5nIERpcmVjdGlvbjsgIk' 'Nsb25pbmcgZGlyZWN0aW9uIgp1aW50IFVNSV9jb3VudDsgIlVuaXF1ZSBNb2x' 'lY3VsYXIgSWRlbnRpZmllciBjb3VudCIKKQ=='))
def main(): """Program """ parser = get_parser() args = parser.parse_args() dcc_mode = args.dcc_mode infile = args.infile protocol_uuid = args.protocol_uuid # connect to DCC conn = Connection(dcc_mode) barplot_description = "Barplot showing the expression of the given gene in the control vs. the treatment. Expression is given in Transcripts Per Million (TPM) and was generated by version 1.2.30 of RSEM's rsem-calculate-expression script." fh = open(infile, 'r') header = fh.readline().strip("\n") if not header.startswith("#"): raise Exception( "First line of input file must be a field-header line starting with a '#'." ) dico = { } # key: library accession, value: {barplot: local_barplot_path, line: line_from_input_file} # store a list of all exp IDs seen in input file so we can later link the # analysis protocol doc to the exp. exp_encids = [] for line in fh: line = line.strip("\n") if not line.strip(): continue line = line.split("\t") dcc_exp_id = line[0].strip() if dcc_exp_id not in exp_encids: exp_encids.append(dcc_exp_id) dcc_rep_id = line[1].strip() rep_json = conn.get(rep_id, ignore404=False) dcc_lib_id = rep_json["library"]["accession"] barplot = line[2].strip() dico[dcc_lib_id] = {"barplot": barplot, "line": line} fh.close() fout = open(OUTPUT_FILE, 'w') fout.write(header + "\tjpeg_dcc_uuid\n") count = 0 for lib_id in dico: # count += 1 barplot = dico[lib_id]["barplot"] download_filename = lib_id + "_relative_knockdown.jpeg" # download_filename is the name the user will get when they downoad the # file from the ENCODE Portal. dcc_uuid = conn.post_document(download_filename=download_filename, document=barplot, document_type="data QA", document_description=barplot_description) line = dico[lib_id]["line"] line.append(dcc_uuid) fout.write("\t".join(line) + "\n") # link document to library conn.link_document(rec_id=lib_id, dcc_document_uuid=dcc_uuid) fout.close() print( "Linking RSEM analysis and plotting protocol document to each experiment" ) for exp in exp_encids: conn.link_document(rec_id=exp, document_id=protocol_uuid)
def main(): """Program """ parser = get_parser() args = parser.parse_args() mode = args.dcc_mode exp_id = args.exp_id conn = Connection(mode) exp_rep_dico = conn.get_fastqfile_replicate_hash(exp_id) exp_json = conn.get(exp_id, ignore404=True) controls = exp_json["possible_controls"] # A list of dicts. # Populate a controls-lookup hash. The keys will be the ctl accessions. Each value will be # the replicates hash (return value of conn.get_fastqfile_replicate_hash(). controls_hash = {} # A dict of dicts. control_bio_rep_counts = [] for c in controls: ctl_accession = c["accession"] controls_hash[ctl_accession] = {} ctl_rep_dico = conn.get_fastqfile_replicate_hash(ctl_accession) controls_hash[ctl_accession]["rep_dico"] = ctl_rep_dico control_bio_rep_counts.append(len(ctl_rep_dico.keys())) # Make sure that all control experiments have the same number of biological replicates. There are # no known rules to apply otherwise. if len(set(control_bio_rep_counts)) != 1: raise Exception( "The controls '{controls}' have different numbers of biological replicates from one another '{rep_nums}'." .format(controls=control_ids, rep_nums=control_bio_rep_counts)) # Make sure that the number of control bio reps equals the number of experiment bio reps: exp_bio_rep_count = len(exp_rep_dico.keys()) if exp_bio_rep_count != control_bio_rep_counts[0]: raise Exception( "The number of experiment replicates '{}' doesn't equal the number of control replicates '{}'." .format(exp_bio_rep_count, control_bio_rep_counts[0])) # Now we'll look at each bio rep on the experiment, in numerical order of # biological_replicate_number from least to greatest. We'll work our way all the down to the # FASTQ files and start populating the File.controlled_by property in the following manner: # # For each control, we'll sort the replicates the same was as we did for the ones on the # experiment, then for the replicate having the same ordinal index, we'll add the FASTQ File # references. sorted_exp_bio_reps = sorted(exp_rep_dico) count = -1 # And now for the nastiest for-loop I've ever written ... this should be cleaned up but the logic # is so rough to implement that it'll be ugly any way we look at it. for b in sorted_exp_bio_reps: # biological_replicate_number count += 1 for t in exp_rep_dico[b]: # technical_replicate_number for read_num in exp_rep_dico[b][t]: for fastq_json in exp_rep_dico[b][t][read_num]: exp_file_acc = fastq_json["accession"] controlled_by = [] for c in controls_hash: ctl_bio_rep_num = sorted( controls_hash[c]["rep_dico"])[count] ctl_tech_reps = controls_hash[c]["rep_dico"][ ctl_bio_rep_num] for ctl_tech_rep_num in ctl_tech_reps: for ctl_encff in ctl_tech_reps[ctl_tech_rep_num][ read_num]: controlled_by.append(ctl_encff["accession"]) conn.patch( { conn.ENCID_KEY: exp_file_acc, "controlled_by": controlled_by }, extend_array_values=False)
class Accession(object): """docstring for Accession""" def __init__(self, steps, metadata_json, server, lab, award): super(Accession, self).__init__() self.set_lab_award(lab, award) self.analysis = Analysis(metadata_json) self.steps_and_params_json = self.file_to_json(steps) self.backend = self.analysis.backend self.conn = Connection(server) self.new_files = [] self.current_user = self.get_current_user() def set_lab_award(self, lab, award): global COMMON_METADATA COMMON_METADATA['lab'] = lab COMMON_METADATA['award'] = award def get_current_user(self): response = requests.get(self.conn.dcc_url + '/session-properties', auth=self.conn.auth) if response.ok: user = response.json().get('user') if user: return user.get('@id') raise Exception('Authenticated user not found') else: raise Exception('Request to portal failed') def file_to_json(self, file): with open(file) as json_file: json_obj = json.load(json_file) return json_obj def file_to_json(self, file): with open(file) as json_file: json_obj = json.load(json_file) return json_obj def file_to_json(self, file): with open(file) as json_file: json_obj = json.load(json_file) return json_obj def accession_fastqs(self): pass def wait_for_portal(self): pass def file_at_portal(self, file): self.wait_for_portal() md5sum = self.backend.md5sum(file) search_param = [('md5sum', md5sum), ('type', 'File')] encode_file = self.conn.search(search_param) if len(encode_file) > 0: return self.conn.get(encode_file[0].get('accession')) def raw_fastq_inputs(self, file): if not file.task and 'fastqs' in file.filekeys: yield file if file.task: for input_file in file.task.input_files: yield from self.raw_fastq_inputs(input_file) def raw_files_accessioned(self): for file in self.analysis.raw_fastqs: if not self.file_at_portal(file.filename): return False return True def accession_file(self, encode_file, gs_file): file_exists = self.file_at_portal(gs_file.filename) submitted_file_path = {'submitted_file_name': gs_file.filename} if not file_exists: local_file = self.backend.download(gs_file.filename)[0] encode_file['submitted_file_name'] = local_file encode_posted_file = self.conn.post(encode_file) os.remove(local_file) encode_posted_file = self.patch_file(encode_posted_file, submitted_file_path) self.new_files.append(encode_posted_file) return encode_posted_file elif (file_exists and file_exists.get('status') in ['deleted', 'revoked']): encode_file.update(submitted_file_path) # Update the file to current user # TODO: Reverse this when duplicate md5sums are enabled encode_file.update({'submitted_by': self.current_user}) encode_patched_file = self.patch_file(file_exists, encode_file) self.new_files.append(encode_patched_file) return encode_patched_file return file_exists def patch_file(self, encode_file, new_properties): new_properties[self.conn.ENCID_KEY] = encode_file.get('accession') return self.conn.patch(new_properties, extend_array_values=False) def get_or_make_step_run(self, lab_prefix, run_name, step_version, task_name): docker_tag = self.analysis.get_tasks(task_name)[0].docker_image.split( ':')[1] payload = { 'aliases': ["{}:{}-{}".format(lab_prefix, run_name, docker_tag)], 'status': 'released', 'analysis_step_version': step_version } payload[Connection.PROFILE_KEY] = 'analysis_step_runs' print(payload) return self.conn.post(payload) @property def assembly(self): assembly = [ reference for reference in ASSEMBLIES if reference in self.analysis.get_tasks('read_genome_tsv') [0].outputs.get('genome', {}).get('ref_fa', '') ] return assembly[0] if len(assembly) > 0 else '' @property def lab_pi(self): return COMMON_METADATA['lab'].split('/labs/')[1].split('/')[0] @property def dataset(self): return self.file_at_portal( self.analysis.raw_fastqs[0].filename).get('dataset') def file_from_template(self, file, file_format, output_type, step_run, derived_from, dataset, file_format_type=None): file_name = file.filename.split('gs://')[-1].replace('/', '-') obj = { 'status': 'uploading', 'aliases': ['{}:{}'.format(self.lab_pi, file_name)], 'file_format': file_format, 'output_type': output_type, 'assembly': self.assembly, 'dataset': dataset, 'step_run': step_run.get('@id'), 'derived_from': derived_from, 'file_size': file.size, 'md5sum': file.md5sum } if file_format_type: obj['file_format_type'] = file_format_type obj[Connection.PROFILE_KEY] = 'file' obj.update(COMMON_METADATA) return obj def get_derived_from_all(self, file, files, inputs=False): ancestors = [] for ancestor in files: ancestors.append( self.get_derived_from(file, ancestor.get('derived_from_task'), ancestor.get('derived_from_filekey'), ancestor.get('derived_from_output_type'), ancestor.get('derived_from_inputs'))) return list(self.flatten(ancestors)) def flatten(self, nested_list): if isinstance(nested_list, str): yield nested_list if isinstance(nested_list, list): for item in nested_list: yield from self.flatten(item) # Returns list of accession ids of files on portal or recently accessioned def get_derived_from(self, file, task_name, filekey, output_type=None, inputs=False): derived_from_files = list( set( list( self.analysis.search_up(file.task, task_name, filekey, inputs)))) encode_files = [ self.file_at_portal(gs_file.filename) for gs_file in derived_from_files ] accessioned_files = encode_files + self.new_files accessioned_files = [x for x in accessioned_files if x is not None] derived_from_accession_ids = [] for gs_file in derived_from_files: for encode_file in accessioned_files: if gs_file.md5sum == encode_file.get('md5sum'): # Optimal peaks can be mistaken for conservative peaks # when their md5sum is the same if output_type and output_type != encode_file.get( 'output_type'): continue derived_from_accession_ids.append( encode_file.get('accession')) derived_from_accession_ids = list(set(derived_from_accession_ids)) # Raise exception when some or all of the derived_from files # are missing from the portal if not derived_from_accession_ids: raise Exception( 'Missing all of the derived_from files on the portal') if len(derived_from_accession_ids) != len(derived_from_files): raise Exception( 'Missing some of the derived_from files on the portal') return [ '/files/{}/'.format(accession_id) for accession_id in derived_from_accession_ids ] # File object to be accessioned # inputs=True will search for input fastqs in derived_from def make_file_obj(self, file, file_format, output_type, step_run, derived_from_files, file_format_type=None, inputs=False): derived_from = self.get_derived_from_all(file, derived_from_files, inputs) return self.file_from_template(file, file_format, output_type, step_run, derived_from, self.dataset, file_format_type) def get_bio_replicate(self, encode_file, string=True): replicate = encode_file.get('biological_replicates')[0] if string: return str(replicate) return int(replicate) def attach_idr_qc_to(self, encode_file, gs_file): if list( filter(lambda x: 'IDRQualityMetric' in x['@type'], encode_file['quality_metrics'])): return qc = self.backend.read_json(self.analysis.get_files('qc_json')[0]) idr_qc = qc['idr_frip_qc'] replicate = self.get_bio_replicate(encode_file) rep_pr = idr_qc['rep' + replicate + '-pr'] frip_score = rep_pr['FRiP'] idr_peaks = qc['ataqc']['rep' + replicate]['IDR peaks'][0] step_run = encode_file.get('step_run') if isinstance(step_run, str): step_run_id = step_run elif isinstance(step_run, dict): step_run_id = step_run.get('@id') qc_object = {} qc_object['F1'] = frip_score qc_object['N1'] = idr_peaks idr_cutoff = self.analysis.metadata['inputs']['atac.idr_thresh'] # Strongly expects that plot exists plot_png = next( self.analysis.search_up(gs_file.task, 'idr_pr', 'idr_plot')) qc_object.update({ 'step_run': step_run_id, 'quality_metric_of': [encode_file.get('@id')], 'IDR_cutoff': idr_cutoff, 'status': 'released', 'IDR_plot_rep{}_pr'.format(replicate): self.get_attachment(plot_png, 'image/png') }) qc_object.update(COMMON_METADATA) qc_object[Connection.PROFILE_KEY] = 'idr-quality-metrics' posted_qc = self.conn.post(qc_object, require_aliases=False) return posted_qc def attach_flagstat_qc_to(self, encode_bam_file, gs_file): # Return early if qc metric exists if list( filter( lambda x: 'SamtoolsFlagstatsQualityMetric' in x['@type'], encode_bam_file['quality_metrics'])): return qc = self.backend.read_json(self.analysis.get_files('qc_json')[0]) replicate = self.get_bio_replicate(encode_bam_file) flagstat_qc = qc['nodup_flagstat_qc']['rep' + replicate] for key, value in flagstat_qc.items(): if '_pct' in key: flagstat_qc[key] = '{}%'.format(value) step_run = encode_bam_file.get('step_run') if isinstance(step_run, str): step_run_id = step_run elif isinstance(step_run, dict): step_run_id = step_run.get('@id') flagstat_qc.update({ 'step_run': step_run_id, 'quality_metric_of': [encode_bam_file.get('@id')], 'status': 'released' }) flagstat_qc.update(COMMON_METADATA) flagstat_qc[ Connection.PROFILE_KEY] = 'samtools-flagstats-quality-metric' posted_qc = self.conn.post(flagstat_qc, require_aliases=False) return posted_qc def attach_cross_correlation_qc_to(self, encode_bam_file, gs_file): # Return early if qc metric exists if list( filter(lambda x: 'ComplexityXcorrQualityMetric' in x['@type'], encode_bam_file['quality_metrics'])): return qc = self.backend.read_json(self.analysis.get_files('qc_json')[0]) plot_pdf = next( self.analysis.search_down(gs_file.task, 'xcor', 'plot_pdf')) read_length_file = next( self.analysis.search_up(gs_file.task, 'bowtie2', 'read_len_log')) read_length = int( self.backend.read_file(read_length_file.filename).decode()) replicate = self.get_bio_replicate(encode_bam_file) xcor_qc = qc['xcor_score']['rep' + replicate] pbc_qc = qc['pbc_qc']['rep' + replicate] step_run = encode_bam_file.get('step_run') if isinstance(step_run, str): step_run_id = step_run elif isinstance(step_run, dict): step_run_id = step_run.get('@id') xcor_object = { 'NRF': pbc_qc['NRF'], 'PBC1': pbc_qc['PBC1'], 'PBC2': pbc_qc['PBC2'], 'NSC': xcor_qc['NSC'], 'RSC': xcor_qc['RSC'], 'sample size': xcor_qc['num_reads'], "fragment length": xcor_qc['est_frag_len'], "quality_metric_of": [encode_bam_file.get('@id')], "step_run": step_run_id, "paired-end": self.analysis.metadata['inputs']['atac.paired_end'], "read length": read_length, "status": "released", "cross_correlation_plot": self.get_attachment(plot_pdf, 'application/pdf') } xcor_object.update(COMMON_METADATA) xcor_object[ Connection.PROFILE_KEY] = 'complexity-xcorr-quality-metrics' posted_qc = self.conn.post(xcor_object, require_aliases=False) return posted_qc def file_has_qc(self, bam, qc): for item in bam['quality_metrics']: if item['@type'][0] == qc['@type'][0]: return True return False def get_attachment(self, gs_file, mime_type): contents = self.backend.read_file(gs_file.filename) contents = b64encode(contents) if type(contents) is bytes: # The Portal treats the contents as string "b'bytes'" contents = str(contents).replace('b', '', 1).replace('\'', '') obj = { 'type': mime_type, 'download': gs_file.filename.split('/')[-1], 'href': 'data:{};base64,{}'.format(mime_type, contents) } return obj def accession_step(self, single_step_params): step_run = self.get_or_make_step_run( self.lab_pi, single_step_params['dcc_step_run'], single_step_params['dcc_step_version'], single_step_params['wdl_task_name']) accessioned_files = [] for task in self.analysis.get_tasks( single_step_params['wdl_task_name']): for file_params in single_step_params['wdl_files']: for wdl_file in [ file for file in task.output_files if file_params['filekey'] in file.filekeys ]: # Conservative IDR thresholded peaks may have # the same md5sum as optimal one try: obj = self.make_file_obj( wdl_file, file_params['file_format'], file_params['output_type'], step_run, file_params['derived_from_files'], file_format_type=file_params.get( 'file_format_type')) encode_file = self.accession_file(obj, wdl_file) except Exception as e: if 'Conflict' in str(e) and file_params.get( 'possible_duplicate'): continue elif 'Missing all of the derived_from' in str(e): continue else: raise # Parameter file inputted assumes Accession implements # the methods to attach the quality metrics quality_metrics = file_params.get('quality_metrics', []) for qc in quality_metrics: qc_method = getattr(self, QC_MAP[qc]) # Pass encode file with # calculated properties qc_method(self.conn.get(encode_file.get('accession')), wdl_file) accessioned_files.append(encode_file) return accessioned_files def accession_steps(self): for step in self.steps_and_params_json: self.accession_step(step)