def set_up(self): """do some setup for script""" if self.UPDATE: print("WARNING: This run is an UPDATE run objects will be released.") else: print("Object status will be checked but not changed") if self.FORCE: print("WARNING: Objects that do not pass audit will be FORCE-released") if self.LOGALL: print("Logging all statuses") if self.infile: if os.path.isfile(self.infile): self.ACCESSIONS = [line.rstrip("\n") for line in open(self.infile)] else: self.ACCESSIONS = self.infile.split(",") elif self.QUERY: if "search" in self.QUERY: temp = encodedcc.get_ENCODE(self.QUERY, self.connection).get("@graph", []) else: temp = [encodedcc.get_ENCODE(self.QUERY, self.connection)] if any(temp): for obj in temp: if obj.get("accession"): self.ACCESSIONS.append(obj["accession"]) elif obj.get("uuid"): self.ACCESSIONS.append(obj["uuid"]) elif obj.get("@id"): self.ACCESSIONS.append(obj["@id"]) elif obj.get("aliases"): self.ACCESSIONS.append(obj["aliases"][0]) if len(self.ACCESSIONS) == 0: # if something happens and we end up with no accessions stop print("ERROR: object has no identifier", file=sys.stderr) sys.exit(1)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.object: if os.path.isfile(args.object): accessions = [line.strip() for line in open(args.object)] else: accessions = args.object.split(",") elif args.query: if "search" in args.query: temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) else: temp = [encodedcc.get_ENCODE(args.query, connection)] if any(temp): for obj in temp: if obj.get("accession"): accessions.append(obj["accession"]) elif obj.get("uuid"): accessions.append(obj["uuid"]) elif obj.get("@id"): accessions.append(obj["@id"]) elif obj.get("aliases"): accessions.append(obj["aliases"][0]) else: print("ERROR: object has no identifier", file=sys.stderr) if len(accessions) == 0: print("No accessions to check!", file=sys.stderr) sys.exit(1) for acc in accessions: encodedcc.get_ENCODE(acc, connection)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.infile: accessions = [line.rstrip("\n") for line in open(args.infile)] elif args.query: data = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) for exp in data: files = exp.get("files", []) for f in files: res = encodedcc.get_ENCODE(f, connection) f_type = res.get("file_format", "") if f_type == "fastq": accessions.append(res["accession"]) elif args.accession: accessions = [args.accession] else: print("No accessions to check") sys.exit(1) for acc in accessions: link = "/files/" + acc + "/@@download/" + acc + ".fastq.gz" for header, sequence, qual_header, quality in encodedcc.fastq_read(connection, uri=link): if args.header: header = header.decode("UTF-8") print(header) else: sequence = sequence.decode("UTF-8") print(acc + "\t" + str(len(sequence)))
def process_link(self, identifier_link, approved_types): item = identifier_link.split("/")[1].replace("-", "") subobj = encodedcc.get_ENCODE(identifier_link, self.connection) subobjname = subobj["@type"][0] restricted_flag = False inactive_pipeline_flag = False if (item in self.profiles_ref) and \ (identifier_link not in self.searched): if (subobjname == 'File'): if self.is_restricted(subobj) is True: print(subobj['@id'] + ' is restricted, ' + 'therefore will not be released') restricted_flag = True self.searched.append(subobj["@id"]) if subobj.get('analysis_step_version'): p = self.has_inactive_pipeline( encodedcc.get_ENCODE(identifier_link, self.connection, "embedded")) if p: print('{} is only associated with inactive pipelines' ' and therefore will not be released: {}'.format( subobj['@id'], p)) inactive_pipeline_flag = True self.searched.append(subobj["@id"]) # expand subobject if (subobjname in approved_types) and \ (restricted_flag is False) and \ (inactive_pipeline_flag is False): self.get_status( subobj, hi.dictionary_of_lower_levels.get( hi.levels_mapping.get(subobjname)))
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) profiles = encodedcc.get_ENCODE('/profiles/', connection) for object_type in profiles.keys(): profile_properties = encodedcc.get_ENCODE( '/profiles/' + object_type, connection).get('properties') # we should fix only objects that have alternate accessions property if profile_properties and profile_properties.get( 'alternate_accessions'): uuid_2_alternate_accessions = {} objects = encodedcc.get_ENCODE('search/?type=' + object_type, connection)['@graph'] for entry in objects: if entry.get('alternate_accessions'): replaced_objects_accessions = [] for acc in entry.get('alternate_accessions'): replaced_objects_accessions.extend( retreive_list_of_replaced(acc, connection)) if sorted(list(set( replaced_objects_accessions))) != sorted( entry.get('alternate_accessions')): uuid_2_alternate_accessions[entry['uuid']] = \ set(replaced_objects_accessions) for uuid in uuid_2_alternate_accessions.keys(): uuid_sets_counter = 0 for key in uuid_2_alternate_accessions.keys(): if uuid_2_alternate_accessions[uuid] <= \ uuid_2_alternate_accessions[key]: uuid_sets_counter += 1 if uuid_sets_counter == 1: for acc in list(uuid_2_alternate_accessions[uuid]): to_clean_objects = encodedcc.get_ENCODE( 'search/?type=Item&accession=' + acc, connection)['@graph'] for object_to_clean in to_clean_objects: print(object_to_clean['uuid'] + ' alternate accessions list ' + str(object_to_clean[ 'alternate_accessions']) + ' is removed') encodedcc.patch_ENCODE( object_to_clean['uuid'], connection, {"alternate_accessions": []}) print(uuid + ' is patched with ' + str({"alternate_accessions": list( uuid_2_alternate_accessions[uuid])})) encodedcc.patch_ENCODE( uuid, connection, {"alternate_accessions": list( uuid_2_alternate_accessions[uuid])})
def check_ENCODE(self, idList, connection, otherIdList=[], bothDicts={}): for pmid in idList: extraData = bothDicts.get(pmid) ENCODEvalue = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=PMID:" + pmid, connection) if ENCODEvalue.get("@graph"): log = "PMID " + pmid + " is listed in ENCODE" logger.info('%s' % log) uuid = ENCODEvalue.get("@graph")[0].get("uuid") if not self.CREATE_ONLY: self.compare_entrez_ENCODE(uuid, pmid, connection, extraData) else: if self.CREATE_ONLY: self.get_entrez([pmid]) titleEntrez = self.entrezDict[pmid].get("title") found = False for otherID in otherIdList: titleENCODE = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=" + otherID, connection) if titleENCODE.get("title") == titleEntrez: log = pmid + " is in ENCODE by a different name " + titleENCODE.get("uuid") logger.warning('%s' % log) self.compare_entrez_ENCODE(titleENCODE.get("uuid"), pmid, connection, extraData) if self.UPDATE: newIdent = titleENCODE.get("identifiers") newIdent.append("PMID:" + pmid) patch_dict = {"identifiers": newIdent} encodedcc.patch_ENCODE(titleENCODE.get("uuid"), connection, patch_dict) found = True if found is False: log = "This publication is not listed in ENCODE " + pmid logger.warning('%s' % log) if self.CREATE: self.POST_COUNT += 1 pmidData = self.entrezDict[pmid] log = "POSTing the new object: " + pmid logger.info('%s' % log) post_dict = { "title": pmidData.get("title"), "abstract": pmidData.get("abstract"), "submitted_by": "/users/8b1f8780-b5d6-4fb7-a5a2-ddcec9054288/", "lab": "/labs/encode-consortium/", "award": "/awards/ENCODE/", "categories": extraData.get("categories"), "published_by": extraData.get("published_by"), "date_published": pmidData.get("date_published"), "authors": pmidData.get("authors"), "identifiers": ["PMID:" + pmid], "journal": pmidData.get("journal"), "volume": pmidData.get("volume"), "issue": pmidData.get("issue"), "page": pmidData.get("page"), "status": "published" } if extraData.get("data_used"): post_dict["data_used"] = extraData.get("data_used") encodedcc.new_ENCODE(connection, "publications", post_dict)
def excel_reader(datafile, sheet, update, connection, patchall): row = reader(datafile, sheetname=sheet) keys = next(row) # grab the first row of headers total = 0 error = 0 success = 0 patch = 0 for values in row: total += 1 post_json = dict(zip(keys, values)) post_json = dict_patcher(post_json) # add attchments here if post_json.get("attachment"): attach = attachment(post_json["attachment"]) post_json["attachment"] = attach print(post_json) temp = {} if post_json.get("uuid"): temp = encodedcc.get_ENCODE(post_json["uuid"], connection) elif post_json.get("aliases"): temp = encodedcc.get_ENCODE(quote(post_json["aliases"][0]), connection) elif post_json.get("accession"): temp = encodedcc.get_ENCODE(post_json["accession"], connection) elif post_json.get("@id"): temp = encodedcc.get_ENCODE(post_json["@id"], connection) if temp.get("uuid"): if patchall: e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 patch += 1 else: print("Object {} already exists. Would you like to patch it instead?".format(temp["uuid"])) i = input("PATCH? y/n ") if i.lower() == "y": e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 patch += 1 else: if update: print("POSTing data!") e = encodedcc.new_ENCODE(connection, sheet, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 print("{sheet}: {success} out of {total} posted, {error} errors, {patch} patched".format( sheet=sheet.upper(), success=success, total=total, error=error, patch=patch))
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) query = "/search/?type=Experiment&lab.title=Brenton+Graveley%2C+UConn&award.project=ENCODE&status=released&files.file_type=bam" data = encodedcc.get_ENCODE(query, connection).get("@graph", []) headers = ["File Accession", "Download", "Annotation", "Cell Line", "Assembly", "Target", "Experiment Accession", "Experiment Aliases", "Control Experiment", "Biosample Accession", "Biosample Aliases", "Library Accession", "Library Aliases", "Lab", "Submitted Name"] with open("output.txt", "w") as tsvfile: writer = csv.DictWriter(tsvfile, fieldnames=headers, delimiter="\t") writer.writeheader() for exp in data: if exp.get("possible_controls"): print("Experiment", exp.get("accession")) temp = dict.fromkeys(headers) temp["Experiment Accession"] = exp.get("accession") temp["Experiment Aliases"] = exp.get("aliases") temp["Cell Line"] = exp.get("biosample_term_name") temp["Target"] = exp.get("target") temp["Control Experiment"] = exp["possible_controls"] if exp.get("files"): files = exp["files"] else: files = exp["original_files"] for f in files: file = encodedcc.get_ENCODE(f, connection) if file.get("file_format", "") == "bam": # this is a bam file and we want it temp["Lab"] = file.get("lab") temp["Annotation"] = file.get("genome_annotation") temp["File Accession"] = file.get("accession") temp["Submitted Name"] = file.get("submitted_file_name") temp["Download"] = connection.server + "/files/" + file["accession"] + "/@@download/" + file["accession"] + ".bam" temp["Assembly"] = file.get("assembly") print("File", file.get("accession")) if file.get("replicate"): rep = encodedcc.get_ENCODE(file["replicate"], connection) if rep.get("library"): lib = encodedcc.get_ENCODE(rep["library"], connection) temp["Library Accession"] = lib.get("accession") temp["Library Aliases"] = lib.get("aliases") print("Library", lib.get("accession")) if lib.get("biosample"): bio = encodedcc.get_ENCODE(lib["biosample"], connection) temp["Biosample Accession"] = bio.get("accession") temp["Biosample Aliases"] = bio.get("aliases") print("Biosample", bio.get("accession")) writer.writerow(temp)
def find_ENCODE_extras(self, communityList, consortiumList, connection): '''finds any publications in the ENCODE database that are not in the files provided ''' community_url = "/search/?type=publication&status=published\ &published_by=community&field=identifiers&limit=all" consortium_url = "/search/?type=publication&status=published\ &published_by!=community&field=identifiers&limit=all" communityResult = encodedcc.get_ENCODE(community_url, connection).get("@graph") consortiumResult = encodedcc.get_ENCODE(consortium_url, connection).get("@graph") communityPMIDfromENCODE = [] # list of PMID from ENCODE site communityOtherID = [] # list of non-PMID ids from ENCODE site for pub in communityResult: temp = pub.get("identifiers", []) for idNum in temp: if "PMID:" in idNum: communityPMIDfromENCODE.append(idNum) # this is something that has a pubmed ID elif "PMCID:PMC" in idNum: pass # this is an alternate PMID else: uuid = pub.get("@id") communityOtherID.append(uuid) # this is something that does not have a PMID yet, find it and PATCH it in community_ENCODE_Only = list( set(communityPMIDfromENCODE) - set(communityList)) consortiumPMIDfromENCODE = [] # list of PMID from ENCODE site consortiumOtherID = [] # list of non-PMID ids from ENCODE site for pub in consortiumResult: temp = pub.get("identifiers", []) for idNum in temp: if "PMID:" in idNum: consortiumPMIDfromENCODE.append(idNum) # this is something that has a pubmed ID elif "PMCID:PMC" in idNum: pass # this is an alternate PMID else: uuid = pub.get("@id") consortiumOtherID.append(uuid) # this is something that does not have a PMID yet, find it and PATCH it in consortium_ENCODE_Only = list( set(consortiumPMIDfromENCODE) - set(consortiumList)) return community_ENCODE_Only, communityOtherID, consortium_ENCODE_Only, consortiumOtherID
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) data = encodedcc.get_ENCODE(args.query, connection).get("@graph") print("Experiment\tStatus\tControl\tStatus") for exp in data: if exp.get("possible_controls"): if exp["status"] != "released": c = exp["possible_controls"][0] control = encodedcc.get_ENCODE(c, connection) if control["status"] == "released": print("{}\t{}\t{}\t{}".format(exp["accession"], exp["status"], control["accession"], control["status"])) '''
def run_script(self): # set_up() gets all the command line arguments and validates them # also makes the list of accessions to run from self.set_up() good = ["released", "current", "disabled", "published", "finished", "virtual"] bad = ["replaced", "revoked", "deleted", "upload failed", "archived", "format check failed", "uploading", "error"] ignore = ["User", "AntibodyCharacterization", "Publication", "ReferenceEpigenome"] for accession in self.ACCESSIONS: self.searched = [] expandedDict = encodedcc.get_ENCODE(accession, self.connection) objectStatus = expandedDict.get("status") obj = expandedDict["@type"][0] audit = encodedcc.get_ENCODE(accession, self.connection, "page").get("audit", {}) passAudit = True logger.info('%s' % "{}: {} Status: {}".format(obj, accession, objectStatus)) if audit.get("ERROR", ""): logger.warning('%s' % "WARNING: Audit status: ERROR") passAudit = False if audit.get("NOT_COMPLIANT", ""): logger.warning('%s' % "WARNING: Audit status: NOT COMPLIANT") passAudit = False self.statusDict = {} self.get_status(expandedDict) if self.FORCE: passAudit = True named = [] for key in sorted(self.statusDict.keys()): name = self.statusDict[key][0] status = self.statusDict[key][1] if name not in ignore: if name not in named: logger.info('%s' % name.upper()) if status in good: if self.LOGALL: logger.info('%s' % "{} has status {}".format(key, status)) elif status in bad: logger.warning('%s' % "WARNING: {} has status {}".format(key, status)) else: logger.info('%s' % "{} has status {}".format(key, status)) if self.UPDATE: if passAudit: self.releasinator(name, key, status) named.append(name) print("Data written to file", self.outfile)
def main(): parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--infile', '-i', default='obList', help="File containing a list of ENCSRs.") parser.add_argument('--search', default='NULL', help="The search parameters.") parser.add_argument('--key', default='default', help="The keypair identifier from the keyfile. Default is --key=default") parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file. Default is --keyfile=%s" %(os.path.expanduser("~/keypairs.json"))) parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages. Default is False.") parser.add_argument('--field', default='accession', help="The field to report. Default is accession.") args = parser.parse_args() DEBUG_ON =args.debug myKey = encodedcc.ENC_Key(args.keyfile, args.key) myConnect = encodedcc.ENC_Connection(myKey) #Get list of objects we are interested in objList = get_experiment_list (args.infile, args.search, myConnect) for i in range (0, len(objList)): field = '' if objList[i] != '': ob = encodedcc.get_ENCODE(objList[i], myConnect) id = ob.get('@id') if args.field in ob: field = str(ob[args.field]) else: id = objList[i] print ('\t'.join([id,field]))
def get_char_summary(lot, connection): anti = encodedcc.get_ENCODE(lot, connection, frame="embedded") charas = anti.get("characterizations", []) number_chars_in_progress = 0 number_chars_passing = 0 number_chars_failing = 0 for c in charas: s = c["status"] if s == "in progress": number_chars_in_progress += 1 elif s == "pending dcc review": number_chars_in_progress += 1 elif s == 'exempt from standards': number_chars_passing += 1 elif s == "compliant": number_chars_passing += 1 else: number_chars_failing += 1 char_dict = { "number_chars_in_progress": number_chars_in_progress, "number_chars_passing": number_chars_passing, "number_chars_failing": number_chars_failing } return char_dict
def single_rep(self, obj): '''one control with one replicate in control, multiple replicates in experiment''' control_files = encodedcc.get_ENCODE( obj["possible_controls"][0]["accession"], self.connection, frame="embedded").get("files", []) if len(control_files) == 0: if self.DEBUG: print("Control object {} has no files".format( obj["possible_controls"][0]["accession"]), file=sys.stderr) return for c in control_files: if c.get("file_type", "") == "fastq": exp_list = [] for e in obj["files"]: if e.get("file_type", "") == "fastq": if not self.MISSING or (self.MISSING and not e.get("controlled_by")): exp_list.append(e["accession"]) for exp in exp_list: temp = { "ExpAcc": obj["accession"], "Method": "Single", "ExpFile": exp, "ConFile": c["accession"] } self.dataList.append(temp) if self.update: self.updater(exp, c["accession"]) if self.DEBUG: print("ExpFile: {}, ConFile: {}".format( temp["ExpFile"], temp["ConFile"]))
def multi_control(self, obj): '''multiple controls, match on biosample''' con_data = {} val = True for con in obj["possible_controls"]: c = encodedcc.get_ENCODE(con["accession"], self.connection, frame="embedded") if c.get("replicates"): for rep in c["replicates"]: if c.get("files"): con_bio_acc = rep["library"]["biosample"]["accession"] con_bio_num = rep["biological_replicate_number"] for f in c["files"]: if f.get("file_type", "") == "fastq": con_file_bio_num = f["biological_replicates"] if con_bio_num in con_file_bio_num: con_file_acc = f["accession"] con_data[con_bio_acc] = con_file_acc else: if self.DEBUG: print("No files found for control {}".format( con["accession"]), file=sys.stderr) val = False else: if self.DEBUG: print("No replicates found in control {}".format( con["accession"]), file=sys.stderr) val = False if val: exp_data = {} for e in obj["replicates"]: exp_bio_acc = e["library"]["biosample"]["accession"] exp_bio_num = e["biological_replicate_number"] for f in obj["files"]: if f.get("file_type", "") == "fastq": if not self.MISSING or (self.MISSING and not f.get("controlled_by")): exp_file_bio_num = f["biological_replicates"] if exp_bio_num in exp_file_bio_num: exp_file_acc = f["accession"] exp_data[exp_bio_acc] = exp_file_acc for key in exp_data.keys(): if con_data.get(key): temp = { "ExpAcc": obj["accession"], "Method": "Biosample", "ExpFile": exp_data[key], "ConFile": con_data[key] } self.dataList.append(temp) if self.update: self.updater(exp_data[key], con_data[key]) if self.DEBUG: print("Biosample: {}, ExpFile: {}, ConFile: {}".format( key, temp["ExpFile"], temp["ConFile"]))
def get_antibody_approval (antibody, target): myConnect = connection search = encodedcc.get_ENCODE('search/?searchTerm='+antibody+'&type=antibody_approval', myConnect) for approval in search['@graph']: if approval['target']['name'] == target: return approval['status'] return "UNKNOWN"
def multi_rep(self, obj): '''one control, with one replicate in control per replicate in experiment''' control_files = encodedcc.get_ENCODE(obj["possible_controls"][0]["accession"], self.connection, frame="embedded").get("files", []) control_replicates = obj["possible_controls"][0].get("replicates", []) exp_data = {} con_data = {} if len(control_replicates) != len(obj["replicates"]): if self.DEBUG: print("Control has {} replicates and experiment has {} replicates".format(len(control_replicates), len(obj["replicates"])), file=sys.stderr) return if len(control_files) == 0: if self.DEBUG: print("Control {} has no files".format(obj["possible_controls"][0]["accession"]), file=sys.stderr) return for e in obj["files"]: if e.get("file_type", "") == "fastq": if not self.MISSING or (self.MISSING and not e.get("controlled_by")): self.pair_dict_maker(exp_data, e) for c in control_files: if c.get("file_type", "") == "fastq": self.pair_dict_maker(con_data, c) if self.ignore_runtype: self.mini(exp_data, con_data, obj) else: self.mini(con_data, exp_data, obj)
def single_rep(self, obj): '''one control with one replicate in control, multiple replicates in experiment''' control_files = encodedcc.get_ENCODE( obj["possible_controls"][0]["accession"], self.connection, frame="embedded").get("files", []) if len(control_files) == 0: if self.DEBUG: print( "Control object {} has no files".format( obj["possible_controls"][0]["accession"]), file=sys.stderr) return for c in control_files: if c.get("file_type", "") == "fastq": exp_list = [] for e in obj["files"]: if e.get("file_type", "") == "fastq": if not self.MISSING or (self.MISSING and not e.get("controlled_by")): exp_list.append(e["accession"]) temp = { "Exp Accession": obj["accession"], "Check type": "Single", "Experiment": exp_list, "Control": c["accession"] } if len(exp_list) > 0: self.data.append(temp) if self.DEBUG: print("experiment files {}".format(temp["Experiment"])) print("control files {}".format(temp["Control"]))
def multi_rep(self, obj, ignore_runtype=False): '''one control, with one replicate in control per replicate in experiment''' control_files = encodedcc.get_ENCODE(obj["possible_controls"][0]["accession"], self.connection, frame="embedded").get("files", []) control_replicates = obj["possible_controls"][0].get("replicates", []) exp_data = {} con_data = {} if len(control_replicates) != len(obj["replicates"]): if self.DEBUG: print("Control has {} replicates and experiment has {} replicates".format(len(control_replicates), len(obj["replicates"])), file=sys.stderr) return if len(control_files) == 0: if self.DEBUG: print("Control {} has no files".format(obj["possible_controls"][0]["accession"]), file=sys.stderr) return for e in obj["files"]: if e.get("file_type", "") == "fastq": if not self.MISSING or (self.MISSING and not e.get("controlled_by")): exp_file_bio_num = e.get("biological_replicates") exp_file_paired = e.get("paired_end") exp_file_acc = e["accession"] if ignore_runtype: exp_file_paired = None exp_pair = str(exp_file_bio_num[0]) + "-" + str(exp_file_paired) exp_data[exp_file_acc] = exp_pair for c in control_files: if c.get("file_type", "") == "fastq": con_file_bio_num = c.get("biological_replicates") con_file_paired = c.get("paired_end") con_file_acc = c["accession"] if ignore_runtype: con_file_paired = None con_pair = str(con_file_bio_num[0]) + "-" + str(con_file_paired) con_data[con_file_acc] = con_pair if ignore_runtype: for e_key in exp_data.keys(): con_list = [] for c_key in con_data.keys(): if exp_data[e_key] == con_data[c_key]: con_list.append(c_key) temp = {"Exp Accession": obj["accession"], "Check type": "Multi-runtype ignored", "Experiment": e_key, "Control": con_list} self.data.append(temp) if self.DEBUG: print("experiment files", e_key) print("control files", con_list) else: for c_key in con_data.keys(): exp_list = [] for e_key in exp_data.keys(): if con_data[c_key] == exp_data[e_key]: exp_list.append(e_key) temp = {"Exp Accession": obj["accession"], "Check type": "Multi", "Experiment": exp_list, "Control": c_key} if len(exp_list) > 0: self.data.append(temp) if self.DEBUG: print("experiment files", exp_list) print("control files", c_key)
def get_experiment_list(path, search, connection): if search == "NULL": with open(path) as f: experiment_list = [line.strip() for line in f.readlines()] else: results = encodedcc.get_ENCODE(search, connection, frame='embedded') experiment_list = [r['accession'] for r in results['@graph']] return experiment_list
def multi_control(self, obj): '''multiple controls, match on biosample''' con_data = {} val = True for con in obj["possible_controls"]: c = encodedcc.get_ENCODE( con["accession"], self.connection, frame="embedded") if c.get("replicates"): for rep in c["replicates"]: if c.get("files"): con_bio_acc = rep["library"]["biosample"]["accession"] con_bio_num = rep["biological_replicate_number"] for f in c["files"]: if f.get("file_type", "") == "fastq": con_file_bio_num = f["biological_replicates"] if con_bio_num in con_file_bio_num: con_file_acc = f["accession"] con_data[con_bio_acc] = con_file_acc else: if self.DEBUG: print( "No files found for control {}".format( con["accession"]), file=sys.stderr) val = False else: if self.DEBUG: print( "No replicates found in control {}".format( con["accession"]), file=sys.stderr) val = False if val: exp_data = {} for e in obj["replicates"]: exp_bio_acc = e["library"]["biosample"]["accession"] exp_bio_num = e["biological_replicate_number"] for f in obj["files"]: if f.get("file_type", "") == "fastq": if not self.MISSING or (self.MISSING and not f.get("controlled_by")): exp_file_bio_num = f["biological_replicates"] if exp_bio_num in exp_file_bio_num: exp_file_acc = f["accession"] exp_data[exp_bio_acc] = exp_file_acc for key in exp_data.keys(): if con_data.get(key): temp = { "Exp Accession": obj["accession"], "Check type": "Biosample", "Experiment": exp_data[key], "Control": con_data[key] } self.data.append(temp) if self.DEBUG: print("Biosample {}: files {}".format(key, temp))
def get_antibody_approval(antibody, target): myConnect = connection search = encodedcc.get_ENCODE( 'search/?searchTerm=' + antibody + '&type=antibody_approval', myConnect) for approval in search['@graph']: if approval['target']['name'] == target: return approval['status'] return "UNKNOWN"
def get_antibody_approval(antibody, target, connection): search = encodedcc.get_ENCODE('search/?searchTerm=' + antibody + '&type=antibody_approval', connection, frame='embedded') for approval in search['@graph']: if approval['target']['name'] == target: return approval['status'] return "UNKNOWN"
def __init__(self, args, connection): # renaming some things so I can be lazy and not pass them around self.infile = args.infile self.outfile = args.outfile self.QUERY = args.query self.LOGALL = args.logall self.FORCE = args.force self.UPDATE = args.update self.keysLink = [] self.PROFILES = {} self.ACCESSIONS = [] self.statusDict = {} self.connection = connection temp = encodedcc.get_ENCODE("/profiles/", self.connection) ignore = ["Lab", "Award", "AntibodyCharacterization", "Platform", "Publication", "Organism", "Reference", "AccessKey", "User", "Target"] self.profilesJSON = [] self.dontExpand = [] self.date_released = [] for profile in temp.keys(): # get the names of things we DON'T expand # these things usually link to other experiments/objects if "AnalysisStep" in profile: self.dontExpand.append(self.helper(profile)) elif "QualityMetric" in profile: self.dontExpand.append(self.helper(profile)) elif "Donor" in profile: self.dontExpand.append(self.helper(profile)) elif profile in ignore: pass else: self.profilesJSON.append(profile) self.profiles_ref = [] #print(self.dontExpand) for profile in self.profilesJSON: #print(profile) self.profiles_ref.append(self.helper(profile)) for item in self.profilesJSON: profile = temp[item] self.keysLink = [] # if a key is in this list, it points to a link and will be embedded in the final product self.make_profile(profile) self.PROFILES[item] = self.keysLink # lets get the list of things that actually get a date released for value in profile["properties"].keys(): if value == "date_released": self.date_released.append(item) #print(self.date_released) self.current = [] self.finished = [] for item in temp.keys(): status = temp[item]["properties"]["status"]["enum"] if "current" in status: self.current.append(item) if "finished" in status: self.finished.append(item)
def set_up(self): '''do some setup for script''' if self.UPDATE: print("WARNING: This run is an " + "UPDATE run objects will be released.") else: print("Object status will be checked but not changed") if self.FORCE: print("WARNING: Objects that do not " + "pass audit will be FORCE-released") if self.HELA: print( 'WARNING: Objects associated with HeLa data will be released') if self.LOGALL: print("Logging all statuses") if self.infile: if os.path.isfile(self.infile): self.ACCESSIONS = [ line.rstrip('\n') for line in open(self.infile) ] else: self.ACCESSIONS = self.infile.split(",") elif self.QUERY: if "search" in self.QUERY: temp = encodedcc.get_ENCODE(self.QUERY, self.connection).get("@graph", []) else: temp = [encodedcc.get_ENCODE(self.QUERY, self.connection)] if any(temp): for obj in temp: if obj.get("accession"): self.ACCESSIONS.append(obj["accession"]) elif obj.get("uuid"): self.ACCESSIONS.append(obj["uuid"]) elif obj.get("@id"): self.ACCESSIONS.append(obj["@id"]) elif obj.get("aliases"): self.ACCESSIONS.append(obj["aliases"][0]) if len(self.ACCESSIONS) == 0: # if something happens and we end up with no accessions stop print("ERROR: object has no identifier", file=sys.stderr) sys.exit(1)
def updater(self, exp, con): ''' helper function runs the update step''' temp = encodedcc.get_ENCODE(exp, self.connection).get("controlled_by", []) if con not in temp: control = temp + [con] patch_dict = {"controlled_by": control} print("patching experiment file {} with controlled_by {}".format(exp, con)) encodedcc.patch_ENCODE(exp, self.connection, patch_dict) else: print("ERROR: controlled_by for experiment file {} already contains {}".format(exp, con))
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.query: temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) for obj in temp: accessions.append(obj.get("@id")) elif args.infile: accessions = [line.strip() for line in open(args.infile)] elif args.accession: accessions = [args.accession] else: print("No accessions to check!", file=sys.stderr) sys.exit(1) for acc in accessions: encodedcc.get_ENCODE(acc, connection)
def main(): headers = [ "accession", "description", "organism", "age_display", "life_stage", "sex", "biosample_term_name", "biosample_type", "depleted_in_term_name", "phase", "subcellular_fraction_term_name", "post_synchronization_time", "post_synchronization_time_units", "synchronization", "model_organism_mating_status", "treatments", "donor", "transfection_type", "talens", "constructs", "model_organism_donor_constructs", "rnais", "part_of", "pooled_from", "derived_from", "status", "culture_harvest_date", "culture_start_date", "date_obtained", "lab", "source", "note", "notes", "health_status", "starting_amount", "starting_amount_units" ] args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.query: temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) for obj in temp: accessions.append(obj.get("accession")) elif args.infile: accessions = [line.strip() for line in open(args.infile)] elif args.accession: accessions = [args.accession] else: print("No accessions to check!", file=sys.stderr) sys.exit(1) data = [] for acc in accessions: temp = {} obj = encodedcc.get_ENCODE(acc, connection) for h in headers: x = obj.get(h, "") if any(x): temp[h] = x else: temp[h] = "" data.append(temp) writer = csv.DictWriter(sys.stdout, delimiter='\t', fieldnames=headers) writer.writeheader() for d in data: writer.writerow(d)
def find_ENCODE_extras(self, communityList, consortiumList, connection): '''finds any publications in the ENCODE database that are not in the files provided ''' community_url = "/search/?type=publication&status=published\ &published_by=community&field=identifiers&limit=all" consortium_url = "/search/?type=publication&status=published\ &published_by!=community&field=identifiers&limit=all" communityResult = encodedcc.get_ENCODE(community_url, connection).get("@graph") consortiumResult = encodedcc.get_ENCODE(consortium_url, connection).get("@graph") communityPMIDfromENCODE = [] # list of PMID from ENCODE site communityOtherID = [] # list of non-PMID ids from ENCODE site for pub in communityResult: temp = pub.get("identifiers", []) for idNum in temp: if "PMID:" in idNum: communityPMIDfromENCODE.append(idNum) # this is something that has a pubmed ID elif "PMCID:PMC" in idNum: pass # this is an alternate PMID else: uuid = pub.get("@id") communityOtherID.append(uuid) # this is something that does not have a PMID yet, find it and PATCH it in community_ENCODE_Only = list(set(communityPMIDfromENCODE) - set(communityList)) consortiumPMIDfromENCODE = [] # list of PMID from ENCODE site consortiumOtherID = [] # list of non-PMID ids from ENCODE site for pub in consortiumResult: temp = pub.get("identifiers", []) for idNum in temp: if "PMID:" in idNum: consortiumPMIDfromENCODE.append(idNum) # this is something that has a pubmed ID elif "PMCID:PMC" in idNum: pass # this is an alternate PMID else: uuid = pub.get("@id") consortiumOtherID.append(uuid) # this is something that does not have a PMID yet, find it and PATCH it in consortium_ENCODE_Only = list(set(consortiumPMIDfromENCODE) - set(consortiumList)) return community_ENCODE_Only, communityOtherID, consortium_ENCODE_Only, consortiumOtherID
def main(): headers = ["accession", "description", "organism", "age_display", "life_stage", "sex", "biosample_term_name", "biosample_type", "depleted_in_term_name", "phase", "subcellular_fraction_term_name", "post_synchronization_time", "post_synchronization_time_units", "synchronization", "model_organism_mating_status", "treatments", "donor", "transfection_type", "talens", "constructs", "model_organism_donor_constructs", "rnais", "part_of", "pooled_from", "derived_from", "status", "culture_harvest_date", "culture_start_date", "date_obtained", "lab", "source", "note", "notes", "health_status", "starting_amount", "starting_amount_units"] args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.query: temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) for obj in temp: accessions.append(obj.get("accession")) elif args.infile: accessions = [line.strip() for line in open(args.infile)] elif args.accession: accessions = [args.accession] else: print("No accessions to check!", file=sys.stderr) sys.exit(1) data = [] for acc in accessions: temp = {} obj = encodedcc.get_ENCODE(acc, connection) for h in headers: x = obj.get(h, "") if any(x): temp[h] = x else: temp[h] = "" data.append(temp) writer = csv.DictWriter(sys.stdout, delimiter='\t', fieldnames=headers) writer.writeheader() for d in data: writer.writerow(d)
def file_manager(key, value, connection, obj_type): filename = key.split("/")[-1] print("Downloading {}".format(filename)) r = requests.get(key) with open(filename, "wb") as outfile: outfile.write(r.content) if obj_type == "Biosample": filepart = filename.split("-")[0] else: filepart = filename.split("-")[1] attach = attachment(filename) temp = "_".join(key.split("/")[-2:]) aliases = ["brenton-graveley:" + temp] if (encodedcc.get_ENCODE(quote(aliases[0]), connection)['status']) != 'error': removing_patch = {'status': 'deleted', 'aliases': []} print('DELETING ' + aliases[0] + ' ' + str(removing_patch)) encodedcc.patch_ENCODE(quote(aliases[0]), connection, removing_patch) upload = { "aliases": aliases, "attachment": attach, "award": "U54HG007005", "document_type": "general protocol", "lab": "/labs/brenton-graveley/", "status": "released", "description": "{obj_type} protocol for {filepart} shRNA followed by RNA-seq".format( obj_type=obj_type, filepart=filepart), } print("Uploading {} as {}".format(filename, aliases[0])) encodedcc.new_ENCODE(connection, "Document", upload) print("Patching {} with document {}".format(value, aliases[0])) if obj_type == "Biosample": docs = {"protocol_documents": aliases} else: docs = {"documents": aliases} encodedcc.patch_ENCODE(quote(value), connection, docs) print("Removing document {}".format(filename)) subprocess.run(["rm", filename]) '''
def has_audit(self, accession): # Another GET request for page frame. audit = encodedcc.get_ENCODE(accession, self.connection, 'page').get('audit', {}) if (audit.get('ERROR') is not None or audit.get('NOT_COMPLIANT') is not None): details = [v[0]['category'] for v in audit.values()] message = 'WARNING: AUDIT on object: {}. SKIPPING!'.format(details) print(message) logger.warning(message) return True return False
def get_status(self, obj): """take object get status, @type, @id, uuid {@id : [@type, status]}""" name = obj["@type"][0] self.searched.append(obj["@id"]) if self.PROFILES.get(name): self.statusDict[obj["@id"]] = [name, obj["status"]] for key in obj.keys(): # loop through object properties if key in self.PROFILES[name]: # if the key is in profiles it's a link if type(obj[key]) is list: for link in obj[key]: item = link.split("/")[1].replace("-", "") if item in self.profiles_ref and link not in self.searched: # expand subobject subobj = encodedcc.get_ENCODE(link, self.connection) self.get_status(subobj) else: if item in self.dontExpand and link not in self.searched: # this is not one of the links we expand # is it a link we just get status of tempobj = encodedcc.get_ENCODE(link, self.connection) tempname = tempobj["@type"][0] self.searched.append(tempobj["@id"]) self.statusDict[tempobj["@id"]] = [tempname, tempobj["status"]] else: item = obj[key].split("/")[1].replace("-", "") if item in self.profiles_ref and obj[key] not in self.searched: # expand subobject subobj = encodedcc.get_ENCODE(obj[key], self.connection) self.get_status(subobj) else: if item in self.dontExpand and obj[key] not in self.searched: # this is not one of the links we expand # is it a link we just get status of tempobj = encodedcc.get_ENCODE(obj[key], self.connection) tempname = tempobj["@type"][0] self.searched.append(tempobj["@id"]) self.statusDict[tempobj["@id"]] = [tempname, tempobj["status"]]
def get_experiment_list(file, search, connection): objList = [] if search == "NULL": f = open(file) objList = f.readlines() for i in range(0, len(objList)): objList[i] = objList[i].strip() else: set = encodedcc.get_ENCODE(search + '&limit=all', connection, frame='embedded') for i in range(0, len(set['@graph'])): objList.append(set['@graph'][i]['accession']) return objList
def main(): parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--infile', '-i', default='obList', help="File containing a list of ENCSRs.") parser.add_argument('--search', default='NULL', help="The search parameters.") parser.add_argument( '--key', default='default', help= "The keypair identifier from the keyfile. Default is --key=default") parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file. Default is --keyfile=%s" % (os.path.expanduser("~/keypairs.json"))) parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages. Default is False.") parser.add_argument('--field', default='accession', help="The field to report. Default is accession.") args = parser.parse_args() DEBUG_ON = args.debug myKey = encodedcc.ENC_Key(args.keyfile, args.key) myConnect = encodedcc.ENC_Connection(myKey) #Get list of objects we are interested in objList = get_experiment_list(args.infile, args.search, myConnect) for i in range(0, len(objList)): field = '' if objList[i] != '': ob = encodedcc.get_ENCODE(objList[i], myConnect) id = ob.get('@id') if args.field in ob: field = str(ob[args.field]) else: id = objList[i] print('\t'.join([id, field]))
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.infile: if os.path.isfile(args.infile): accessions = [line.strip() for line in open(args.infile)] else: accessions = args.infile.split(",") elif args.query: data = [] if "search" in args.query: data = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) else: data = [encodedcc.get_ENCODE(args.query, connection)] for exp in data: files = exp.get("files", []) for f in files: res = encodedcc.get_ENCODE(f, connection) f_type = res.get("file_format", "") if f_type == "fastq": accessions.append(res["accession"]) else: print("No accessions to check") sys.exit(1) for acc in accessions: link = "/files/" + acc + "/@@download/" + acc + ".fastq.gz" for header, sequence, qual_header, quality in encodedcc.fastq_read( connection, uri=link): if args.header: header = header.decode("UTF-8") print(acc + "\t" + str(len(sequence)) + "\t" + header) else: sequence = sequence.decode("UTF-8") print(acc + "\t" + str(len(sequence)))
def updater(self, exp, con): ''' helper function runs the update step''' temp = encodedcc.get_ENCODE(exp, self.connection).get("controlled_by", []) if con not in temp: control = temp + [con] patch_dict = {"controlled_by": control} print("patching experiment file {} with controlled_by {}".format( exp, con)) encodedcc.patch_ENCODE(exp, self.connection, patch_dict) else: print( "ERROR: controlled_by for experiment file {} already contains {}" .format(exp, con))
def file_manager(key, value, connection, obj_type): filename = key.split("/")[-1] print("Downloading {}".format(filename)) r = requests.get(key) with open(filename, "wb") as outfile: outfile.write(r.content) if obj_type == "Biosample": filepart = filename.split("-")[0] else: filepart = filename.split("-")[1] attach = attachment(filename) temp = "_".join(key.split("/")[-2:]) aliases = ["brenton-graveley:" + temp] if (encodedcc.get_ENCODE(quote(aliases[0]), connection)['status']) != 'error': removing_patch = {'status':'deleted', 'aliases': []} print ('DELETING ' + aliases[0] + ' ' + str(removing_patch)) encodedcc.patch_ENCODE(quote(aliases[0]), connection, removing_patch) upload = {"aliases": aliases, "attachment": attach, "award": "U54HG007005", "document_type": "general protocol", "lab": "/labs/brenton-graveley/", "status": "released", "description": "{obj_type} protocol for {filepart} shRNA followed by RNA-seq".format(obj_type=obj_type, filepart=filepart), } print("Uploading {} as {}".format(filename, aliases[0])) encodedcc.new_ENCODE(connection, "Document", upload) print("Patching {} with document {}".format(value, aliases[0])) if obj_type == "Biosample": docs = {"protocol_documents": aliases} else: docs = {"documents": aliases} encodedcc.patch_ENCODE(quote(value), connection, docs) print("Removing document {}".format(filename)) subprocess.run(["rm", filename]) '''
def get_experiment_list(file, search, connection): objList = [] if search is None: f = open(file) objList = f.readlines() for i in range(0, len(objList)): objList[i] = objList[i].strip() else: col = get_ENCODE(search, connection, frame='page') for i in range(0, len(col['@graph'])): # print set['@graph'][i]['accession'] objList.append(col['@graph'][i]['@id']) # objList.append(set['@graph'][i]['uuid'] ) return objList
def retreive_list_of_replaced(object_to_inspect_acc, connection): to_return_list = [object_to_inspect_acc] objects_to_inspect = encodedcc.get_ENCODE( 'search/?type=Item&accession=' + object_to_inspect_acc, connection)['@graph'] if objects_to_inspect: for object_to_inspect in objects_to_inspect: if object_to_inspect.get('alternate_accessions'): for acc in object_to_inspect.get('alternate_accessions'): to_return_list.extend( retreive_list_of_replaced(acc, connection)) return to_return_list else: return to_return_list else: return to_return_list
def make_matrix(rows, columns, headers, queries, basic_query, connection): matrix = {} for row in rows: matrix[row] = [row] for col in headers: query = basic_query + queries[row] + columns[col] res = get_ENCODE(query, connection, frame='object') link = connection.server + query total = res['total'] func = '=HYPERLINK(' + '"' + link + '",' + repr(total) + ')' matrix[row].append(func) print ('\t'.join(matrix[row])) print (' ') print (' ')
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) if args.type: names = [args.type] else: book = xlrd.open_workbook(args.infile) names = book.sheet_names() profiles = encodedcc.get_ENCODE("/profiles/", connection) supported_collections = list(profiles.keys()) supported_collections = [s.lower() for s in list(profiles.keys())] for n in names: if n.lower() in supported_collections: excel_reader(args.infile, n, args.update, connection, args.patchall) else: print("Sheet name '{}' not part of supported object types!".format(n), file=sys.stderr)
def replacer(file, connection, update): if file.get("aliases"): # this has aliases if file["aliases"][0].endswith("_replaced"): # this is one of the old ones alias = file["aliases"][0].rstrip("_replaced") old_acc = file["accession"] old_date = file["date_created"] print(old_acc) new = encodedcc.get_ENCODE(quote(alias), connection) new_acc = new["accession"] new_date = new["date_created"] patch_dict = {"status": "replaced", "alternate_accessions": [alias]} #print("file {} with date {} replaces file {} with date {}".format(new_acc, new_date, old_acc, old_date)) if update: encodedcc.patch_ENCODE(file["@id"], connection, patch_dict) else: print("file {} has no aliases".format(file["@id"]))
def process_link(self, identifier_link, approved_types): # print ("entering process_link with " + identifier_link) #print ('Replicate' in approved_types) item = identifier_link.split("/")[1].replace("-", "") subobj = encodedcc.get_ENCODE(identifier_link, self.connection) subobjname = subobj["@type"][0] restricted_flag = False if (subobjname == 'File') and (self.is_restricted(subobj) is True): print (subobj['@id'] + ' is restricted, ' + 'therefore will not be released') restricted_flag = True if (item in self.profiles_ref) and \ (identifier_link not in self.searched): # expand subobject if (subobjname in approved_types) and \ (restricted_flag is False): self.get_status( subobj, hi.dictionary_of_lower_levels.get( hi.levels_mapping.get(subobjname)))
def process_link(self, identifier_link, approved_types): # print ("entering process_link with " + identifier_link) #print ('Replicate' in approved_types) item = identifier_link.split("/")[1].replace("-", "") subobj = encodedcc.get_ENCODE(identifier_link, self.connection) subobjname = subobj["@type"][0] restricted_flag = False if (subobjname == 'File') and (self.is_restricted(subobj) is True): print(subobj['@id'] + ' is restricted, ' + 'therefore will not be released') restricted_flag = True if (item in self.profiles_ref) and \ (identifier_link not in self.searched): # expand subobject if (subobjname in approved_types) and \ (restricted_flag is False): self.get_status( subobj, hi.dictionary_of_lower_levels.get( hi.levels_mapping.get(subobjname)))
def replacer(file, connection, update): if file.get("aliases"): # this has aliases if file["aliases"][0].endswith("_replaced"): # this is one of the old ones alias = file["aliases"][0].rstrip("_replaced") old_acc = file["accession"] old_date = file["date_created"] print(old_acc) new = encodedcc.get_ENCODE(quote(alias), connection) new_acc = new["accession"] new_date = new["date_created"] patch_dict = { "status": "replaced", "alternate_accessions": [alias] } #print("file {} with date {} replaces file {} with date {}".format(new_acc, new_date, old_acc, old_date)) if update: encodedcc.patch_ENCODE(file["@id"], connection, patch_dict) else: print("file {} has no aliases".format(file["@id"]))
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) print("Running on {server}".format(server=connection.server)) if not os.path.isfile(args.infile): print("File {filename} not found!".format(filename=args.infile)) sys.exit(1) if args.type: names = [args.type] else: book = xlrd.open_workbook(args.infile) names = book.sheet_names() profiles = encodedcc.get_ENCODE("/profiles/", connection) supported_collections = list(profiles.keys()) supported_collections = [s.lower() for s in list(profiles.keys())] for n in names: if n.lower() in supported_collections: excel_reader(args.infile, n, args.update, connection, args.patchall) else: print("Sheet name '{name}' not part of supported object types!".format(name=n), file=sys.stderr)
def _get_associated_term_id(self, data_type, data): """ Find biosample_term_id associated with particular object. """ obj_id = None if data_type == 'File': # Get biosample_term_id in file.dataset. obj_id = data.get('dataset') elif data_type == 'Replicate': # Get biosample_term_id in replicate.experiment. obj_id = data.get('experiment') elif data_type == 'Library': # Get biosample_term_id in library.biosample. obj_id = data.get('biosample') else: # For experiments and biosamples. biosample_term_id = data.get('biosample_term_id') if obj_id is not None: # Return biosample_term_id of embedded object. biosample_term_id = encodedcc.get_ENCODE( obj_id, self.connection).get('biosample_term_id') return biosample_term_id
def multi_rep(self, obj): '''one control, with one replicate in control per replicate in experiment''' control_files = encodedcc.get_ENCODE( obj["possible_controls"][0]["accession"], self.connection, frame="embedded").get("files", []) control_replicates = obj["possible_controls"][0].get("replicates", []) exp_data = {} con_data = {} if len(control_replicates) != len(obj["replicates"]): if self.DEBUG: print( "Control has {} replicates and experiment has {} replicates" .format(len(control_replicates), len(obj["replicates"])), file=sys.stderr) return if len(control_files) == 0: if self.DEBUG: print("Control {} has no files".format( obj["possible_controls"][0]["accession"]), file=sys.stderr) return for e in obj["files"]: if e.get("file_type", "") == "fastq": if not self.MISSING or (self.MISSING and not e.get("controlled_by")): self.pair_dict_maker(exp_data, e) for c in control_files: if c.get("file_type", "") == "fastq": self.pair_dict_maker(con_data, c) if self.ignore_runtype: self.mini(exp_data, con_data, obj) else: self.mini(con_data, exp_data, obj)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) print("Running on {server}".format(server=connection.server)) if not os.path.isfile(args.infile): print("File {filename} not found!".format(filename=args.infile)) sys.exit(1) if args.type: names = [args.type] else: book = xlrd.open_workbook(args.infile) names = book.sheet_names() profiles = encodedcc.get_ENCODE("/profiles/", connection) supported_collections = list(profiles.keys()) supported_collections = [s.lower() for s in list(profiles.keys())] for n in names: if n.lower() in supported_collections: excel_reader(args.infile, n, args.update, connection, args.patchall) else: print("Sheet name '{name}' not part of supported object types!". format(name=n), file=sys.stderr)
def get_char_summary(lot, connection): anti = encodedcc.get_ENCODE(lot, connection, frame="embedded") charas = anti.get("characterizations", []) number_chars_in_progress = 0 number_chars_passing = 0 number_chars_failing = 0 for c in charas: s = c["status"] if s == "in progress": number_chars_in_progress += 1 elif s == "pending dcc review": number_chars_in_progress += 1 elif s == 'exempt from standards': number_chars_passing += 1 elif s == "compliant": number_chars_passing += 1 else: number_chars_failing += 1 char_dict = {"number_chars_in_progress": number_chars_in_progress, "number_chars_passing": number_chars_passing, "number_chars_failing": number_chars_failing } return char_dict
def files(objList, fileCheckedItems, connection): for obj in objList: exp = encodedcc.get_ENCODE(obj, connection) if any(exp.get("files")): expfiles = exp["files"] else: expfiles = exp["original_files"] for f in expfiles: fileob = {} file = encodedcc.get_ENCODE(f, connection) for field in fileCheckedItems: fileob[field] = file.get(field) fileob["submitted_by"] = encodedcc.get_ENCODE( file["submitted_by"], connection)["title"] fileob["experiment"] = exp["accession"] fileob["experiment-lab"] = encodedcc.get_ENCODE( exp["lab"], connection)["name"] fileob["biosample"] = exp.get("biosample_term_name", "") fileob["flowcell"] = [] fileob["lane"] = [] fileob["Uniquely mapped reads number"] = "" fileob["biological_replicate"] = "" fileob["technical_replicate"] = "" fileob["replicate_id"] = "" if file.get("file_format", "") == "bam": for q in file.get("quality_metrics", []): if "star-quality-metrics" in q: star = encodedcc.get_ENCODE(q, connection) fileob["Uniquely mapped reads number"] = star[ "Uniquely mapped reads number"] for fcd in file["flowcell_details"]: fileob["flowcell"].append(fcd.get("flowcell", "")) fileob["lane"].append(fcd.get("lane")) try: fileob["platform"] = encodedcc.get_ENCODE( fileob["platform"], connection)["title"] except: fileob["platform"] = None if "replicates" in exp: temp_rep = encodedcc.get_ENCODE(exp["replicates"][0], connection) if "library" in temp_rep: temp_lib = encodedcc.get_ENCODE(temp_rep["library"], connection) if "biosample" in temp_lib: temp_bio = encodedcc.get_ENCODE( temp_lib["biosample"], connection) if "donor" in temp_bio: temp_don = encodedcc.get_ENCODE( temp_bio["donor"], connection) if "organism" in temp_don: temp_org = encodedcc.get_ENCODE( temp_don["organism"], connection) fileob["species"] = temp_org["name"] else: fileob["species"] = "" if "replicate" in file: rep = encodedcc.get_ENCODE(file["replicate"], connection) fileob["biological_replicate"] = rep[ "biological_replicate_number"] fileob["technical_replicate"] = rep[ "technical_replicate_number"] fileob["replicate_id"] = rep["uuid"] if "library" in rep: library = encodedcc.get_ENCODE(rep["library"], connection) try: fileob["library_aliases"] = library["aliases"] except: fileob["library_aliases"] = "" if "biosample" in library: bio = encodedcc.get_ENCODE(library["biosample"], connection) fileob["biosample_aliases"] = bio["aliases"] if any(exp.get("aliases", [])): fileob["alias"] = exp["aliases"][0] else: fileob["alias"] = "" row = [] for j in fileCheckedItems: row.append(repr(fileob[j])) print('\t'.join(row))
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) assemblies = ['hg19', 'GRCh38'] summary = [] if args.infile is not None and 'ENCSR' in args.infile: objList = [args.infile] else: objList = get_experiment_list(args.infile, args.query, connection) for obj_id in objList: results = {} obj = get_ENCODE(obj_id, connection, frame='page') # Get basic info reps = get_replicate_count(obj) results['rep_count'] = len(reps) results['status'] = obj['status'] results['internal_status'] = obj['internal_status'] results['award'] = obj['award'].get('rfa') results['peaks'] = {} results['mapping'] = {} results['unarchived_files'] = [] results['status issues'] = [] results['accession'] = obj['accession'] # Get audits for level in ['WARNING', 'ERROR', 'NOT_COMPLIANT', 'INTERNAL_ACTION']: if obj['audit'].get(level): results[level] = len(obj['audit'].get(level)) # Get status issues actions = obj['audit'].get('INTERNAL_ACTION') if actions: status_issues = [ i for i in actions if i['category'] in [ 'experiment not submitted to GEO', 'mismatched file status', 'mismatched status' ] ] results['status issues'] = status_issues # Inspect files good_files = [ f for f in obj['files'] if f['status'] in ['released', 'in progress'] ] fastqs = [ f for f in obj['files'] if f['status'] in ['released', 'in progress'] ] print("There are files in this experiment:", len(obj['files'])) print("There are good files in this experiment:", len(good_files)) # look for unarchived processed files from other labs processed_files = [ f for f in obj['files'] if f['file_format'] != 'fastq' ] external_files = [ f for f in processed_files if (f['lab']['name'] != 'encode-processing-pipeline') ] unarchived_files = [ f for f in external_files if (f['status'] != 'archived') ] results['unarchived_files'] = unarchived_files for assembly in assemblies: replicates = [] file_list = [ f for f in good_files if f.get('assembly') == assembly ] for rep in reps: rep_obj = {'rep': rep} file_list_rep = [ f for f in file_list if rep in f.get('biological_replicates') ] aligns = [ f for f in file_list_rep if f.get('output_type') == 'alignments' ] rep_obj['aligns'] = len(aligns) raw_aligns = [ f for f in file_list_rep if f.get('output_type') == 'unfiltered alignments' ] rep_obj['raws'] = len(raw_aligns) replicates.append(rep_obj) failing_replicates = [f for f in replicates if f['aligns'] == 0] if len(failing_replicates) is 0: results['mapping'][assembly] = True elif len(replicates) == len(failing_replicates): # They all fail results['mapping'][assembly] = False else: results['mapping'][assembly] = [] for rep in failing_replicates: results['mapping'][assembly].append(rep['rep']) peaks = [f for f in file_list if f.get('output_type') == 'peaks'] if len(peaks) > 0: results['peaks'][assembly] = True else: results['peaks'][assembly] = False summary.append(results) unarchived_list = [r for r in summary if len(r['unarchived_files']) > 0] print('These experiments have unarchived files', len(unarchived_list)) for item in unarchived_list: print(item['accession']) print('') print('') exps_mismatched_states = [ r for r in summary if len(r['status issues']) > 0 ] print('These experiments have mismatched states', len(exps_mismatched_states)) for item in exps_mismatched_states: print(item['accession']) print('') print('') # not_mapped_GRCh38 = [r for r in summary if r['missing_aligns']['GRCh38'] is False] exps_missing_hg38_mapping = [ r for r in summary if r['mapping']['GRCh38'] is False ] print('These experiments are missing GRCh38 mapping for all replicates', len(exps_missing_hg38_mapping)) for item in exps_missing_hg38_mapping: print(item['accession'], item['status'], item['internal_status']) print('') print('') exps_partial_hg38_mapping = [ r for r in summary if r['mapping']['GRCh38'] is not False and r['mapping']['GRCh38'] is not True ] print('These experiments are missing GRCh38 mapping for some replicates', len(exps_partial_hg38_mapping)) for item in exps_partial_hg38_mapping: print(item['accession'], item['status'], item['internal_status'], item['mapping']['GRCh38']) print('') print('') exps_missing_hg38_peaks = [ r for r in summary if r['peaks']['GRCh38'] is False ] exps_missing_hg38_peaks_but_have_mapping = [ f for f in exps_missing_hg38_peaks if f['peaks']['GRCh38'] is False and f not in exps_missing_hg38_mapping and f not in exps_partial_hg38_mapping ] print('These experiments are missing GRCh38 peaks but having all mappings', len(exps_missing_hg38_peaks_but_have_mapping)) for item in exps_missing_hg38_peaks: print(item['accession'], item['status'], item['internal_status']) print('') print('') exps_missing_hg19_mapping = [ r for r in summary if r['mapping']['hg19'] is False ] print('These experiments are missing hg19 mapping for all replicates', len(exps_missing_hg19_mapping)) for item in exps_missing_hg19_mapping: print(item['accession'], item['status'], item['internal_status']) print('') print('') exps_partial_hg19_mapping = [ r for r in summary if r['mapping']['hg19'] is not False and r['mapping']['hg19'] is not True ] print('These experiments are missing hg19 mapping for some replicates', len(exps_partial_hg19_mapping)) for item in exps_partial_hg19_mapping: print(item['accession'], item['status'], item['internal_status'], item['mapping']['hg19']) print('') print('') exps_missing_hg19_peaks = [ r for r in summary if r['peaks']['hg19'] is False and r not in exps_missing_hg19_mapping and r not in exps_partial_hg19_mapping ] print('These experiments are missing hg19 peaks', len(exps_missing_hg19_peaks)) for item in exps_missing_hg19_peaks: print(item['accession'], item['status'], item['internal_status'], 'warnings:', item.get('WARNING')) print('') print('')
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.infile: if os.path.isfile(args.infile): accessions = [line.rstrip('\n') for line in open(args.infile)] else: accessions = args.infile.split(",") elif args.query: if "search" in args.query: temp = encodedcc.get_ENCODE(args.query, connection).get( "@graph", []) else: temp = [encodedcc.get_ENCODE(args.query, connection)] if any(temp): for obj in temp: if obj.get("accession"): accessions.append(obj["accession"]) elif obj.get("uuid"): accessions.append(obj["uuid"]) elif obj.get("@id"): accessions.append(obj["@id"]) elif obj.get("aliases"): accessions.append(obj["aliases"][0]) if len(accessions) == 0: # if something happens and we end up with no accessions stop print("ERROR: object has no identifier", file=sys.stderr) sys.exit(1) else: dataList = [] for acc in accessions: obj = encodedcc.get_ENCODE(acc, connection, frame="embedded") isValid = True check = ["replicates", "files"] for c in check: if not obj.get(c): if args.debug: print( "Missing {} for {}".format(c, acc), file=sys.stderr) isValid = False if obj.get("possible_controls"): for p in obj["possible_controls"]: for c in check: if not obj.get(c): if args.debug: print( "Missing {} for {}".format( c, p["accession"]), file=sys.stderr) isValid = False else: isValid = False if args.debug: print( "Missing possible_controls for {}".format(acc), file=sys.stderr) if isValid: b = BackFill( connection, dataList, debug=args.debug, missing=args.missing) if args.method == "single": b.single_rep(obj) if args.debug: print("SINGLE REP {}".format(acc)) elif args.method == "multi": b.multi_rep(obj, args.ignore_runtype) if args.debug: print("MULTI REP {}".format(acc)) elif args.method == "biosample": b.multi_control(obj) if args.debug: print("BIOSAMPLE {}".format(acc)) else: exp_rep = len(obj["replicates"]) exp_con = len(obj["possible_controls"]) if exp_con == 1: # one possible control con_rep = len( obj["possible_controls"][0]["replicates"]) if con_rep == exp_rep: # same number experiment replicates as control replicates # method is multi b.multi_rep(obj, args.ignore_runtype) if args.debug: print("MULTI REP {}".format(acc)) elif con_rep == 1: # one control replicate and multiple experiment replicates # method is single b.single_rep(obj) if args.debug: print("SINGLE REP {}".format(acc)) else: if args.debug: print( "Experiment {} contains {} experiment replicates and {} control replicates and so does not fit the current pattern!" .format(acc, exp_rep, con_rep)) elif exp_con > 1: # more than one possible control con_reps = 0 for con in obj["possible_controls"]: if len(con["replicates"]) == 1: con_reps += 1 if con_reps == exp_rep: # same number of controls with one replicate as number of experiment replicates # method is biosample b.multi_control(obj) if args.debug: print("BIOSAMPLE {}".format(acc)) else: if args.debug: print( "Experiment {} contains {} experiment replicates and {} control replicates between {} total controls and so does not fit the current pattern!" .format(acc, exp_rep, con_rep, exp_con)) else: if args.debug: print( "Experiment {} does not fit any of the current patterns!" .format(acc)) if len(dataList) > 0: print( "Experiment Accession\tCheck Type\tControl Files\tExperiment Files" ) for d in dataList: print("{}\t{}\t{}\t{}".format(d["Exp Accession"], d["Check type"], d["Control"], d["Experiment"]))