def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.object: if os.path.isfile(args.object): accessions = [line.strip() for line in open(args.object)] else: accessions = args.object.split(",") elif args.query: if "search" in args.query: temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) else: temp = [encodedcc.get_ENCODE(args.query, connection)] if any(temp): for obj in temp: if obj.get("accession"): accessions.append(obj["accession"]) elif obj.get("uuid"): accessions.append(obj["uuid"]) elif obj.get("@id"): accessions.append(obj["@id"]) elif obj.get("aliases"): accessions.append(obj["aliases"][0]) else: print("ERROR: object has no identifier", file=sys.stderr) if len(accessions) == 0: print("No accessions to check!", file=sys.stderr) sys.exit(1) for acc in accessions: encodedcc.get_ENCODE(acc, connection)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) encodedcc.patch_set(args, connection)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) print("Running on", key.server) # build the PROFILES reference dictionary release = Data_Release(args, connection) release.run_script()
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) profiles = encodedcc.get_ENCODE('/profiles/', connection) for object_type in profiles.keys(): profile_properties = encodedcc.get_ENCODE( '/profiles/' + object_type, connection).get('properties') # we should fix only objects that have alternate accessions property if profile_properties and profile_properties.get( 'alternate_accessions'): uuid_2_alternate_accessions = {} objects = encodedcc.get_ENCODE('search/?type=' + object_type, connection)['@graph'] for entry in objects: if entry.get('alternate_accessions'): replaced_objects_accessions = [] for acc in entry.get('alternate_accessions'): replaced_objects_accessions.extend( retreive_list_of_replaced(acc, connection)) if sorted(list(set( replaced_objects_accessions))) != sorted( entry.get('alternate_accessions')): uuid_2_alternate_accessions[entry['uuid']] = \ set(replaced_objects_accessions) for uuid in uuid_2_alternate_accessions.keys(): uuid_sets_counter = 0 for key in uuid_2_alternate_accessions.keys(): if uuid_2_alternate_accessions[uuid] <= \ uuid_2_alternate_accessions[key]: uuid_sets_counter += 1 if uuid_sets_counter == 1: for acc in list(uuid_2_alternate_accessions[uuid]): to_clean_objects = encodedcc.get_ENCODE( 'search/?type=Item&accession=' + acc, connection)['@graph'] for object_to_clean in to_clean_objects: print(object_to_clean['uuid'] + ' alternate accessions list ' + str(object_to_clean[ 'alternate_accessions']) + ' is removed') encodedcc.patch_ENCODE( object_to_clean['uuid'], connection, {"alternate_accessions": []}) print(uuid + ' is patched with ' + str({"alternate_accessions": list( uuid_2_alternate_accessions[uuid])})) encodedcc.patch_ENCODE( uuid, connection, {"alternate_accessions": list( uuid_2_alternate_accessions[uuid])})
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) if args.datatype == 'CHIP': make_chip_report(connection) elif args.datatype == 'RNA': make_rna_report(connection) else: print('unimplimented')
def main(): parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--infile', '-i', default='obList', help="File containing a list of ENCSRs.") parser.add_argument('--search', default='NULL', help="The search parameters.") parser.add_argument( '--key', default='default', help= "The keypair identifier from the keyfile. Default is --key=default") parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file. Default is --keyfile=%s" % (os.path.expanduser("~/keypairs.json"))) parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages. Default is False.") parser.add_argument('--field', default='accession', help="The field to report. Default is accession.") args = parser.parse_args() DEBUG_ON = args.debug myKey = encodedcc.ENC_Key(args.keyfile, args.key) myConnect = encodedcc.ENC_Connection(myKey) #Get list of objects we are interested in objList = get_experiment_list(args.infile, args.search, myConnect) for i in range(0, len(objList)): field = '' if objList[i] != '': ob = encodedcc.get_ENCODE(objList[i], myConnect) id = ob.get('@id') if args.field in ob: field = str(ob[args.field]) else: id = objList[i] print('\t'.join([id, field]))
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) data = encodedcc.get_ENCODE(args.query, connection).get("@graph") print("Experiment\tStatus\tControl\tStatus") for exp in data: if exp.get("possible_controls"): if exp["status"] != "released": c = exp["possible_controls"][0] control = encodedcc.get_ENCODE(c, connection) if control["status"] == "released": print("{}\t{}\t{}\t{}".format(exp["accession"], exp["status"], control["accession"], control["status"])) '''
def main(): headers = [ "accession", "description", "organism", "age_display", "life_stage", "sex", "biosample_term_name", "biosample_type", "depleted_in_term_name", "phase", "subcellular_fraction_term_name", "post_synchronization_time", "post_synchronization_time_units", "synchronization", "model_organism_mating_status", "treatments", "donor", "transfection_type", "talens", "constructs", "model_organism_donor_constructs", "rnais", "part_of", "pooled_from", "derived_from", "status", "culture_harvest_date", "culture_start_date", "date_obtained", "lab", "source", "note", "notes", "health_status", "starting_amount", "starting_amount_units" ] args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.query: temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) for obj in temp: accessions.append(obj.get("accession")) elif args.infile: accessions = [line.strip() for line in open(args.infile)] elif args.accession: accessions = [args.accession] else: print("No accessions to check!", file=sys.stderr) sys.exit(1) data = [] for acc in accessions: temp = {} obj = encodedcc.get_ENCODE(acc, connection) for h in headers: x = obj.get(h, "") if any(x): temp[h] = x else: temp[h] = "" data.append(temp) writer = csv.DictWriter(sys.stdout, delimiter='\t', fieldnames=headers) writer.writeheader() for d in data: writer.writerow(d)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) encodedcc.get_fields(args, connection) data = [] with open(args.outfile, "r") as tsvfile: reader = csv.DictReader(tsvfile, delimiter='\t') for row in reader: data.append(row) for x in data: for key in x.keys(): if key != "accession": x[key] = format_number(x[key]) header = ["accession", args.onefield] with open(args.outfile, "w") as tsvfile: writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=header) writer.writeheader() for x in data: writer.writerow(x) args.infile = args.outfile encodedcc.patch_set(args, connection)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.infile: if os.path.isfile(args.infile): accessions = [line.strip() for line in open(args.infile)] else: accessions = args.infile.split(",") elif args.query: data = [] if "search" in args.query: data = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) else: data = [encodedcc.get_ENCODE(args.query, connection)] for exp in data: files = exp.get("files", []) for f in files: res = encodedcc.get_ENCODE(f, connection) f_type = res.get("file_format", "") if f_type == "fastq": accessions.append(res["accession"]) else: print("No accessions to check") sys.exit(1) for acc in accessions: link = "/files/" + acc + "/@@download/" + acc + ".fastq.gz" for header, sequence, qual_header, quality in encodedcc.fastq_read( connection, uri=link): if args.header: header = header.decode("UTF-8") print(acc + "\t" + str(len(sequence)) + "\t" + header) else: sequence = sequence.decode("UTF-8") print(acc + "\t" + str(len(sequence)))
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) if args.lib: libraries = [line.strip() for line in open(args.lib)] lib_alias = dict.fromkeys(libraries) # http://graveleylab.cam.uchc.edu/ENCODE/ENCODE_DATA/protocol/LV08_library_protocol/L-AKAP1-LV08-3.pdf #print (lib_alias) for key in lib_alias.keys(): lib_alias[key] = "brenton-graveley:" + key.split("/")[-1].split( ".")[0] for key in lib_alias.keys(): file_manager(key, lib_alias[key], connection, "Library") if args.bio: biosamples = [line.strip() for line in open(args.bio)] bio_alias = dict.fromkeys(biosamples) # http://graveleylab.cam.uchc.edu/ENCODE/ENCODE_DATA/protocol/LV08_biosample_protocol/DDX3X-LV08-15.pdf for key in bio_alias.keys(): bio_alias[key] = "brenton-graveley:" + key.split("/")[-1].split( ".")[0] for key in bio_alias.keys(): file_manager(key, bio_alias[key], connection, "Biosample")
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) print("Running on {server}".format(server=connection.server)) if not os.path.isfile(args.infile): print("File {filename} not found!".format(filename=args.infile)) sys.exit(1) if args.type: names = [args.type] else: book = xlrd.open_workbook(args.infile) names = book.sheet_names() profiles = encodedcc.get_ENCODE("/profiles/", connection) supported_collections = list(profiles.keys()) supported_collections = [s.lower() for s in list(profiles.keys())] for n in names: if n.lower() in supported_collections: excel_reader(args.infile, n, args.update, connection, args.patchall) else: print("Sheet name '{name}' not part of supported object types!". format(name=n), file=sys.stderr)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) output = encodedcc.GetFields(connection, args) output.get_fields()
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) keypair = (key.authid, key.authpw) server = key.server query = args.query objects = \ encoded_get(server + 'search/?type=AntibodyLot' + '&type=Donor&type=Biosample' + '&type=File&type=Library' + '&type=Dataset&type=Pipeline' + '&type=Replicate' + '&type=Treatment&format=json&' + 'frame=object&limit=all&' + query, keypair)['@graph'] print('There are ' + str(len(objects)) + ' objects that should be inspected on the portal') counter = 0 for obj in objects: counter += 1 if counter % 1000 == 0: print('Script processed ' + str(counter) + ' objects') if obj['status'] not in ['replaced']: patching_data = {} # fixing links of donor fix_replaced_references(obj, 'parent_strains', patching_data, keypair, server) fix_replaced_references(obj, 'identical_twin', patching_data, keypair, server) fix_replaced_references(obj, 'outcrossed_strain', patching_data, keypair, server) fix_replaced_references(obj, 'littermates', patching_data, keypair, server) fix_replaced_references(obj, 'fraternal_twin', patching_data, keypair, server) fix_replaced_references(obj, 'parents', patching_data, keypair, server) fix_replaced_references(obj, 'children', patching_data, keypair, server) fix_replaced_references(obj, 'siblings', patching_data, keypair, server) # fixing links of file/experiment/biosample fix_replaced_references(obj, 'derived_from', patching_data, keypair, server) fix_replaced_references(obj, 'paired_with', patching_data, keypair, server) fix_replaced_references(obj, 'controlled_by', patching_data, keypair, server) fix_replaced_references(obj, 'possible_controls', patching_data, keypair, server) fix_replaced_references(obj, 'supersedes', patching_data, keypair, server) fix_replaced_references(obj, 'dataset', patching_data, keypair, server) fix_replaced_references(obj, 'related_files', patching_data, keypair, server) fix_replaced_references(obj, 'related_datasets', patching_data, keypair, server) # fixing links of biosample fix_replaced_references(obj, 'host', patching_data, keypair, server) fix_replaced_references(obj, 'part_of', patching_data, keypair, server) fix_replaced_references(obj, 'originated_from', patching_data, keypair, server) fix_replaced_references(obj, 'pooled_from', patching_data, keypair, server) fix_replaced_references(obj, 'donor', patching_data, keypair, server) # fixing links of library fix_replaced_references(obj, 'biosample', patching_data, keypair, server) # fixing links of treatment fix_replaced_references(obj, 'biosamples_used', patching_data, keypair, server) fix_replaced_references(obj, 'antibodies_used', patching_data, keypair, server) # fixing links of replicate fix_replaced_references(obj, 'antibody', patching_data, keypair, server) fix_replaced_references(obj, 'experiment', patching_data, keypair, server) fix_replaced_references(obj, 'library', patching_data, keypair, server) if patching_data: print('Patching object ' + obj['@type'][0] + '\t' + obj['uuid']) print('OLD DATA:') for k in patching_data: print('\t' + k + '\t' + str(obj[k])) print('---------') print('NEW DATA:') for k in patching_data: print('\t' + k + '\t' + str(patching_data[k])) print('---------') encodedcc.patch_ENCODE(obj['uuid'], connection, patching_data)
def main(): import argparse parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('collection', help="The collection to get") parser.add_argument('--es', default=False, action='store_true', help="Use elasticsearch") parser.add_argument( '--query', help="A complete query to run rather than GET the whole collection. \ E.g. \"search/?type=biosample&lab.title=Ross Hardison, PennState\". Implies --es." ) parser.add_argument( '--submittable', default=False, action='store_true', help="Show only properties you might want a submitter to submit.") parser.add_argument('--key', default='default', help="The keypair identifier from the keyfile. \ Default is --key=default") parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file. Default is --\ keyfile=%s" % (os.path.expanduser("~/keypairs.json"))) parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages. Default is False.") args = parser.parse_args() keys = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(keys) global DEBUG DEBUG = args.debug supplied_name = args.collection if supplied_name.endswith('s'): search_name = supplied_name.rstrip('s').replace('-', '_') schema_name = search_name + '.json' elif supplied_name.endswith('.json'): schema_name = supplied_name search_name = supplied_name.rstrip('.json') else: search_name = supplied_name.replace('-', '_') schema_name = search_name + '.json' schema_uri = '/profiles/' + schema_name object_schema = encodedcc.get_ENCODE(schema_uri, connection) headings = [] for schema_property in object_schema["properties"]: property_type = object_schema["properties"][schema_property]["type"] if isinstance( property_type, list ): # hack to deal with multi-typed properties, just pick the first one property_type = property_type[0] if property_type == 'string': # if it's a string type, the heading is just the property name headings.append(schema_property) elif property_type == 'array': # format the heading to be property_name:type:array or, if an array of strings, property_name:array if 'items' in object_schema["properties"][schema_property].keys(): whateveritscalled = "items" elif 'reference' in object_schema["properties"][ schema_property].keys(): whateveritscalled = "reference" elif 'url' in object_schema["properties"][schema_property].keys(): whateveritscalled = "url" else: print(object_schema["properties"][schema_property].keys()) raise NameError("None of these match anything I know") if object_schema["properties"][schema_property][whateveritscalled][ "type"] == 'string': headings.append(schema_property + ':array') else: try: headings.append( schema_property + ':' + object_schema["properties"] [schema_property][whateveritscalled]["type"] + ':array') except: headings.append(schema_property + ':mixed:array') else: # it isn't a string, and it isn't an array, so make the heading property_name:type headings.append(schema_property + ':' + property_type) headings.sort() if 'file' in supplied_name or 'dataset' in supplied_name or 'source' in supplied_name or 'award' in supplied_name: pass else: # headings.append('award.rfa') #need to add a parameter to specify additional properties pass if 'file' in supplied_name: headings.append('replicate.biological_replicate_number') headings.append('replicate.technical_replicate_number') if 'biosample' in supplied_name: headings.append('organ_slims') if 'access-key' in supplied_name: headings.append('user.title') if 'user' in supplied_name: headings.append('title') exclude_unsubmittable = [ 'accession', 'uuid', 'schema_version', 'alternate_accessions', 'submitted_by' ] global collection if args.query: uri = args.query collection = encodedcc.get_ENCODE(uri, connection) elif args.es: uri = '/search/?type=' + search_name collection = encodedcc.get_ENCODE(uri, connection) else: collection = get_without_ESearch(search_name, connection) collected_items = collection['@graph'] headstring = "" for heading in headings: if args.submittable and heading.split(':')[0] in exclude_unsubmittable: pass else: headstring += heading + '\t' headstring = headstring.rstrip() print(headstring) for item in collected_items: # obj = encodedcc.get_ENCODE(item['@id'], connection) obj = item obj = encodedcc.flat_ENCODE(obj) rowstring = "" for header in headstring.split('\t'): prop_key = header.split(':')[0] if prop_key in obj: tempstring = json.dumps(obj[prop_key]).lstrip('"').rstrip('"') if tempstring == '[]': tempstring = "" rowstring += tempstring + '\t' elif '.' in prop_key: try: embedded_key = obj[prop_key.split('.')[0]] if '/' in embedded_key: embedded_obj = encodedcc.get_ENCODE( embedded_key, connection) else: embedded_obj = encodedcc.get_ENCODE( prop_key.split('.')[0] + '/' + obj[prop_key.split('.')[0]], connection) embedded_value_string = json.dumps(embedded_obj[ prop_key.split('.')[1]]).lstrip('"').rstrip('"') if embedded_value_string == '[]': embedded_value_string = "" except KeyError: embedded_value_string = "" rowstring += embedded_value_string + '\t' else: rowstring += '\t' rowstring = rowstring.rstrip() print(rowstring)
def main(): parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--infile', '-i', default='objList', help="File containing a list of ENCSRs.") parser.add_argument('--search', default='NULL', help="The search parameters.") parser.add_argument( '--datatype', default='OTHER', help="The datatype format to print your report. (CHIP,RNA,REPLI,OTHER)" ) parser.add_argument( '--key', default='default', help= "The keypair identifier from the keyfile. Default is --key=default") parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file. Default is --keyfile=%s" % (os.path.expanduser("~/keypairs.json"))) parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages. Default is False.") parser.add_argument('--details', default=False, action='store_true', help="Print detailed report. Default off") parser.add_argument('--status', default=False, action='store_true', help="Print statuses of each object. Default off") parser.add_argument('--mouse', default=False, action='store_true', help="Print mouse specific details. Default off") parser.add_argument('--simple', default=False, action='store_true', help="Very simple output. Default off") parser.add_argument('--library', default=False, action='store_true', help="Print library details. Default off") parser.add_argument('--files', default=False, action='store_true', help="Print a file based report. Default off") parser.add_argument( '--nhgri', default=False, action='store_true', help="Print a library based report based on standards. Default off") parser.add_argument('--encode2', default=False, action='store_true', help="Print dbxrefs for ENCODE2. Default off") args = parser.parse_args() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) '''Adjust the checked list by the datatype''' if args.datatype != 'CHIP': checkedItems.remove('theTarget') checkedItems.remove('control_exps') repCheckedItems.remove('antibody') repCheckedItems.remove('antibody_status') repCheckedItems.remove('antibody_source') repCheckedItems.remove('antibody_product') repCheckedItems.remove('antibody_lot') if args.datatype != 'REPLI': libraryCheckedItems.remove('phase') if args.datatype != 'RNA': libraryCheckedItems.remove('subcellular_fraction_term_name') libraryCheckedItems.remove('library_treatments') libraryCheckedItems.remove('depleted_in_term_name') libraryCheckedItems.remove('spikeins_used') if args.simple: if args.datatype == 'CHIP': repCheckedItems.remove('antibody_status') repCheckedItems.remove('antibody_source') repCheckedItems.remove('antibody_product') repCheckedItems.remove('antibody_lot') if not args.details: checkedItems.remove('project') checkedItems.remove('submitter') checkedItems.remove('grant') checkedItems.remove('assay_term_id') checkedItems.remove('biosample_term_id') libraryCheckedItems.remove('nucleic_acid_term_id') libraryCheckedItems.remove('biosample_biosample_term') libraryCheckedItems.remove('biosample_biosample_id') libraryCheckedItems.remove('biosample_biosample_type') if not args.library: libraryCheckedItems.remove('lysis_method') libraryCheckedItems.remove('fragmentation_method') libraryCheckedItems.remove('fragmentation_date') libraryCheckedItems.remove('extraction_method') libraryCheckedItems.remove('library_size_selection_method') libraryCheckedItems.remove('size_range') libraryCheckedItems.remove('nucleic_acid_starting_quantity') libraryCheckedItems.remove('nucleic_acid_starting_quantity_units') if not args.status: libraryCheckedItems.remove('library_status') libraryCheckedItems.remove('biosample_status') libraryCheckedItems.remove('donor_status') repCheckedItems.remove('rep_status') checkedItems.remove('status') if not args.encode2: checkedItems.remove('dbxrefs') if not args.mouse: libraryCheckedItems.remove('strain') libraryCheckedItems.remove('strain_background') if args.files: print('\t'.join(fileCheckedItems)) else: print('\t'.join(checkedItems + repCheckedItems + libraryCheckedItems)) # Get list of objects we are interested in search = args.search objList = get_experiment_list(args.infile, search, connection) if args.files: reporter_files.files(objList, fileCheckedItems, connection) return else: for i in range(0, len(objList)): exp = encodedcc.get_ENCODE(objList[i], connection, frame='embedded') ob = {} for i in checkedItems: if i in exp: ob[i] = exp[i] else: ob[i] = '' '''Get the counts''' if 'replicates' in exp: ob['replicate_count'] = len(exp['replicates']) else: ob['replicate_count'] = 0 if 'documents' in exp: ob['document_count'] = len(exp['documents']) ob['experiment_documents'] = get_doc_list(exp['documents']) else: ob['document_count'] = 0 ob['experiment_documents'] = [] if 'files' in exp: ob['file_count'] = len(exp['files']) else: ob['file_count'] = 0 '''Get the experiment level ownership''' ob['lab_name'] = exp['lab']['name'] ob['project'] = exp['award'].get('rfa') ob['grant'] = exp['award']['name'] ob['submitter'] = exp['submitted_by']['title'] ob['experiment_documents'] = get_doc_list(exp['documents']) temp = '' for i in range(0, len(exp['dbxrefs'])): temp = temp + ' ; ' + exp['dbxrefs'][i] ob['dbxrefs'] = temp ob['control_exps'] = '' if 'possible_controls' in exp: for q in exp['possible_controls']: ob['control_exps'] = ob['control_exps'] + \ ' ' + q['accession'] else: ob['control_exps'] = [] if 'target' in exp: ob['theTarget'] = exp['target']['label'] files_count = {} files_list = {} repIds = [] for item in exp['files']: if item.get('biological_replicates') is None: repId = 'no rep' elif len(item['biological_replicates']) == 1: repId = item['biological_replicates'][0] else: repId = 'no rep' if repId in files_list: files_list[repId].append(item['accession']) else: files_list[repId] = [item['accession']] if repId in files_count: files_count[repId] = files_count[repId] + 1 else: files_count[repId] = 1 libs = [] for q in range(0, ob['replicate_count']): rep = exp['replicates'][q] '''Inititalize rep object''' repOb = {} for field in libraryCheckedItems: repOb[field] = '' for field in repCheckedItems: if field in rep: repOb[field] = rep[field] else: repOb[field] = '' if rep['biological_replicate_number'] in files_count: repOb['files'] = files_list[ rep['biological_replicate_number']] repOb['rep_file_count'] = files_count[ rep['biological_replicate_number']] else: repOb['rep_file_count'] = 0 repOb['files'] = [] repOb['replicate_aliases'] = rep['aliases'] repOb['replicate_uuid'] = rep['uuid'] repOb['rep_status'] = rep['status'] if 'platform' in rep: repOb['platform'] = rep['platform']['term_name'] if 'antibody' in rep: repOb['antibody'] = rep['antibody']['accession'] summary = get_char_summary(rep['antibody']['accession'], connection) if len(rep['antibody']['lot_reviews']) < 1: continue print('\t'.join([ 'NHGRI', exp['accession'], rep['antibody']['accession'], rep['antibody']['lot_reviews'][0]['status'], 'Characterizations failing:' + repr(summary['number_chars_failing']), 'Characterizations passing:' + repr(summary['number_chars_passing']), 'Characterizations in progress:' + repr(summary['number_chars_in_progress']), ])) # repOb['antibody_status'] = rep['antibody']['approvals'][0]['status'] repOb['antibody_source'] = rep['antibody']['source'] repOb['antibody_product'] = rep['antibody']['product_id'] repOb['antibody_lot'] = rep['antibody']['lot_id'] repOb['antibody_status'] = rep['antibody']['lot_reviews'][ 0]['status'] lib = [] # inititalize the lib with repItems for i in repCheckedItems: if i in repOb: lib.append(repr(repOb[i])) if 'library' in rep: for field in libraryCheckedItems: if field in rep['library']: repOb[field] = rep['library'][field] repOb['protocols'] = get_doc_list( rep['library']['documents']) repOb['library_treatments'] = get_treatment_list( rep['library']['treatments']) repOb['spikeins_used'] = get_spikeins_list( rep['library'].get('spikeins_used')) repOb['library_status'] = rep['library']['status'] if 'biosample' in rep['library']: bs = rep['library']['biosample'] repOb['biosample_accession'] = bs['accession'] repOb['biosample_status'] = bs['status'] try: repOb['biosample_biosample_term'] = bs[ 'biosample_term_name'] except: print( "Skipping missing biosample_term_name in %s" % (bs['accession']), file=sys.stderr) repOb['biosample_biosample_term'] = "" repOb['biosample_biosample_id'] = bs[ 'biosample_term_id'] repOb['biosample_biosample_type'] = bs[ 'biosample_type'] ob['species'] = bs['organism']['name'] if 'subcellular_fraction_term_name' in bs: repOb['subcellular_fraction_term_name'] = bs[ 'subcellular_fraction_term_name'] else: repOb[ 'subcellular_fraction_term_name'] = 'unfractionated' if bs['treatments'] != []: repOb[ 'biological_treatments'] = get_treatment_list( bs['treatments']) if 'donor' in bs: repOb['donor'] = bs['donor']['accession'] repOb['donor_status'] = bs['donor']['status'] repOb['strain'] = bs['donor'].get('strain') repOb['strain_background'] = bs['donor'].get( 'strain_background') for term in ('sex', 'phase', 'age', 'age_units', 'life_stage'): repOb[term] = bs.get(term) temp = ' '.join(rep['library']['aliases']) repOb['aliases'] = temp ob['list_libraries'] = '' ob['list_libraries'] = ob['list_libraries'] + \ ' ' + rep['library']['accession'] for i in libraryCheckedItems: if i in repOb: lib.append(repr(repOb[i])) else: lib.append('') libs.append(lib) row = [] for j in checkedItems: row.append(str(ob[j])) if len(libs) == 0: print('\t'.join(row)) for k in range(0, len(libs)): print('\t'.join(row + libs[k]))
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) control = "/search/?type=experiment&award.project=Roadmap&status=released&assay_term_name=ChIP-seq&target.investigated_as=control" missing_control = "/search/?type=experiment&award.project=Roadmap&status=released&audit.NOT_COMPLIANT.category=missing+possible_controls" control_list = encodedcc.get_ENCODE(control, connection, frame="embedded").get("@graph", []) missing_list = encodedcc.get_ENCODE(missing_control, connection, frame="embedded").get("@graph", []) links = {} missing_accessions = [] control_accessions = [] for value in missing_list: missing_accessions.append(value["accession"]) for value in control_list: control_accessions.append(value["accession"]) print("building dictionary") for obj in control_list: if obj.get("replicates"): bio_acc = obj["replicates"][0]["library"]["biosample"]["accession"] links[bio_acc] = [[], []] print("sorting data") for obj in control_list: if obj.get("replicates"): control_acc = obj["accession"] bio_acc = obj["replicates"][0]["library"]["biosample"]["accession"] links[bio_acc][0].append(control_acc) for obj in missing_list: missing_acc = obj["accession"] # print(missing_acc) if obj.get("replicates"): bio_acc = obj["replicates"][0]["library"]["biosample"]["accession"] if links.get(bio_acc): links[bio_acc][1].append(missing_acc) found_controls = [] found_missings = [] print("writing data") with open("results.txt", "w") as f: f.write("biosample" + "\t" + "possible control" + "\t" + "possible experiments" + "\n") for key in links.keys(): c = ";".join(links[key][0]) for item in links[key][0]: found_controls.append(item) e = ";".join(links[key][1]) for item in links[key][1]: found_missings.append(item) if len(links[key][1]) > 0: s = key + "\t" + c + "\t" + e +"\n" f.write(s) print("checking for left over items") diff_missing = set(missing_accessions) - set(found_missings) header = ["experiment", "biosample", "description", "controls", "control description", "control biosample", "lab", "control lab", "bio term name", "control term name", "bio type", "control bio type", "age", "control age", "organism", "control organism"] temp_list = [] possible = [] for acc in diff_missing: exp = encodedcc.get_ENCODE(acc, connection, frame="embedded") temp = {} if exp.get("replicates"): bio = exp["replicates"][0]["library"]["biosample"]["accession"] lab = exp["lab"]["@id"] bio_name = exp["replicates"][0]["library"]["biosample"]["biosample_term_name"] bio_type = exp["replicates"][0]["library"]["biosample"]["biosample_type"] bio_age = exp["replicates"][0]["library"]["biosample"]["age"] organism = exp["replicates"][0]["library"]["biosample"]["organism"]["name"] des = exp["replicates"][0]["library"]["biosample"].get("description", "NONE") for con in control_list: if con.get("replicates"): con_id = con["accession"] con_bio = con["replicates"][0]["library"]["biosample"]["accession"] con_lab = con["lab"]["@id"] con_bio_name = con["replicates"][0]["library"]["biosample"]["biosample_term_name"] con_bio_type = con["replicates"][0]["library"]["biosample"]["biosample_type"] con_age = con["replicates"][0]["library"]["biosample"]["age"] con_organism = con["replicates"][0]["library"]["biosample"]["organism"]["name"] con_des = con["replicates"][0]["library"]["biosample"].get("description", "NONE") if bio_name == con_bio_name and lab == con_lab and bio_type == con_bio_type and organism == con_organism: possible.append(acc) temp["experiment"] = acc temp["biosample"] = bio temp["description"] = des temp["controls"] = con_id temp["control description"] = con_des temp["control biosample"] = con_bio temp["lab"] = lab temp["control lab"] = con_lab temp["bio term name"] = bio_name temp["control term name"] = con_bio_name temp["bio type"] = bio_type temp["control bio type"] = con_bio_type temp["age"] = bio_age temp["control age"] = con_age temp["organism"] = organism temp["control organism"] = con_organism temp_list.append(temp) with open("missing_control.txt", "w") as tsvfile: writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=header) writer.writeheader() for item in temp_list: writer.writerow(item) really_missing = set(diff_missing) - set(possible) with open("extras.txt", "w") as f: f.write(str(len(really_missing)) + " experiments unaccounted for\n") for line in really_missing: f.write(line + "\n") print("output written to results.txt, missing_controls.txt and extras.txt") '''if args.update:
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) labs = { 'stam': '&lab.title=John+Stamatoyannopoulos%2C+UW&lab.title=Job+Dekker%2C+UMass', 'bernstein': '&lab.title=Bradley+Bernstein%2C+Broad', 'gingeras': '&lab.title=Yijun+Ruan%2C+GIS&lab.title=Thomas+Gingeras%2C+CSHL&lab.title=Piero+Carninci%2C+RIKEN', 'snyder': '&lab.title=Michael+Snyder%2C+Stanford&lab.title=Sherman+Weissman%2C+Yale&lab.title=Kevin+White%2C+UChicago&lab.title=Peggy+Farnham%2C+USC' } # ----------- QUERIES ---------------------------------------------------- unreplicated_query = '&replication_type=unreplicated' replicated_query = '&replication_type!=unreplicated' not_pipeline_query = '&files.analysis_step_version.analysis_step.pipelines.title%21=Transcription+factor+ChIP-seq' no_peaks_query = '&files.file_type!=bigBed+narrowPeak' concordance_query = '&searchTerm=IDR%3Afail' #'&searchTerm=IDR%3Afail' unrunnable_query = '&internal_status=unrunnable' pipeline_query = '&files.analysis_step_version.analysis_step.pipelines.title=Transcription+factor+ChIP-seq' read_depth_query = '&audit.NOT_COMPLIANT.category=insufficient+read+depth' read_depth_query_3 = '&audit.WARNING.category=low+read+depth' complexity_query = '&audit.NOT_COMPLIANT.category=insufficient+library+complexity' read_length_query = '&files.read_length=271272&files.read_length=657265&files.read_length=25&files.read_length=31&files.read_length=30' no_concerns_query = '&internal_status%21=requires+lab+review&internal_status%21=unrunnable&internal_status%21=pipeline+error' human_query = '&replicates.library.biosample.donor.organism.scientific_name=H**o+sapiens' mouse_query = '&replicates.library.biosample.donor.organism.scientific_name=Mus+musculus' unknown_org_query = '&replicates.library.biosample.donor.organism.scientific_name%21=H**o+sapiens&replicates.library.biosample.donor.organism.scientific_name%21=Mus+musculus' ENCODE2_query = '&award.rfa=ENCODE2&award.rfa=ENCODE2-Mouse' ENCODE3_query = '&award.rfa=ENCODE3' ROADMAP_query = '&award.rfa=Roadmap' total_query = '&status=released&status=submitted&status=started&status=ready+for+review' released_query = '&status=released' proposed_query = '&status=proposed' unreleased_query = '&status=submitted&status=ready+for+review&status=started' concerns_query = '&internal_status=requires+lab+review&internal_status=unrunnable&internal_status=pipeline+error&status!=deleted&status!=revoked' antibody_query = '&audit.NOT_COMPLIANT.category=not+characterized+antibody' orange_audits_query = '&audit.NOT_COMPLIANT.category=missing+controlled_by&audit.NOT_COMPLIANT.category=insufficient+read+depth&audit.NOT_COMPLIANT.category=missing+documents&audit.NOT_COMPLIANT.category=control+insufficient+read+depth&audit.NOT_COMPLIANT.category=unreplicated+experiment&audit.NOT_COMPLIANT.category=poor+library+complexity&audit.NOT_COMPLIANT.category=severe+bottlenecking&audit.NOT_COMPLIANT.category=insufficient+replicate+concordance&audit.NOT_COMPLIANT.category=missing+possible_controls&audit.NOT_COMPLIANT.category=missing+input+control' red_audits_query = '&audit.ERROR.category=control+extremely+low+read+depth&audit.ERROR.category=missing+raw+data+in+replicate&audit.ERROR.category=missing+donor&audit.ERROR.category=inconsistent+library+biosample&audit.ERROR.category=inconsistent+replicate&audit.ERROR.category=replicate+with+no+library&audit.ERROR.category=technical+replicates+with+not+identical+biosample&&audit.ERROR.category=missing+paired_with&audit.ERROR.category=missing+possible_controls&audit.ERROR.category=inconsistent+control&audit.ERROR.category=missing+antibody' orange_audits_query2 = '&audit.NOT_COMPLIANT.category=insufficient+read+length&audit.NOT_COMPLIANT.category=control+low+read+depth&audit.NOT_COMPLIANT.category=insufficient+read+depth&audit.NOT_COMPLIANT.category=missing+documents&audit.NOT_COMPLIANT.category=unreplicated+experiment&audit.NOT_COMPLIANT.category=missing+possible_controls&audit.NOT_COMPLIANT.category=missing+spikeins&audit.NOT_COMPLIANT.category=missing+RNA+fragment+size' peaks_query = '&files.file_type=bigBed+narrowPeak' missing_signal_query = '&files.file_type!=bigWig&target.investigated_as!=control' grch38_query = '&files.assembly=GRCh38' v19_query = '&files.genome_annotation=V19' not_v19_query = '&files.genome_annotation!=V19' hg19_query = '&files.assembly=hg19' mm10_query = '&files.assembly=mm10' hg19_vis_query = '&assembly=hg19' grch38_vis_query = '&assembly=GRCh38' not_grch38_vis_query = '&assembly!=GRCh38' mm10_vis_query = '&assembly=mm10' not_mm10_vis_query = '&assembly!=mm10' not_grch38_query = '&files.assembly!=GRCh38' not_hg19_query = '&files.assembly!=hg19' not_mm10_query = '&files.assembly!=mm10' uniform_query = '&files.lab.name=encode-processing-pipeline' requires_query = '&internal_status=requires+lab+review' submitted_query = '&files.lab.name!=encode-processing-pipeline' audits_query = '&audit.NOT_COMPLIANT.category=missing+controlled_by&audit.NOT_COMPLIANT.category=insufficient+read+depth&audit.NOT_COMPLIANT.category=missing+documents&audit.NOT_COMPLIANT.category=unreplicated+experiment&assay_slims=Transcription&audit.NOT_COMPLIANT.category=missing+possible_controls&audit.NOT_COMPLIANT.category=missing+spikeins&audit.NOT_COMPLIANT.category=missing+RNA+fragment+size' processing_query = '&internal_status=pipeline+ready&internal_status=processing' mismatched_file_query = '&audit.INTERNAL_ACTION.category=mismatched+file+status' dnase_pipeline = "&files.analysis_step_version.analysis_step.pipelines.title=DNase-HS+pipeline+%28paired-end%29&files.analysis_step_version.analysis_step.pipelines.title=DNase-HS+pipeline+%28single-end%29&files.file_type=bigBed+broadPeak" lab_query = labs.get(args.grant) filters = { 'released': released_query, 'unreleased': total_query } row_queries = { 'Total': total_query, 'Released': released_query, 'Released with issues': released_query+audits_query, 'Released with antibody issues': released_query + antibody_query, 'Released with NOT COMPLIANT issues': released_query + orange_audits_query, 'Released with NOT COMPLIANT': released_query + orange_audits_query2, 'Released with ERROR issues': released_query + red_audits_query, 'With ERROR issues': red_audits_query + filters[args.status], 'With NOT COMPLIANT issues': orange_audits_query + filters[args.status], 'Unreleased': unreleased_query, 'Proposed': proposed_query, 'Processed on GRCh38': grch38_query + filters[args.status], 'Processed on Dnase Grch38 or mm10': dnase_pipeline + grch38_vis_query + mm10_vis_query + filters[args.status], 'Mapped on GRCh38 or mm10': grch38_query + mm10_query + filters[args.status], 'Unmapped on GRCh38 or mm10': not_grch38_query + not_mm10_query + filters[args.status], 'Submitted on GRCh38': grch38_query + filters[args.status], 'Submitted on GRCh38 or mm10': grch38_query + mm10_query + filters[args.status], 'Uniformly Processed on hg19-v19': v19_query + filters[args.status], 'Mapped on hg19': hg19_query + uniform_query + filters[args.status], 'Unmapped on hg19': not_hg19_query + filters[args.status], 'Processed on Dnase hg19': dnase_pipeline + hg19_vis_query + filters[args.status], 'Peaks called on hg19': hg19_vis_query + uniform_query + filters[args.status], 'Peaks called on GRCh38 or mm10': grch38_vis_query + mm10_vis_query + filters[args.status], 'Submitted on hg19': hg19_query + filters[args.status], 'Processed on mm10': mm10_query + filters[args.status], 'Submitted on mm10': mm10_query + filters[args.status], 'Cannot be currently processed': concerns_query + filters[args.status], 'In processing queue': processing_query + filters[args.status], 'Unreleased files in a released experiment': mismatched_file_query, 'Missing GRCh38 or mm10 peaks': not_grch38_vis_query + not_mm10_vis_query + replicated_query + filters[args.status], 'Missing hg19': not_hg19_query + not_mm10_query + filters[args.status], 'Missing hg19-v19': not_v19_query + not_mm10_query + filters[args.status], 'Missing signal files': total_query + missing_signal_query, 'missing fastqs': '&files.file_format!=fastq' + total_query, 'Unreplicated': unreplicated_query + filters[args.status] } columns = collections.OrderedDict([ ('ENCODE3-human', ENCODE3_query + human_query), ('ENCODE3-mouse', ENCODE3_query + mouse_query), ('ENCODE2-human', ENCODE2_query + human_query), ('ENCODE2-mouse', ENCODE2_query + mouse_query), # ('Organism Unknown', ENCODE3_query + unknown_org_query), ('ROADMAP', ROADMAP_query), ('Total', '&award.rfa=ENCODE3' + ROADMAP_query + ENCODE2_query) ]) if args.grant: columns = collections.OrderedDict([ ('ENCODE3-human', ENCODE3_query + human_query + lab_query), ('ENCODE3-mouse', ENCODE3_query + mouse_query + lab_query), ('ENCODE2-human', ENCODE2_query + human_query + lab_query), ('ENCODE2-mouse', ENCODE2_query + mouse_query + lab_query), # ('Organism Unknown', ENCODE3_query + unknown_org_query), ('ROADMAP', ROADMAP_query + lab_query), ('Total', '&award.rfa=ENCODE3' + ROADMAP_query + ENCODE2_query + lab_query) ]) if args.datatype == 'CHIP': make_chip_report(connection, columns, row_queries) elif args.datatype == 'RNA': make_rna_report(connection, columns, row_queries) elif args.datatype == 'METHYL': make_methyl_report(connection, columns, row_queries) elif args.datatype == '3D': make_3d_report(connection, columns, row_queries) elif args.datatype == 'Accessibility': make_dna_report(connection, columns, row_queries) elif args.datatype == 'RBP': make_rbp_report(connection, row_queries) else: print ('unimplemented')
def main(): args = get_args() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) if not test_encode_keys(connection): logger.error("Invalid ENCODE server or keys: server=%s auth=%s" % (connection.server, connection.auth)) sys.exit(1) input_csv, output_csv = init_csvs(args.infile, args.outfile) for n, row in enumerate(input_csv, start=2): # row 1 is the header # if there is no "file_format_spec" then no point in running get_asfile() if row.get("file_format_specifications"): as_file = get_asfile(row['file_format_specifications'], connection) as_file.close() # validateFiles needs a closed file for -as, otherwise it gives a return code of -11 validated = validate_file( row, args.encvaldata, row.get('assembly'), as_file.name) os.unlink(as_file.name) else: validated = validate_file( row, args.encvaldata, row.get('assembly')) if not validated: logger.warning('Skipping row %d: file %s failed validation' % ( n, row['submitted_file_name'])) continue json_payload = process_row(row, connection) if not json_payload: logger.warning( 'Skipping row %d: invalid field format for JSON' % (n)) continue file_object = encodedcc.post_file( json_payload, connection, args.update) if isinstance(file_object, requests.models.Response): if file_object.status_code == 409: print("POST Conflict", file_object.json()) i = input("Upload file to S3? y/n: ") if i.lower() == "y": detail = file_object.json()["detail"] # pull out the list with the 'key conflict' and turn it into a list conflict = ast.literal_eval( detail.lstrip("Keys conflict: ")) # get the first tuple in the list conflict = conflict[0] # the second item in the tuple is the value obj = conflict[1] print( "Getting upload credentials from conflicting identifier {}".format(obj)) if ":" in obj: obj = quote(obj) temp_object = encodedcc.get_ENCODE(obj, connection) file_object = encodedcc.ENC_Item( connection, temp_object["@id"]) print("Uploading file to S3") aws_return_code = uploader(file_object, args.update) else: logger.warning( 'Skipping row %d: POST file object failed' % (n)) aws_return_code = None else: if not file_object: logger.warning( 'Skipping row %d: POST file object failed' % (n)) aws_return_code = None continue else: aws_return_code = uploader(file_object, args.update) output_csv.writeheader() output_row = {} for key in output_csv.fieldnames: output_row.update({key: file_object.get(key)}) output_row.update({'aws_return': aws_return_code}) output_csv.writerow(output_row)
def test_connection(): key = encodedcc.ENC_Key(keypairs, "default") connection = encodedcc.ENC_Connection(key) assert(connection) assert(connection.auth) assert(connection.server)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.update: print("This is an UPDATE run data will be PATCHed") else: print("This is a dryrun, no data will be changed") if args.infile: if os.path.isfile(args.infile): accessions = [line.rstrip('\n') for line in open(args.infile)] else: accessions = args.infile.split(",") elif args.query: if "search" in args.query: temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) else: temp = [encodedcc.get_ENCODE(args.query, connection)] if any(temp): for obj in temp: if obj.get("accession"): accessions.append(obj["accession"]) elif obj.get("uuid"): accessions.append(obj["uuid"]) elif obj.get("@id"): accessions.append(obj["@id"]) elif obj.get("aliases"): accessions.append(obj["aliases"][0]) if len(accessions) == 0: # if something happens and we end up with no accessions stop print("ERROR: object has no identifier", file=sys.stderr) sys.exit(1) else: for acc in accessions: obj = encodedcc.get_ENCODE(acc, connection, frame="embedded") isValid = True check = ["replicates", "files"] for c in check: if not obj.get(c): if args.debug: print("Missing {} for {}".format(c, acc), file=sys.stderr) isValid = False if obj.get("possible_controls"): for p in obj["possible_controls"]: for c in check: if not obj.get(c): if args.debug: print("Missing {} for {}".format( c, p["accession"]), file=sys.stderr) isValid = False else: isValid = False if args.debug: print("Missing possible_controls for {}".format(acc), file=sys.stderr) if isValid: backfill = BackFill(connection, debug=args.debug, missing=args.missing, update=args.update, ignore_runtype=args.ignore_runtype) if args.method == "single": if args.debug: print("SINGLE REP {}".format(acc)) backfill.single_rep(obj) elif args.method == "multi": if args.debug: print("MULTI REP {}".format(acc)) backfill.multi_rep(obj) elif args.method == "biosample": if args.debug: print("BIOSAMPLE {}".format(acc)) backfill.multi_control(obj) else: exp_rep = len(obj["replicates"]) exp_con = len(obj["possible_controls"]) if exp_con == 1: # one possible control con_rep = len( obj["possible_controls"][0]["replicates"]) if con_rep == exp_rep: # same number experiment replicates as control replicates # method is multi if args.debug: print("MULTI REP {}".format(acc)) backfill.multi_rep(obj) elif con_rep == 1: # one control replicate and multiple experiment replicates # method is single if args.debug: print("SINGLE REP {}".format(acc)) backfill.single_rep(obj) else: if args.debug: print( "Experiment {} contains {} experiment replicates and {} control replicates and so does not fit the current pattern!" .format(acc, exp_rep, con_rep)) elif exp_con > 1: # more than one possible control con_reps = 0 for con in obj["possible_controls"]: if len(con["replicates"]) == 1: con_reps += 1 if con_reps == exp_rep: # same number of controls with one replicate as number of experiment replicates # method is biosample if args.debug: print("BIOSAMPLE {}".format(acc)) backfill.multi_control(obj) else: if args.debug: print( "Experiment {} contains {} experiment replicates and {} control replicates between {} total controls and so does not fit the current pattern!" .format(acc, exp_rep, con_rep, exp_con)) else: if args.debug: print( "Experiment {} does not fit any of the current patterns!" .format(acc)) if len(backfill.dataList) > 0: print("Experiment\tMethod\tExperimentFile\tControlFile") for data in backfill.dataList: print( "{ExpAcc}\t{Method}\t{ExpFile}\t{ConFile}".format( ExpAcc=data["ExpAcc"], Method=data["Method"], ExpFile=data["ExpFile"], ConFile=data["ConFile"]))
def main(): args = getArgs() outfile = args.outfile CREATE_ONLY = args.createonly UPDATE_ONLY = args.updateonly Entrez.email = args.email key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) print("Running on ", connection.server) publication = PublicationUpdate(args) if not UPDATE_ONLY: publication.setup_publication() pmidList = publication.consortium_ids + publication.community_ids mergeDicts = publication.consortium_dict.copy() mergeDicts.update(publication.community_dict) # holds published_by, categories, and data_used if not CREATE_ONLY: publication.get_entrez(pmidList) community_ENCODE_Only, communityOtherID, consortium_ENCODE_Only, consortiumOtherID = publication.find_ENCODE_extras(publication.community_ids, publication.consortium_ids, connection) total_ENCODE_only = len(community_ENCODE_Only) + len(consortium_ENCODE_Only) allOtherIDs = communityOtherID + consortiumOtherID publication.check_ENCODE(pmidList, connection, allOtherIDs, mergeDicts) log = str(total_ENCODE_only) + " items in ENCODE but not in files" logger.info('%s' % log) log = str(publication.PATCH_COUNT) + " publication files PATCHed" logger.info('%s' % log) log = str(publication.POST_COUNT) + " publication files POSTed" logger.info('%s' % log) print("Results printed to", outfile) else: infile = UPDATE_ONLY with open(infile, 'r') as readfile: uuidList = [x.rstrip('\n') for x in readfile] # check each publication to see if it has a PMID, if it does add it to the PMIDlist # if it does not have one look it up on Entrez pmid_uuid_dict = {} for uuid in uuidList: pub = encodedcc.get_ENCODE(uuid, connection) title = pub.get("title", "") identifiers = pub.get("identifiers", []) found = False for i in identifiers: if "PMID:" in i: p = i.split(":")[1] found = True if found: pmid_uuid_dict[p] = uuid else: # search Entrez for publication by title handle = Entrez.esearch(db="pubmed", term=title) record = Entrez.read(handle) idlist = record["IdList"] if len(idlist) > 1: log = "More than one possible PMID found for " + uuid logger.error('%s' % log) log = str(idlist) + " are possible PMIDs" logger.error('%s' % log) elif len(idlist) == 0: log = "No possible PMID found for " + uuid logger.error('%s' % log) else: handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) # save the records, you can convert them to a list records = list(records) for record in records: pm = record.get("PMID") ti = record.get("TI") log = "Publication " + uuid + " with title \"" + title + "\" matches PMID:" + pm + " with title \"" + ti + "\"" logger.info('%s' % log) identifiers.append("PMID:" + pm) encodedcc.patch_ENCODE(uuid, connection, {"identifiers": identifiers}) pmid_uuid_dict[pm] = uuid pmidList = list(pmid_uuid_dict.keys()) publication.get_entrez(pmidList) with open("pub_update.txt", "w") as f: for pmid in pmid_uuid_dict.keys(): publication.compare_entrez_ENCODE(pmid_uuid_dict[pmid], pmid, connection) f.write(str(len(pmid_uuid_dict.keys())) + " publications checked " + str(publication.PATCH_COUNT) + " publications PATCHed")
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) assemblies = ['hg19', 'GRCh38'] summary = [] if args.infile is not None and 'ENCSR' in args.infile: objList = [args.infile] else: objList = get_experiment_list(args.infile, args.query, connection) for obj_id in objList: results = {} obj = get_ENCODE(obj_id, connection, frame='page') # Get basic info reps = get_replicate_count(obj) results['rep_count'] = len(reps) results['status'] = obj['status'] results['internal_status'] = obj['internal_status'] results['award'] = obj['award'].get('rfa') results['peaks'] = {} results['mapping'] = {} results['unarchived_files'] = [] results['status issues'] = [] results['accession'] = obj['accession'] # Get audits for level in ['WARNING', 'ERROR', 'NOT_COMPLIANT', 'INTERNAL_ACTION']: if obj['audit'].get(level): results[level] = len(obj['audit'].get(level)) # Get status issues actions = obj['audit'].get('INTERNAL_ACTION') if actions: status_issues = [ i for i in actions if i['category'] in [ 'experiment not submitted to GEO', 'mismatched file status', 'mismatched status' ] ] results['status issues'] = status_issues # Inspect files good_files = [ f for f in obj['files'] if f['status'] in ['released', 'in progress'] ] fastqs = [ f for f in obj['files'] if f['status'] in ['released', 'in progress'] ] print("There are files in this experiment:", len(obj['files'])) print("There are good files in this experiment:", len(good_files)) # look for unarchived processed files from other labs processed_files = [ f for f in obj['files'] if f['file_format'] != 'fastq' ] external_files = [ f for f in processed_files if (f['lab']['name'] != 'encode-processing-pipeline') ] unarchived_files = [ f for f in external_files if (f['status'] != 'archived') ] results['unarchived_files'] = unarchived_files for assembly in assemblies: replicates = [] file_list = [ f for f in good_files if f.get('assembly') == assembly ] for rep in reps: rep_obj = {'rep': rep} file_list_rep = [ f for f in file_list if rep in f.get('biological_replicates') ] aligns = [ f for f in file_list_rep if f.get('output_type') == 'alignments' ] rep_obj['aligns'] = len(aligns) raw_aligns = [ f for f in file_list_rep if f.get('output_type') == 'unfiltered alignments' ] rep_obj['raws'] = len(raw_aligns) replicates.append(rep_obj) failing_replicates = [f for f in replicates if f['aligns'] == 0] if len(failing_replicates) is 0: results['mapping'][assembly] = True elif len(replicates) == len(failing_replicates): # They all fail results['mapping'][assembly] = False else: results['mapping'][assembly] = [] for rep in failing_replicates: results['mapping'][assembly].append(rep['rep']) peaks = [f for f in file_list if f.get('output_type') == 'peaks'] if len(peaks) > 0: results['peaks'][assembly] = True else: results['peaks'][assembly] = False summary.append(results) unarchived_list = [r for r in summary if len(r['unarchived_files']) > 0] print('These experiments have unarchived files', len(unarchived_list)) for item in unarchived_list: print(item['accession']) print('') print('') exps_mismatched_states = [ r for r in summary if len(r['status issues']) > 0 ] print('These experiments have mismatched states', len(exps_mismatched_states)) for item in exps_mismatched_states: print(item['accession']) print('') print('') # not_mapped_GRCh38 = [r for r in summary if r['missing_aligns']['GRCh38'] is False] exps_missing_hg38_mapping = [ r for r in summary if r['mapping']['GRCh38'] is False ] print('These experiments are missing GRCh38 mapping for all replicates', len(exps_missing_hg38_mapping)) for item in exps_missing_hg38_mapping: print(item['accession'], item['status'], item['internal_status']) print('') print('') exps_partial_hg38_mapping = [ r for r in summary if r['mapping']['GRCh38'] is not False and r['mapping']['GRCh38'] is not True ] print('These experiments are missing GRCh38 mapping for some replicates', len(exps_partial_hg38_mapping)) for item in exps_partial_hg38_mapping: print(item['accession'], item['status'], item['internal_status'], item['mapping']['GRCh38']) print('') print('') exps_missing_hg38_peaks = [ r for r in summary if r['peaks']['GRCh38'] is False ] exps_missing_hg38_peaks_but_have_mapping = [ f for f in exps_missing_hg38_peaks if f['peaks']['GRCh38'] is False and f not in exps_missing_hg38_mapping and f not in exps_partial_hg38_mapping ] print('These experiments are missing GRCh38 peaks but having all mappings', len(exps_missing_hg38_peaks_but_have_mapping)) for item in exps_missing_hg38_peaks: print(item['accession'], item['status'], item['internal_status']) print('') print('') exps_missing_hg19_mapping = [ r for r in summary if r['mapping']['hg19'] is False ] print('These experiments are missing hg19 mapping for all replicates', len(exps_missing_hg19_mapping)) for item in exps_missing_hg19_mapping: print(item['accession'], item['status'], item['internal_status']) print('') print('') exps_partial_hg19_mapping = [ r for r in summary if r['mapping']['hg19'] is not False and r['mapping']['hg19'] is not True ] print('These experiments are missing hg19 mapping for some replicates', len(exps_partial_hg19_mapping)) for item in exps_partial_hg19_mapping: print(item['accession'], item['status'], item['internal_status'], item['mapping']['hg19']) print('') print('') exps_missing_hg19_peaks = [ r for r in summary if r['peaks']['hg19'] is False and r not in exps_missing_hg19_mapping and r not in exps_partial_hg19_mapping ] print('These experiments are missing hg19 peaks', len(exps_missing_hg19_peaks)) for item in exps_missing_hg19_peaks: print(item['accession'], item['status'], item['internal_status'], 'warnings:', item.get('WARNING')) print('') print('')
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) print("Running on", connection.server) if args.update: assert args.user, "A user must be provided to run this script!" user = encodedcc.get_ENCODE(args.user, connection).get("@id") assert user, "{} was not found in the ENCODE database as a registered user. Please try again".format( args.user) data = [] idList = [] with open(args.infile, "r") as tsvfile: reader = csv.DictReader(tsvfile, delimiter='\t') for row in reader: data.append(row) for item in data: lanes = item.get("lanes", "") lanes = list(set(lanes.split(","))) item["lanes"] = lanes if not any(item["notes"]): item.pop("notes") if item.get("@id") not in idList: idList.append(item["@id"]) objDict = {key: [] for key in idList} for item in data: objDict.get(item.get("@id", ""), "").append(item) for idNum in objDict.keys(): antibody = encodedcc.get_ENCODE(idNum, connection, frame="edit") new_antibody = {} if antibody.get("primary_characterization_method"): reviews = antibody.get("characterization_reviews", []) enc_docs = antibody.get("documents", []) file_docs = [] for obj in objDict[idNum]: if obj.get("documents"): for doc in obj["documents"].split(","): file_docs.append(doc) if obj.get("notes"): new_antibody["notes"] = obj["notes"] for doc in file_docs: if ":" in doc: doc = quote(doc) link = encodedcc.get_ENCODE(doc, connection).get("@id") if link: if link not in enc_docs: enc_docs.append(link) ####################### # begin lanes checking ####################### enc_lanes_check = [] file_lanes_check = [] flag = False for r in reviews: enc_lanes_check.append(r["lane"]) for item in objDict[idNum]: for l in item["lanes"]: file_lanes_check.append(int(l)) if len(set(enc_lanes_check)) < len(enc_lanes_check): # duplicate lanes in ENCODE print("Possible duplicate lanes in ENCODE") flag = True if len(set(file_lanes_check)) < len(file_lanes_check): # duplicate lanes in file print("Possible duplicate lanes in file") flag = True if len(set(enc_lanes_check) - set(file_lanes_check)) > 0: # more lanes in ENCODE than in file print("Found lanes in ENCODE not in the file") flag = True if len(set(file_lanes_check) - set(enc_lanes_check)) > 0: # more lanes in file than in ENCODE print("Found lanes in the file not in ENCODE") flag = True if flag: print( "Some problem was found with the number of lanes in the file as compared to ENCODE" ) print( "Do you want to continue running the program or exit and check the data?" ) i = input("Continue? y/n ") assert i.upper() == "Y" # exit the script for r in reviews: for line in objDict[idNum]: for lane in line["lanes"]: if int(lane) == r["lane"]: if line["lane_status"].lower( ) == "pending dcc review": print( "can't set to pending review, need manual override" ) fin = input( "Change the status to 'pending dcc review'? y/n " ) if fin.upper() == "Y": r["lane_status"] = line[ "lane_status"].lower() for link in enc_docs: if encodedcc.get_ENCODE( link, connection ).get("document_type", "") == "standards document": enc_docs.pop(link) else: pass else: r["lane_status"] = line["lane_status"].lower() # now all lanes in reviews should be updated to document enc_comp = 0 enc_ncomp = 0 other = 0 for r in reviews: if r.get("lane_status", "") == "compliant": enc_comp = enc_comp + 1 elif r.get("lane_status", "") == "not compliant": enc_ncomp = enc_ncomp + 1 else: other = other + 1 if other > 0: print( "not all lanes have allowed status, antibody characterization status set to not compliant" ) new_antibody["status"] = "not compliant" elif enc_comp > 0: new_antibody["status"] = "compliant" elif other == 0 and enc_comp == 0 and enc_ncomp > 0: new_antibody["status"] = "not compliant" ###################### # end lanes checking ###################### if antibody.get("lab", "") == "/labs/michael-snyder/": # make sure special document is added if not in the file if "michael-snyder:biorad_protein_standard" not in file_docs: file_docs.append("michael-snyder:biorad_protein_standard") if antibody[ "primary_characterization_method"] == "immunoprecipitation": if len(reviews) == 1: # fix lane number reviews[0]["lane"] = 3 new_antibody["characterization_reviews"] = reviews new_antibody["documents"] = enc_docs if args.update: new_antibody["reviewed_by"] = user if args.update: print("PATCHing antibody characterization", idNum) encodedcc.patch_ENCODE(idNum, connection, new_antibody) else: print("PATCH data:", new_antibody)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) query = "/search/?type=Experiment&lab.title=Brenton+Graveley%2C+UConn&award.project=ENCODE&status=released&files.file_type=bam" data = encodedcc.get_ENCODE(query, connection).get("@graph", []) headers = [ "File Accession", "Download", "Annotation", "Cell Line", "Assembly", "Target", "Experiment Accession", "Experiment Aliases", "Control Experiment", "Biosample Accession", "Biosample Aliases", "Library Accession", "Library Aliases", "Lab", "Submitted Name" ] with open("output.txt", "w") as tsvfile: writer = csv.DictWriter(tsvfile, fieldnames=headers, delimiter="\t") writer.writeheader() for exp in data: if exp.get("possible_controls"): print("Experiment", exp.get("accession")) temp = dict.fromkeys(headers) temp["Experiment Accession"] = exp.get("accession") temp["Experiment Aliases"] = exp.get("aliases") temp["Cell Line"] = exp.get("biosample_term_name") temp["Target"] = exp.get("target") temp["Control Experiment"] = exp["possible_controls"] if exp.get("files"): files = exp["files"] else: files = exp["original_files"] for f in files: file = encodedcc.get_ENCODE(f, connection) if file.get("file_format", "") == "bam": # this is a bam file and we want it temp["Lab"] = file.get("lab") temp["Annotation"] = file.get("genome_annotation") temp["File Accession"] = file.get("accession") temp["Submitted Name"] = file.get( "submitted_file_name") temp[ "Download"] = connection.server + "/files/" + file[ "accession"] + "/@@download/" + file[ "accession"] + ".bam" temp["Assembly"] = file.get("assembly") print("File", file.get("accession")) if file.get("replicate"): rep = encodedcc.get_ENCODE(file["replicate"], connection) if rep.get("library"): lib = encodedcc.get_ENCODE( rep["library"], connection) temp["Library Accession"] = lib.get( "accession") temp["Library Aliases"] = lib.get("aliases") print("Library", lib.get("accession")) if lib.get("biosample"): bio = encodedcc.get_ENCODE( lib["biosample"], connection) temp["Biosample Accession"] = bio.get( "accession") temp["Biosample Aliases"] = bio.get( "aliases") print("Biosample", bio.get("accession")) writer.writerow(temp)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) print("Running on {}".format(connection.server)) accessions = [] if args.object: if os.path.isfile(args.object): accessions = [line.strip() for line in open(args.object)] else: accessions = args.object.split(",") elif args.query: if "search" in args.query: temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) else: temp = [encodedcc.get_ENCODE(args.query, connection)] if any(temp): for obj in temp: if obj.get("accession"): accessions.append(obj["accession"]) elif obj.get("uuid"): accessions.append(obj["uuid"]) elif obj.get("@id"): accessions.append(obj["@id"]) else: print("ERROR: object has no identifier", file=sys.stderr) if len(accessions) == 0: print("No accessions to check!", file=sys.stderr) sys.exit(1) for acc in accessions: files = encodedcc.get_ENCODE(acc, connection).get("original_files", []) new_files = {} old_files = {} for f in files: file = encodedcc.get_ENCODE(f, connection) #renamer(file, connection, args.update) #replacer(file, connection, args.update) if any(file.get("aliases", [])): # this has aliases if file["aliases"][0].endswith("_replaced"): # this is one of the old ones dict_maker(file, old_files) else: # this is a new file dict_maker(file, new_files) else: print("file {} has no aliases".format(file["@id"])) for new in new_files.keys(): new_temp = new_files[new] for old in old_files.keys(): old_temp = old_files[old] if new_temp["replicate"] == old_temp["replicate"]: #print(new_temp["replicate"], old_temp["replicate"]) if new_temp["file_type"] == old_temp["file_type"]: #print(new_temp["file_type"], old_temp["file_type"]) if new_temp["run_type"] == old_temp["run_type"]: #print(new_temp["run_type"], old_temp["run_type"]) if new_temp["paired_end"] == old_temp[ "paired_end"]: #print(new_temp["paired_end"], old_temp["paired_end"]) print( "New file {} with date {} replacing old file {} with date {}" .format(new, new_temp["date"], old, old_temp["date"])) if args.update: #replace old file encodedcc.patch_ENCODE( old, connection, {"status": "replaced"}) # release and update new file patch_dict = { "status": "released", "alternate_accessions": [old] } encodedcc.patch_ENCODE( new, connection, patch_dict)
def main(): print("This script outputs a 'No Results Found' error.") print("This is due to the Long/Short RNA-seq, it does not affect the final results") args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) search_string = "/matrix/?type=Experiment" rfa_string = "" species_string = "" status_string = "" lab_string = "" if args.rfa: rfa_list = args.rfa.split(",") for r in rfa_list: rfa_string += "&award.rfa=" + r if args.species: species_list = args.species.split(",") for r in species_list: species_string += "&replicates.library.biosample.donor.organism.name=" + r if args.status: status_list = args.status.split(",") for r in status_list: status_string += "&status=" + r if args.lab: lab_list = args.lab.split(",") for r in lab_list: lab_string += "&lab.name=" + r full_string = rfa_string + species_string + status_string + lab_string search_string += full_string matrix_url = '=HYPERLINK("{}","{}")'.format(connection.server + search_string, connection.server + search_string) matrix = encodedcc.get_ENCODE(search_string, connection).get("matrix") x_values = matrix.get("x") y_values = matrix.get("y") y_buckets = y_values["replicates.library.biosample.biosample_type"].get("buckets") x_buckets = x_values.get("buckets") if args.all: full_list = [] for x in x_buckets: full_list.append(x["key"]) else: full_list = ["RNA-seq", "microRNA profiling by array assay", "microRNA-seq", "DNase-seq", "whole-genome shotgun bisulfite sequencing", "RAMPAGE", "CAGE"] temp_list = list(full_list) if "RNA-seq" in temp_list: temp_list.remove("RNA-seq") headers = [matrix_url] + ["Long RNA-seq", "Short RNA-seq"] + temp_list + ["TOTAL"] final_assay_search = "" # this will be used to total rows for name in full_list: final_assay_search += "&assay_term_name=" + name final_bio_search = "" # this will be used to total columns col_dict = dict.fromkeys(headers) for k in col_dict.keys(): col_dict[k] = [] with open(args.outfile, "w") as tsvfile: dictwriter = csv.DictWriter(tsvfile, delimiter="\t", fieldnames=headers) dictwriter.writeheader() for y in y_buckets: inner_buckets = y["biosample_term_name"].get("buckets") group_dict = dict.fromkeys(headers) group_dict[matrix_url] = y["key"] dictwriter.writerow(group_dict) for item in inner_buckets: bio_name = item["key"] final_bio_search += "&biosample_term_name=" + quote(bio_name) assay_list = item["assay_term_name"] row_dict = dict.fromkeys(headers) for k in row_dict.keys(): row_dict[k] = 0 row_dict[matrix_url] = bio_name row_count = [] for x in range(len(assay_list)): assay_name = x_buckets[x]["key"] if assay_name in full_list: if assay_list[x] > 0: search = "/search/?type=Experiment&biosample_term_name=" + quote(bio_name) + "&assay_term_name=" + assay_name + full_string if assay_name == "RNA-seq": short_search = search + "&replicates.library.size_range=<200" long_search = search + "&replicates.library.size_range!=<200" short_url = connection.server + short_search long_url = connection.server + long_search short_facets = encodedcc.get_ENCODE(short_search, connection) long_facets = encodedcc.get_ENCODE(long_search, connection) if short_facets.get("total") == 0: row_dict["Short RNA-seq"] = 0 row_count.append([0, 0, 0, 0, 0]) col_dict["Short RNA-seq"].append([0, 0, 0, 0, 0]) else: total, error, not_compliant, warning, dcc_action = audit_count(short_facets.get("facets", []), short_facets.get("total"), args.allaudits) if args.allaudits: string = '=HYPERLINK("{}","{}, {}E, {}NC, {}W, {}DCC")'.format(short_url, total, error, not_compliant, warning, dcc_action) else: string = '=HYPERLINK("{}","{}, {}E, {}NC")'.format(short_url, total, error, not_compliant) row_dict["Short RNA-seq"] = string row_count.append([total, error, not_compliant, warning, dcc_action]) col_dict["Short RNA-seq"].append([total, error, not_compliant, warning, dcc_action]) if long_facets.get("total") == 0: row_dict["Long RNA-seq"] = 0 row_count.append([0, 0, 0, 0, 0]) col_dict["Long RNA-seq"].append([0, 0, 0, 0, 0]) else: total, error, not_compliant, warning, dcc_action = audit_count(long_facets.get("facets", []), long_facets.get("total"), args.allaudits) if args.allaudits: string = '=HYPERLINK("{}","{}, {}E, {}NC, {}W, {}DCC")'.format(long_url, total, error, not_compliant, warning, dcc_action) else: string = '=HYPERLINK("{}","{}, {}E, {}NC")'.format(long_url, total, error, not_compliant) row_dict["Long RNA-seq"] = string row_count.append([total, error, not_compliant, warning, dcc_action]) col_dict["Long RNA-seq"].append([total, error, not_compliant, warning, dcc_action]) else: url = connection.server + search facets = encodedcc.get_ENCODE(search, connection).get("facets", []) total, error, not_compliant, warning, dcc_action = audit_count(facets, assay_list[x], args.allaudits) if args.allaudits: string = '=HYPERLINK("{}","{}, {}E, {}NC, {}W, {}DCC")'.format(url, total, error, not_compliant, warning, dcc_action) else: string = '=HYPERLINK("{}","{}, {}E, {}NC")'.format(url, total, error, not_compliant) row_count.append([total, error, not_compliant, warning, dcc_action]) row_dict[assay_name] = string col_dict[assay_name].append([total, error, not_compliant, warning, dcc_action]) else: if assay_name == "RNA-seq": row_dict["Short RNA-seq"] = 0 row_dict["Long RNA-seq"] = 0 col_dict["Short RNA-seq"].append([0, 0, 0, 0, 0]) col_dict["Long RNA-seq"].append([0, 0, 0, 0, 0]) else: row_dict[assay_name] = 0 col_dict[assay_name].append([0, 0, 0, 0, 0]) row_count.append([0, 0, 0, 0, 0]) row_total = 0 row_error = 0 row_not_compliant = 0 row_warning = 0 row_dcc_action = 0 bio_total = "/search/?type=Experiment&biosample_term_name=" + quote(bio_name) + final_assay_search + full_string bio_url = connection.server + bio_total for col in row_count: row_total += col[0] row_error += col[1] row_not_compliant += col[2] row_warning += col[3] row_dcc_action += col[4] if args.allaudits: row_dict["TOTAL"] = '=HYPERLINK("{}", "{}, {}E, {}NC, {}W, {}DCC")'.format(bio_url, row_total, row_error, row_not_compliant, row_warning, row_dcc_action) else: row_dict["TOTAL"] = '=HYPERLINK("{}", "{}, {}E, {}NC")'.format(bio_url, row_total, row_error, row_not_compliant) dictwriter.writerow(row_dict) total = 0 error = 0 not_compliant = 0 warning = 0 dcc_action = 0 total_dict = dict.fromkeys(headers) total_dict[matrix_url] = "TOTALS" for key in col_dict.keys(): if key in headers and key != matrix_url: col_total = 0 col_error = 0 col_not_compliant = 0 col_warning = 0 col_dcc_action = 0 for cell in col_dict[key]: col_total += cell[0] col_error += cell[1] col_not_compliant += cell[2] col_warning += cell[3] col_dcc_action += cell[4] total += col_total error += col_error not_compliant += col_not_compliant warning += col_warning dcc_action += col_dcc_action if key == "Long RNA-seq": assay_total = "/search/?type=Experiment&assay_term_name=RNA-seq&replicates.library.size_range!=<200" + final_bio_search + full_string elif key == "Short RNA-seq": assay_total = "/search/?type=Experiment&assay_term_name=RNA-seq&replicates.library.size_range=<200" + final_bio_search + full_string else: assay_total = "/search/?type=Experiment&assay_term_name=" + key + final_bio_search + full_string assay_url = connection.server + assay_total if args.allaudits: total_dict[key] = '=HYPERLINK("{}", "{}, {}E, {}NC, {}W, {}DCC")'.format(assay_url, col_total, col_error, col_not_compliant, col_warning, col_dcc_action) else: total_dict[key] = '=HYPERLINK("{}", "{}, {}E, {}NC")'.format(assay_url, col_total, col_error, col_not_compliant) full_search = "/search/?type=Experiment" + final_assay_search + final_bio_search + full_string full_url = connection.server + full_search if args.allaudits: total_dict["TOTAL"] = '=HYPERLINK("{}", "{}, {}E, {}NC, {}W, {}DCC")'.format(full_url, total, error, not_compliant, warning, dcc_action) else: total_dict["TOTAL"] = '=HYPERLINK("{}", "{}, {}E, {}NC")'.format(full_url, total, error, not_compliant) dictwriter.writerow(total_dict) print("Output saved to {}, open this file with Google Docs Sheets, don't use Excel because it sucks".format(args.outfile))
def test_get(): key = encodedcc.ENC_Key(keypairs, "default") connection = encodedcc.ENC_Connection(key) result = encodedcc.get_ENCODE("/profiles/", connection) assert(type(result) is dict)
def main(): import argparse parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( '--infile', '-i', help="File containing the JSON object as a JSON string.") parser.add_argument('--server', help="Full URL of the server.") parser.add_argument('--key', default='default', help="The keypair identifier from the keyfile. \ Default is --key=default") parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file. Default is --keyfile\ =%s" % (os.path.expanduser("~/keypairs.json"))) parser.add_argument('--authid', help="The HTTP auth ID.") parser.add_argument('--authpw', help="The HTTP auth PW.") parser.add_argument( '--force-put', default=False, action='store_true', help="Force the object to be PUT rather than PATCHed. \ Default is False.") parser.add_argument('--get-only', default=False, action='store_true', help="Do nothing but get the object and print it. \ Default is False.") parser.add_argument('--id', help="URI for an object"), parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages. Default is False.") parser.add_argument( '--frame', help= "define a frame to get back the JSON object, for use with --id. Default is frame=object", default="object") parser.add_argument('--type', help="the object's type") parser.add_argument( '--update', default=False, action='store_true', help="Let the script PATCH/POST the data. Default is False") args = parser.parse_args() global DEBUG_ON DEBUG_ON = args.debug if args.get_only: GET_ONLY = True else: GET_ONLY = False key = encodedcc.ENC_Key(args.keyfile, args.key) if args.server and args.authpw and args.authid: key.server = args.server key.authid = args.authid key.authpw = args.authpw print("Creating authorization data from command line inputs") connection = encodedcc.ENC_Connection(key) print("Running on {}".format(connection.server)) if args.update: print( "This is an UPDATE run! Data will be PATCHed or POSTed accordingly" ) else: print("This is a dry run, no data will be changed") new_object = False if args.id: GET_ONLY = True print("Taking id to get from --id") new_json = {} uuid_response = {} accession_response = {} try: id_response = encodedcc.get_ENCODE(args.id, connection, frame=args.frame) except: id_response = {} new_object = True else: if args.infile: infile = open(args.infile, 'r') else: infile = sys.stdin new_json_string = infile.read() new_json = json.loads(new_json_string) if args.debug: encodedcc.pprint_ENCODE(new_json) if '@id' in new_json: id_response = encodedcc.get_ENCODE(new_json['@id'], connection) if id_response.get("code") == 404: id_response = {} new_object = True else: id_response = {} new_object = True if 'uuid' in new_json: uuid_response = encodedcc.get_ENCODE(new_json['uuid'], connection) if uuid_response.get("code") == 404: uuid_response = {} new_object = True else: uuid_response = {} new_object = True if 'accession' in new_json: accession_response = encodedcc.get_ENCODE(new_json['accession'], connection) if accession_response.get("code") == 404: accession_response = {} new_object = True else: accession_response = {} new_object = True if new_object: print( "No identifier in new JSON object. Assuming POST or PUT with auto-accessioning." ) object_exists = False if id_response: object_exists = True print("Found matching @id:") encodedcc.pprint_ENCODE(id_response) if uuid_response: object_exists = True print("Found matching uuid:") encodedcc.pprint_ENCODE(uuid_response) if accession_response: object_exists = True print("Found matching accession") encodedcc.pprint_ENCODE(accession_response) if id_response and uuid_response and (id_response != uuid_response): print("Existing id/uuid mismatch") if id_response and accession_response and (id_response != accession_response): print("Existing id/accession mismatch") if uuid_response and accession_response and (uuid_response != accession_response): print("Existing uuid/accession mismatch") if new_object and object_exists: print( "Conflict: At least one identifier already exists and at least one does not exist" ) profiles = encodedcc.get_ENCODE("/profiles/", connection) supported_collections = list(profiles.keys()) if "Dataset" not in supported_collections: supported_collections.append("Dataset") type_list = new_json.pop('@type', []) if args.type: type_list = [args.type] if any(type_list): findit = False for x in supported_collections: if x.lower() == type_list[0].lower(): type_list = [x] findit = True if findit: if args.debug: print("Object will have type of", type_list[0]) else: print( "Error! JSON object does not contain one of the supported types" ) print("Provided type:", type_list[0]) print( "Please either change the JSON file or define the type with the --type feature" ) sys.exit(1) else: print("No type found for JSON object!") sys.exit(1) possible_collections = [x for x in type_list if x in supported_collections] if possible_collections: # collection = possible_collections[0] + 's/' collection = possible_collections[0] else: collection = [] if '@id' in new_json: identifier = new_json.pop('@id') elif 'uuid' in new_json: if collection: identifier = '/' + collection + '/' + new_json['uuid'] + '/' else: identifier = '/' + new_json['uuid'] + '/' elif 'accession' in new_json: if collection: identifier = '/' + collection + '/' + new_json['accession'] + '/' else: identifier = '/' + new_json['accession'] + '/' if 'attachment' in new_json: if 'href' in new_json['attachment']: pass else: try: filename = new_json['attachment']['download'] print("Setting filename to %s" % (filename)) except: print("Must specify either href or filename for attachment", file=sys.stderr) if new_json['attachment'].get('type'): mime_type = new_json['attachment'].get('type') else: try: mime_type, encoding = mimetypes.guess_type(filename) major, minor = mime_type.split('/') #detected_type = magic.from_file(filename, mime=True) print("Detected mime type %s" % (mime_type)) except: print("Failed to detect mime type in file %s" % (filename), file=sys.stderr) try: with open(filename, 'rb') as stream: print("opened") newvalue = { 'download': filename, # Just echoes the given filename as the download name 'type': mime_type, 'href': 'data:%s;base64,%s' % (mime_type, b64encode(stream.read())) } f = open('tmp', 'w') print(f, newvalue) new_json.update({'attachment': newvalue}) # add except: print("Cannot open file %s" % (filename), file=sys.stderr) if object_exists: if args.force_put: if not GET_ONLY: print("Replacing existing object") if args.update: e = encodedcc.replace_ENCODE(identifier, connection, new_json) print(e) else: if not GET_ONLY: print("PATCHing existing object") if args.update: e = encodedcc.patch_ENCODE(identifier, connection, new_json) print(e) elif new_object: if args.force_put: if not GET_ONLY: print("PUT'ing new object") if args.update: e = encodedcc.replace_ENCODE(identifier, connection, new_json) print(e) else: if not GET_ONLY: print("POST'ing new object") if not any(collection): print( "ERROR: Unable to POST to non-existing collection {}". format(collection)) sys.exit(1) if args.update: e = encodedcc.new_ENCODE(connection, collection, new_json) print(e)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) keypair = (key.authid, key.authpw) server = key.server connection = encodedcc.ENC_Connection(key) lab = '&lab.name=' + args.lab organism = '&replicates.library.biosample.donor.organism.scientific_name=' + \ args.organism histone_experiments_pages = encoded_get( server + 'search/?type=Experiment' + '&assay_term_name=ChIP-seq' '&award.rfa=ENCODE3' + organism + '&target.investigated_as=' + args.target + lab + '&format=json&frame=' + 'page&limit=all', keypair)['@graph'] print("retreived " + str(len(histone_experiments_pages)) + " experiment pages") histone_controls_pages = encoded_get( server + 'search/?type=Experiment' + '&assay_term_name=ChIP-seq' '&award.rfa=ENCODE3' + organism + '&target.investigated_as=control' + lab + '&format=json&frame=' + 'page&limit=all', keypair)['@graph'] print("retreived " + str(len(histone_controls_pages)) + " control pages") histone_experiments_objects = encoded_get( server + 'search/?type=Experiment' + '&assay_term_name=ChIP-seq' '&award.rfa=ENCODE3' + organism + '&target.investigated_as=' + args.target + lab + '&format=json&frame=' + 'embedded&limit=all', keypair)['@graph'] print("retreived " + str(len(histone_experiments_objects)) + " experiment objects") histone_controls_objects = encoded_get( server + 'search/?type=Experiment' + '&assay_term_name=ChIP-seq' '&award.rfa=ENCODE3' + organism + '&target.investigated_as=control' + lab + '&format=json&frame=' + 'embedded&limit=all', keypair)['@graph'] print("retreived " + str(len(histone_controls_objects)) + " control objects") matrix = {} control_matrix = {} sample_types = set() marks = set() histone_experiments_dict = {} for entry in histone_experiments_pages: histone_experiments_dict[entry['accession']] = {'page': entry} for entry in histone_experiments_objects: histone_experiments_dict[entry['accession']]['object'] = entry sample = entry['biosample_term_name'] mark = entry['target']['label'] if mark not in matrix: matrix[mark] = {} if sample not in matrix[mark]: matrix[mark][sample] = [] if 'aliases' in entry: matrix[mark][sample].append((entry['accession'], entry['aliases'])) else: matrix[mark][sample].append((entry['accession'], 'NO ALIASES')) sample_types.add(sample) marks.add(mark) histone_controls_dict = {} for entry in histone_controls_pages: histone_controls_dict[entry['accession']] = {'page': entry} for entry in histone_controls_objects: histone_controls_dict[entry['accession']]['object'] = entry sample = entry['biosample_term_name'] mark = 'control' if mark not in control_matrix: control_matrix[mark] = {} if sample not in control_matrix[mark]: control_matrix[mark][sample] = [] if 'aliases' in entry: control_matrix[mark][sample].append( (entry['accession'], entry['aliases'])) else: control_matrix[mark][sample].append( (entry['accession'], 'NO ALIASES')) sample_types.add(sample) marks.add(mark) mone = 0 for ac in histone_experiments_dict: page = histone_experiments_dict[ac]['page'] obj = histone_experiments_dict[ac]['object'] mone += 1 # check only experiments that are not DELETED/REVOKED/REPLACED if is_interesting(obj): if mone % 10 == 0: print('processed ' + str(mone) + ' out of ' + str(len(histone_experiments_dict.keys()))) statuses = { 'replication': [], 'antibody': [], 'control': [], 'files': [], 'qc': [] } if is_replicated(obj) is False or is_replicated(page) is False: statuses['replication'].append('unreplicated') if is_antibody_eligible(page) is False: statuses['antibody'].append('not eligible antybody') if is_not_missing_antibody(page) is False: statuses['antibody'].append('missing antybody') if is_not_mismatched_control(page) is False: statuses['control'].append('mismatched controled_by') if is_not_mismatched_control_run_type(page) is False: statuses['control'].append('mismatched controled_by run_type') if is_not_mismatched_control_read_length(page) is False: statuses['control'].append( 'mismatched controled_by read_length') if is_not_missing_controls(page) is False: statuses['control'].append('missing control') if is_not_missing_paired_with(page) is False: statuses['files'].append('missing paired_with files') if is_sufficient_read_depth(page) is False: statuses['qc'].append('insufficient read depth') if is_not_low_read_depth(page) is False: statuses['qc'].append('low read depth') if is_compliant_library_complexity(page) is False: statuses['qc'].append('insufficient/poor library complexity') if is_not_moderate_library_complexity(page) is False: statuses['qc'].append('moderate library complexity') if is_compliant_library_bottlenecking(page) is False: statuses['qc'].append('severe/moderate library bottlenecking') if is_not_mild_library_bottlenecking(page) is False: statuses['qc'].append('mild library bottlenecking') if is_not_missing_controls(page) is True and \ is_not_mismatched_control(page) is True: not_encode_3_flag = False for entry in obj['possible_controls']: control_accession = entry['accession'] if control_accession in histone_controls_dict: control_page = histone_controls_dict[ control_accession]['page'] if is_sufficient_read_depth(control_page) is False: statuses['control'].append('insufficient read ' 'depth in control') if is_not_low_read_depth(control_page) is False: statuses['control'].append('low read ' 'depth in control') if is_compliant_library_complexity( control_page) is False: statuses['control'].append('insufficient/poor ' 'library ' 'complexity in control') if is_not_moderate_library_complexity( control_page) is False: statuses['control'].append('moderate library ' 'complexity in control') if is_compliant_library_bottlenecking( control_page) is False: statuses['control'].append( 'severe/moderate library ' 'bottlenecking in control') if is_not_mild_library_bottlenecking( control_page) is False: statuses['control'].append( 'mild library ' 'bottlenecking in control') else: not_encode_3_flag = True if (not_encode_3_flag is True): statuses['control'].append('non ENCODE3 control') histone_experiments_dict[ac]['statuses'] = statuses rep_dict = {} for file_id in obj['original_files']: file_object = encodedcc.get_ENCODE( file_id.split('/')[2], connection, 'embedded') if file_object['status'] in FILE_IGNORE_STATUS: continue if file_object['file_format'] == 'fastq': if 'replicate' in file_object: bio_rep_number = file_object['replicate'][ 'biological_replicate_number'] tec_rep_number = file_object['replicate'][ 'technical_replicate_number'] key = (bio_rep_number, tec_rep_number) if key not in rep_dict: rep_dict[key] = set() if 'read_length' in file_object and 'run_type' in file_object: if file_object['run_type'] == 'single-ended': record_val = str( file_object['read_length']) + 'SE' else: record_val = str( file_object['read_length']) + 'PE' rep_dict[key].add(record_val) seq_info_string = '' for k in sorted(rep_dict.keys()): reps_string = '' for member in rep_dict[k]: reps_string += member + ', ' seq_info_string += 'REP' + str(k[0]) + '.' + str( k[1]) + ' ' + reps_string[:-2] + '\r' histone_experiments_dict[ac]['seq_info'] = seq_info_string mone = 0 for ac in histone_controls_dict: mone += 1 page = histone_controls_dict[ac]['page'] obj = histone_controls_dict[ac]['object'] if is_interesting(obj): if mone % 10 == 0: print('processed ' + str(mone) + ' out of ' + str(len(histone_controls_dict.keys()))) statuses = {'replication': [], 'files': [], 'qc': []} if is_replicated(obj) is False or is_replicated(page) is False: statuses['replication'].append('unreplicated') if is_not_missing_paired_with(page) is False: statuses['files'].append('missing paired_with files') if is_sufficient_read_depth(page) is False: statuses['qc'].append('insufficient read depth') if is_not_low_read_depth(page) is False: statuses['qc'].append('low read depth') if is_compliant_library_complexity(page) is False: statuses['qc'].append('insufficient/poor library complexity') if is_not_moderate_library_complexity(page) is False: statuses['qc'].append('moderate library complexity') if is_compliant_library_bottlenecking(page) is False: statuses['qc'].append('severe/moderate library bottlenecking') if is_not_mild_library_bottlenecking(page) is False: statuses['qc'].append('mild library bottlenecking') histone_controls_dict[ac]['statuses'] = statuses rep_dict = {} for file_id in obj['original_files']: file_object = encodedcc.get_ENCODE( file_id.split('/')[2], connection, 'embedded') if file_object['status'] in FILE_IGNORE_STATUS: continue if file_object['file_format'] == 'fastq': if 'replicate' in file_object: bio_rep_number = file_object['replicate'][ 'biological_replicate_number'] tec_rep_number = file_object['replicate'][ 'technical_replicate_number'] key = (bio_rep_number, tec_rep_number) if key not in rep_dict: rep_dict[key] = set() if 'read_length' in file_object and 'run_type' in file_object: if file_object['run_type'] == 'single-ended': record_val = str(file_object['read_length']) + 'SE' else: record_val = str(file_object['read_length']) + 'PE' rep_dict[key].add(record_val) seq_info_string = '' for k in sorted(rep_dict.keys()): reps_string = '' for member in rep_dict[k]: reps_string += member + ', ' seq_info_string += 'REP' + str(k[0]) + '.' + str( k[1]) + ' ' + reps_string[:-2] + '\r' histone_controls_dict[ac]['seq_info'] = seq_info_string if args.target == "histone": marks_to_print = ['control'] marks_to_print.extend(CORE_MARKS) for m in marks: if m not in CORE_MARKS and m != 'control': marks_to_print.append(m) else: marks_to_print = ['control'] for m in marks: if m != 'control': marks_to_print.append(m) with open(args.audit_matrix, 'w') as output: fields = ['sample'] + marks_to_print writer = csv.DictWriter(output, fieldnames=fields) writer.writeheader() for sample in sample_types: row = {'sample': sample} for mark in marks_to_print: if mark != 'control': if sample in matrix[mark]: total = len(matrix[mark][sample]) accessionStatuses = {} aliases = {} for (acc, al) in matrix[mark][sample]: aliases[acc] = al accessionStatuses[acc] = [] statuses = histone_experiments_dict[acc][ 'statuses'] for k in statuses: if len(statuses[k]) > 0: statuses_string = '' for status in statuses[k]: statuses_string += '-' + status + '\r' accessionStatuses[acc].append( statuses_string) cell_info = '' for acc in accessionStatuses: if len(accessionStatuses[acc]) < 1: cell_info += acc + ' ' + histone_experiments_dict[acc]['object']['status'] + \ '\r' + str(aliases[acc]) else: statuses_string = '' for status in accessionStatuses[acc]: statuses_string += status cell_info += acc + ' ' + histone_experiments_dict[acc]['object']['status'] + \ '\r' + str(aliases[acc]) + '\r' + \ statuses_string cell_info += '\r\n' row.update({ mark: 'Experiments number : ' + str(total) + '\r' + cell_info }) else: row.update({mark: 'NONE'}) else: if sample in control_matrix[mark]: total = len(control_matrix[mark][sample]) accessionStatuses = {} aliases = {} for (acc, al) in control_matrix[mark][sample]: aliases[acc] = al accessionStatuses[acc] = [] statuses = histone_controls_dict[acc]['statuses'] for k in statuses: if len(statuses[k]) > 0: statuses_string = '' for status in statuses[k]: statuses_string += '-' + status + '\r' accessionStatuses[acc].append( statuses_string) cell_info = '' for acc in accessionStatuses: if len(accessionStatuses[acc]) < 1: cell_info += acc + ' ' + histone_controls_dict[acc]['object']['status'] + \ '\r' + str(aliases[acc]) else: statuses_string = '' for status in accessionStatuses[acc]: statuses_string += status cell_info += acc + ' ' + histone_controls_dict[acc]['object']['status'] + \ '\r' + str(aliases[acc]) + '\r' + \ statuses_string cell_info += '\r\n' row.update({ mark: 'Experiments number : ' + str(total) + '\r' + cell_info }) else: row.update({mark: 'NONE'}) writer.writerow(row) with open(args.run_type_matrix, 'w') as output: fields = ['sample'] + marks_to_print writer = csv.DictWriter(output, fieldnames=fields) writer.writeheader() for sample in sample_types: row = {'sample': sample} for mark in marks_to_print: if mark != 'control': if sample in matrix[mark]: total = len(matrix[mark][sample]) accessionStatuses = {} aliases = {} for (acc, al) in matrix[mark][sample]: aliases[acc] = al accessionStatuses[acc] = [] statuses = histone_experiments_dict[acc][ 'statuses'] for k in statuses: if len(statuses[k]) > 0: statuses_string = '' for status in statuses[k]: statuses_string += '-' + status + '\r' accessionStatuses[acc].append( statuses_string) cell_info = '' for acc in accessionStatuses: cell_info += acc + ' ' + \ histone_experiments_dict[acc]['object']['status'] + \ '\r' + str(aliases[acc]) + \ '\r' + \ histone_experiments_dict[acc]['seq_info'] cell_info += '\r\n' row.update({ mark: 'Experiments number : ' + str(total) + '\r' + cell_info }) else: row.update({mark: 'NONE'}) else: if sample in control_matrix[mark]: total = len(control_matrix[mark][sample]) accessionStatuses = {} aliases = {} for (acc, al) in control_matrix[mark][sample]: aliases[acc] = al accessionStatuses[acc] = [] statuses = histone_controls_dict[acc]['statuses'] for k in statuses: if len(statuses[k]) > 0: statuses_string = '' for status in statuses[k]: statuses_string += '-' + status + '\r' accessionStatuses[acc].append( statuses_string) cell_info = '' for acc in accessionStatuses: cell_info += acc + ' ' + histone_controls_dict[acc]['object']['status'] + \ '\r' + str(aliases[acc]) + '\r' + \ histone_controls_dict[acc]['seq_info'] cell_info += '\r\n' row.update({ mark: 'Experiments number : ' + str(total) + '\r' + cell_info }) else: row.update({mark: 'NONE'}) writer.writerow(row)