def main(cmdline=None): parser = ArgumentParser() parser.add_argument( '-s', '--server', required=True, choices=['www.encodeproject.org', 'test.encodedcc.org'], help='DCC Server to upload to') parser.add_argument('-m', '--metadata', required=True, help='Metadata spreadsheet to use') parser.add_argument('-f', '--flowcell-details', required=True, help='Flowcel metadata details') parser.add_argument('-n', '--dry-run', action='store_true', default=False) args = parser.parse_args(cmdline) logging.basicConfig(level=logging.INFO) logging.info('Server: %s', args.server) logging.info('Sheetname: %s', args.metadata) server = ENCODED(args.server) server.load_netrc() book = ODFReader(args.metadata) process_fastqs(server, book, args.flowcell_details, args.dry_run)
def encoded_experiment_loader(query_url, experiments=None): server = ENCODED('www.encodeproject.org') server.load_netrc() if experiments is None: experiments = {} query = server.get_json(query_url) tzero = time.monotonic() tnow = tzero tprev = tzero progress = len(query['@graph']) // 10 for i, record in enumerate(query['@graph']): accession = record['@id'][len('/experiments/'):-1] if accession not in experiments: experiments[accession] = server.get_json(record['@id']) if progress != 0 and (i+1) % progress == 0: tnow = time.monotonic() print("Reading {} of {} records in {} seconds".format( (i+1), len(query['@graph']), tnow - tprev)) tprev = tnow print("Read {} records in {} seconds".format( len(query['@graph']), tnow-tzero)) return experiments
def setUp(self): self.encode = ENCODED('www.encodeproject.org') self.encode._user = { '@context': '/terms/', '@id': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/', '@type': ['User', 'Item'], 'first_name': 'Diane', 'groups': [], 'job_title': 'Submitter', 'lab': { '@id': '/labs/barbara-wold/', '@type': ['Lab', 'Item'], 'country': 'USA', 'institute_label': 'Caltech', 'institute_name': 'California Institute of Technology', 'pi': '/users/0598c868-0b4a-4c5b-9112-8f85c5de5374/', 'schema_version': '4', 'title': 'Barbara Wold, Caltech', 'uuid': '72d5666a-a361-4f7b-ab66-a88e11280937' }, 'last_name': 'Trout', 'schema_version': '5', 'submits_for': ['/labs/barbara-wold/', '/labs/richard-myers/', '/labs/ali-mortazavi/'], 'uuid': 'bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a', } logging.disable(logging.WARNING) self.validator = DCCValidator(self.encode) for schema, filename in [('library', 'library.json'), ('biosample', 'biosample.json')]: schema_file = os.path.join(os.path.dirname(__file__), filename) with open(schema_file, 'rt') as instream: self.validator._schemas[schema] = json.load(instream)
def main(): data_id = 'ENCSR574CRQ' #test_id = 'TSTSR910688' server = ENCODED('www.encodeproject.org') #server = ENCODED('test.encodedcc.org') server.load_netrc() experiments = {} for e in load_experiments_from_ods( 'C1-encode3-limb-tranche1-resubmit.ods'): experiments['/experiments/{}/'.format(e)] = None for e in load_experiments_from_ods( 'C1-mouse-forlimb-submission-201804.ods'): experiments['/experiments/{}/'.format(e)] = None df = pandas.read_excel( 'Mouse embryo samples list library numbers Diane August 21 2017.xlsx', sheet='Sheet 1', usecols=[0, 1, 2, 3], dtype={ 2: str, 3: str }) df = df.dropna() print(df) award = '/awards/UM1HG009443/' lab = '/labs/barbara-wold/' # load bulk list for l in df['Library number']: graph = server.search_jsonld(searchTerm='barbara-wold:{}'.format(l)) for result in graph['@graph']: if 'Experiment' in result['@type']: experiment = server.get_json(result['@id']) if experiment['status'] == 'released': experiments[result['@id']] = None pub_id = '/publications/e0d01543-9965-4edb-933c-778a40575cd9/' pub = server.get_json(pub_id) dataset = pub.get('datasets', []) dataset = [x['@id'] for x in dataset] #for d in dataset: # if d in experiments: # del experiments[d] experiments = list(experiments) print('posted', len(dataset)) print('update', len(experiments)) pprint(list(experiments)) print(type(experiments)) print(type(dataset)) print(set(experiments).difference(set(dataset))) print(server.patch_json(pub_id, {'datasets': experiments}))
def main(): server = ENCODED('www.encodeproject.org') server.load_netrc() to_include = [x.replace('_mm10', '') for x in generate_to_include()] found = [] for lib in to_include[1:]: alias = 'barbara-wold:' + lib try: obj = server.get_json(alias) except HTTPError as e: print(lib, 'not found') else: found.append((lib,obj['@id'])) df = pandas.DataFrame(found, columns=['library_id', 'id']) print(df.head()) df.to_csv('library_id_to_accession.tsv', sep='\t', index=False)
def main(): server = ENCODED('www.encodeproject.org') server.load_netrc() validator = DCCValidator(server) table = pandas.read_csv('publication_files.tsv', sep='\t', skipfooter=1) award = '/awards/UM1HG009443/' lab = '/labs/barbara-wold/' payload = { 'award': award, 'lab': lab, 'related_files': list(table['file']), 'references': ['/publications/e0d01543-9965-4edb-933c-778a40575cd9/'], } #validator.validate(payload, 'publication_data') print(server.post_json('/publication-data/', payload))
def main(): data_id = 'ENCSR574CRQ' #test_id = 'TSTSR910688' #libraries = ['17298', '17299', '15019', '15020', '16930', # '16931', '16110', '16111', '15084', '15085', # '16134', '16135',] server = ENCODED('www.encodeproject.org') #server = ENCODED('test.encodedcc.org') server.load_netrc() df = pandas.read_excel( 'Mouse embryo samples list library numbers Diane August 21 2017.xlsx', sheet='Sheet 1', usecols=[0, 1, 2, 3], dtype={ 2: str, 3: str }) df = df.dropna() print(df) award = '/awards/UM1HG009443/' lab = '/labs/barbara-wold/' files = {} for l in df['Library number']: graph = server.search_jsonld(searchTerm='barbara-wold:{}'.format(l)) for result in graph['@graph']: if 'Experiment' in result['@type']: experiment = server.get_json(result['@id']) print(result['@id'], l) for f in experiment['files']: assembly = f.get('assembly') genome_annotation = f.get('genome_annotation') if assembly is None and genome_annotation is None: files[f['@id']] = f.get('submitted_file_name') print(f['@id'], f.get('submitted_file_name')) elif assembly == 'mm10' and genome_annotation == 'M4': files[f['@id']] = f.get('submitted_file_name') print(f['@id'], f.get('submitted_file_name')) else: print(' ', assembly, genome_annotation) payload = { 'award': award, 'lab': lab, 'related_files': list(files.keys()), 'references': ['/publications/e0d01543-9965-4edb-933c-778a40575cd9/'], } #pprint(payload) #if data_id is None: # result = server.post_json('/publication-data/', payload) # print(result['@id']) results = server.patch_json(data_id, {'related_files': list(files)}) print(results) print(results.get('@id'))
def main(cmdline=None): parser = ArgumentParser() parser.add_argument('-s', '--sheet', default=0, help='Sheet to use') parser.add_argument('--header', default=None, help="header row") parser.add_argument('filename', nargs=1, help='spreadsheet to look at') args = parser.parse_args(cmdline) header = int(args.header) if args.header is not None else None book = ODFReader(args.filename[0]) data = book.parse(args.sheet, header=header) server = ENCODED('www.encodeproject.org') server.load_netrc() first_experiments = models.load_experiments( to_files(paper_433_experiment_files)) all_experiments = models.load_experiments( to_files(ASOF_RUN17_experiment_files)) first_libraries = set(parse_replicates(first_experiments['replicates'])) all_libraries = set(parse_replicates(all_experiments['replicates'])) #print(first_libraries) #print(all_libraries) results = [] for i, library_id in enumerate(data[data.columns[0]]): if library_id in first_libraries: tranche = 1 elif library_id in all_libraries: tranche = 2 else: tranche = 'C' row = find_library_info(server, library_id) row['tranche'] = tranche results.append(row) if (i + 1) % 10: print('.', end='', flush=True) df = pandas.DataFrame(results) df.to_csv('tranche.csv', index=False)
def main(cmdline=None): data_id = 'ENCSR574CRQ' server = ENCODED('www.encodeproject.org') server.load_netrc() book = pandas.ExcelFile( 'C1-mouse-forelimb-submission-201907-uploaded-production.xlsx') files = book.parse('File') publication = server.get_json(data_id) publication_accessions = [ x[len('/files/'):-1] for x in publication['files'] ] print('submitted', len(files['accession'])) print('posted', len(publication_accessions)) print( 'Intersect', len( set(files['accession'].values).intersection( publication_accessions)))
def main(cmdline=None): data_id = 'ENCSR226XLF' server = ENCODED('www.encodeproject.org') server.load_netrc() book_name = 'C1-mouse-forelimb-submission-201907-uploaded-production.xlsx' book = pandas.ExcelFile(book_name) file_sheet = book.parse('File') files = ['/files/{}/'.format(x) for x in file_sheet['accession']] print('Have {} files to update with'.format(len(files))) publication = server.get_json(data_id) print(publication['description']) print('Publication has {} files'.format(len(publication['files']))) p = input("Replace with {} files from {}? ".format(len(files), book_name)) if p.lower().startswith('y'): response = server.patch_json(data_id, {'related_files': files}) print(response)
class TestEncoded(TestCase): def setUp(self): self.encode = ENCODED('www.encodeproject.org') self.encode._user = { '@context': '/terms/', '@id': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/', '@type': ['User', 'Item'], 'first_name': 'Diane', 'groups': [], 'job_title': 'Submitter', 'lab': { '@id': '/labs/barbara-wold/', '@type': ['Lab', 'Item'], 'country': 'USA', 'institute_label': 'Caltech', 'institute_name': 'California Institute of Technology', 'pi': '/users/0598c868-0b4a-4c5b-9112-8f85c5de5374/', 'schema_version': '4', 'title': 'Barbara Wold, Caltech', 'uuid': '72d5666a-a361-4f7b-ab66-a88e11280937' }, 'last_name': 'Trout', 'schema_version': '5', 'submits_for': ['/labs/barbara-wold/', '/labs/richard-myers/', '/labs/ali-mortazavi/'], 'uuid': 'bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a', } logging.disable(logging.WARNING) self.validator = DCCValidator(self.encode) for schema, filename in [('library', 'library.json'), ('biosample', 'biosample.json')]: schema_file = os.path.join(os.path.dirname(__file__), filename) with open(schema_file, 'rt') as instream: self.validator._schemas[schema] = json.load(instream) def tearDown(self): logging.disable(logging.NOTSET) def test_prepare_url(self): tests = [ ('/experiments', 'https://www.encodeproject.org/experiments'), ('/experiments/ENCLB045ZZZ', 'https://www.encodeproject.org/experiments/ENCLB045ZZZ'), ('https://www.encodeproject.org/experiments/ENCLB045ZZZ', 'https://www.encodeproject.org/experiments/ENCLB045ZZZ'), ('barbara-wold:11111', 'https://www.encodeproject.org/barbara-wold:11111') ] for url, result in tests: self.assertEqual(self.encode.prepare_url(url), result) def test_validate_library(self): """Test validation of a Library object """ obj = { u'@id': u'/libraries/ENCLB045ZZZ/', u'@type': [u'Library', u'Item'], u'aliases': [], u'alternate_accessions': [], u'award': u'/awards/U54HG006998/', u'biosample': u'/biosamples/ENCBS089RNA/', u'date_created': u'2014-01-14T19:44:51.061770+00:00', u'documents': [], u'extraction_method': u'Ambion mirVana', u'fragmentation_method': u'chemical (Nextera tagmentation)', u'lab': u'/labs/barbara-wold/', u'library_size_selection_method': u'SPRI beads', u'lysis_method': u'Ambion mirVana', u'nucleic_acid_term_name': u'polyadenylated mRNA', u'size_range': u'>200', u'status': u'released', u'strand_specificity': False, u'treatments': [], } self.validator.validate(obj, 'library') # test requestMethod obj['schema_version'] = u'2' self.assertRaises(jsonschema.ValidationError, self.validator.validate, obj, 'library') del obj['schema_version'] # test calculatedProperty obj['nucleic_acid_term_name'] = u'SO:0000871' self.assertRaises(jsonschema.ValidationError, self.validator.validate, obj, 'library') del obj['nucleic_acid_term_name'] # test permssionValidator obj['uuid'] = u'42c46028-708f-4347-a3df-2c82dfb021c4' self.assertRaises(jsonschema.ValidationError, self.validator.validate, obj, 'library') del obj['uuid'] def test_validate_biosample(self): bio = { 'aliases': ['barbara-wold:c1_e12.5_mouse_limb_donor'], 'award': 'U54HG006998', 'biosample_term_id': 'UBERON:0002101', 'biosample_term_name': 'limb', 'biosample_type': 'tissue', 'date_obtained': '2017-02-01', 'description': 'C57Bl6 wild-type embryonic mouse', 'donor': '/mouse-donors/ENCDO956IXV/', 'lab': '/labs/barbara-wold', 'model_organism_age': '12.5', 'model_organism_age_units': 'day', 'mouse_life_stage': 'embryonic', 'organism': '3413218c-3d86-498b-a0a2-9a406638e786', 'source': '/sources/gems-caltech/', 'starting_amount': 1, 'starting_amount_units': 'items', } # test aliases bio['aliases'] = ['barbara-wold:c1-donor'] self.validator.validate(bio, 'biosample') del bio['aliases'] # test part_of bio['part_of'] = 'barbara-wold:c1-donor' self.validator.validate(bio, 'biosample') del bio['part_of'] # tests linkTo self.validator.validate(bio, 'biosample') bio['organism'] = '/organisms/mouse/' bio['lab'] = '/labs/alkes-price/' self.assertRaises(jsonschema.ValidationError, self.validator.validate, bio, 'biosample') bio['lab'] = '/labs/barbara-wold' bio['organism'] = "7745b647-ff15-4ff3-9ced-b897d4e2983c" self.assertRaises(jsonschema.ValidationError, self.validator.validate, bio, 'biosample') bio['organism'] = "/organisms/human" self.assertRaises(jsonschema.ValidationError, self.validator.validate, bio, 'biosample') bio['organism'] = '/organisms/mouse/' def test_aliases(self): """Test that objects being validated can access previous ones by alias Some properties can require that an object exist, since we validate before submitting, we need to be able cache object by its alias and use that to retrieve it. """ donor = { 'aliases': ['barbara-wold:c1_e12.5_mouse_limb_donor'], 'award': 'U54HG006998', 'biosample_term_id': 'UBERON:0002101', 'biosample_term_name': 'limb', 'biosample_type': 'tissue', 'date_obtained': '2017-02-01', 'description': 'C57Bl6 wild-type embryonic mouse', 'donor': '/mouse-donors/ENCDO956IXV/', 'lab': '/labs/barbara-wold', 'model_organism_age': '12.5', 'model_organism_age_units': 'day', 'mouse_life_stage': 'embryonic', 'organism': '3413218c-3d86-498b-a0a2-9a406638e786', 'source': '/sources/gems-caltech/', 'starting_amount': 1, 'starting_amount_units': 'items', } part = donor.copy() part['aliases'] = ['barbara-wold:A7'] part['part_of'] = 'barbara-wold:c1_e12.5_mouse_limb_donor' self.assertRaises(jsonschema.ValidationError, self.validator.validate, part, 'biosample') self.validator.validate(donor, 'biosample') self.validator.validate(part, 'biosample') def test_create_context(self): linked_id = {'@type': '@id'} library = {'@id': '/libraries/1234', '@type': ['Library', 'Item']} url = self.encode.prepare_url(library['@id']) context = self.encode.create_jsonld_context(library, url) self.assertEqual(context['@vocab'], 'https://www.encodeproject.org/profiles/library.json#') self.assertEqual(context['award'], linked_id ) self._verify_context(context, 'Library') # namespaces not added yet. self.assertRaises(AssertionError, self._verify_namespaces, context) self.encode.add_jsonld_namespaces(context) self._verify_namespaces(context) def test_add_context(self): """Checking to make sure nested @base and @vocab urls are set correctly """ obj = { "nucleic_acid_term_name": "RNA", "accession": "ENCLB044ZZZ", "@id": "/libraries/ENCLB044ZZZ/", "schema_version": "1", "@type": [ "Library", "Item" ], "lysis_method": "Ambion mirVana", "nucleic_acid_term_id": "SO:0000356", "biosample": { "biosample_term_name": "GM12878", "description": "B-lymphocyte, lymphoblastoid, International HapMap Project - CEPH/Utah - European Caucasion, Epstein-Barr Virus", "accession": "ENCBS090RNA", "date_created": "2013-10-29T21:15:29.144260+00:00", "@id": "/biosamples/ENCBS090RNA/", "aliases": [ "brenton-graveley:GM12878-2", "thomas-gingeras:191WC" ], "organism": "/organisms/human/", "@type": [ "Biosample", "Item" ] }, } bio_base = self.encode.prepare_url(obj['biosample']['@id']) url = self.encode.prepare_url('/libraries/ENCLB044ZZZ/?format=json&embed=False') obj_type = self.encode.get_object_type(obj) schema_url = self.encode.get_schema_url(obj_type) self.encode.add_jsonld_context(obj, url) self.assertEqual(obj['biosample']['@context']['@base'], bio_base) self.assertEqual(obj['@context']['@vocab'], schema_url) self._verify_context(obj['@context'], 'Library') self._verify_namespaces(obj['@context']) self._verify_context(obj['biosample']['@context'], 'Biosample') self.assertEqual(obj['@context']['rdf'], 'http://www.w3.org/1999/02/22-rdf-syntax-ns#') self.assertEqual(obj['@context']['OBO'], 'http://purl.obolibrary.org/obo/') def test_convert_search_to_jsonld(self): example = {'count': {'biosamples': 2}, 'portal_title': 'ENCODE', 'title': 'Search', 'notification': 'Success', 'filters': [], '@id': '/search/?searchTerm=wold', '@type': ['search'], 'facets': [], '@graph': [{ u'@id': u'/biosamples/ENCBS125ENC/', u'@type': [u'Biosample', u'Item'], u'accession': u'ENCBS125ENC', u'award.rfa': u'ENCODE2-Mouse', u'biosample_term_name': u'myocyte', u'biosample_type': u'in vitro differentiated cells', u'characterizations.length': [], u'constructs.length': [], u'lab.title': u'Barbara Wold, Caltech', u'life_stage': u'unknown', u'organism.name': u'mouse', u'source.title': u'Barbara Wold', u'status': u'CURRENT', u'treatments.length': []}, {u'@id': u'/biosamples/ENCBS126ENC/', u'@type': [u'Biosample', u'Item'], u'accession': u'ENCBS126ENC', u'award.rfa': u'ENCODE2-Mouse', u'biosample_term_name': u'myocyte', u'biosample_type': u'in vitro differentiated cells', u'characterizations.length': [], u'constructs.length': [], u'lab.title': u'Barbara Wold, Caltech', u'life_stage': u'unknown', u'organism.name': u'mouse', u'source.title': u'Barbara Wold', u'status': u'CURRENT', u'treatments.length': []}, ]} result = self.encode.convert_search_to_jsonld(example) for obj in result['@graph']: self.assertNotIn('award.rfa', obj) def _verify_context(self, context, obj_type): for context_key in [None, obj_type]: for k in ENCODED_CONTEXT[context_key]: self.assertIn(k, context) self.assertEqual(ENCODED_CONTEXT[context_key][k], context[k]) def _verify_namespaces(self, context): for k in ENCODED_NAMESPACES: self.assertIn(k, context) self.assertEqual(ENCODED_NAMESPACES[k], context[k])