# follow instructions here to enable API & generate credentials # https://www.twilio.com/blog/2017/02/an-easy-way-to-read-and-write-to-a-google-spreadsheet-in-python.html creds = ServiceAccountCredentials.from_json_keyfile_name( args.creds, 'https://www.googleapis.com/auth/drive') client = gspread.authorize(creds) sheet = client.open_by_key(args.sheet) for tab in sheet.worksheets(): if tab.title == schema_name: sheet.del_worksheet(tab) tab = sheet.add_worksheet(title=schema_name, rows='100', cols='52') abcs = string.ascii_uppercase cell_grid = list(abcs) + ['A' + i for i in abcs] connection = lattice.Connection(args.mode) server = connection.server # grab the OntologyTerm term_name & term_id schemas to put in places that linkTo OntologyTerm ont_schema_url = urljoin(server, 'profiles/ontology_term/?format=json') ont_schema = requests.get(ont_schema_url).json() term_id_props = ont_schema['properties']['term_id'] term_name_props = ont_schema['properties']['term_name'] # grab all of the submittable properties props = {} schema_url = urljoin(server, 'profiles/' + schema_name + '/?format=json') schema = requests.get(schema_url).json() for p in schema['properties'].keys(): props[p] = schema['properties'][p]
def main(): summary_report = [] args = getArgs() if not args.mode: sys.exit('ERROR: --mode is required') connection = lattice.Connection(args.mode) server = connection.server print('Running on {server}'.format(server=server)) if not os.path.isfile(args.infile): sys.exit( 'ERROR: file {filename} not found!'.format(filename=args.infile)) book = load_workbook(args.infile) names = {} if args.justtype: if args.starttype: sys.exit('ERROR: cannot specify both --justtype and --starttype') else: for sheet in book.sheetnames: if sheet.lower().replace('_', '') == args.justtype.lower().replace( '_', ''): names[sheet.lower().replace('_', '')] = sheet else: for sheet in book.sheetnames: names[sheet.lower().replace('_', '')] = sheet profiles = requests.get(server + 'profiles/?format=json').json() supported_collections = [s.lower() for s in list(profiles.keys()) ] # get accepted object types supported_collections.append('cover sheet') for n in names.keys(): if n not in supported_collections: # check that each sheet name corresponds to an object type print( 'ERROR: Sheet name {name} not part of supported object types!'. format(name=n), file=sys.stderr) ont_schema_url = urljoin(server, 'profiles/ontology_term/?format=json') ont_term_schema = requests.get(ont_schema_url).json()['properties'] ontology_props = [] for p in ont_term_schema.keys(): if not str(ont_term_schema[p].get('comment')).startswith('Do not submit') \ and ont_term_schema[p].get('notSubmittable') != True: ontology_props.append(p) load_order = ORDER # pull in the order used to load test inserts on a local instance if args.starttype: st_index = load_order.index(args.starttype) load_order = load_order[st_index:] all_posts = {} for schema_to_load in load_order: # go in order to try and get objects posted before they are referenced by another object obj_type = schema_to_load.replace('_', '') if obj_type in names.keys(): obj_posts = [] row_count, rows = reader(book, names[obj_type]) # remove all columns that do not have any values submitted if not args.remove: index_to_remove = [] for i in range(0, len(rows[0])): values = [row[i] for row in rows[1:]] if set(values) == {''}: index_to_remove.append(i) index_to_remove.reverse() for index in index_to_remove: for row in rows: del row[index] headers = rows.pop(0) schema_url = urljoin( server, 'profiles/' + schema_to_load + '/?format=json') schema_properties = requests.get(schema_url).json()['properties'] invalid_flag = properties_validator(headers, schema_to_load, schema_properties, ontology_props) if invalid_flag == True: print('{}: invalid schema, check the headers'.format(obj_type)) summary_report.append( '{}: invalid schema, check the headers'.format(obj_type)) continue for row in rows: row_count += 1 post_json = dict(zip(headers, row)) # convert values to the type specified in the schema, including embedded json objects if not args.remove: post_json, post_ont = dict_patcher(post_json, schema_properties, ont_term_schema) for k, v in post_ont.items(): all_posts.setdefault('ontology_term', []).append( (obj_type + '.' + k, v)) # add attchments here if post_json.get('attachment'): attach = attachment(post_json['attachment']) post_json['attachment'] = attach obj_posts.append((row_count, post_json)) all_posts[schema_to_load] = obj_posts if args.patchall: patch_req = True else: patch_req = False for schema in load_order: # go in order to try and get objects posted before they are referenced by another object if all_posts.get(schema): total = 0 error = 0 success = 0 patch = 0 new_accessions_aliases = [] failed_postings = [] for row_count, post_json in all_posts[schema]: total += 1 #check for an existing object based on any possible identifier temp_id, temp = check_existing_obj(post_json, schema, connection) if temp.get('uuid' ): # if there is an existing corresponding object if schema == 'ontology_term': ont_mismatch = False ont_patch = False for k in post_json.keys(): if temp.get(k) and post_json[k] != temp.get(k): print( 'ERROR: {}:{} {} of {} does not match existing {}' .format(row_count, k, post_json[k], post_json['term_id'], temp.get(k))) ont_mismatch = True elif not temp.get(k): ont_patch = True if ont_mismatch == False and ont_patch == True: print( schema.upper() + ' ' + str(row_count) + ':Object {} already exists. Would you like to patch it instead?' .format(post_json['term_id'])) i = input('PATCH? y/n: ') if i.lower() == 'y': patch_req = True elif ont_mismatch == True: print('OntologyTerm {} will not be updated'.format( post_json['term_id'])) i = input('EXIT SUBMISSION? y/n: ') if i.lower() == 'y': sys.exit( '{sheet}: {success} posted, {patch} patched, {error} errors out of {total} total' .format(sheet=schema.upper(), success=success, total=total, error=error, patch=patch)) elif patch_req == False: # patch wasn't specified, see if the user wants to patch print( schema.upper() + ' ROW ' + str(row_count) + ':Object {} already exists. Would you like to patch it instead?' .format(temp_id)) i = input('PATCH? y/n: ') if i.lower() == 'y': patch_req = True if patch_req == True and args.remove: existing_json = lattice.get_object(temp['uuid'], connection, frame="edit") for k in post_json.keys(): if k not in ['uuid', 'accession', 'alias', '@id']: if k not in existing_json.keys(): print( 'Cannot remove {}, may be calculated property, or is not submitted' .format(k)) else: existing_json.pop(k) print('Removing value:', k) if args.update: e = lattice.replace_object(temp['uuid'], connection, existing_json) if e['status'] == 'error': error += 1 elif e['status'] == 'success': new_patched_object = e['@graph'][0] # Print now and later print(schema.upper() + ' ROW ' + str(row_count) + ':identifier: {}'. format((new_patched_object.get( 'accession', new_patched_object.get('uuid'))))) patch += 1 elif patch_req == True and args.update: e = lattice.patch_object(temp['uuid'], connection, post_json) if e['status'] == 'error': error += 1 elif e['status'] == 'success': new_patched_object = e['@graph'][0] # Print now and later print(schema.upper() + ' ROW ' + str(row_count) + ':identifier: {}'.format( (new_patched_object.get( 'accession', new_patched_object.get('uuid'))))) patch += 1 else: # we have new object to post if args.patchall: print( schema.upper() + ' ROW ' + str(row_count) + ':Object not found. Check identifier or consider removing --patchall to post a new object' ) error += 1 elif args.update: print(schema.upper() + ' ROW ' + str(row_count) + ':POSTing data!') e = lattice.post_object(schema, connection, post_json) if e['status'] == 'error': error += 1 failed_postings.append( schema.upper() + ' ROW ' + str(row_count) + ':' + str( post_json.get('aliases', 'alias not specified'))) elif e['status'] == 'success': new_object = e['@graph'][0] # Print now and later print(schema.upper() + ' ROW ' + str(row_count) + ':New accession/UUID: {}'.format(( new_object.get('accession', new_object.get('uuid'))))) new_accessions_aliases.append( ('ROW ' + str(row_count), new_object.get('accession', new_object.get('uuid')), new_object.get('aliases', new_object.get('name')))) success += 1 # Print now and later print( '{sheet}: {success} posted, {patch} patched, {error} errors out of {total} total' .format(sheet=schema.upper(), success=success, total=total, error=error, patch=patch)) summary_report.append( '{sheet}: {success} posted, {patch} patched, {error} errors out of {total} total' .format(sheet=schema.upper(), success=success, total=total, error=error, patch=patch)) if new_accessions_aliases: print('New accessions/UUIDs and aliases:') for (row, accession, alias) in new_accessions_aliases: if alias == None: alias = 'alias not specified' else: alias = ', '.join(alias) if isinstance(alias, list) else alias print(row, accession, alias) if failed_postings: print('Posting failed for {} object(s):'.format( len(failed_postings))) for alias in failed_postings: print( ', '.join(alias) if isinstance(alias, list) else alias) print('-------Summary of all objects-------') print('\n'.join(summary_report))
def main(): logging.basicConfig(filename='checkfiles.log', level=logging.INFO) logging.info('Started') args = getArgs() if (args.query or args.accessions) and not args.mode: sys.exit('ERROR: --mode is required with --query/--accessions') arg_count = 0 for arg in [args.query, args.accessions, args.s3_file, args.ext_file]: if arg: arg_count += 1 if arg_count != 1: sys.exit('ERROR: exactly one of --query, --accessions, --s3-file, --ext-file is required, {} given'.format(arg_count)) if args.mode: connection = lattice.Connection(args.mode) else: connection = '' initiating_run = 'STARTING Checkfiles version {}'.format(checkfiles_version) logging.info(initiating_run) timestr = datetime.now().strftime('%Y_%m_%d-%H_%M_%S') report_out = 'report_{}.tsv'.format(timestr) logging.info('Writing results to {}'.format(report_out)) report_headers = '\t'.join([ 'identifier', 'uri', 'errors', 'results', 'json_patch', 'Lattice patched?', 'S3 tag patched?', 'download_time', 'check_time', 'content_md5sum_time' ]) with open(report_out, 'w') as out: out.write(report_headers + '\n') jobs = fetch_files(report_out, connection, args.query, args.accessions, args.s3_file, args.ext_file, args.file_format, args.include_validated) if jobs: logging.info('CHECKING {} files'.format(len(jobs))) for job in jobs: file_obj = job.get('item') logging.info('Starting {}'.format(file_obj.get('@id', 'File not in DB'))) if file_obj.get('external_uri'): local_file, job = download_external(job) elif file_obj.get('file_format') == 'mex': local_file, job = download_s3_directory(job) else: local_file, job = download_s3_file(job) if os.path.exists(local_file): check_file(job) if not args.s3_file and not args.ext_file: compare_with_db(job, connection) if job['results'].get('flowcell_details') and file_obj.get('derived_from'): dets = job['results']['flowcell_details'] sorted_dets = sorted(dets, key=lambda k: (k.get('machine'), k.get('flowcell'), k.get('lane'))) if job['post_json'] and not job['errors'] and args.update: logging.info('PATCHING {}'.format(file_obj.get('accession'))) patch = lattice.patch_object(file_obj.get('accession'), connection, job['post_json']) job['patch_result'] = patch['status'] if file_obj.get('s3_uri'): set_s3_tags(job) out = open(report_out, 'a') out.write(report(job)) out.close() finishing_run = 'FINISHED Checkfiles at {}'.format(datetime.now()) logging.info(finishing_run) else: logging.info('FINISHED No files to check, see report*.tsv for details') logging.info('Results written to {}'.format(report_out)) logging.info('Finished')