class BatchedSearchGenerator: SEARCH_PATH = '/search/' DEFAULT_PARAMS = [('limit', 'all')] def __init__(self, request, batch_field='@id', batch_size=5000): self.request = request self.batch_field = batch_field self.batch_size = batch_size self.query_string = QueryString(request) self.param_list = self.query_string.group_values_by_key() self.batch_param_values = self.param_list.get(batch_field, []).copy() def _make_batched_values_from_batch_param_values(self): end = len(self.batch_param_values) for start in range(0, end, self.batch_size): yield self.batch_param_values[start:min(start + self.batch_size, end)] def _make_batched_params_from_batched_values(self, batched_values): return [(self.batch_field, batched_value) for batched_value in batched_values] def _build_new_request(self, batched_params): self.query_string.drop('limit') self.query_string.drop(self.batch_field) self.query_string.extend(batched_params + self.DEFAULT_PARAMS) request = self.query_string.get_request_with_new_query_string() request.path_info = self.SEARCH_PATH request.registry = self.request.registry return request def results(self): if not self.batch_param_values: yield from search_generator(self._build_new_request([]))['@graph'] for batched_values in self._make_batched_values_from_batch_param_values( ): batched_params = self._make_batched_params_from_batched_values( batched_values) request = self._build_new_request(batched_params) yield from search_generator(request)['@graph']
def batch_download(context, request): default_params = [ ('limit', 'all'), ('field', 'files.href'), ('field', 'files.restricted'), ('field', 'files.file_format'), ('field', 'files.file_format_type'), ('field', 'files.status'), ('field', 'files.assembly'), ] qs = QueryString(request) param_list = qs.group_values_by_key() file_filters = qs.param_keys_to_list(params=qs.get_filters_by_condition( key_and_value_condition=lambda k, _: k.startswith('files.'))) # Process PublicationData batch downloads separately. type_param = param_list.get('type', [''])[0] if type_param and type_param.lower() == 'publicationdata': return _batch_download_publicationdata(request) file_fields = [('field', k) for k in file_filters] qs.drop('limit') type_param = param_list.get('type', [''])[0] cart_uuids = param_list.get('cart', []) # Only allow specific type= query-string values, or cart=. if not type_param and not cart_uuids: raise HTTPBadRequest( explanation='URL must include a "type" or "cart" parameter.') if not type_param.lower() in _allowed_types: raise HTTPBadRequest(explanation='"{}" not a valid type for metadata'. format(type_param)) # Check for the "visualizable" and/or "raw" options in the query string for file filtering. visualizable_only = qs.is_param('option', 'visualizable') raw_only = qs.is_param('option', 'raw') qs.drop('option') qs.extend(default_params + file_fields) experiments = [] if request.method == 'POST': metadata_link = '' cart_uuid = qs.get_one_value(params=qs.get_key_filters(key='cart')) try: elements = request.json.get('elements', []) except ValueError: elements = [] if cart_uuid: try: request.embed(cart_uuid, '@@object') except KeyError: raise HTTPBadRequest( explanation='Specified cart does not exist.') # metadata.tsv link includes a cart UUID metadata_link = '{host_url}/metadata/?{search_params}'.format( host_url=request.host_url, search_params=qs._get_original_query_string()) else: metadata_link = '{host_url}/metadata/?{search_params} -X GET -H "Accept: text/tsv" -H "Content-Type: application/json" --data \'{{"elements": [{elements_json}]}}\''.format( host_url=request.host_url, search_params=qs._get_original_query_string(), elements_json=','.join('"{0}"'.format(element) for element in elements)) # Because of potential number of datasets in the cart, break search # into multiple searches of ELEMENT_CHUNK_SIZE datasets each. for i in range(0, len(elements), ELEMENT_CHUNK_SIZE): qs.drop('@id') qs.extend([('@id', e) for e in elements[i:i + ELEMENT_CHUNK_SIZE]]) path = '/search/?{}'.format(str(qs)) results = request.embed(quote(path), as_user=True) experiments.extend(results['@graph']) else: # Make sure regular batch download doesn't include a cart parameter; error if it does. if cart_uuids: raise HTTPBadRequest( explanation= 'You must download cart file manifests from the portal.') # Regular batch download has single simple call to request.embed metadata_link = '{host_url}/metadata/?{search_params}'.format( host_url=request.host_url, search_params=qs._get_original_query_string()) path = '/search/?{}'.format(str(qs)) results = request.embed(quote(path), as_user=True) experiments = results['@graph'] exp_files = (exp_file for exp in experiments for exp_file in exp.get('files', [])) files = [metadata_link] param_list = qs.group_values_by_key() for exp_file in exp_files: if not files_prop_param_list(exp_file, param_list): continue elif visualizable_only and not is_file_visualizable(exp_file): continue elif raw_only and exp_file.get('assembly'): # "raw" option only allows files w/o assembly. continue elif restricted_files_present(exp_file): continue files.append('{host_url}{href}'.format( host_url=request.host_url, href=exp_file['href'], )) return Response(content_type='text/plain', body='\n'.join(files), content_disposition='attachment; filename="%s"' % 'files.txt')
def metadata_tsv(context, request): qs = QueryString(request) param_list = qs.group_values_by_key() if 'referrer' in param_list: search_path = '/{}/'.format(param_list.pop('referrer')[0]) else: search_path = '/search/' type_param = param_list.get('type', [''])[0] cart_uuids = param_list.get('cart', []) # Only allow specific type= query-string values, or cart=. if not type_param and not cart_uuids: raise HTTPBadRequest( explanation='URL must include a "type" or "cart" parameter.') if not type_param.lower() in _allowed_types: raise HTTPBadRequest(explanation='"{}" not a valid type for metadata'. format(type_param)) # Handle special-case metadata.tsv generation. if type_param: if type_param.lower() == 'annotation': return _get_annotation_metadata(request, search_path, param_list) if type_param.lower() == 'publicationdata': return _get_publicationdata_metadata(request) param_list['field'] = [] header = [] file_attributes = [] for prop in _tsv_mapping: if prop not in _excluded_columns: header.append(prop) if _tsv_mapping[prop][0].startswith('files'): file_attributes = file_attributes + [_tsv_mapping[prop][0]] param_list['field'] = param_list['field'] + _tsv_mapping[prop] # Handle metadata.tsv lines from cart-generated files.txt. if cart_uuids: # metadata.tsv line includes cart UUID, so load the specified cart and # get its "elements" property for a list of items to retrieve. cart_uuid = cart_uuids.pop() del param_list['cart'] try: cart = request.embed(cart_uuid, '@@object') except KeyError: raise HTTPBadRequest(explanation='Specified cart does not exist.') else: if cart.get('elements'): param_list['@id'] = cart['elements'] else: # If the metadata.tsv line includes a JSON payload, get its "elements" # property for a list of items to retrieve. try: elements = request.json.get('elements') except ValueError: pass else: param_list['@id'] = elements default_params = [ ('field', 'audit'), ('limit', 'all'), ] field_params = [('field', p) for p in param_list.get('field', [])] at_id_params = [('@id', p) for p in param_list.get('@id', [])] qs.drop('limit') # Check for the "visualizable" and/or "raw" options in the query string for file filtering. visualizable_only = qs.is_param('option', 'visualizable') raw_only = qs.is_param('option', 'raw') qs.drop('option') qs.extend(default_params + field_params + at_id_params) path = '{}?{}'.format(search_path, str(qs)) results = request.embed(quote(path), as_user=True) rows = [] for experiment_json in results['@graph']: if experiment_json.get('files', []): exp_data_row = [] for column in header: if not _tsv_mapping[column][0].startswith('files'): make_cell(column, experiment_json, exp_data_row) f_attributes = [ 'files.title', 'files.file_type', 'files.file_format', 'files.file_format_type', 'files.output_type', 'files.assembly' ] for f in experiment_json['files']: if not files_prop_param_list(f, param_list): continue if visualizable_only and not is_file_visualizable(f): continue if raw_only and f.get('assembly'): # "raw" option only allows files w/o assembly. continue if restricted_files_present(f): continue if is_no_file_available(f): continue f['href'] = request.host_url + f['href'] f_row = [] for attr in f_attributes: f_row.append(f.get(attr[6:], '')) data_row = f_row + exp_data_row for prop in file_attributes: if prop in f_attributes: continue path = prop[6:] temp = [] for value in simple_path_ids(f, path): temp.append(str(value)) if prop == 'files.replicate.rbns_protein_concentration': if 'replicate' in f and 'rbns_protein_concentration_units' in f[ 'replicate']: temp[0] = temp[0] + ' ' + f['replicate'][ 'rbns_protein_concentration_units'] if prop in ['files.paired_with', 'files.derived_from']: # chopping of path to just accession if len(temp): new_values = [t[7:-1] for t in temp] temp = new_values data = list(set(temp)) data.sort() data_row.append(', '.join(data)) audit_info = [ make_audit_cell(audit_type, experiment_json, f) for audit_type in _audit_mapping ] data_row.extend(audit_info) rows.append(data_row) fout = io.StringIO() writer = csv.writer(fout, delimiter='\t', lineterminator='\n') header.extend([prop for prop in _audit_mapping]) writer.writerow(header) writer.writerows(rows) return Response(content_type='text/tsv', body=fout.getvalue(), content_disposition='attachment;filename="%s"' % 'metadata.tsv')
class MetadataReport: SEARCH_PATH = '/search/' EXCLUDED_COLUMNS = ( 'Restricted', 'No File Available', ) DEFAULT_PARAMS = [ ('field', 'audit'), ('field', 'files.@id'), ('field', 'files.restricted'), ('field', 'files.no_file_available'), ('field', 'files.file_format'), ('field', 'files.file_format_type'), ('field', 'files.status'), ('field', 'files.assembly'), ('limit', 'all'), ] CONTENT_TYPE = 'text/tsv' CONTENT_DISPOSITION = 'attachment; filename="metadata.tsv"' def __init__(self, request): self.request = request self.query_string = QueryString(request) self.param_list = self.query_string.group_values_by_key() self.positive_file_param_set = {} self.header = [] self.experiment_column_to_fields_mapping = OrderedDict() self.file_column_to_fields_mapping = OrderedDict() self.visualizable_only = self.query_string.is_param( 'option', 'visualizable') self.raw_only = self.query_string.is_param('option', 'raw') self.csv = CSVGenerator() def _get_column_to_fields_mapping(self): return METADATA_COLUMN_TO_FIELDS_MAPPING def _build_header(self): for column in self._get_column_to_fields_mapping(): if column not in self.EXCLUDED_COLUMNS: self.header.append(column) for audit, column in METADATA_AUDIT_TO_AUDIT_COLUMN_MAPPING: self.header.append(column) def _split_column_and_fields_by_experiment_and_file(self): for column, fields in self._get_column_to_fields_mapping().items(): if fields[0].startswith('files'): self.file_column_to_fields_mapping[column] = [ field.replace('files.', '') for field in fields ] else: self.experiment_column_to_fields_mapping[column] = fields def _set_positive_file_param_set(self): self.positive_file_param_set = { k.replace('files.', ''): set(map_strings_to_booleans_and_ints(v)) for k, v in self.param_list.items() if k.startswith('files.') and '!' not in k } def _add_positive_file_filters_as_fields_to_param_list(self): self.param_list['field'] = self.param_list.get('field', []) self.param_list['field'].extend( (k for k, v in self.query_string._get_original_params() if k.startswith('files.') and '!' not in k)) def _add_fields_to_param_list(self): self.param_list['field'] = self.param_list.get('field', []) for column, fields in self._get_column_to_fields_mapping().items(): self.param_list['field'].extend(fields) self._add_positive_file_filters_as_fields_to_param_list() def _initialize_at_id_param(self): self.param_list['@id'] = self.param_list.get('@id', []) def _maybe_add_cart_elements_to_param_list(self): # Don't need to limit max_cart_elements here since # search is batched. cart = CartWithElements(self.request, max_cart_elements=None) self.param_list['@id'].extend(cart.elements) self.param_list.pop('cart', None) def _get_json_elements_or_empty_list(self): try: return self.request.json.get('elements', []) except ValueError: return [] def _maybe_add_json_elements_to_param_list(self): self.param_list['@id'].extend(self._get_json_elements_or_empty_list()) def _get_field_params(self): return [('field', p) for p in self.param_list.get('field', [])] def _get_at_id_params(self): return [('@id', p) for p in self.param_list.get('@id', [])] def _get_default_params(self): return self.DEFAULT_PARAMS def _build_query_string(self): self.query_string.drop('limit') self.query_string.drop('option') self.query_string.extend(self._get_default_params() + self._get_field_params() + self._get_at_id_params()) def _get_search_path(self): return self.SEARCH_PATH def _build_new_request(self): self._build_query_string() request = self.query_string.get_request_with_new_query_string() request.path_info = self._get_search_path() return request def _get_search_results_generator(self): return BatchedSearchGenerator(self._build_new_request()).results() def _should_not_report_file(self, file_): conditions = [ not file_matches_file_params(file_, self.positive_file_param_set), self.visualizable_only and not is_file_visualizable(file_), self.raw_only and file_.get('assembly'), file_.get('restricted'), file_.get('no_file_available'), ] return any(conditions) def _get_experiment_data(self, experiment): return { column: make_experiment_cell(fields, experiment) for column, fields in self.experiment_column_to_fields_mapping.items() } def _get_file_data(self, file_): file_['href'] = self.request.host_url + file_['href'] return { column: make_file_cell(fields, file_) for column, fields in self.file_column_to_fields_mapping.items() } def _get_audit_data(self, grouped_audits_for_file, grouped_other_audits): return { audit_column: ', '.join( set( grouped_audits_for_file.get(audit_type, []) + grouped_other_audits.get(audit_type, []))) for audit_type, audit_column in METADATA_AUDIT_TO_AUDIT_COLUMN_MAPPING } def _output_sorted_row(self, experiment_data, file_data): row = [] for column in self.header: row.append(file_data.get(column, experiment_data.get(column))) return row def _generate_rows(self): yield self.csv.writerow(self.header) for experiment in self._get_search_results_generator(): if not experiment.get('files', []): continue grouped_file_audits, grouped_other_audits = group_audits_by_files_and_type( experiment.get('audit', {})) experiment_data = self._get_experiment_data(experiment) for file_ in experiment.get('files', []): if self._should_not_report_file(file_): continue file_data = self._get_file_data(file_) audit_data = self._get_audit_data( grouped_file_audits.get(file_.get('@id'), {}), grouped_other_audits) file_data.update(audit_data) yield self.csv.writerow( self._output_sorted_row(experiment_data, file_data)) def _validate_request(self): type_params = self.param_list.get('type', []) if len(type_params) != 1: raise HTTPBadRequest( explanation='URL requires one "type" parameter.') return True def _initialize_report(self): self._build_header() self._split_column_and_fields_by_experiment_and_file() self._set_positive_file_param_set() def _build_params(self): self._add_fields_to_param_list() self._initialize_at_id_param() self._maybe_add_cart_elements_to_param_list() self._maybe_add_json_elements_to_param_list() def generate(self): self._validate_request() self._initialize_report() self._build_params() return Response( content_type=self.CONTENT_TYPE, app_iter=self._generate_rows(), content_disposition=self.CONTENT_DISPOSITION, )
def batch_download(context, request): default_params = [('limit', 'all'), ('field', 'files.href'), ('field', 'files.restricted')] qs = QueryString(request) file_filters = qs.param_keys_to_list(params=qs.get_filters_by_condition( key_and_value_condition=lambda k, _: k.startswith('files.'))) file_fields = [('field', k) for k in file_filters] qs.drop('limit') qs.extend(default_params + file_fields) experiments = [] error_message = None if request.method == 'POST': metadata_link = '' cart_uuid = qs.get_one_value(params=qs.get_key_filters(key='cart')) try: elements = request.json.get('elements', []) except ValueError: elements = [] if cart_uuid: # metadata.tsv link includes a cart UUID metadata_link = '{host_url}/metadata/?{search_params}'.format( host_url=request.host_url, search_params=qs._get_original_query_string()) else: metadata_link = '{host_url}/metadata/?{search_params} -X GET -H "Accept: text/tsv" -H "Content-Type: application/json" --data \'{{"elements": [{elements_json}]}}\''.format( host_url=request.host_url, search_params=qs._get_original_query_string(), elements_json=','.join('"{0}"'.format(element) for element in elements)) # Because of potential number of datasets in the cart, break search # into multiple searches of ELEMENT_CHUNK_SIZE datasets each. for i in range(0, len(elements), ELEMENT_CHUNK_SIZE): qs.drop('@id') qs.extend([('@id', e) for e in elements[i:i + ELEMENT_CHUNK_SIZE]]) path = '/search/?{}'.format(str(qs)) results = request.embed(quote(path), as_user=True) experiments.extend(results['@graph']) else: # Regular batch download has single simple call to request.embed metadata_link = '{host_url}/metadata/?{search_params}'.format( host_url=request.host_url, search_params=qs._get_original_query_string()) path = '/search/?{}'.format(str(qs)) results = request.embed(quote(path), as_user=True) experiments = results['@graph'] exp_files = (exp_file for exp in experiments for exp_file in exp.get('files', [])) files = [metadata_link] param_list = qs.group_values_by_key() for exp_file in exp_files: if not files_prop_param_list(exp_file, param_list): continue elif restricted_files_present(exp_file): continue files.append('{host_url}{href}'.format( host_url=request.host_url, href=exp_file['href'], )) return Response(content_type='text/plain', body='\n'.join(files), content_disposition='attachment; filename="%s"' % 'files.txt')