def related_topics(self): """Request data from Google's Related Topics section and return a dictionary of dataframes If no top and/or rising related topics are found, the value for the key "top" and/or "rising" will be None """ # make the request related_payload = dict() result_dict = dict() for request_json in self.related_topics_widget_list: # ensure we know which keyword we are looking at rather than relying on order if request_json['request']['restriction'].get( "complexKeywordsRestriction") is not None: kw = request_json['request']['restriction'][ 'complexKeywordsRestriction']['keyword'][0]['value'] else: kw = "" # convert to string as requests will mangle related_payload['req'] = json.dumps(request_json['request']) related_payload['token'] = request_json['token'] related_payload['tz'] = self.tz # parse the returned json req_json = self._get_data( url=TrendReq.RELATED_QUERIES_URL, method=TrendReq.GET_METHOD, trim_chars=5, params=related_payload, ) # top topics try: top_list = req_json['default']['rankedList'][0][ 'rankedKeyword'] df_top = pd.DataFrame( [nested_to_record(d, sep='_') for d in top_list]) except KeyError: # in case no top topics are found, the lines above will throw a KeyError df_top = None # rising topics try: rising_list = req_json['default']['rankedList'][1][ 'rankedKeyword'] df_rising = pd.DataFrame( [nested_to_record(d, sep='_') for d in rising_list]) except KeyError: # in case no rising topics are found, the lines above will throw a KeyError df_rising = None result_dict[kw] = {'rising': df_rising, 'top': df_top} return result_dict
def test_nonetype_multiple_levels(self): # GH21158: If inner level json has a key with a null value # make sure it doesnt do a new_d.pop twice and except data = { "id": None, "location": { "id": None, "country": { "id": None, "state": { "id": None, "town.info": { "region": None, "x": 49.151580810546875, "y": -33.148521423339844, "z": 27.572303771972656}}} } } result = nested_to_record(data) expected = { 'id': None, 'location.id': None, 'location.country.id': None, 'location.country.state.id': None, 'location.country.state.town.info.region': None, 'location.country.state.town.info.x': 49.151580810546875, 'location.country.state.town.info.y': -33.148521423339844, 'location.country.state.town.info.z': 27.572303771972656} assert result == expected
def calc_metrics_classification(target, predictions, target_scores=None, jsd_score=None): if target_scores is not None: assert predictions.squeeze(1).shape == target_scores.shape tvdist = tvd(predictions.squeeze(1), target_scores) if predictions.shape[-1] == 1: predictions = predictions[:, 0] predictions = np.array([1 - predictions, predictions]).T predict_classes = np.argmax(predictions, axis=-1) if len(np.unique(target)) < 4: rep = nested_to_record(classification_report(target, predict_classes, output_dict=True), sep='/') else: rep = {} rep.update({'accuracy': accuracy_score(target, predict_classes)}) if jsd_score: rep.update({'js_divergence': jsd_score}) if target_scores is not None: rep.update({'TVD': tvdist}) if predictions.shape[-1] == 2: rep.update({'roc_auc': roc_auc_score(target, predictions[:, 1])}) rep.update( {"pr_auc": average_precision_score(target, predictions[:, 1])}) return rep
def test_one_level_deep_flattens(self): data = dict(flat1=1, dict1=dict(c=1, d=2)) result = nested_to_record(data) expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1} assert result == expected
def test_donot_drop_nonevalues(self): # GH21356 data = [{ 'info': None, 'author_name': { 'first': 'Smith', 'last_name': 'Appleseed' } }, { 'info': { 'created_at': '11/08/1993', 'last_updated': '26/05/2012' }, 'author_name': { 'first': 'Jane', 'last_name': 'Doe' } }] result = nested_to_record(data) expected = [{ 'info': None, 'author_name.first': 'Smith', 'author_name.last_name': 'Appleseed' }, { 'author_name.first': 'Jane', 'author_name.last_name': 'Doe', 'info.created_at': '11/08/1993', 'info.last_updated': '26/05/2012' }] assert result == expected
def test_nonetype_dropping(self): # GH20030: Checks that None values are dropped in nested_to_record # to prevent additional columns of nans when passed to DataFrame data = [{ 'info': None, 'author_name': { 'first': 'Smith', 'last_name': 'Appleseed' } }, { 'info': { 'created_at': '11/08/1993', 'last_updated': '26/05/2012' }, 'author_name': { 'first': 'Jane', 'last_name': 'Doe' } }] result = nested_to_record(data) expected = [{ 'author_name.first': 'Smith', 'author_name.last_name': 'Appleseed' }, { 'author_name.first': 'Jane', 'author_name.last_name': 'Doe', 'info.created_at': '11/08/1993', 'info.last_updated': '26/05/2012' }] assert result == expected
def calc_metrics_multilabel(target, predictions): rep = {} target = np.array(target) nlabels = target.shape[1] predict_classes = np.where(predictions > 0.5, 1, 0) for i in range(nlabels): rep_i = nested_to_record(classification_report(target[:, i], predict_classes[:, i], output_dict=True), sep='/') rep_i.update( {'accuracy': accuracy_score(target[:, i], predict_classes[:, i])}) rep_i.update( {'roc_auc': roc_auc_score(target[:, i], predictions[:, i])}) rep_i.update({ "pr_auc": average_precision_score(target[:, i], predictions[:, i]) }) for k in list(rep_i.keys()): rep_i['label_' + str(i) + '/' + k] = rep_i[k] del rep_i[k] rep.update(rep_i) macro_roc_auc = np.mean([v for k, v in rep.items() if 'roc_auc' in k]) macro_pr_auc = np.mean([v for k, v in rep.items() if 'pr_auc' in k]) rep['macro_roc_auc'] = macro_roc_auc rep['macro_pr_auc'] = macro_pr_auc return rep
def test_nonetype_multiple_levels(self): # GH21158: If inner level json has a key with a null value # make sure it doesnt do a new_d.pop twice and except data = { "id": None, "location": { "id": None, "country": { "id": None, "state": { "id": None, "town.info": { "region": None, "x": 49.151580810546875, "y": -33.148521423339844, "z": 27.572303771972656, }, }, }, }, } result = nested_to_record(data) expected = { "id": None, "location.id": None, "location.country.id": None, "location.country.state.id": None, "location.country.state.town.info.region": None, "location.country.state.town.info.x": 49.151580810546875, "location.country.state.town.info.y": -33.148521423339844, "location.country.state.town.info.z": 27.572303771972656, } assert result == expected
def test_one_level_deep_flattens(self): data = dict(flat1=1, dict1=dict(c=1, d=2)) result = nested_to_record(data) expected = {'dict1.c': 1, 'dict1.d': 2, 'flat1': 1} assert result == expected
def test_flat_stays_flat(self): recs = [dict(flat1=1, flat2=2), dict(flat1=3, flat2=4), ] result = nested_to_record(recs) expected = recs assert result == expected
def collect_gear_config(gear_id, client): ''' Collects the gear's configuration and inputs ''' gear = client.get_gear(gear_id) name = gear['gear']['name'] label = gear['gear']['label'] description = gear['gear']['description'] inputs = nested_to_record(gear.gear.inputs) config = nested_to_record(gear.get_default_config()) return ({ 'name': name, 'inputs': inputs, 'config': config, 'label': label, 'description': description })
def test_flat_stays_flat(self): recs = [dict(flat1=1, flat2=2), dict(flat1=3, flat2=4), ] result = nested_to_record(recs) expected = recs assert result == expected
def related_topics(self): """Request data from Google's Related Topics section and return a dictionary of dataframes If no top and/or rising related topics are found, the value for the key "top" and/or "rising" will be None """ # make the request related_payload = dict() result_dict = dict() for request_json in self.related_topics_widget_list: # ensure we know which keyword we are looking at rather than relying on order kw = request_json['request']['restriction']['complexKeywordsRestriction']['keyword'][0]['value'] # convert to string as requests will mangle related_payload['req'] = json.dumps(request_json['request']) related_payload['token'] = request_json['token'] related_payload['tz'] = self.tz # parse the returned json req_json = self._get_data( url=TrendReq.RELATED_QUERIES_URL, method=TrendReq.GET_METHOD, trim_chars=5, params=related_payload, ) # top topics try: top_list = req_json['default']['rankedList'][0]['rankedKeyword'] df_top = pd.DataFrame([nested_to_record(d, sep='_') for d in top_list]) except KeyError: # in case no top topics are found, the lines above will throw a KeyError df_top = None #rising topics try: rising_list = req_json['default']['rankedList'][1]['rankedKeyword'] df_rising = pd.DataFrame([nested_to_record(d, sep='_') for d in rising_list]) except KeyError: # in case no rising topics are found, the lines above will throw a KeyError df_rising = None result_dict[kw] = {'rising': df_rising, 'top' : df_top} return result_dict
def test_one_level_deep_flattens(self): data = dict(flat1=1, dict1=dict(c=1, d=2)) result = nested_to_record(data) expected = {'dict1.c': 1, 'dict1.d': 2, 'flat1': 1} assert result == expected
def flatten_data(data, json_column='data'): json_data = data.pop(json_column) # this gets at any nested dicts as well flat_data = pandas.DataFrame(nested_to_record(json_data)) # rename the columns so they can be un-flattened later flat_data.columns = [ '{0}.{1}'.format(json_column, i) for i in flat_data.columns.values ] other_data = pandas.DataFrame(data) return pandas.concat([other_data, flat_data], axis=1)
def flatten_page(page): meta = page['meta'] players = page['players'] for player in players: try: res = nested_to_record(player) for k, v in meta.items(): res['meta.{}'.format(k)] = v yield res except Exception as err: sys.stderr.write(str(err)) sys.stderr.flush() continue
def main(): with warnings.catch_warnings(): warnings.simplefilter("ignore") fw = flywheel.Client() assert fw, "Your Flywheel CLI credentials aren't set!" parser = argparse.ArgumentParser(description=( "Use this to query Flywheel for the gears available to you, or get the config file for a gear." )) parser.add_argument("-name", "--gear-name", dest='name', help="Shorthand name of the gear on Flywheel", required=True, default='all') parser.add_argument( "-config", "--output-config", dest='config', help="True/False; Whether to output configuration file for running", required=False, default='False') args = parser.parse_args() config = str2bool(args.config) if args.name == 'all': gears = fw.gears() gears_table = [nested_to_record(g.to_dict(), sep='_') for g in gears] df = pd.DataFrame(gears_table) df = df.filter(regex=r'gear_label$|gear_name$|^category$', axis=1) print(tabulate( df, headers='keys', tablefmt='psql', )) else: gear = find_gear(args.name, fw) config_file = collect_gear_config(gear['_id'], fw) if config: with open('gear_config.json', 'w') as outfile: json.dump(config_file, outfile) print("Config file written.") else: print(json.dumps(config_file, indent=4))
def test_nested_flattens(self): data = dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2)) result = nested_to_record(data) expected = {'dict1.c': 1, 'dict1.d': 2, 'flat1': 1, 'nested.d': 2, 'nested.e.c': 1, 'nested.e.d': 2} assert result == expected
def test_nested_flattens(self): data = dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2)) result = nested_to_record(data) expected = {'dict1.c': 1, 'dict1.d': 2, 'flat1': 1, 'nested.d': 2, 'nested.e.c': 1, 'nested.e.d': 2} assert result == expected
def get(self, cid): """Retrieve card for a single contribution. --- operationId: get_card parameters: - name: cid in: path type: string pattern: '^[a-f0-9]{24}$' required: true description: contribution ID (ObjectId) responses: 200: description: contribution card schema: type: string """ ctx = {'cid': cid} mask = ['project', 'identifier', 'content.data'] contrib = Contributions.objects.only(*mask).get(id=cid) info = Projects.objects.get(project=contrib.project) ctx['title'] = info.title ctx['descriptions'] = info.description.strip().split('.', 1) authors = [a.strip() for a in info.authors.split(',') if a] ctx['authors'] = {'main': authors[0], 'etal': authors[1:]} debug = current_app.config['DEBUG'] ctx['landing_page'] = f'/{contrib.project}' ctx['more'] = f'/explorer/{cid}' ctx['urls'] = info.urls.values() card_script = get_resource_as_string('templates/linkify.min.js') card_script += get_resource_as_string( 'templates/linkify-element.min.js') card_script += get_resource_as_string('templates/card.min.js') data = unflatten( dict((k.rsplit('.', 1)[0] if k.endswith('.display') else k, v) for k, v in nested_to_record(contrib.content.data, sep='.').items() if not k.endswith('.value') and not k.endswith('.unit'))) browser = get_browser() browser.execute_script(card_script, data) bs = BeautifulSoup(browser.page_source, 'html.parser') ctx['data'] = bs.body.table browser.close() rendered = html_minify(render_template('card.html', **ctx)) tree = html.fromstring(rendered) inline(tree) card = html.tostring(tree.body[0]).decode('utf-8') return card
def test_nested_flattens(self): data = dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2)) result = nested_to_record(data) expected = { "dict1.c": 1, "dict1.d": 2, "flat1": 1, "nested.d": 2, "nested.e.c": 1, "nested.e.d": 2, } assert result == expected
def test_with_large_max_level(self): # GH23843: Enhanced JSON normalize max_level = 100 input_data = [{ "CreatedBy": { "user": { "name": { "firstname": "Leo", "LastName": "Thomson" }, "family_tree": { "father": { "name": "Father001", "father": { "Name": "Father002", "father": { "name": "Father003", "father": { "Name": "Father004" }, }, }, } }, } } }] expected = [{ "CreatedBy.user.name.firstname": "Leo", "CreatedBy.user.name.LastName": "Thomson", "CreatedBy.user.family_tree.father.name": "Father001", "CreatedBy.user.family_tree.father.father.Name": "Father002", "CreatedBy.user.family_tree.father.father.father.name": "Father003", "CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", }] output = nested_to_record(input_data, max_level=max_level) assert output == expected
def calc_metrics_classification(target, predictions): if predictions.shape[-1] == 1: predictions = predictions[:, 0] predictions = np.array([1 - predictions, predictions]).T predict_classes = np.argmax(predictions, axis=-1) if len(np.unique(target)) < 4: rep = nested_to_record(classification_report(target, predict_classes, output_dict=True), sep='/') else: rep = {} rep.update({'accuracy': accuracy_score(target, predict_classes)}) if predictions.shape[-1] == 2: rep.update({'roc_auc': roc_auc_score(target, predictions[:, 1])}) rep.update( {"pr_auc": average_precision_score(target, predictions[:, 1])}) return rep
def make_combinations(config): """ Generate all possible configurations that a config file specifies via 'multi_' parameters If there are no 'multi_' parameters this generator is empty """ flat = nested_to_record(config) flat = {tuple(key.split('.')): value for key, value in flat.items()} multi_config_flat = { key[:-1] + (key[-1][6:], ): value for key, value in flat.items() if key[-1].startswith('multi') } if len(multi_config_flat) == 0: return # if there are no multi params this generator is empty keys, values = zip(*multi_config_flat.items()) # delete the multi_params # taken from https://stackoverflow.com/a/49723101/4841832 def delete_keys_from_dict(dictionary, keys): """ Delete fields in a nested dict """ for key in keys: with suppress(KeyError): del dictionary[key] for value in dictionary.values(): if isinstance(value, MutableMapping): delete_keys_from_dict(value, keys) to_delete = ['multi_' + key[-1] for key in multi_config_flat] delete_keys_from_dict(config, to_delete) for values in itertools.product(*values): experiment = dict(zip(keys, values)) for setting, value in experiment.items(): pointer_to_inner_dict = reduce(operator.getitem, setting[:-1], config) pointer_to_inner_dict[setting[-1]] = value yield config
def elastic_dsl(client, dsl, index, **kwargs): """ Sends DSL query to elasticsearch and returns the results as a :class:`pandas.DataFrame`. :param client: Configured elasticseach client. See :func:`create_elastic_client` :type client: :class:`elasticseach.Elasticsearch` :param dsl: Elasticsearch DSL query statement See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html # noqa: E501 :type dsl: str :param index: Index pattern. Usually the same as 'from' part of the SQL See https://www.elastic.co/guide/en/elasticsearch/reference/current/multi-index.html # noqa: E501 :type index: str :param kwargs: Any additional keyword arguments will be passed to the initial :meth:`elasticsearch.Elasticsearch.search` call :type kwargs: dict :return: results as a :class:`pandas.DataFrame`. :rtype: :class:`pandas.DataFrame` """ response = helpers.scan(client=client, query=dsl, index=index, **kwargs) data = [] for row in response: # Normalize nested dicts in '_source' such as 'config' or 'git' source = nested_to_record(row["_source"]) if "_source" in row else {} # Squeeze scalar fields returned as arrays in the response by the search API fields = row.get("fields", {}) fields = {k: v[0] if len(v) == 1 else v for k, v in fields.items()} data.append({ "_index": row["_index"], "_type": row["_type"], **fields, **source, }) return DataFrame(data)
def test_donot_drop_nonevalues(self): # GH21356 data = [ { "info": None, "author_name": { "first": "Smith", "last_name": "Appleseed" } }, { "info": { "created_at": "11/08/1993", "last_updated": "26/05/2012" }, "author_name": { "first": "Jane", "last_name": "Doe" }, }, ] result = nested_to_record(data) expected = [ { "info": None, "author_name.first": "Smith", "author_name.last_name": "Appleseed", }, { "author_name.first": "Jane", "author_name.last_name": "Doe", "info.created_at": "11/08/1993", "info.last_updated": "26/05/2012", }, ] assert result == expected
def test_nonetype_dropping(self): # GH20030: Checks that None values are dropped in nested_to_record # to prevent additional columns of nans when passed to DataFrame data = [ {'info': None, 'author_name': {'first': 'Smith', 'last_name': 'Appleseed'} }, {'info': {'created_at': '11/08/1993', 'last_updated': '26/05/2012'}, 'author_name': {'first': 'Jane', 'last_name': 'Doe'} } ] result = nested_to_record(data) expected = [ {'author_name.first': 'Smith', 'author_name.last_name': 'Appleseed'}, {'author_name.first': 'Jane', 'author_name.last_name': 'Doe', 'info.created_at': '11/08/1993', 'info.last_updated': '26/05/2012'}] assert result == expected
def test_donot_drop_nonevalues(self): # GH21356 data = [ {'info': None, 'author_name': {'first': 'Smith', 'last_name': 'Appleseed'} }, {'info': {'created_at': '11/08/1993', 'last_updated': '26/05/2012'}, 'author_name': {'first': 'Jane', 'last_name': 'Doe'} } ] result = nested_to_record(data) expected = [ {'info': None, 'author_name.first': 'Smith', 'author_name.last_name': 'Appleseed'}, {'author_name.first': 'Jane', 'author_name.last_name': 'Doe', 'info.created_at': '11/08/1993', 'info.last_updated': '26/05/2012'}] assert result == expected
def test_with_max_level(self, max_level, expected, max_level_test_input_data): # GH23843: Enhanced JSON normalize output = nested_to_record(max_level_test_input_data, max_level=max_level) assert output == expected
def get_experiment_files(experiment_path: str, files: dict= {}, flag=False) \ -> Tuple[Dict, pd.DataFrame, pd.DataFrame]: # Assumes each directory (/ experiment run) has a unique cfg cfg_files = glob.glob(f"{experiment_path}/**/cfg.yaml", recursive=True) cfg_files = natsort.natsorted(cfg_files) cfg_dfs = [] data = dict() join_dfs = dict() # -- Load cfgs for run_index, cfg_file in enumerate(cfg_files): data[run_index] = dict() dir_name = os.path.dirname(cfg_file) data[run_index]["dir_name"] = dir_name run_name = dir_name.replace(experiment_path, "") run_name = run_name[1:] if run_name[0] == "/" else run_name data[run_index]["dir_name"] = run_name # -- Read cfg with open(os.path.join(cfg_file)) as handler: config_data = yaml.load(handler, Loader=yaml.SafeLoader) put_manual_id = False if "experiment_id" in config_data: experiment_id = config_data["experiment_id"] else: put_manual_id = True experiment_id = config_data["cfg_id"] run_id = config_data.get("run_id", 0) data[run_index]["experiment_id"] = experiment_id data[run_index]["run_id"] = run_id if flag: cfg_df = pd.DataFrame(nested_to_record(config_data, sep="."), index=[0]) else: nc = nested_to_record(config_data) for k, v in nc.items(): if isinstance(v, list): nc[k] = np.array(v).astype(np.object) cfg_df = pd.DataFrame.from_dict(nc, orient="index").transpose() cfg_df["run_name"] = run_name cfg_df["run_index"] = run_index cfg_dfs.append(cfg_df) data["cfg"] = cfg_df # -- Read logs for file_name, file_type in files.items(): file_path = os.path.join(dir_name, file_name) if not os.path.isfile(file_path): file_path = None continue file_data = file_path if hasattr(pd, str(file_type)) and file_path is not None: # Some bad header for experiments Fix file_data = getattr(pd, file_type)(file_path) if put_manual_id: file_data["experiment_id"] = experiment_id file_data["run_id"] = run_id file_data["run_index"] = run_index if file_name not in join_dfs: join_dfs[file_name] = [] join_dfs[file_name].append(file_data) data[file_name] = file_data cfgs = pd.concat(cfg_dfs) merge_dfs = cfgs.copy() for join_df_name, join_df in join_dfs.items(): other_df = pd.concat(join_df, sort=True) try: try_merge = pd.merge(other_df, merge_dfs, how="left", on="run_index", sort=True) merge_dfs = try_merge except: print(f"Cannot merge {join_df_name}") return data, cfgs, merge_dfs
def process_query(client, acquisitions, target_cols=None): ''' Extract an acquisition This function extracts an acquisition object and collects all imaging files and important classification/BIDS information. These data are processed and returned as a pandas dataframe that can then be exported Parameters -------- client A flywheel connection object acquisitions A list of flywheel acquisition objects target_cols List of specific columns to return Returns -------- return_df A dataframe of the result of the query and processing ''' acquisitions_list = [] for x in tqdm(acquisitions, total=len(acquisitions)): try: tempacq = client.get(x.id) if tempacq is None: raise Exception d = { 'acquisition.id': x.id, 'acquisition.label': x.label, 'session.id': x.session, 'session.label': client.get(x.parents.session).label, 'subject.id': x.parents.subject, 'subject.label': client.get(x.parents.subject).label, 'timestamp': x.timestamp } files = tempacq.files files = [f.to_dict() for f in files] for f in files: f.update(d) except Exception as e: print(e) global NO_DATA NO_DATA += 1 continue acquisitions_list.extend(files) files_list = [ nested_to_record(fdict, sep="_") for fdict in acquisitions_list ] global VERBOSE if VERBOSE: print("Tidying and returning the results...") # filter columns if necessary if not target_cols: cols = r'(\.label)|(\.id)|(classification)|(^type$)|(^modality$)|(BIDS)|(EchoTime)|(RepetitionTime)|(PhaseEncodingDirection)|(SequenceName)|(SeriesDescription)|(name)' # filter the dict keys for the columns names files_list = [{k: v for k, v in my_dict.items() if re.search(cols, k)} for my_dict in files_list] return_df = pd.DataFrame(files_list) else: required_cols = ['\.id', '\.label', 'name'] target_cols.extend(required_cols) target_cols = "|".join(["({})".format(x) for x in target_cols]) files_list = [{ k: v for k, v in my_dict.items() if re.search(target_cols, k) } for my_dict in files_list] return_df = pd.DataFrame(files_list) #drop_downs = return_df.apply(is_list_column, 0, reduce=None).values #return_df.loc[:, drop_downs] = return_df.loc[:, drop_downs].applymap(unlist_item) if 'type' in return_df.columns: return_df = return_df[return_df.type.str.contains( r'nifti|dicom', na=False)].reset_index(drop=True) return (return_df)
def get(self, project): """Retrieve overview graph for a project. --- operationId: get_graph parameters: - name: project in: path type: string pattern: '^[a-zA-Z0-9_]{3,30}$' required: true description: project name/slug - name: columns in: query type: array items: type: string required: true description: comma-separated list of column names to plot (in MongoDB dot notation) - name: filters in: query type: array items: type: string description: list of `column__operator:value` filters \ with `column` in dot notation and `operator` in mongoengine format \ (http://docs.mongoengine.org/guide/querying.html#query-operators). \ `column` needs to be a valid field in `content.data`. - name: page in: query type: integer default: 1 description: page to retrieve (in batches of `per_page`) - name: per_page in: query type: integer default: 200 minimum: 2 maximum: 200 description: number of results to return per page responses: 200: description: x-y-data in plotly format schema: type: array items: type: object properties: x: type: array items: type: number y: type: array items: type: number """ mask = ['content.data', 'identifier'] columns = request.args.get('columns').split(',') filters = request.args.get('filters', '').split(',') page = int(request.args.get('page', 1)) PER_PAGE_MAX = 200 per_page = int(request.args.get('per_page', PER_PAGE_MAX)) per_page = PER_PAGE_MAX if per_page > PER_PAGE_MAX else per_page with no_dereference(Contributions) as ContributionsDeref: objects = ContributionsDeref.objects(project=project).only(*mask) data = [{'x': [], 'y': [], 'text': []} for col in columns] # C__gte:0.42,C__lte:2.10,ΔE-QP.direct__lte:11.3 -> content__data__C__value__lte if filters: query = {} for f in filters: if '__' in f and ':' in f: k, v = f.split(':') col, op = k.rsplit('__', 1) col = col.replace(".", "__") key = f'content__data__{col}__value__{op}' query[key] = float(v) objects = objects(**query) for obj in objects.paginate(page=page, per_page=per_page).items: d = nested_to_record(obj['content']['data'], sep='.') if all(f'{c}.display' in d.keys() for c in columns): for idx, col in enumerate(columns): val = d.get(f'{col}.display') if val: data[idx]['x'].append(obj.identifier) data[idx]['y'].append(val.split(' ')[0]) data[idx]['text'].append(str(obj.id)) return data
def get(self, project): """Retrieve a table of contributions for a project. --- operationId: get_table parameters: - name: project in: path type: string pattern: '^[a-zA-Z0-9_]{3,30}$' required: true description: project name/slug - name: columns in: query type: array items: type: string description: comma-separated list of column names to tabulate - name: page in: query type: integer default: 1 description: page to retrieve (in batches of `per_page`) - name: per_page in: query type: integer default: 20 minimum: 2 maximum: 20 description: number of results to return per page - name: q in: query type: string description: substring to search for in first non-id column - name: order in: query type: string description: sort ascending or descending enum: [asc, desc] - name: sort_by in: query type: string description: column name to sort by responses: 200: description: Paginated table response in backgrid format (items = rows of table) schema: type: object properties: total_count: type: integer total_pages: type: integer page: type: integer last_page: type: integer per_page: type: integer items: type: array items: type: object """ # config and parameters explorer = 'http://localhost:8080/explorer' if current_app.config['DEBUG'] \ else 'https://portal.mpcontribs.org/explorer' mp_site = 'https://materialsproject.org/materials' mask = ['content.data', 'content.structures', 'identifier'] search = request.args.get('q') page = int(request.args.get('page', 1)) PER_PAGE_MAX = current_app.config['PER_PAGE_MAX'] per_page = int(request.args.get('per_page', PER_PAGE_MAX)) per_page = PER_PAGE_MAX if per_page > PER_PAGE_MAX else per_page order = request.args.get('order') sort_by = request.args.get('sort_by', 'identifier') general_columns = ['identifier', 'id'] user_columns = request.args.get('columns', '').split(',') objects = Contributions.objects(project=project).only(*mask) # default user_columns sample = objects.first()['content']['data'] data_keys = sorted( list( k.rsplit('.', 1)[0] if k.endswith('.display') else k for k, v in nested_to_record(sample, sep='.').items() if not k.endswith('.value') and not k.endswith('.unit'))) if not data_keys: return { 'total_count': 0, 'total_pages': 0, 'page': 1, 'last_page': 1, 'per_page': per_page, 'items': [] } formula_key_exists = bool('formula' in data_keys) if formula_key_exists: general_columns.append('formula') else: # test whether search key exists in all docs and is not a number/object search_key = data_keys[0].replace('.', '__') q1 = {f'content__data__{search_key}__exists': False} q2 = {f'content__data__{search_key}__type': 'object'} if objects(Q(**q1) | Q(**q2)).count() < 1: general_columns.append(data_keys[0]) else: general_columns.append('formula') if not user_columns[0]: if formula_key_exists: data_keys.remove('formula') user_columns = data_keys if 'formula' in general_columns else data_keys[ 1:] # add units to column names units = [ objects.distinct(f'content.data.{col}.unit') for col in user_columns ] columns = general_columns + [ '{} [{}]'.format(col, units[idx][0]) if units[idx] else col for idx, col in enumerate(user_columns) ] # search and sort if search is not None: kwargs = { f'content__data__{general_columns[-1]}__exists': True, f'content__data__{general_columns[-1]}__contains': search } objects = objects(Q(identifier__contains=search) | Q(**kwargs)) sort_by_key = sort_by if ' ' in sort_by and sort_by[-1] == ']': sort_by = sort_by.split(' ')[0] # remove unit sort_by_key = f'content.data.{sort_by}.value' elif sort_by in columns[2:]: sort_by_key = f'content.data.{sort_by}' order_sign = '-' if order == 'desc' else '+' order_by = f"{order_sign}{sort_by_key}" objects = objects.order_by(order_by) # generate table page items = [] for doc in objects.paginate(page=page, per_page=per_page).items: mp_id = doc['identifier'] contrib = nested_to_record(doc['content']['data'], sep='.') search_value = contrib.get(general_columns[-1], mp_id).replace(' ', '') row = [ f"{mp_site}/{mp_id}", f"{explorer}/{doc['id']}", search_value ] for idx, col in enumerate(user_columns): cell = '' if 'CIF' in col: structures = doc['content']['structures'] if '.' in col: # grouped columns sname = '.'.join(col.split( '.')[:-1]) # remove CIF string from field name for d in structures: if d['name'] == sname: cell = f"{explorer}/{d['id']}.cif" break elif structures: cell = f"{explorer}/{structures[0]['id']}.cif" else: cell = contrib.get(col + '.value', contrib.get(col, '')) row.append(str(cell)) items.append(dict(zip(columns, row))) total_count = objects.count() total_pages = int(total_count / per_page) if total_pages % per_page: total_pages += 1 return { 'total_count': total_count, 'total_pages': total_pages, 'page': page, 'last_page': total_pages, 'per_page': per_page, 'items': items }
def get_experiment_files(experiment_path: str, files: dict = {}): # Assumes each directory (/ experiment run) has a unique cfg cfg_files = glob.glob(f"{experiment_path}/**/cfg.yaml", recursive=True) cfg_files = natsort.natsorted(cfg_files) cfg_dfs = [] data = dict() join_dfs = dict() # -- Load cfgs for run_index, cfg_file in enumerate(cfg_files): data[run_index] = dict() dir_name = os.path.dirname(cfg_file) data[run_index]["dir_name"] = dir_name run_name = dir_name.replace(experiment_path, "") run_name = run_name[1:] if run_name[0] == "/" else run_name data[run_index]["dir_name"] = run_name # -- Read cfg with open(os.path.join(cfg_file)) as handler: config_data = yaml.load(handler, Loader=yaml.SafeLoader) put_manual_id = False if "experiment_id" in config_data: experiment_id = config_data["experiment_id"] else: put_manual_id = True experiment_id = config_data["cfg_id"] run_id = getattr(config_data, "run_id", 0) data[run_index]["experiment_id"] = experiment_id data[run_index]["run_id"] = run_id #cfg_df = pd.DataFrame(nested_to_record(config_data, sep="."), index=[0]) cfg_df = pd.DataFrame(nested_to_record(config_data)) cfg_df["run_name"] = run_name cfg_df["run_index"] = run_index cfg_dfs.append(cfg_df) data["cfg"] = cfg_df # -- Read logs for file_name, file_type in files.items(): file_path = os.path.join(dir_name, file_name) if not os.path.isfile(file_path): file_path = None continue file_data = file_path if hasattr(pd, str(file_type)) and file_path is not None: # Some bad header for experiments Fix # file_data = getattr(pd, file_type)(file_path, skiprows=1, names=['update', 'frames', 'FPS', 'duration', 'rreturn_mean', 'rreturn_std', 'rreturn_min', 'rreturn_max', 'num_frames_mean', 'num_frames_std', 'num_frames_min', 'num_frames_max', 'entropy', 'value', 'policy_loss', 'value_loss', 'grad_norm', 'value_ext', 'value_int', 'value_ext_loss', 'value_int_loss', 'return_mean', 'return_std', 'return_min', 'return_max']) file_data = getattr(pd, file_type)(file_path) if put_manual_id: file_data["experiment_id"] = experiment_id file_data["run_id"] = run_id file_data["run_index"] = run_index if file_name not in join_dfs: join_dfs[file_name] = [] join_dfs[file_name].append(file_data) data[file_name] = file_data cfgs = pd.concat(cfg_dfs) merge_dfs = cfgs.copy() for join_df_name, join_df in join_dfs.items(): other_df = pd.concat(join_df, sort=True) try: try_merge = pd.merge(other_df, merge_dfs, how="left", on="run_index", sort=True) merge_dfs = try_merge except: print(f"Cannot merge {join_df_name}") return data, cfgs, merge_dfs