def test_temporal_coverage_validate(self): coverage = {} self.assertEqual(Utils.temporal_coverage_validate(coverage), { "start": None, "end": None }) coverage = {"start": None} self.assertEqual(Utils.temporal_coverage_validate(coverage), { "start": None, "end": None }) coverage = {"end": None} self.assertEqual(Utils.temporal_coverage_validate(coverage), { "start": None, "end": None }) coverage = {"start": "2018-09-23T00:00:00", "end": "2018-10-10"} self.assertEqual(Utils.temporal_coverage_validate(coverage), { 'end': '2018-10-10T00:00:00', 'start': '2018-09-23T00:00:00' }) coverage = {"start": "2018-00", "end": "2018-10-10"} self.assertEqual(Utils.temporal_coverage_validate(coverage), { 'end': '2018-10-10T00:00:00', 'start': None })
def upload(meta_list: typing.List[dict], es_index: str = PRODUCTION_ES_INDEX, deduplicate: bool = True, index_builder: IndexBuilder = None) -> typing.List[dict]: ib = index_builder or IndexBuilder() succeeded = [] for meta in meta_list: try: Utils.validate_schema(meta) meta['datamart_status'] = 'not_profiled' if deduplicate: exist_id = check_existence(meta['materialization'], es_index=es_index) if exist_id: success = ib.updating_send_trusted_metadata( metadata=meta, es_index=es_index, datamart_id=exist_id) else: success = ib.indexing_send_to_es(metadata=meta, es_index=es_index) else: success = ib.indexing_send_to_es(metadata=meta, es_index=es_index) if success: succeeded.append(success) except Exception as e: print('UPLOAD FAILED: ', str(e)) continue return succeeded
def match_temporal_coverage(cls, start: str = None, end: str = None) -> typing.Optional[dict]: """Generate query body for query by temporal_coverage. Args: start: dataset should cover date time earlier than the start date. end: dataset should cover date time later than the end date. Returns: dict of query body """ start = Utils.date_validate(date_text=start) if start else None end = Utils.date_validate(date_text=end) if end else None if not start and not end: warnings.warn("Start and end are valid") return None body = { "nested": { "path": "variables", "inner_hits": { "_source": ["temporal_coverage"] }, "query": { "bool": { "must": [] } } } } if start: body["nested"]["query"]["bool"]["must"].append({ "range": { "variables.temporal_coverage.start": { "lte": start, "format": "yyyy-MM-dd'T'HH:mm:ss" } } }) if end: body["nested"]["query"]["bool"]["must"].append({ "range": { "variables.temporal_coverage.end": { "gte": end, "format": "yyyy-MM-dd'T'HH:mm:ss" } } }) return body
def updating(self, description_path: str, es_index: str, document_id: int, data_path: str = None, query_data_for_updating: bool = False) -> dict: """Update document in elastic search. By providing description file, index builder should be able to process it and create metadata json for the dataset, update document in elastic search Args: description_path: Path to description json file. es_index: str, es index for this dataset document_id: int, document id of document which need to be updated data_path: Path to data csv file. query_data_for_updating: Bool. If no data is presented, and query_data_for_updating is False, will only create metadata according to the description json. If query_data_for_updating is True and no data is presented, will use Materialize to query data for profiling and indexing Returns: metadata dictionary """ """ Not keep up to date for a while, may not work well. But updating is not very useful as well. """ self._check_es_index(es_index=es_index) description, data = self._read_data(description_path, data_path) if not data and query_data_for_updating: try: data = Utils.materialize(metadata=description).infer_objects() except: traceback.print_exc() warnings.warn( "Materialization Failed, index based on schema json only. (%s)" % description_path) metadata = self.construct_global_metadata( description=description, data=data, overwrite_datamart_id=document_id) Utils.validate_schema(metadata) self.im.update_doc(index=es_index, doc_type='document', body={"doc": metadata}, id=metadata['datamart_id']) return metadata
def updating_send_trusted_metadata(self, metadata: dict, datamart_id: int, es_index: str): self.update_datamart_id(metadata=metadata, datamart_id=datamart_id) Utils.validate_schema(metadata) try: self.im.update_doc(index=es_index, doc_type='_doc', body={"doc": metadata}, id=metadata['datamart_id']) return metadata except Exception as e: if isinstance(e, TransportError): print(e.info) pass
def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame, left_columns: typing.List[typing.List[int]], right_columns: typing.List[typing.List[int]], left_metadata: dict = None, right_metadata: dict = None, joiner: str = "default") -> typing.Optional[pd.DataFrame]: """Join two dataframes based on different joiner. Args: left_df: pandas Dataframe right_df: pandas Dataframe left_metadata: metadata of left dataframe right_metadata: metadata of right dataframe left_columns: list of integers from left df for join right_columns: list of integers from right df for join joiner: string of joiner, default to be "default" Returns: Dataframe """ if joiner not in self.joiners: self.joiners[joiner] = JoinerPrepare.prepare_joiner(joiner=joiner) if not self.joiners[joiner]: warnings.warn("No suitable joiner, return original dataframe") return left_df if not left_metadata: # Left df is the user provided one. # We will generate metadata just based on the data itself, profiling and so on left_metadata = Utils.generate_metadata_from_dataframe( data=left_df) left_metadata = Utils.calculate_dsbox_features(data=left_df, metadata=left_metadata) right_metadata = Utils.calculate_dsbox_features( data=right_df, metadata=right_metadata) return self.joiners[joiner].join( left_df=left_df, right_df=right_df, left_columns=left_columns, right_columns=right_columns, left_metadata=left_metadata, right_metadata=right_metadata, )
def default_search_by_csv(self, request, old_df): query_string = request.args.get("query_string", None) minimum_should_match = request.args.get( "minimum_should_match_for_column" ) if "minimum_should_match_for_column" in request.args else None ret = { "message": "Created Dataframe and finding datasets for augmenting", "result": [] } for idx in range(old_df.shape[1]): if Utils.is_column_able_to_query(old_df.iloc[:, idx]): this_column_result = self.augment.query( col=old_df.iloc[:, idx], minimum_should_match_ratio_for_col=minimum_should_match, query_string=query_string) if this_column_result: ret["result"].append({ "column_idx": idx, "datasets_metadata": this_column_result[:10] }) return ret
def test_validate_schema(self): with open( os.path.join(os.path.dirname(__file__), "resources/sample_schema.json"), "r") as f: description = json.load(f) self.assertEqual(Utils.validate_schema(description["description"]), True)
def test_get_dataset(self): fake_matadata = { "materialization": { "python_path": "noaa_materializer", "arguments": { "type": "TAVG" } } } fake_constrains = { "named_entity": { 2: ["new york", "sdasds"] }, "date_range": { "start": "2018-09-23T00:00:00", "end": "2018-09-30T00:00:00" } } df = Utils.get_dataset(metadata=fake_matadata, variables=[0, 2, 3], constrains=fake_constrains) ground_truth = pd.read_csv( os.path.join(os.path.dirname(__file__), "./resources", "test_augment.csv")) self.dataframe_equal(ground_truth, df)
def test_materialize(self): fake_metadata = { "materialization": { "python_path": "noaa_materializer", "arguments": { "type": 'PRCP' } } } fake_constrains = { "date_range": { "start": "2016-09-23", "end": "2016-09-23" }, "named_entity": { 2: ["los angeles"] } } result = Utils.materialize(metadata=fake_metadata, constrains=fake_constrains).infer_objects() print(result) expepcted = pd.read_csv( os.path.join(os.path.dirname(__file__), "resources/noaa_result.csv")) self.dataframe_equal(result, expepcted)
def search(url: str, query: dict, data: pd.DataFrame or str or d3m_ds.Dataset=None, send_data=True, max_return_docs: int=20, return_named_entity: bool=False) -> typing.List[Dataset]: """ Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API Args: url: str - the datamart server(for ISI's datamart it is meaningless, just a flag) query: JSON object describing the query(https://datadrivendiscovery.org/wiki/display/work/Query+results+schema) data: the data you are trying to augment. It can be provided as one of: - a pandas.DataFrame object - a D3M Dataset object - the path to a D3M datasetDoc.json file - the path to a CSV file send_data: (for ISI's datamart it is meaningless) Returns: a list of datamart.Dataset objects """ if not url.startswith(SEARCH_URL): return [] loaded_data = DataLoader.load_data(data) augmenter = Augment(es_index=PRODUCTION_ES_INDEX) es_results = [] if (query and ('required_variables' in query)) or (loaded_data is None): # if ("required_variables" exists or no data): es_results = augmenter.query_by_json(query, loaded_data, size=max_return_docs, return_named_entity=return_named_entity) or [] else: # if there is no "required_variables" in the query JSON, but the dataset exists, # try each named entity column as "required_variables" and concat the results: query = query or {} exist = set() for col in loaded_data: if Utils.is_column_able_to_query(loaded_data[col]): query['required_variables'] = [{ "type": "dataframe_columns", "names": [col] }] cur_results = augmenter.query_by_json(query, loaded_data, size=max_return_docs, return_named_entity=return_named_entity) if not cur_results: continue for res in cur_results: if res['_id'] not in exist: # TODO: how about the score ?? exist.add(res['_id']) es_results.append(res) return [Dataset(es_result, original_data=loaded_data, query_json=query) for es_result in es_results]
def _read_data(description_path: str, data_path: str = None) -> typing.Tuple[dict, pd.DataFrame]: """Read dataset description json and dataset if present. Args: description_path: Path to description json file. data_path: Path to data csv file. Returns: Tuple of (description json, dataframe of data) """ description = json.load(open(description_path, 'r')) Utils.validate_schema(description) if data_path: data = pd.read_csv(open(data_path), 'r') else: data = None return description, data
def _read_data(description_path: str or dict, data_path: str = None) -> typing.Tuple[dict, pd.DataFrame]: """Read dataset description json and dataset if present. Args: description_path: Path to description json file, or the description JSON in Python dict. data_path: Path to data csv file. Returns: Tuple of (description json, dataframe of data) """ if isinstance(description_path, str): description = json.load(open(description_path, 'r')) else: description = description_path Utils.validate_schema(description) if data_path: data = pd.read_csv(open(data_path), 'r') else: data = None return description, data
def indexing_generate_metadata( self, description_path: str or dict, data_path: str or pd.DataFrame = None, query_data_for_indexing: bool = False, save_to_file: str = None, save_to_file_mode: str = "a+", cache_dataset_path: str = None, enable_two_ravens_profiler: bool = False) -> dict: description, data = self._read_data(description_path, data_path) if data is None and query_data_for_indexing: try: data = Utils.materialize(metadata=description).infer_objects() if cache_dataset_path: data.to_csv(cache_dataset_path, index=False) except Exception: traceback.print_exc() warnings.warn( "Materialization Failed, index based on schema json only. (%s)" % description_path) # construct global metadata without generating valid datamart_id metadata = self.construct_global_metadata(description=description, data=data, overwrite_datamart_id=0) if data is not None: metadata = self.profile( data=data, metadata=metadata, enable_two_ravens_profiler=enable_two_ravens_profiler) Utils.validate_schema(metadata) if save_to_file: self._save_data(save_to_file=save_to_file, save_mode=save_to_file_mode, metadata=metadata) return metadata
def test_calculate_dsbox_features(self): expected = { 'variables': [{ 'dsbox_profiled': { 'ratio_of_numeric_values': 1.0, 'number_of_outlier_numeric_values': 0 } }, { 'dsbox_profiled': { 'most_common_tokens': [{ 'name': '2014-02-23', 'count': 1 }, { 'name': '2018-10-05', 'count': 1 }, { 'name': '2020-09-23', 'count': 1 }, { 'name': '2023-02-13', 'count': 1 }], 'number_of_tokens_containing_numeric_char': 4, 'ratio_of_tokens_containing_numeric_char': 1.0, 'number_of_values_containing_numeric_char': 4, 'ratio_of_values_containing_numeric_char': 1.0 } }, { 'dsbox_profiled': { 'most_common_tokens': [{ 'name': 'Jack', 'count': 1 }, { 'name': 'Ricky', 'count': 1 }, { 'name': 'Steve', 'count': 1 }, { 'name': 'Tom', 'count': 1 }] } }] } self.assertDictEqual( Utils.calculate_dsbox_features( data=self.df, metadata={"variables": [{}, {}, {}]}), expected)
def load_meta_and_data_by_id(datamart_id: int, first_n_rows: int = None, constrains=None): qm = QueryManager(es_host=ES_HOST, es_port=ES_PORT, es_index=PRODUCTION_ES_INDEX) res = qm.get_by_id(datamart_id) if res and res.get('_source'): df = Utils.get_dataset(res['_source'], constrains=constrains) if first_n_rows: df = df.head(first_n_rows) return res['_source'], df return None, None
def test_two_ravens_profiler(self): data = pd.DataFrame({ 'Name': ['Tom', 'Jack', 'Steve'], 'Age': [28, 34, 29], 'Date': ["2018-10-05", "2014-02-23", "2020-09-23"] }) meta = Utils.generate_metadata_from_dataframe(data) res = TwoRavensProfiler().profile(data, meta) if meta == res: # TwoRavensProfiler is probably down print('TwoRavensProfiler is probably down. Skipping test.') else: expected_file = os.path.join(resources_path, "two_ravens.json") with open(expected_file) as f: expected = json.load(f) self.assertEqual(res, expected)
def test_get_named_entity_constrain_from_inner_hits(self): expected = {2: ['new york'], 1: ['united states']} self.assertDictEqual( Utils.get_named_entity_constrain_from_inner_hits( matches=[{ 'offset': 2, 'matched_queries': ['new york'], 'highlight': { 'variables.named_entity': ['new york'] } }, { 'offset': 1, 'matched_queries': ['nunited states of american'], 'highlight': { 'variables.named_entity': ['united states'] } }]), expected)
def test_get_inner_hits_info(self): fake_es_result = { "inner_hits": { "variables": { "hits": { "hits": [{ "_nested": { "field": "variables", "offset": 2 }, "highlight": { "variables.named_entity": ["new york"] }, "matched_queries": ["new york"] }, { "_nested": { "field": "variables", "offset": 1 }, "highlight": { "variables.named_entity": ["united states"] }, "matched_queries": ["united states of american"] }] } } } } expected = [{ 'offset': 2, 'matched_queries': ['new york'], 'highlight': { 'variables.named_entity': ['new york'] } }, { 'offset': 1, 'matched_queries': ['united states of american'], 'highlight': { 'variables.named_entity': ['united states'] } }] self.assertListEqual( Utils.get_inner_hits_info(hitted_es_result=fake_es_result), expected)
def test_get_metadata_intersection(self): metadata_lst1 = [{ "_source": { "datamart_id": 0 } }, { "_source": { "datamart_id": 1 } }, { "_source": { "datamart_id": 2 } }] metadata_lst2 = [{ "_source": { "datamart_id": 0 } }, { "_source": { "datamart_id": 2 } }, { "_source": { "datamart_id": 3 } }] metadata_lst3 = [{ "_source": { "datamart_id": 0 } }, { "_source": { "datamart_id": 3 } }] expect = [{'_source': {'datamart_id': 0}}] self.assertListEqual( Utils.get_metadata_intersection(metadata_lst1, metadata_lst2, metadata_lst3), expect)
def test_append_columns_for_implicit_variables(self): implicit_variables = [{ "name": "indicator", "value": "born" }, { "name": "city", "value": "shanghai" }] data = { 'Age': [28, 34, 29, 42], 'Date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"], 'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'indicator': ["born", "born", "born", "born"], 'city': ['shanghai', 'shanghai', 'shanghai', 'shanghai'] } expected = pd.DataFrame(data, columns=data.keys()) self.dataframe_equal( Utils.append_columns_for_implicit_variables( implicit_variables=implicit_variables, df=self.df), expected)
def bulk_generate_metadata( html_page: str, description: dict = None, enable_two_ravens_profiler=False) -> typing.List[typing.List[dict]]: """ :param html_page: :param description: :param es_index: :return: """ successed = [] hp = HTMLProcesser(html_page) html_meta = hp.extract_description_from_meta() for text, href in hp.generate_a_tags_from_html(): try: cur_description = copy.deepcopy(description) or {} if not Utils.validate_url(href): continue if not cur_description.get('title'): black_list = set( text.lower().split()).intersection(TITLE_BLACK_LIST) if not black_list: cur_description['title'] = text.strip() if not cur_description.get('description'): cur_description['description'] = html_meta cur_description['materialization_arguments'] = {'url': href} # Not to extract html tables, otherwise there will be too many FPs: cur_metadata = generate_metadata( cur_description, ignore_html=True, enable_two_ravens_profiler=enable_two_ravens_profiler) if cur_metadata: successed.append(cur_metadata) except Exception as e: print( ' - FAILED GENERATE METADATA ON \n\ttext = %s, \n\thref = %s \n%s' % (text, href, str(e))) return successed
def __init__(self, description: dict, datamart_id: typing.Union[int, None] = None) -> None: """Init method of VariableMetadata. Args: description: description dict. datamart_id: unique datamart_id. Returns: """ super().__init__() self._metadata["datamart_id"] = datamart_id if "name" in description: self._metadata["name"] = description["name"] if "description" in description: self._metadata["description"] = description["description"] self._metadata["semantic_type"] = description.get("semantic_type", []) if "named_entity" in description: self._metadata["named_entity"] = description["named_entity"] if "temporal_coverage" in description: self._metadata["temporal_coverage"] = description[ "temporal_coverage"] if self.temporal_coverage is not False: self.temporal_coverage = Utils.temporal_coverage_validate( self.temporal_coverage) if "spatial_coverage" in description: self._metadata["spatial_coverage"] = description[ "spatial_coverage"]
def search( query: dict, data: pd.DataFrame or str or d3m_ds.Dataset = None) -> typing.List[Dataset]: """ Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API Args: query: JSON object describing the query(https://datadrivendiscovery.org/wiki/display/work/Query+results+schema) data: the data you are trying to augment. It can be provided as one of: - a pandas.DataFrame object - a D3M Dataset object - the path to a D3M datasetDoc.json file - the path to a CSV file Returns: a list of datamart.Dataset objects. """ loaded_data = DataLoader.load_data(data) augmenter = Augment(es_index=DEFAULT_ES) if not (query and ('required_variables' in query)) and (loaded_data is not None): query = query or {} query['required_variables'] = [] for col in loaded_data: if Utils.is_column_able_to_query(loaded_data[col]): query['required_variables'].append({ "type": "dataframe_columns", "names": [col] }) es_results = augmenter.query_by_json(query, loaded_data) if es_results: return [ Dataset(es_result, original_data=loaded_data, query_json=query) for es_result in es_results ] return []
def test_generate_metadata_from_dataframe(self): expected = { 'datamart_id': None, 'materialization': { 'python_path': 'default_materializer', 'arguments': None }, 'variables': [{ 'datamart_id': None, 'semantic_type': [], 'name': 'Age', 'description': 'column name: Age, dtype: int64' }, { 'datamart_id': None, 'semantic_type': [], 'name': 'Date', 'description': 'column name: Date, dtype: object', 'temporal_coverage': { 'start': '2014-02-23T00:00:00', 'end': '2023-02-13T00:00:00' } }, { 'datamart_id': None, 'semantic_type': [], 'name': 'Name', 'description': 'column name: Name, dtype: object', 'named_entity': ['Tom', 'Jack', 'Steve', 'Ricky'] }], 'title': 'Age Date Name', 'description': 'Age : int64, Date : object, Name : object', 'keywords': ['Age', 'Date', 'Name'] } self.assertEqual(Utils.generate_metadata_from_dataframe(data=self.df), expected)
from datamart.utilities.utils import Utils from datamart import search, augment schema = { "materialization": { "python_path": "general_materializer", "arguments": { "url": "https://en.wikipedia.org/wiki/List_of_Rock_and_Roll_Hall_of_Fame_inductees", "file_type": "html" } } } hof_df = Utils.get_dataset(schema) print(hof_df) query = { "dataset": { "about": "rock and roll, music, rock music, rock artist, rock band, music award, artist award, hall of fame, singer" }, "required_variables": [ { "type": "dataframe_columns", "index": [2] } ] } candidates = search(query, hof_df) res = []
def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame, left_columns: typing.List[typing.List[int]], right_columns: typing.List[typing.List[int]], left_metadata: dict = None, right_metadata: dict = None, joiner: JoinerType = JoinerType.DEFAULT) -> JoinResult: """Join two dataframes based on different joiner. Args: left_df: pandas Dataframe right_df: pandas Dataframe left_metadata: metadata of left dataframe right_metadata: metadata of right dataframe left_columns: list of integers from left df for join right_columns: list of integers from right df for join joiner: string of joiner, default to be "default" Returns: JoinResult """ if joiner not in self.joiners: self.joiners[joiner] = JoinerPrepare.prepare_joiner(joiner=joiner) if not self.joiners[joiner]: warnings.warn("No suitable joiner, return original dataframe") return JoinResult(left_df, []) print(" - start profiling") if not (left_metadata and left_metadata.get("variables")): # Left df is the user provided one. # We will generate metadata just based on the data itself, profiling and so on left_metadata = Utils.generate_metadata_from_dataframe( data=left_df, original_meta=left_metadata) if not right_metadata: right_metadata = Utils.generate_metadata_from_dataframe( data=right_df) # Only profile the joining columns, otherwise it will be too slow: left_metadata = Utils.calculate_dsbox_features( data=left_df, metadata=left_metadata, selected_columns=set(chain.from_iterable(left_columns))) right_metadata = Utils.calculate_dsbox_features( data=right_df, metadata=right_metadata, selected_columns=set(chain.from_iterable(right_columns))) # update with implicit_variable on the user supplied dataset if left_metadata.get('implicit_variables'): Utils.append_columns_for_implicit_variables_and_add_meta( left_metadata, left_df) print(" - start joining tables") res = self.joiners[joiner].join( left_df=left_df, right_df=right_df, left_columns=left_columns, right_columns=right_columns, left_metadata=left_metadata, right_metadata=right_metadata, ) return res
def test_is_column_able_to_query(self): self.assertTrue(Utils.is_column_able_to_query(col=self.df['Name'])) self.assertFalse(Utils.is_column_able_to_query(col=self.df['Date'])) self.assertFalse(Utils.is_column_able_to_query(col=self.df['Age']))
def test_load_materializer(self): materializer = Utils.load_materializer("noaa_materializer") self.assertEqual(issubclass(type(materializer), MaterializerBase), True) self.assertIn(type(materializer).__name__, NoaaMaterializer.__name__)
def test_date_validate(self): self.assertEqual(Utils.date_validate("2018-10-10"), "2018-10-10T00:00:00")