def indexing(self, description_path: str, es_index: str, data_path: str = None, query_data_for_indexing: bool = False, save_to_file: str = None, save_to_file_mode: str = "a+", delete_old_es_index: bool = False) -> dict: """API for the index builder. By providing description file, index builder should be able to process it and create metadata json for the dataset, create index in our index store Args: description_path: Path to description json file. es_index: str, es index for this dataset data_path: Path to data csv file. query_data_for_indexing: Bool. If no data is presented, and query_data_for_indexing is False, will only create metadata according to the description json. If query_data_for_indexing is True and no data is presented, will use Materialize to query data for profiling and indexing save_to_file: str, a path to the json line file save_to_file_mode: str, mode for saving, default "a+" delete_old_es_index: bool, boolean if delete original es index if it exist Returns: metadata dictionary """ self._check_es_index(es_index=es_index, delete_old_es_index=delete_old_es_index) if not self.current_global_index or delete_old_es_index: self.current_global_index = self.im.current_global_datamart_id( index=es_index) description, data = self._read_data(description_path, data_path) if not data and query_data_for_indexing: try: data = Utils.materialize(metadata=description) except: traceback.print_exc() warnings.warn( "Materialization Failed, index based on schema json only") metadata = self.construct_global_metadata(description=description, data=data) Utils.validate_schema(metadata.value) if save_to_file: self._save_data(save_to_file=save_to_file, save_mode=save_to_file_mode, metadata=metadata) self.im.create_doc(index=es_index, doc_type='_doc', body=metadata.value, id=metadata.value['datamart_id']) return metadata.value
def updating(self, description_path: str, es_index: str, document_id: int, data_path: str = None, query_data_for_updating: bool = False) -> dict: """Update document in elastic search. By providing description file, index builder should be able to process it and create metadata json for the dataset, update document in elastic search Args: description_path: Path to description json file. es_index: str, es index for this dataset document_id: int, document id of document which need to be updated data_path: Path to data csv file. query_data_for_updating: Bool. If no data is presented, and query_data_for_updating is False, will only create metadata according to the description json. If query_data_for_updating is True and no data is presented, will use Materialize to query data for profiling and indexing Returns: metadata dictionary """ self._check_es_index(es_index=es_index) description, data = self._read_data(description_path, data_path) if not data and query_data_for_updating: try: materializer_module = description["materialization"][ "python_path"] materializer = Utils.load_materializer(materializer_module) data = materializer.get(metadata=description) except: warnings.warn( "Materialization Failed, index based on schema json only") metadata = self.construct_global_metadata( description=description, data=data, overwrite_datamart_id=document_id) Utils.validate_schema(metadata.value) self.im.update_doc(index=es_index, doc_type='document', body={"doc": metadata.value}, id=metadata.value['datamart_id']) return metadata.value
def test_validate_schema(self): with open( os.path.join(os.path.dirname(__file__), "resources/trading_economic.json"), "r") as f: description = json.load(f) self.assertEqual(Utils.validate_schema(description["description"]), True)
def test_validate_schema(self): print("[Test]{}/test_validate_schema".format(self.__class__.__name__)) description = json.load( open( os.path.join(os.path.dirname(__file__), "resources/trading_economic.json"), "r")) self.assertEqual(Utils.validate_schema(description["description"]), True) print(colored('.Done', 'red'))
def _read_data(description_path: str, data_path: str = None) -> typing.Tuple[dict, pd.DataFrame]: """Read dataset description json and dataset if present. Args: description_path: Path to description json file. data_path: Path to data csv file. Returns: Tuple of (description json, dataframe of data) """ description = json.load(open(description_path, 'r')) Utils.validate_schema(description) if data_path: data = pd.read_csv(open(data_path), 'r') else: data = None return description, data
import sys, os sys.path.append(sys.path.append(os.path.join(os.path.dirname(__file__), '..'))) import argparse from datamart.utils import Utils import json if __name__ == '__main__': parser = argparse.ArgumentParser(description='Util functions') parser.add_argument( '--validate_json', help='Validate json against schema. Provide a path to json file', default=None) args = parser.parse_args() if args.validate_json: description = json.load(open(args.validate_json, 'r')) try: Utils.validate_schema(description) print("Valid json") except: print("Invalid json")