コード例 #1
0
    def indexing(self,
                 description_path: str,
                 es_index: str,
                 data_path: str = None,
                 query_data_for_indexing: bool = False,
                 save_to_file: str = None,
                 save_to_file_mode: str = "a+",
                 delete_old_es_index: bool = False) -> dict:
        """API for the index builder.

        By providing description file, index builder should be able to process it and create metadata json for the
        dataset, create index in our index store

        Args:
            description_path: Path to description json file.
            es_index: str, es index for this dataset
            data_path: Path to data csv file.
            query_data_for_indexing: Bool. If no data is presented, and query_data_for_indexing is False, will only
                create metadata according to the description json. If query_data_for_indexing is True and no data is
                presented, will use Materialize to query data for profiling and indexing
            save_to_file: str, a path to the json line file
            save_to_file_mode: str, mode for saving, default "a+"
            delete_old_es_index: bool, boolean if delete original es index if it exist

        Returns:
            metadata dictionary

        """

        self._check_es_index(es_index=es_index,
                             delete_old_es_index=delete_old_es_index)

        if not self.current_global_index or delete_old_es_index:
            self.current_global_index = self.im.current_global_datamart_id(
                index=es_index)

        description, data = self._read_data(description_path, data_path)
        if not data and query_data_for_indexing:
            try:
                data = Utils.materialize(metadata=description)
            except:
                traceback.print_exc()
                warnings.warn(
                    "Materialization Failed, index based on schema json only")

        metadata = self.construct_global_metadata(description=description,
                                                  data=data)
        Utils.validate_schema(metadata.value)

        if save_to_file:
            self._save_data(save_to_file=save_to_file,
                            save_mode=save_to_file_mode,
                            metadata=metadata)

        self.im.create_doc(index=es_index,
                           doc_type='_doc',
                           body=metadata.value,
                           id=metadata.value['datamart_id'])

        return metadata.value
コード例 #2
0
ファイル: test_utils.py プロジェクト: linqyd/datamart
 def test_temporal_coverage_validate(self):
     print("[Test]{}/test_temporal_coverage_validate".format(
         self.__class__.__name__))
     coverage = {}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         "start": None,
         "end": None
     })
     coverage = {"start": None}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         "start": None,
         "end": None
     })
     coverage = {"end": None}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         "start": None,
         "end": None
     })
     coverage = {"start": "2018-09-23T00:00:00", "end": "2018-10-10"}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         'end': '2018-10-10T00:00:00',
         'start': '2018-09-23T00:00:00'
     })
     coverage = {"start": "2018-00", "end": "2018-10-10"}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         'end': '2018-10-10T00:00:00',
         'start': None
     })
     print(colored('.Done', 'red'))
コード例 #3
0
    def match_temporal_coverage(cls, start: str = None, end: str = None) -> str:
        """Generate query body for query by temporal_coverage.

        Args:
            start: dataset should cover date time earlier than the start date.
            end: dataset should cover date time later than the end date.

        Returns:
            string of query body
        """
        start = Utils.date_validate(date_text=start)
        end = Utils.date_validate(date_text=end)
        if not start and not end:
            warnings.warn("Start and end are None, match all")
            return cls.match_all()

        body = {
            "query": {
                "nested": {
                    "path": "variables",
                    "inner_hits": {
                        "_source": [
                            "temporal_coverage"
                        ]
                    },
                    "query": {
                        "bool": {
                            "must": [
                            ]
                        }
                    }
                }
            }
        }

        if start:
            body["query"]["nested"]["query"]["bool"]["must"].append(
                {
                    "range": {
                        "variables.temporal_coverage.start": {
                            "lte": start,
                            "format": "yyyy-MM-dd'T'HH:mm:ss"
                        }
                    }
                }
            )

        if end:
            body["query"]["nested"]["query"]["bool"]["must"].append(
                {
                    "range": {
                        "variables.temporal_coverage.end": {
                            "gte": end,
                            "format": "yyyy-MM-dd'T'HH:mm:ss"
                        }
                    }
                }
            )

        return json.dumps(body)
コード例 #4
0
 def test_temporal_coverage_validate(self):
     coverage = {}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         "start": None,
         "end": None
     })
     coverage = {"start": None}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         "start": None,
         "end": None
     })
     coverage = {"end": None}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         "start": None,
         "end": None
     })
     coverage = {"start": "2018-09-23T00:00:00", "end": "2018-10-10"}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         'end': '2018-10-10T00:00:00',
         'start': '2018-09-23T00:00:00'
     })
     coverage = {"start": "2018-00", "end": "2018-10-10"}
     self.assertEqual(Utils.temporal_coverage_validate(coverage), {
         'end': '2018-10-10T00:00:00',
         'start': None
     })
コード例 #5
0
ファイル: global_metadata.py プロジェクト: linqyd/datamart
    def __init__(self, description: dict, datamart_id: int):
        """Init method of GlobalMetadata.

        Args:
            description: description dict.
            datamart_id: unique datamart_id.

        Returns:

        """

        super().__init__()

        self._metadata["datamart_id"] = datamart_id
        if "title" in description:
            self._metadata["title"] = description["title"]

        if "description" in description:
            self._metadata["description"] = description["description"]

        if "url" in description:
            self._metadata["url"] = description["url"]

        if "keywords" in description:
            self._metadata["keywords"] = description["keywords"]

        if "date_published" in description:
            self._metadata["date_published"] = description["date_published"]
        if self.date_published:
            self.date_published = Utils.date_validate(self.date_published)

        if "date_updated" in description:
            self._metadata["date_updated"] = description["date_updated"]
        if self.date_updated:
            self.date_updated = Utils.date_validate(self.date_updated)

        if "provenance" in description:
            self._metadata["provenance"] = description["provenance"]

        if "original_identifier" in description:
            self._metadata["original_identifier"] = description[
                "original_identifier"]

        try:
            self._metadata["materialization"] = description["materialization"]
        except:
            raise ValueError("No materialization found")

        if "python_path" not in self.materialization:
            raise ValueError("No python path found in materialization")

        if "arguments" not in self.materialization:
            self._metadata["materialization"]["arguments"] = None

        self._metadata["variables"] = list()
        self._variables = list()

        if "license" in description:
            self._metadata["license"] = description["license"]
コード例 #6
0
    def updating(self,
                 description_path: str,
                 es_index: str,
                 document_id: int,
                 data_path: str = None,
                 query_data_for_updating: bool = False) -> dict:
        """Update document in elastic search.

        By providing description file, index builder should be able to process it and create metadata json for the
        dataset, update document in elastic search

        Args:
            description_path: Path to description json file.
            es_index: str, es index for this dataset
            document_id: int, document id of document which need to be updated
            data_path: Path to data csv file.
            query_data_for_updating: Bool. If no data is presented, and query_data_for_updating is False, will only
                create metadata according to the description json. If query_data_for_updating is True and no data is
                presented, will use Materialize to query data for profiling and indexing

        Returns:
            metadata dictionary

        """

        self._check_es_index(es_index=es_index)

        description, data = self._read_data(description_path, data_path)
        if not data and query_data_for_updating:
            try:
                materializer_module = description["materialization"][
                    "python_path"]
                materializer = Utils.load_materializer(materializer_module)
                data = materializer.get(metadata=description)
            except:
                warnings.warn(
                    "Materialization Failed, index based on schema json only")

        metadata = self.construct_global_metadata(
            description=description,
            data=data,
            overwrite_datamart_id=document_id)
        Utils.validate_schema(metadata.value)

        self.im.update_doc(index=es_index,
                           doc_type='document',
                           body={"doc": metadata.value},
                           id=metadata.value['datamart_id'])

        return metadata.value
コード例 #7
0
 def test_validate_schema(self):
     with open(
             os.path.join(os.path.dirname(__file__),
                          "resources/trading_economic.json"), "r") as f:
         description = json.load(f)
     self.assertEqual(Utils.validate_schema(description["description"]),
                      True)
コード例 #8
0
ファイル: test_utils.py プロジェクト: linqyd/datamart
 def test_load_materializer(self):
     print("[Test]{}/test_load_materializer".format(
         self.__class__.__name__))
     materializer = Utils.load_materializer("noaa_materializer")
     self.assertEqual(issubclass(type(materializer), MaterializerBase),
                      True)
     self.assertIn(type(materializer).__name__, NoaaMaterializer.__name__)
     print(colored('.Done', 'red'))
コード例 #9
0
ファイル: test_utils.py プロジェクト: linqyd/datamart
 def test_validate_schema(self):
     print("[Test]{}/test_validate_schema".format(self.__class__.__name__))
     description = json.load(
         open(
             os.path.join(os.path.dirname(__file__),
                          "resources/trading_economic.json"), "r"))
     self.assertEqual(Utils.validate_schema(description["description"]),
                      True)
     print(colored('.Done', 'red'))
コード例 #10
0
    def _read_data(description_path: str, data_path: str = None) -> typing.Tuple[dict, pd.DataFrame]:
        """Read dataset description json and dataset if present.

        Args:
            description_path: Path to description json file.
            data_path: Path to data csv file.

        Returns:
            Tuple of (description json, dataframe of data)
        """

        description = json.load(open(description_path, 'r'))
        Utils.validate_schema(description)
        if data_path:
            data = pd.read_csv(open(data_path), 'r')
        else:
            data = None
        return description, data
コード例 #11
0
    def get_dataset(metadata: dict, variables: list = None, constrains: dict = None) -> typing.Optional[pd.DataFrame]:
        """Get the dataset with materializer.

       Args:
           metadata: metadata dict.
           variables:
           constrains:

       Returns:
            pandas dataframe
       """

        return Utils.materialize(metadata=metadata, variables=variables, constrains=constrains)
コード例 #12
0
ファイル: test_utils.py プロジェクト: JiFeRe/datamart
 def test_materialize(self):
     fake_metadata = {
         "materialization": {
             "python_path": "noaa_materializer",
             "arguments": {
                 "type": 'PRCP'
             }
         }
     }
     fake_constrains = {
         "date_range": {
             "start": "2016-09-23",
             "end": "2016-09-23"
         },
         "named_entity": {2: ["los angeles"]}
     }
     result = Utils.materialize(metadata=fake_metadata, constrains=fake_constrains)
     expepcted = pd.read_csv(os.path.join(os.path.dirname(__file__), "resources/noaa_result.csv"))
     assert_frame_equal(result, expepcted)
コード例 #13
0
    def get_dataset(metadata: dict,
                    variables: list = None,
                    constrains: dict = None) -> typing.Optional[pd.DataFrame]:
        """Get the dataset with materializer.

       Args:
           metadata: metadata dict.
           variables:
           constrains:

       Returns:
            pandas dataframe
       """
        if "date_range" in constrains:
            if not constrains["date_range"].get("start", None):
                constrains["date_range"]["start"] = Augment.DEFAULT_START_DATE
            if not constrains["date_range"].get("end", None):
                constrains["date_range"]["end"] = datetime.now().strftime(
                    '%Y-%m-%dT%H:%M:%S')
        df = Utils.materialize(metadata=metadata, constrains=constrains)
        if variables:
            return df.iloc[:, variables]
        return df
コード例 #14
0
    def __init__(self, description: dict, datamart_id: int):
        """Init method of VariableMetadata.

        Args:
            description: description dict.
            datamart_id: unique datamart_id.

        Returns:

        """

        super().__init__()

        self._metadata["datamart_id"] = datamart_id

        if "name" in description:
            self._metadata["name"] = description["name"]

        if "description" in description:
            self._metadata["description"] = description["description"]

        self._metadata["semantic_type"] = description.get("semantic_type", [])

        if "named_entity" in description:
            self._metadata["named_entity"] = description["named_entity"]

        if "temporal_coverage" in description:
            self._metadata["temporal_coverage"] = description[
                "temporal_coverage"]

        if self.temporal_coverage:
            self.temporal_coverage = Utils.temporal_coverage_validate(
                self.temporal_coverage)

        if "spatial_coverage" in description:
            self._metadata["spatial_coverage"] = description[
                "spatial_coverage"]
コード例 #15
0
ファイル: join_datasets.py プロジェクト: JiFeRe/datamart
    def default_join(self, request):

        # print(request.form, request.files)
        query_data = json.loads(request.form['data'])
        selected_metadata = query_data["selected_metadata"]

        old_df = pd.read_csv(request.files['file']).infer_objects()

        offset_and_matched_queries = Utils.get_offset_and_matched_queries_from_variable_metadata(
            metadata=selected_metadata)

        if not offset_and_matched_queries:
            return old_df.to_csv()

        if "constrains" in query_data:
            try:
                constrains = query_data["constrains"]
            except:
                constrains = None
        else:
            constrains = {}

        constrains["named_entity"] = {}
        for offset, matched_queries in offset_and_matched_queries:
            constrains["named_entity"][offset] = matched_queries

        new_df = self.augument.get_dataset(
            metadata=selected_metadata["_source"], constrains=constrains)

        df = self.augument.join(
            left_df=old_df,
            right_df=new_df,
            left_columns=[int(x) for x in query_data["old_df_column_ids"]],
            right_columns=[offset for offset, _ in offset_and_matched_queries])

        return df.to_csv()
コード例 #16
0
ファイル: test_utils.py プロジェクト: linqyd/datamart
 def test_date_validate(self):
     print("[Test]{}/test_date_validate".format(self.__class__.__name__))
     self.assertEqual(Utils.date_validate("2018-10-10"),
                      "2018-10-10T00:00:00")
     print(colored('.Done', 'red'))
コード例 #17
0
 def get_dataset(metadata, variables=None, constrains=None):
     materializer = Utils.load_materializer(
         metadata["materialization"]["python_path"])
     return materializer.get(metadata=metadata,
                             variables=variables,
                             constrains=constrains)
コード例 #18
0
import sys, os
sys.path.append(sys.path.append(os.path.join(os.path.dirname(__file__), '..')))
import argparse
from datamart.utils import Utils
import json

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Util functions')
    parser.add_argument(
        '--validate_json',
        help='Validate json against schema. Provide a path to json file',
        default=None)

    args = parser.parse_args()
    if args.validate_json:
        description = json.load(open(args.validate_json, 'r'))
        try:
            Utils.validate_schema(description)
            print("Valid json")
        except:
            print("Invalid json")
コード例 #19
0
 def test_load_materializer(self):
     materializer = Utils.load_materializer("noaa_materializer")
     self.assertEqual(issubclass(type(materializer), MaterializerBase),
                      True)
     self.assertIn(type(materializer).__name__, NoaaMaterializer.__name__)
コード例 #20
0
 def test_date_validate(self):
     self.assertEqual(Utils.date_validate("2018-10-10"),
                      "2018-10-10T00:00:00")