Ejemplo n.º 1
0
def augment(original_data: pd.DataFrame or str or d3m_ds.Dataset,
            augment_data: Dataset) -> pd.DataFrame:
    """
    Perform the augmentation (either join or union).
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        original_data:
        augment_data:

    Returns:

    """

    loaded_data = DataLoader.load_data(original_data)

    if not augment_data.matched_cols:
        return loaded_data

    left_cols, right_cols = augment_data.matched_cols
    default_joiner = 'rltk'
    augmenter = Augment(es_index=DEFAULT_ES)

    augmented_data = augmenter.join(left_df=loaded_data,
                                    right_df=augment_data.materialize(),
                                    left_columns=left_cols,
                                    right_columns=right_cols,
                                    left_metadata=None,
                                    right_metadata=augment_data.metadata,
                                    joiner=default_joiner)
    return augmented_data
Ejemplo n.º 2
0
class TestAugment(unittest.TestCase):
    def setUp(self):
        self.augment = Augment(es_index="fake")
        self.assertDataframeEqual = assert_frame_equal

        data = {
            'Name': ['Tom', 'Jack', 'Steve', 'Ricky'],
            'Age': [28, 34, 29, 42],
            'Date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"]
        }
        self.df = pd.DataFrame(data).infer_objects()

    @Utils.test_print
    def test_joiner(self):
        data = {
            'Age': [28, 34, 29, 42],
            'Date_x': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"],
            'Name_x': ['Tom', 'Jack', 'Steve', 'Ricky'],
            'Date_y': ["2018-10-05", "2014-02-23", np.nan, np.nan],
            'Name_y': ['Tom', 'Jack', np.nan, np.nan]
        }
        expected = pd.DataFrame(data, columns=data.keys())

        self.assertDataframeEqual(
            self.augment.join(left_df=self.df,
                              right_df=self.df.iloc[:2, :],
                              left_columns=[[0]],
                              right_columns=[[0]],
                              joiner="default").df, expected)
Ejemplo n.º 3
0
def join(left_data: pd.DataFrame or str or d3m_ds.Dataset,
         right_data: Dataset or int or pd.DataFrame or str or d3m_ds.Dataset,
         left_columns: typing.List[typing.List[int or str]],
         right_columns: typing.List[typing.List[int or str]],
         left_meta: dict=None,
         joiner=JoinerType.RLTK
         ) -> JoinResult:
    """

    :param left_data: a tabular data
    :param right_data: a tabular data or the datamart.Dataset(metadata with materialize info)
                        or an int for the datamart_id - Recommend to use datamart.Dataset or ID
    :param left_columns: list of index(indices)/header(headers) for each "key" for joining
    :param right_columns: list of index(indices)/header(headers) for each "key" for joining(same length as left_columns)
    :return: a pandas.DataFrame(joined table)
    """

    if isinstance(right_data, Dataset):
        return augment(left_data, right_data, (left_columns, right_columns), joiner)

    print(" - start loading data")
    left_df = DataLoader.load_data(left_data)
    right_metadata = None
    if isinstance(right_data, int):
        right_metadata, right_df = DataLoader.load_meta_and_data_by_id(right_data)
    else:
        right_df = DataLoader.load_data(right_data)

    if not (isinstance(left_df, pd.DataFrame) and isinstance(right_df, pd.DataFrame) and left_columns and right_columns):
        return JoinResult(left_df, [])

    augmenter = Augment(es_index=PRODUCTION_ES_INDEX)

    print(" - satrt augmenting")
    augmented_data = augmenter.join(
            left_df=left_df,
            right_df=right_df,
            left_columns=left_columns,
            right_columns=right_columns,
            left_metadata=left_meta,
            right_metadata=right_metadata,
            joiner=joiner
    )
    return augmented_data
Ejemplo n.º 4
0
def augment(original_data: pd.DataFrame or str or d3m_ds.Dataset,
            augment_data: Dataset,
            joining_columns: typing.Tuple[typing.List[typing.List[int or str]], typing.List[typing.List[int or str]]]=None,
            joiner=JoinerType.RLTK
            ) -> JoinResult:
    """
    Perform the augmentation (either join or union).
    Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API

    Args:
        original_data:
        augment_data:
        joining_columns: user defined which columns to be joined

    Returns:

    """

    loaded_data = DataLoader.load_data(original_data)
    if joining_columns:
        try:
            augment_data.set_join_columns(*joining_columns)
        except Exception as e:
            print("FAILED SET JOINING COLUMNS:", e)

    if not augment_data.join_columns:
        return JoinResult(loaded_data, [])

    left_cols, right_cols = augment_data.join_columns
    augmenter = Augment(es_index=PRODUCTION_ES_INDEX)

    augmented_data = augmenter.join(
            left_df=loaded_data,
            right_df=augment_data.materialize(),
            left_columns=left_cols,
            right_columns=right_cols,
            left_metadata=None,
            right_metadata=augment_data.metadata,
            joiner=joiner
    )
    return augmented_data
Ejemplo n.º 5
0
class JoinDatasets(object):
    def __init__(self, es_index="datamart"):
        self.augument = Augment(es_index=es_index)

    def default_join(self, request):

        # print(request.form, request.files)
        query_data = json.loads(request.form['data'])
        selected_metadata = query_data["selected_metadata"]

        old_df = pd.read_csv(request.files['file']).infer_objects()

        offset_and_matched_queries = Utils.get_offset_and_matched_queries_from_variable_metadata(
            metadata=selected_metadata)

        if not offset_and_matched_queries:
            return old_df.to_csv()

        if "constrains" in query_data:
            try:
                constrains = query_data["constrains"]
            except:
                constrains = None
        else:
            constrains = {}

        constrains["named_entity"] = {}
        for offset, matched_queries in offset_and_matched_queries:
            constrains["named_entity"][offset] = matched_queries

        new_df = self.augument.get_dataset(
            metadata=selected_metadata["_source"], constrains=constrains)

        df = self.augument.join(
            left_df=old_df,
            right_df=new_df,
            left_columns=[int(x) for x in query_data["old_df_column_ids"]],
            right_columns=[offset for offset, _ in offset_and_matched_queries])

        return df.to_csv()
Ejemplo n.º 6
0
class JoinDatasets(object):
    def __init__(self, es_index="datamart"):
        self.augment = Augment(es_index=es_index)

    def default_join(self, request, old_df):

        left_metadata = Utils.generate_metadata_from_dataframe(data=old_df)

        query_data = json.loads(request.form['data'])
        selected_metadata = query_data["selected_metadata"]
        columns_mapping = query_data["columns_mapping"]

        if "constrains" in query_data:
            try:
                constrains = query_data["constrains"]
            except:
                constrains = {}
        else:
            constrains = {}

        matches = Utils.get_inner_hits_info(hitted_es_result=selected_metadata)

        if not matches:
            return json.dumps({
                "message":
                "Default join should perform after default search using default search result"
            })

        constrains[
            "named_entity"] = Utils.get_named_entity_constrain_from_inner_hits(
                matches)

        # get temporal coverage from provided dataframe
        if left_metadata.get("variables", []):
            for variable in left_metadata["variables"]:
                if variable.get(
                        "temporal_coverage"
                ) and variable["temporal_coverage"].get(
                        "start") and variable["temporal_coverage"].get("end"):
                    constrains["date_range"] = {
                        "start": variable["temporal_coverage"]["start"],
                        "end": variable["temporal_coverage"]["end"]
                    }
                    break

        try:
            new_df = Utils.get_dataset(metadata=selected_metadata["_source"],
                                       constrains=constrains)
        except:
            return json.dumps({
                "message":
                "Failed to join, not getting complementary dataset"
            })

        try:
            df = self.augment.join(
                left_df=old_df,
                right_df=new_df,
                left_columns=[x["old_cols"] for x in columns_mapping],
                right_columns=[x["new_cols"] for x in columns_mapping],
                left_metadata=left_metadata,
                right_metadata=selected_metadata["_source"],
                joiner="default")
        except:
            return json.dumps(
                {"message": "Failed to join, con not join two dataframes"})

        return df.to_csv()