def augment(original_data: pd.DataFrame or str or d3m_ds.Dataset, augment_data: Dataset) -> pd.DataFrame: """ Perform the augmentation (either join or union). Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API Args: original_data: augment_data: Returns: """ loaded_data = DataLoader.load_data(original_data) if not augment_data.matched_cols: return loaded_data left_cols, right_cols = augment_data.matched_cols default_joiner = 'rltk' augmenter = Augment(es_index=DEFAULT_ES) augmented_data = augmenter.join(left_df=loaded_data, right_df=augment_data.materialize(), left_columns=left_cols, right_columns=right_cols, left_metadata=None, right_metadata=augment_data.metadata, joiner=default_joiner) return augmented_data
class TestAugment(unittest.TestCase): def setUp(self): self.augment = Augment(es_index="fake") self.assertDataframeEqual = assert_frame_equal data = { 'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28, 34, 29, 42], 'Date': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"] } self.df = pd.DataFrame(data).infer_objects() @Utils.test_print def test_joiner(self): data = { 'Age': [28, 34, 29, 42], 'Date_x': ["2018-10-05", "2014-02-23", "2020-09-23", "2023-02-13"], 'Name_x': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Date_y': ["2018-10-05", "2014-02-23", np.nan, np.nan], 'Name_y': ['Tom', 'Jack', np.nan, np.nan] } expected = pd.DataFrame(data, columns=data.keys()) self.assertDataframeEqual( self.augment.join(left_df=self.df, right_df=self.df.iloc[:2, :], left_columns=[[0]], right_columns=[[0]], joiner="default").df, expected)
def join(left_data: pd.DataFrame or str or d3m_ds.Dataset, right_data: Dataset or int or pd.DataFrame or str or d3m_ds.Dataset, left_columns: typing.List[typing.List[int or str]], right_columns: typing.List[typing.List[int or str]], left_meta: dict=None, joiner=JoinerType.RLTK ) -> JoinResult: """ :param left_data: a tabular data :param right_data: a tabular data or the datamart.Dataset(metadata with materialize info) or an int for the datamart_id - Recommend to use datamart.Dataset or ID :param left_columns: list of index(indices)/header(headers) for each "key" for joining :param right_columns: list of index(indices)/header(headers) for each "key" for joining(same length as left_columns) :return: a pandas.DataFrame(joined table) """ if isinstance(right_data, Dataset): return augment(left_data, right_data, (left_columns, right_columns), joiner) print(" - start loading data") left_df = DataLoader.load_data(left_data) right_metadata = None if isinstance(right_data, int): right_metadata, right_df = DataLoader.load_meta_and_data_by_id(right_data) else: right_df = DataLoader.load_data(right_data) if not (isinstance(left_df, pd.DataFrame) and isinstance(right_df, pd.DataFrame) and left_columns and right_columns): return JoinResult(left_df, []) augmenter = Augment(es_index=PRODUCTION_ES_INDEX) print(" - satrt augmenting") augmented_data = augmenter.join( left_df=left_df, right_df=right_df, left_columns=left_columns, right_columns=right_columns, left_metadata=left_meta, right_metadata=right_metadata, joiner=joiner ) return augmented_data
def augment(original_data: pd.DataFrame or str or d3m_ds.Dataset, augment_data: Dataset, joining_columns: typing.Tuple[typing.List[typing.List[int or str]], typing.List[typing.List[int or str]]]=None, joiner=JoinerType.RLTK ) -> JoinResult: """ Perform the augmentation (either join or union). Follow the API defined by https://datadrivendiscovery.org/wiki/display/work/Python+API Args: original_data: augment_data: joining_columns: user defined which columns to be joined Returns: """ loaded_data = DataLoader.load_data(original_data) if joining_columns: try: augment_data.set_join_columns(*joining_columns) except Exception as e: print("FAILED SET JOINING COLUMNS:", e) if not augment_data.join_columns: return JoinResult(loaded_data, []) left_cols, right_cols = augment_data.join_columns augmenter = Augment(es_index=PRODUCTION_ES_INDEX) augmented_data = augmenter.join( left_df=loaded_data, right_df=augment_data.materialize(), left_columns=left_cols, right_columns=right_cols, left_metadata=None, right_metadata=augment_data.metadata, joiner=joiner ) return augmented_data
class JoinDatasets(object): def __init__(self, es_index="datamart"): self.augument = Augment(es_index=es_index) def default_join(self, request): # print(request.form, request.files) query_data = json.loads(request.form['data']) selected_metadata = query_data["selected_metadata"] old_df = pd.read_csv(request.files['file']).infer_objects() offset_and_matched_queries = Utils.get_offset_and_matched_queries_from_variable_metadata( metadata=selected_metadata) if not offset_and_matched_queries: return old_df.to_csv() if "constrains" in query_data: try: constrains = query_data["constrains"] except: constrains = None else: constrains = {} constrains["named_entity"] = {} for offset, matched_queries in offset_and_matched_queries: constrains["named_entity"][offset] = matched_queries new_df = self.augument.get_dataset( metadata=selected_metadata["_source"], constrains=constrains) df = self.augument.join( left_df=old_df, right_df=new_df, left_columns=[int(x) for x in query_data["old_df_column_ids"]], right_columns=[offset for offset, _ in offset_and_matched_queries]) return df.to_csv()
class JoinDatasets(object): def __init__(self, es_index="datamart"): self.augment = Augment(es_index=es_index) def default_join(self, request, old_df): left_metadata = Utils.generate_metadata_from_dataframe(data=old_df) query_data = json.loads(request.form['data']) selected_metadata = query_data["selected_metadata"] columns_mapping = query_data["columns_mapping"] if "constrains" in query_data: try: constrains = query_data["constrains"] except: constrains = {} else: constrains = {} matches = Utils.get_inner_hits_info(hitted_es_result=selected_metadata) if not matches: return json.dumps({ "message": "Default join should perform after default search using default search result" }) constrains[ "named_entity"] = Utils.get_named_entity_constrain_from_inner_hits( matches) # get temporal coverage from provided dataframe if left_metadata.get("variables", []): for variable in left_metadata["variables"]: if variable.get( "temporal_coverage" ) and variable["temporal_coverage"].get( "start") and variable["temporal_coverage"].get("end"): constrains["date_range"] = { "start": variable["temporal_coverage"]["start"], "end": variable["temporal_coverage"]["end"] } break try: new_df = Utils.get_dataset(metadata=selected_metadata["_source"], constrains=constrains) except: return json.dumps({ "message": "Failed to join, not getting complementary dataset" }) try: df = self.augment.join( left_df=old_df, right_df=new_df, left_columns=[x["old_cols"] for x in columns_mapping], right_columns=[x["new_cols"] for x in columns_mapping], left_metadata=left_metadata, right_metadata=selected_metadata["_source"], joiner="default") except: return json.dumps( {"message": "Failed to join, con not join two dataframes"}) return df.to_csv()