def ingest_data(self, filepath, dataset): """ load data from a file to a dataframe and store it on the db Parameters ---------- filepath : String file path of the .csv file for the dataset dataset: DataSet The DataSet object that holds the Session ID for HoloClean Returns ------- No Return """ # Spawn new reader and load data into dataframe fileReader = Reader(self.holoEnv.spark_session) df = fileReader.read(filepath) # Store dataframe to DB table schema = df.schema.names name_table = self._add_info_to_meta('Init', schema, dataset) self._dataframe_to_table(name_table, df) table_attribute_string = self.get_schema(dataset, "Init") count = 0 map_schema = [] attributes = table_attribute_string.split(',') for attribute in attributes: if attribute != "index": count = count + 1 map_schema.append([count, attribute]) dataframe_map_schema = self.holoEnv.spark_session.createDataFrame( map_schema, StructType([ StructField("index", IntegerType(), False), StructField("attribute", StringType(), True) ])) self.add_db_table('Map_schema', dataframe_map_schema, dataset) for tuple in map_schema: self.attribute_map[tuple[1]] = tuple[0] return
def ingest_data(self, filepath, dataset): """ Load data from a file to a dataframe and store it on the db filepath : String File path of the .csv file for the dataset dataset: DataSet The DataSet object that holds the Session ID for HoloClean """ # Spawn new reader and load data into dataframe filereader = Reader(self.holo_env.spark_session) # read with an index column df = filereader.read(filepath,1) # Store dataframe to DB table schema = df.schema.names name_table = dataset.table_specific_name('Init') self.dataframe_to_table(name_table, df) dataset.attributes['Init'] = schema count = 0 map_schema = [] attribute_map = {} for attribute in schema: if attribute != GlobalVariables.index_name: count = count + 1 map_schema.append([count, attribute]) attribute_map[attribute] = count dataframe_map_schema = self.holo_env.spark_session.createDataFrame( map_schema, dataset.attributes['Map_schema']) self.add_db_table('Map_schema', dataframe_map_schema, dataset) for table_tuple in map_schema: self.attribute_map[table_tuple[1]] = table_tuple[0] return df, attribute_map
def get_correct_array(self, current_file_path): my_reader = Reader() init_array = my_reader.read(current_file_path) return init_array