def extract(self): df_article = function.read_delta(self.spark, self.path_source_article) df_clustering = function.read_delta(self.spark, self.path_source_clustering) df_embeddings = function.read_delta(self.spark, self.path_source_embeddings) return df_article, df_clustering, df_embeddings
def extract(self, start_state): start_state_parsed = datetime.strptime(start_state, '%Y-%m-%d %H:%M:%S.%f') start_state_parsed = datetime.timestamp(start_state_parsed) start_state_period = int(start_state_parsed) // self.period_seconds df = function.read_delta(self.spark, self.path_source) \ .filter(col('_time_updated_period') >= lit(start_state_period)) \ .filter(col('_time_updated') > lit(start_state)) stop_state = df \ .agg({'_time_updated': 'max'}) \ .collect()[0][0] stop_state = stop_state or start_state df = df.drop('_time_updated_period', '_time_updated') return df, stop_state
def extract(self): return function.read_delta(self.spark, self.path_source)
def extract(self): df_clustering = function.read_delta(self.spark, self.path_source_clustering) df_topicwords = function.read_delta(self.spark, self.path_source_topicwords) return df_clustering, df_topicwords
def extract(self): df_topic_id = function.read_delta(self.spark, self.path_source_topic_ids) df_documents = function.read_delta(self.spark, self.path_source_documents) return df_documents, df_topic_id
def extract(self): df_embeddings = function.read_delta(self.spark, self.path_source) return df_embeddings