Exemple #1
0
 def extract(self):
     df_article = function.read_delta(self.spark, self.path_source_article)
     df_clustering = function.read_delta(self.spark,
                                         self.path_source_clustering)
     df_embeddings = function.read_delta(self.spark,
                                         self.path_source_embeddings)
     return df_article, df_clustering, df_embeddings
Exemple #2
0
    def extract(self, start_state):

        start_state_parsed = datetime.strptime(start_state,
                                               '%Y-%m-%d %H:%M:%S.%f')
        start_state_parsed = datetime.timestamp(start_state_parsed)
        start_state_period = int(start_state_parsed) // self.period_seconds

        df = function.read_delta(self.spark, self.path_source) \
            .filter(col('_time_updated_period') >= lit(start_state_period)) \
            .filter(col('_time_updated') > lit(start_state))

        stop_state = df \
            .agg({'_time_updated': 'max'}) \
            .collect()[0][0]

        stop_state = stop_state or start_state

        df = df.drop('_time_updated_period', '_time_updated')

        return df, stop_state
Exemple #3
0
 def extract(self):
     return function.read_delta(self.spark, self.path_source)
Exemple #4
0
 def extract(self):
     df_clustering = function.read_delta(self.spark, self.path_source_clustering)
     df_topicwords = function.read_delta(self.spark, self.path_source_topicwords)
     return df_clustering, df_topicwords
Exemple #5
0
 def extract(self):
     df_topic_id = function.read_delta(self.spark,
                                       self.path_source_topic_ids)
     df_documents = function.read_delta(self.spark,
                                        self.path_source_documents)
     return df_documents, df_topic_id
Exemple #6
0
 def extract(self):
     df_embeddings = function.read_delta(self.spark, self.path_source)
     return df_embeddings