def tokenize_text_rdd(self, rdd_input): input_parser = ParserFactory.get_parser(self.config, **self.options) if input_parser: # TEXT (NEW): Each RDD element when parsed yields a tuple of strings # to be tokenized, and each string yields a list of tokens # corresponding to one per input column_path print('got parsed') rdd_parsed = rdd_input.mapValues(lambda x: input_parser.parse_values(x)) if(self.options.get("emptylines") == 'False'): rdd_parsed = rdd_parsed.filter(lambda x : self.filter_emptylines(x)) return self._tokenize_rdd(rdd_parsed)
def tokenize_seq_rdd(self, rdd_input): input_parser = ParserFactory.get_parser(self.config, **self.options) if input_parser: # SEQUENCE: Each RDD element when parsed yields a tuple of strings # to be tokenized, and each string yields a list of tokens # corresponding to one per input column_path rdd_parsed = rdd_input.mapValues(lambda x: input_parser.parse_values(x)) if(self.options.get("emptylines") == 'False'): rdd_parsed = rdd_parsed.filter(lambda x : self.filter_emptylines(x)) return self._tokenize_rdd(rdd_parsed) else: raise ValueError("No input_parser")
def tokenize_text_rdd(self, rdd_input): input_parser = ParserFactory.get_parser(self.config, **self.options) if input_parser: # TEXT (NEW): Each RDD element when parsed yields a tuple of strings # to be tokenized, and each string yields a list of tokens # corresponding to one per input column_path print('got parsed') rdd_parsed = rdd_input.mapValues( lambda x: input_parser.parse_values(x)) if (self.options.get("emptylines") == 'False'): rdd_parsed = rdd_parsed.filter( lambda x: self.filter_emptylines(x)) return self._tokenize_rdd(rdd_parsed)
def tokenize_seq_rdd(self, rdd_input): input_parser = ParserFactory.get_parser(self.config, **self.options) if input_parser: # SEQUENCE: Each RDD element when parsed yields a tuple of strings # to be tokenized, and each string yields a list of tokens # corresponding to one per input column_path rdd_parsed = rdd_input.mapValues( lambda x: input_parser.parse_values(x)) if (self.options.get("emptylines") == 'False'): rdd_parsed = rdd_parsed.filter( lambda x: self.filter_emptylines(x)) return self._tokenize_rdd(rdd_parsed) else: raise ValueError("No input_parser")
def tokenize_text_file(self, spark_context, filename, data_type): raw_data = spark_context.textFile(filename) input_parser = ParserFactory.get_parser(data_type, self.config, self.options) if input_parser: data = raw_data.map(lambda x: input_parser.parse(x)) return self.tokenize_rdd(data)