def get_batch_results(self, batch): batch_as_df = get_df_from_csv_string(batch) results = { "target_results_file_name": f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv", "file_content": batch } return results
def test_get_batch_results(TEXT_CSV_DATA): batch_tokenizer = BatchTokenizer(source=None, dest=None, dest_col_name="Actual Tokens", include_cols=["Tokens"]) results = batch_tokenizer.get_batch_results(TEXT_CSV_DATA) df = get_df_from_csv_string(results["file_content"]) num_rows = df.shape[0] for i in range(num_rows): row = df.iloc[i] expected = row["Tokens"] actual = row["Actual Tokens"] assert (pd.isnull(expected) and actual == "[]") or (expected == actual)
def get_batch_results(self, batch): batch_as_df = get_df_from_csv_string(batch) batch_as_df[self.dest_col_name] = self.get_tokenized_column(batch_as_df, self.source_col_name) results_df_cols = [self.id_col_name, self.dest_col_name] results_df_cols.extend(self.include_cols) results_df = batch_as_df[results_df_cols] results_csv_string = get_csv_string_from_df(results_df) results = { "target_results_file_name" : f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv", "file_content" : results_csv_string } return results
def get_batch_results(self, batch): batch_as_df = get_df_from_csv_string(batch) batch_as_df[self.dest_col_name] = batch_as_df[self.source_col_name].apply( self.remove_url ) \ .apply( self.autocorrect_spelling ) results_df_cols = [self.id_col_name, self.dest_col_name] results_df_cols.extend(self.include_cols) results_df = batch_as_df[results_df_cols] results_csv_string = get_csv_string_from_df(results_df) results = { "target_results_file_name" : f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv", "file_content" : results_csv_string } return results
def test_get_batch_results(TEXT_CSV_DATA): batch_preprocessor = BatchPreprocessor( source=None, dest=None, dest_col_name="Actual Preprocessed Text", include_cols=["Preprocessed Text"]) results = batch_preprocessor.get_batch_results(TEXT_CSV_DATA) df = get_df_from_csv_string(results["file_content"]) num_rows = df.shape[0] for i in range(num_rows): row = df.iloc[i] expected = row["Preprocessed Text"] actual = row["Actual Preprocessed Text"] assert (pd.isnull(expected) and pd.isnull(actual)) or (expected == actual)
def test_get_tokenized_column(TEXT_CSV_DATA): batch_tokenizer = BatchTokenizer(source=None, dest=None, dest_col_name="Actual Tokens", include_cols=["Tokens"]) df = get_df_from_csv_string(TEXT_CSV_DATA) df["Actual Tokens"] = batch_tokenizer.get_tokenized_column( df, "Preprocessed Text") num_rows = df.shape[0] for i in range(num_rows): row = df.iloc[i] expected = row["Tokens"] actual = row["Actual Tokens"] assert (pd.isnull(expected) and str(actual) == "[]") or (expected == str(actual))
def run(self): df = pd.DataFrame() while True: try: batch = self.source.get_next_batch() batch_df = get_df_from_csv_string(batch) df = pd.concat([df, batch_df]) self.source.mark_batch_as_complete() except StopIteration: logger.info(f"Finished reading batches from source.") results_file_content = get_csv_string_from_df(df) target_file_name = "consolidated_batches.csv" self.dest.publish_batch_results(results_file_content, target_file_name) break except Exception as e: logger.error(e) break
def get_batch_results(self, batch): batch_as_df = get_df_from_csv_string(batch) results_df_cols = [self.id_col_name] results_df_cols.extend(self.include_cols) for question_type in QUESTION_WORDS: source_col_name = question_type dest_col_name = question_type + " tokens" results_df_cols.extend([source_col_name, dest_col_name]) batch_as_df[dest_col_name] = self.get_tokenized_column(batch_as_df, source_col_name) results_df = batch_as_df[results_df_cols] results_csv_string = get_csv_string_from_df(results_df) results = { "target_results_file_name" : f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv", "file_content" : results_csv_string } return results
def get_batch_results(self, batch): batch_as_df = get_df_from_csv_string(batch) for question_type in QUESTION_WORDS: batch_as_df[question_type] = None for i, row in batch_as_df.iterrows(): top_wh_phrases = self.get_top_wh_phrases(row[self.source_col_name]) for question_type in QUESTION_WORDS: batch_as_df.at[i, question_type] = top_wh_phrases.get( question_type) results_df_cols = [self.id_col_name] results_df_cols.extend(QUESTION_WORDS) results_df_cols.extend(self.include_cols) results_df = batch_as_df[results_df_cols] results_csv_string = get_csv_string_from_df(results_df) results = { "target_results_file_name": f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv", "file_content": results_csv_string } return results