Beispiel #1
0
 def get_batch_results(self, batch):
     batch_as_df = get_df_from_csv_string(batch)
     results = {
         "target_results_file_name":
         f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv",
         "file_content": batch
     }
     return results
Beispiel #2
0
def test_get_batch_results(TEXT_CSV_DATA):
    batch_tokenizer = BatchTokenizer(source=None,
                                     dest=None,
                                     dest_col_name="Actual Tokens",
                                     include_cols=["Tokens"])
    results = batch_tokenizer.get_batch_results(TEXT_CSV_DATA)
    df = get_df_from_csv_string(results["file_content"])
    num_rows = df.shape[0]
    for i in range(num_rows):
        row = df.iloc[i]
        expected = row["Tokens"]
        actual = row["Actual Tokens"]
        assert (pd.isnull(expected) and actual == "[]") or (expected == actual)
Beispiel #3
0
    def get_batch_results(self, batch):
        batch_as_df = get_df_from_csv_string(batch)
        batch_as_df[self.dest_col_name] = self.get_tokenized_column(batch_as_df, self.source_col_name)

        results_df_cols = [self.id_col_name, self.dest_col_name]
        results_df_cols.extend(self.include_cols)
        results_df = batch_as_df[results_df_cols]
        results_csv_string = get_csv_string_from_df(results_df)

        results = {
            "target_results_file_name" : f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv",
            "file_content" : results_csv_string
        }
        return results
Beispiel #4
0
    def get_batch_results(self, batch):
        batch_as_df = get_df_from_csv_string(batch)
        batch_as_df[self.dest_col_name] = batch_as_df[self.source_col_name].apply( self.remove_url ) \
                                                                           .apply( self.autocorrect_spelling )
        results_df_cols = [self.id_col_name, self.dest_col_name]
        results_df_cols.extend(self.include_cols)
        results_df = batch_as_df[results_df_cols]
        results_csv_string = get_csv_string_from_df(results_df)

        results = {
            "target_results_file_name" : f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv",
            "file_content" : results_csv_string
        }
        return results
Beispiel #5
0
def test_get_batch_results(TEXT_CSV_DATA):
    batch_preprocessor = BatchPreprocessor(
        source=None,
        dest=None,
        dest_col_name="Actual Preprocessed Text",
        include_cols=["Preprocessed Text"])
    results = batch_preprocessor.get_batch_results(TEXT_CSV_DATA)
    df = get_df_from_csv_string(results["file_content"])
    num_rows = df.shape[0]
    for i in range(num_rows):
        row = df.iloc[i]
        expected = row["Preprocessed Text"]
        actual = row["Actual Preprocessed Text"]
        assert (pd.isnull(expected) and pd.isnull(actual)) or (expected
                                                               == actual)
Beispiel #6
0
def test_get_tokenized_column(TEXT_CSV_DATA):
    batch_tokenizer = BatchTokenizer(source=None,
                                     dest=None,
                                     dest_col_name="Actual Tokens",
                                     include_cols=["Tokens"])
    df = get_df_from_csv_string(TEXT_CSV_DATA)
    df["Actual Tokens"] = batch_tokenizer.get_tokenized_column(
        df, "Preprocessed Text")
    num_rows = df.shape[0]
    for i in range(num_rows):
        row = df.iloc[i]
        expected = row["Tokens"]
        actual = row["Actual Tokens"]
        assert (pd.isnull(expected)
                and str(actual) == "[]") or (expected == str(actual))
Beispiel #7
0
 def run(self):
     df = pd.DataFrame()
     while True:
         try:
             batch = self.source.get_next_batch()
             batch_df = get_df_from_csv_string(batch)
             df = pd.concat([df, batch_df])
             self.source.mark_batch_as_complete()
         except StopIteration:
             logger.info(f"Finished reading batches from source.")
             results_file_content = get_csv_string_from_df(df)
             target_file_name = "consolidated_batches.csv"
             self.dest.publish_batch_results(results_file_content,
                                             target_file_name)
             break
         except Exception as e:
             logger.error(e)
             break
Beispiel #8
0
    def get_batch_results(self, batch):
        batch_as_df = get_df_from_csv_string(batch)
        results_df_cols = [self.id_col_name]
        results_df_cols.extend(self.include_cols)

        for question_type in QUESTION_WORDS:
            source_col_name = question_type
            dest_col_name = question_type + " tokens"
            results_df_cols.extend([source_col_name, dest_col_name])
            batch_as_df[dest_col_name] = self.get_tokenized_column(batch_as_df, source_col_name)

        results_df = batch_as_df[results_df_cols]
        results_csv_string = get_csv_string_from_df(results_df)

        results = {
            "target_results_file_name" : f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv",
            "file_content" : results_csv_string
        }
        return results
Beispiel #9
0
    def get_batch_results(self, batch):
        batch_as_df = get_df_from_csv_string(batch)
        for question_type in QUESTION_WORDS:
            batch_as_df[question_type] = None
        for i, row in batch_as_df.iterrows():
            top_wh_phrases = self.get_top_wh_phrases(row[self.source_col_name])
            for question_type in QUESTION_WORDS:
                batch_as_df.at[i, question_type] = top_wh_phrases.get(
                    question_type)

        results_df_cols = [self.id_col_name]
        results_df_cols.extend(QUESTION_WORDS)
        results_df_cols.extend(self.include_cols)
        results_df = batch_as_df[results_df_cols]
        results_csv_string = get_csv_string_from_df(results_df)

        results = {
            "target_results_file_name":
            f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv",
            "file_content": results_csv_string
        }
        return results