def __init__(self,
              input_node: WillumpGraphNode,
              input_name: str,
              output_name: str,
              input_vocab_dict: Mapping[str, int],
              input_idf_vector,
              aux_data: List[Tuple[int, WeldType]],
              ngram_range: Tuple[int, int],
              analyzer: str = "char",
              cost: float = 0) -> None:
     """
     Initialize the node, appending a new entry to aux_data in the process.
     """
     self._input_array_string_name = input_name
     self._output_name = output_name
     vocabulary_list = sorted(input_vocab_dict.keys(),
                              key=lambda x: input_vocab_dict[x])
     self._vocab_size = len(vocabulary_list)
     self.output_width = self._vocab_size
     self._vocab_dict_name = "AUX_DATA_{0}".format(len(aux_data))
     self._idf_vector_name = "AUX_DATA_{0}".format(len(aux_data) + 1)
     self._input_nodes = []
     self._input_nodes.append(input_node)
     self._input_nodes.append(WillumpInputNode(self._vocab_dict_name))
     self._input_nodes.append(WillumpInputNode(self._idf_vector_name))
     self._input_names = [
         input_name, self._vocab_dict_name, self._idf_vector_name
     ]
     self._min_gram, self._max_gram = ngram_range
     self._analyzer = analyzer
     self._cost = cost
     for entry in self._process_aux_data(vocabulary_list, input_idf_vector):
         aux_data.append(entry)
Beispiel #2
0
 def test_basic_cv(self):
     print("\ntest_basic_cv")
     input_str = ["catcatcat", "dogdogdog", "elephantcat"]
     input_node: WillumpInputNode = WillumpInputNode("input_str")
     aux_data = []
     with open("tests/test_resources/simple_vocabulary.txt") as simple_vocab:
         simple_vocab_dict = {word: index for index, word in
                              enumerate(simple_vocab.read().splitlines())}
     array_cv_node: ArrayCountVectorizerNode = \
         ArrayCountVectorizerNode(input_node, "input_str", output_name='lowered_output_words',
                                  input_vocab_dict=simple_vocab_dict,
                                  aux_data=aux_data, ngram_range=(2, 5))
     output_node: WillumpOutputNode = WillumpOutputNode(array_cv_node, ["lowered_output_words"])
     graph: WillumpGraph = WillumpGraph(output_node)
     type_map = {"input_str": WeldVec(WeldStr()),
                 "lowered_output_words": WeldCSR((WeldLong()))}
     weld_output = wexec.execute_from_basics(graph,
                                             type_map,
                                             (input_str,), ["input_str"], ["lowered_output_words"], aux_data)
     numpy.testing.assert_equal(
         weld_output[0], numpy.array([0, 1, 2], dtype=numpy.int64))
     numpy.testing.assert_equal(
         weld_output[1], numpy.array([3, 4, 3], dtype=numpy.int64))
     numpy.testing.assert_equal(
         weld_output[2], numpy.array([3, 3, 1], dtype=numpy.int64))
 def test_basic_hash_join(self):
     print("\ntest_basic_hash_join")
     left_table = pd.read_csv("tests/test_resources/toy_data_csv.csv")
     right_table = pd.read_csv("tests/test_resources/toy_metadata_csv.csv")
     input_node: WillumpInputNode = WillumpInputNode("input_table")
     aux_data = []
     hash_join_node: WillumpHashJoinNode = \
         WillumpHashJoinNode(input_node=input_node, input_name="input_table", output_name="output",
                             join_col_names=["join_column"],
                             right_dataframe=right_table, aux_data=aux_data,
                             left_input_type=WeldPandas(
                                 [WeldVec(WeldLong()), WeldVec(WeldLong()), WeldVec(WeldLong())],
                                 ["join_column", "data1", "data2"]))
     output_node: WillumpOutputNode = WillumpOutputNode(hash_join_node, ["output"])
     graph: WillumpGraph = WillumpGraph(output_node)
     type_map = {"input_table": WeldPandas([WeldVec(WeldLong()), WeldVec(WeldLong()), WeldVec(WeldLong())],
                                           ["join_column", "data1", "data2"]),
                 "output": WeldPandas([WeldVec(WeldLong()), WeldVec(WeldLong()), WeldVec(WeldLong()),
                                       WeldVec(WeldDouble()), WeldVec(WeldDouble())],
                                      ["join_column", "data1", "data2", "metadata1", "metadata2"])}
     weld_output = wexec.execute_from_basics(graph=graph,
                                             type_map=type_map,
                                             inputs=((left_table["join_column"].values,
                                                      left_table["data1"].values, left_table["data2"].values),),
                                             input_names=["input_table"],
                                             output_names=["output"],
                                             aux_data=aux_data)
     numpy.testing.assert_equal(
         weld_output[1], numpy.array([4, 5, 2, 5, 3], dtype=numpy.int64))
     numpy.testing.assert_equal(
         weld_output[3], numpy.array([1.2, 2.2, 2.2, 3.2, 1.2], dtype=numpy.float64))
     numpy.testing.assert_equal(
         weld_output[4], numpy.array([1.3, 2.3, 2.3, 3.3, 1.3], dtype=numpy.float64))
 def __init__(self, input_node: WillumpGraphNode, input_name: str,
              left_input_type: WeldType, output_name: str,
              join_col_names: List[str], right_dataframe,
              aux_data: List[Tuple[int, WeldType]]) -> None:
     """
     Initialize the node, appending a new entry to aux_data in the process.
     """
     self.left_input_name = input_name
     self._output_name = output_name
     self._right_dataframe = right_dataframe
     self._right_dataframe_name = "AUX_DATA_{0}".format(len(aux_data))
     self.join_col_names = join_col_names
     self._input_nodes = [
         input_node,
         WillumpInputNode(self._right_dataframe_name)
     ]
     self._input_names = [input_name, self._right_dataframe_name]
     assert (isinstance(left_input_type, WeldPandas))
     self.left_df_type = left_input_type
     for entry in self._process_aux_data(right_dataframe):
         aux_data.append(entry)
     self._output_type = WeldPandas(
         field_types=self.left_df_type.field_types +
         self.right_df_type.field_types,
         column_names=self.left_df_type.column_names +
         self.right_df_type.column_names)
Beispiel #5
0
 def test_mixed_string_lower(self):
     print("\ntest_mixed_string_lower")
     input_df = pd.DataFrame({"target_col": ["aA,.,.a", "B,,b", "c34234C"]})
     input_str = list(input_df["target_col"].values)
     input_node: WillumpInputNode = WillumpInputNode("input_str")
     string_lower_node: StringLowerNode = \
         StringLowerNode(input_node=input_node, input_name="input_str",
                         input_type=WeldPandas([WeldStr()], ["target_col"]),
                         input_col="target_col",
                         output_name="lowered_output_words",
                         output_col="new_col",
                         output_type=WeldPandas([WeldVec(WeldStr()), WeldVec(WeldStr())],
                                                    ["new_col", "target_col"]))
     output_node: WillumpOutputNode = WillumpOutputNode(string_lower_node, ["lowered_output_words"])
     graph: WillumpGraph = WillumpGraph(output_node)
     type_map = {"input_str": WeldPandas([WeldVec(WeldStr())], ["target_col"]),
                 "lowered_output_words": WeldPandas([WeldVec(WeldStr()), WeldVec(WeldStr())],
                                                    ["new_col", "target_col"])}
     weld_output = wexec.execute_from_basics(graph, type_map, ((input_str,),), ["input_str"], ["lowered_output_words"],
                                             [])
     self.assertEqual(weld_output, (["aa,.,.a", "b,,b", "c34234c"], ["aA,.,.a", "B,,b", "c34234C"]))
 def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
     """
     Begin processing of a function.  Create input nodes for function arguments.
     """
     for arg in node.args.args:
         arg_name: str = self.get_store_name(arg.arg, node.lineno)
         arg_type: WeldType = self._type_map[arg_name]
         input_node: WillumpInputNode = WillumpInputNode(arg_name, arg_type=arg_type)
         self._node_dict[arg_name] = input_node
         self.arg_list.append(arg_name)
     for entry in node.body:
         if isinstance(entry, ast.Assign):
             output_var_names, assignment_node = self.analyze_Assign(entry)
             for output_var_name in output_var_names:
                 self._node_dict[output_var_name] = assignment_node
         elif isinstance(entry, ast.Return):
             self.analyze_Return(entry)
         else:
             output_names, py_node = self._create_py_node(entry)
             for output_name in output_names:
                 self._node_dict[output_name] = py_node
def process_weld_block(weld_block_input_set, weld_block_aux_input_set, weld_block_output_set, weld_block_node_list,
                       future_nodes, typing_map, num_workers, eval_cascades, batch) \
        -> List[typing.Union[ast.AST, Tuple[List[str], List[str], List[List[str]]]]]:
    """
    Helper function for graph_to_weld.  Creates Willump statements for a block of Weld code given information about
    the code, its inputs, and its outputs.  Returns these Willump statements.
    """
    # Do not emit any output that are not needed in later blocks.
    for entry in weld_block_output_set.copy():
        appears_later = False
        for future_node in future_nodes:
            if any(entry == input_name
                   for input_name in future_node.get_in_names()):
                appears_later = True
                break
            if any(entry == output_name
                   for output_name in future_node.get_output_names()):
                break
        if not appears_later:
            weld_block_output_set.remove(entry)
    # Do optimization passes over the block.
    csr_preprocess_nodes, csr_postprocess_nodes = \
        wg_passes.weld_csr_marshalling_pass(weld_block_input_set, weld_block_output_set, typing_map)
    pandas_preprocess_nodes, pandas_postprocess_nodes = wg_passes.weld_pandas_marshalling_pass(
        weld_block_input_set, weld_block_output_set, typing_map, batch)
    series_preprocess_nodes, series_postprocess_nodes = wg_passes.weld_pandas_series_marshalling_pass(
        weld_block_input_set, weld_block_output_set, typing_map)
    preprocess_nodes = csr_preprocess_nodes + pandas_preprocess_nodes + series_preprocess_nodes
    postprocess_nodes = csr_postprocess_nodes + pandas_postprocess_nodes + series_postprocess_nodes
    # Split Weld blocks into multiple threads.
    num_threads = num_workers + 1  # The main thread also does work.
    if num_threads > 1:
        threaded_statements_list = \
            wg_passes.multithreading_weld_blocks_pass(weld_block_node_list,
                                                      weld_block_input_set, weld_block_output_set, num_threads)
    else:
        threaded_statements_list = [
            (weld_block_node_list, weld_block_input_set, weld_block_output_set)
        ]
    # Append appropriate input and output nodes to each node list.
    for multithreaded_entry in threaded_statements_list:
        if len(multithreaded_entry) == 2:
            input_set, thread_list = multithreaded_entry
            for thread_entry in thread_list:
                weld_block_nodes, output_set = thread_entry
                for entry in input_set:
                    weld_block_nodes.insert(0, WillumpInputNode(entry))
                for entry in weld_block_aux_input_set:
                    weld_block_nodes.insert(0, WillumpInputNode(entry))
                weld_block_nodes.append(
                    WillumpMultiOutputNode(list(output_set)))
        else:
            weld_block_nodes, input_set, output_set = multithreaded_entry
            for entry in input_set:
                weld_block_nodes.insert(0, WillumpInputNode(entry))
            for entry in weld_block_aux_input_set:
                weld_block_nodes.insert(0, WillumpInputNode(entry))
            weld_block_nodes.append(WillumpMultiOutputNode(list(output_set)))
    # Construct the statement list from the nodes.
    weld_string_nodes: List[Tuple[List[str], List[str], List[List[str]]]] = []
    for multithreaded_entry in threaded_statements_list:
        if len(multithreaded_entry) == 2:
            input_set, thread_list = multithreaded_entry
            weld_strings = []
            output_sets = []
            for thread_entry in thread_list:
                weld_block_nodes, output_set = thread_entry
                weld_str: str = ""
                for weld_node in weld_block_nodes:
                    weld_str += weld_node.get_node_weld()
                weld_strings.append(weld_str)
                output_sets.append(output_set)
            weld_string_nodes.append(
                (weld_strings, list(input_set), output_sets))
        else:
            weld_block_nodes, input_set, output_set = multithreaded_entry
            weld_str: str = ""
            for weld_node in weld_block_nodes:
                weld_str += weld_node.get_node_weld()
            weld_string_nodes.append(
                ([weld_str], list(input_set), [list(output_set)]))
    preprocess_python = list(map(lambda x: x.get_python(), preprocess_nodes))
    postprocess_python = list(map(lambda x: x.get_python(), postprocess_nodes))
    return preprocess_python + weld_string_nodes + postprocess_python