def predict(self, domains): """This function accepts cudf series of domains as an argument to classify domain names as benign/malicious and returns the learned label for each object in the form of cudf series. :param domains: List of domains. :type domains: cudf.Series :return: Predicted results with respect to given domains. :rtype: cudf.Series Examples -------- >>> dd.predict(['nvidia.com', 'dgadomain']) 0 0 1 1 Name: is_dga, dtype: int64 """ df = cudf.DataFrame({"domain": domains}) domains_len = df["domain"].count() temp_df = du.str2ascii(df, domains_len) # Assigning sorted domains index to return learned labels as per the given input order. df.index = temp_df.index df["domain"] = temp_df["domain"] temp_df = temp_df.drop("domain") input, seq_lengths = self.__create_variables(temp_df) del temp_df model_result = self.model(input, seq_lengths) pred = model_result.data.max(1, keepdim=True)[1] type_ids = pred.view(-1).tolist() df["is_dga"] = type_ids df = df.sort_index() return df["is_dga"]
def __get_partitioned_dfs(self, df, batch_size): """Partition one dataframe to multiple small dataframes based on a given batch size. :param df: Contains domains and it's types. :type df: cudf.DataFrame :param batch_size: Number of records has to be in each partitioned dataframe. :type batch_size: int """ dataset_len = df["domain"].count() df = du.str2ascii(df, dataset_len) prev_chunk_offset = 0 partitioned_dfs = [] while prev_chunk_offset < dataset_len: curr_chunk_offset = prev_chunk_offset + batch_size chunk = df.iloc[prev_chunk_offset:curr_chunk_offset:1] partitioned_dfs.append(chunk) prev_chunk_offset = curr_chunk_offset return partitioned_dfs, dataset_len
def test_str2ascii(): actual_output_df = du.str2ascii(test_input_df, test_domains_len) assert actual_output_df.equals(expected_output_df)