Python mergeの例、cudf.merge Pythonの例

コード例 #1

0

ファイルを表示

    def fit_transform(self, docs_df):

        self._warn_for_unused_params()
        self._validate_params()
        topic_series = docs_df["Topic"]
        topic_df = topic_series.to_frame(name="Topic_ID")
        topic_df["doc_id"] = cp.arange(len(topic_df))

        docs = self.preprocess_text_gpu(docs_df["Document"])
        n_doc = len(topic_df["Topic_ID"].unique())

        tokenized_df = self._create_tokenized_df(docs)
        self.vocabulary_ = tokenized_df["token"].unique()

        merged_count_df = (cudf.merge(
            tokenized_df, topic_df,
            how="left").sort_values("Topic_ID").rename({"Topic_ID": "doc_id"},
                                                       axis=1))

        count_df = self._count_vocab(merged_count_df)

        # TODO: handle empty docids case later
        empty_doc_ids = cp.empty(shape=0, dtype=cp.int32)
        X = create_csr_matrix_from_count_df(count_df,
                                            empty_doc_ids,
                                            n_doc,
                                            len(self.vocabulary_),
                                            dtype=self.dtype)
        if self.binary:
            X.data.fill(1)

        return X

コード例 #2

0

ファイルを表示

def test_dataframe_merge_on(on):
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left['key1'] = np.random.randint(0, 40, nelem)
    df_left['key2'] = np.random.randint(0, 50, nelem)
    df_left['left_val'] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right['key1'] = np.random.randint(0, 30, nelem)
    df_right['key2'] = np.random.randint(0, 50, nelem)
    df_right['right_val'] = np.arange(nelem)

    # Make pandas DF
    pddf_left = df_left.to_pandas()
    pddf_right = df_right.to_pandas()

    # Expected result (from pandas)
    pddf_joined = pddf_left.merge(pddf_right, on=on, how='left')

    # Test (from cuDF; doesn't check for ordering)
    join_result = df_left.merge(df_right, on=on, how='left')
    join_result_cudf = cudf.merge(df_left, df_right, on=on, how='left')

    join_result['right_val'] = (join_result['right_val'].astype(
        np.float64).fillna(np.nan))

    join_result_cudf['right_val'] = (join_result_cudf['right_val'].astype(
        np.float64).fillna(np.nan))

    for col in list(pddf_joined.columns):
        if (col.count('_y') > 0):
            join_result[col] = (join_result[col].astype(np.float64).fillna(
                np.nan))
            join_result_cudf[col] = (join_result_cudf[col].astype(
                np.float64).fillna(np.nan))

    # Test dataframe equality (ignore order of rows and columns)
    cdf_result = join_result.to_pandas() \
                            .sort_values(list(pddf_joined.columns)) \
                            .reset_index(drop=True)

    pdf_result = pddf_joined.sort_values(list(pddf_joined.columns)) \
                            .reset_index(drop=True)

    pd.util.testing.assert_frame_equal(cdf_result, pdf_result, check_like=True)

    merge_func_result_cdf = join_result_cudf.to_pandas() \
                                            .sort_values(
                                                list(pddf_joined.columns)) \
                                            .reset_index(drop=True)

    pd.util.testing.assert_frame_equal(merge_func_result_cdf,
                                       cdf_result,
                                       check_like=True)

コード例 #3

0

ファイルを表示

ファイル: test_joining.py プロジェクト: wphicks/cudf

def test_dataframe_merge_on(on):
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left["key1"] = np.random.randint(0, 40, nelem)
    df_left["key2"] = np.random.randint(0, 50, nelem)
    df_left["left_val"] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right["key1"] = np.random.randint(0, 30, nelem)
    df_right["key2"] = np.random.randint(0, 50, nelem)
    df_right["right_val"] = np.arange(nelem)

    # Make pandas DF
    pddf_left = df_left.to_pandas()
    pddf_right = df_right.to_pandas()

    # Expected result (from pandas)
    pddf_joined = pddf_left.merge(pddf_right, on=on, how="left")

    # Test (from cuDF; doesn't check for ordering)
    join_result = df_left.merge(df_right, on=on, how="left")
    join_result_cudf = cudf.merge(df_left, df_right, on=on, how="left")

    join_result["right_val"] = (join_result["right_val"].astype(
        np.float64).fillna(np.nan))

    join_result_cudf["right_val"] = (join_result_cudf["right_val"].astype(
        np.float64).fillna(np.nan))

    for col in list(pddf_joined.columns):
        if col.count("_y") > 0:
            join_result[col] = (join_result[col].astype(np.float64).fillna(
                np.nan))
            join_result_cudf[col] = (join_result_cudf[col].astype(
                np.float64).fillna(np.nan))

    # Test dataframe equality (ignore order of rows and columns)
    cdf_result = (join_result.to_pandas().sort_values(list(
        pddf_joined.columns)).reset_index(drop=True))

    pdf_result = pddf_joined.sort_values(list(
        pddf_joined.columns)).reset_index(drop=True)

    assert_eq(cdf_result, pdf_result, check_like=True)

    merge_func_result_cdf = (join_result_cudf.to_pandas().sort_values(
        list(pddf_joined.columns)).reset_index(drop=True))

    assert_eq(merge_func_result_cdf, cdf_result, check_like=True)

コード例 #4

0

ファイルを表示

ファイル: aggregateTimeFeature.py プロジェクト: stjordanis/gQuant

    def process(self, inputs):
        df = inputs[self.INPUT_PORT_NAME]
        output = {}

        col = list(df.columns)
        col.remove('year')
        col.remove('month')

        mdf = df[col].groupby('sample_id').mean()
        mdf.columns = [c + "_mean" for c in mdf.columns]

        sdf = df[col].groupby('sample_id').std()
        sdf.columns = [c + "_std" for c in sdf.columns]

        out = cudf.merge(mdf, sdf, left_index=True,
                         right_index=True).reset_index()
        output.update({self.OUTPUT_PORT_NAME: out})
        return output

コード例 #5

0

ファイルを表示

    def process(self, inputs):
        """
        left merge the two dataframes in the inputs. the `on column` is defined
        in the `column` of the node's conf

        Arguments
        -------
         inputs: list
            list of input dataframes.
        Returns
        -------
        dataframe
        """
        df1 = inputs[self.INPUT_PORT_LEFT_NAME]
        df2 = inputs[self.INPUT_PORT_RIGHT_NAME]
        return {self.OUTPUT_PORT_NAME: cudf.merge(df1, df2,
                                                  on=self.conf['column'],
                                                  how='inner')}

コード例 #6

0

ファイルを表示

ファイル: utils.py プロジェクト: AjayThorve/Spatial-Analytics-Viz

def get_nearest_polygons_from_selected_point(
    point_lat, point_lon, average_speed, trip_time,
    nodes_df, edges_df, census_data
):
    times = [time.time()]

    lat, lon = createCircleAroundWithRadius(
        point_lat, point_lon, distanceInMiles
    )
    nodes = get_updated_df(lat, lon, nodes_df)

    edges = get_updated_edges(nodes, edges_df)
    times.append(time.time())

    # km per hour to m per minute
    meters_per_minute = (average_speed * 1000) / 60
    edges['time'] = edges['length'] / meters_per_minute

    point_of_interest = get_nearest_node(
        nodes, point=(point_lat, point_lon), x='x', y='y', osmid='vertex'
    )
    times.append(time.time())
    shortest_paths = get_shortest_paths(edges, point_of_interest)
    results = cudf.merge(shortest_paths, nodes[
        ['vertex', 'y', 'x']], on='vertex', how='inner'
    )
    times.append(time.time())
    polygons = get_polygons_for_travel_time(results, trip_time)
    d = gpd.geodataframe.from_shapely(polygons)
    polygon = gpd.GeoDataFrame(
        index=[i for i in range(len(d))], geometry=d).reset_index()
    times.append(time.time())

    times = np.diff(times)
    times = np.round(times, 4)

    del results, shortest_paths, edges, nodes
    return (
        polygon,
        delayed(query_census_dataset)(polygons, census_data).compute(),
        times
    )

コード例 #7

0

ファイルを表示

 def _merge(self, dataframe, merge=True):
     with timer("merge"):
         for param_dict, features in tqdm(zip(self.param_dict,
                                              self.features),
                                          total=len(self.features)):
             key, var, agg, on = self._get_params(param_dict)
             if merge:
                 if is_cudf(dataframe):
                     dataframe = cudf.merge(dataframe,
                                            features,
                                            how="left",
                                            on=on)
                 else:
                     dataframe = dataframe.merge(features,
                                                 how="left",
                                                 on=on)
             else:
                 new_features = self._get_feature_names(key, var, agg)
                 dataframe = pd.concat([dataframe, features[new_features]],
                                       axis=1)
     return dataframe

コード例 #8

0

ファイルを表示

    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            org_cols = train.columns.tolist()
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True).reset_index()
            del train, test
            gc.collect()

        with timer("GroupbyTransformer"):
            groupby = GroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total["diff_Year_of_Release_groupby_Platform"] = (
                total["max_Year_of_Release_groupby_Platform"]
                - total["min_Year_of_Release_groupby_Platform"]
            )
            groupby = DiffGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)
            groupby = RatioGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)

        with timer("pivot_tables"):
            with timer("Publisher"):
                count_publishers_groupby_platform = cudf.from_pandas(
                    total.to_pandas()
                    .pivot_table(
                        index="Platform",
                        columns="Publisher",
                        values="Name",
                        aggfunc="count",
                    )
                    .reset_index()
                ).fillna(0.0)
                count_publishers_groupby_platform.columns = ["Platform"] + [
                    "count_publisher_" + str(col) + "_groupby_platform"
                    for col in count_publishers_groupby_platform.columns
                    if str(col) != "Platform"
                ]
                total = cudf.merge(
                    total, count_publishers_groupby_platform, how="left", on="Platform"
                )

            with timer("Genre"):
                count_genres_groupby_platform = cudf.from_pandas(
                    total.to_pandas()
                    .pivot_table(
                        index="Platform",
                        columns="Genre",
                        values="Name",
                        aggfunc="count",
                    )
                    .reset_index()
                ).fillna(0.0)
                count_genres_groupby_platform.columns = ["Platform"] + [
                    "count_genre_" + str(col) + "_groupby_platform"
                    for col in count_genres_groupby_platform.columns
                    if str(col) != "Platform"
                ]
                total = cudf.merge(
                    total, count_genres_groupby_platform, how="left", on="Platform"
                )

            with timer("Year_of_Release"):
                count_year_of_releases_groupby_platform = cudf.from_pandas(
                    total.to_pandas()
                    .pivot_table(
                        index="Platform",
                        columns="Year_of_Release",
                        values="Name",
                        aggfunc="count",
                    )
                    .reset_index()
                ).fillna(0.0)
                count_year_of_releases_groupby_platform.columns = ["Platform"] + [
                    "count_year_of_release_" + str(col) + "_groupby_platform"
                    for col in count_year_of_releases_groupby_platform.columns
                    if str(col) != "Platform"
                ]
                total = cudf.merge(
                    total,
                    count_year_of_releases_groupby_platform,
                    how="left",
                    on="Platform",
                )

            with timer("Rating"):
                count_ratings_groupby_platform = cudf.from_pandas(
                    total.to_pandas()
                    .pivot_table(
                        index="Platform",
                        columns="Rating",
                        values="Name",
                        aggfunc="count",
                    )
                    .reset_index()
                ).fillna(0.0)
                count_ratings_groupby_platform.columns = ["Platform"] + [
                    "count_rating_" + str(col) + "_groupby_platform"
                    for col in count_ratings_groupby_platform.columns
                    if str(col) != "Platform"
                ]
                total = cudf.merge(
                    total, count_ratings_groupby_platform, how="left", on="Platform"
                )

        with timer("end"):
            total = total.sort_values("index")
            new_cols = [col for col in total.columns if col not in org_cols + ["index"]]

            self.train = total[new_cols].iloc[:len_train].reset_index(drop=True)
            self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)

コード例 #9

0

ファイルを表示

ファイル: 3_Rapids_flow_classification.py プロジェクト: teppoeva/MLADS_RAPIDS

def main():
    parser = argparse.ArgumentParser("RAPIDS_DBSCAN")
    parser.add_argument("--data_dir", type=str, help="Location of data")
    parser.add_argument('-f', type=str,
                        default='')  # added for notebook execution scenarios

    args = parser.parse_args()
    data_dir = args.data_dir

    run = Run.get_context()

    # specify the location of the data files
    DATA_PATH = data_dir

    # the sample PCAP file used for explanation
    DATA_PCAP = DATA_PATH + "/small_sample.pcap"

    # the flow connection log (conn.log) file
    DATA_SOURCE = DATA_PATH + "/conn.log"

    # the data label file (matches IP addresses with MAC addresses)
    DATA_LABELS = DATA_PATH + "/lab_mac_labels_cats.csv"

    print("Running NETWORK FLOW on GPU...")
    t1 = datetime.now()

    # ### Background

    ##### Types of Network Data
    # The most detailed type of data that is typically collected on a network is full Packet CAPture (PCAP) data. This information is detailed and contains everything about the communication, including: source address, destination address, protocols used, bytes transferred, and even the raw data (e.g., image, audio file, executable). PCAP data is fine-grained, meaning that there is a record for each frame being transmitted. A typical communication is composed of many individual packets/frames.
    #
    # If we aggregate PCAP data so that there is one row of data per communication session, we call that flow level data. A simplified example of this relationship is shown in the figure below.
    #
    # ![PCAP_flow_relationship](images/pcap_vs_flow.png "PCAP vs FLOW")
    #
    # For this tutorial, we use data from the University of New South Wales. In a lab environment, they [collected nearly three weeks of IoT data from 21 IoT devices](http://149.171.189.1). They also kept a detailed [list of devices by MAC address](http://149.171.189.1/resources/List_Of_Devices.txt), so we have ground-truth with respect to each IoT device's behavior on the network.
    #
    # **Our goal is to utilize the behavior exhibited in the network data to classify IoT devices.**

    ##### The Internet of Things and Data at a Massive Scale
    # Gartner estimates there are currently over 8.4 billion Internet of Things (IoT) devices. By 2020, that number is [estimated to surpass 20 billion](https://www.zdnet.com/article/iot-devices-will-outnumber-the-worlds-population-this-year-for-the-first-time/). These types of devices range from consumer devices (e.g., Amazon Echo, smart TVs, smart cameras, door bells) to commercial devices (e.g., building automation systems, keycard entry). All of these devices exhibit behavior on the Internet as they communicate back with their own clouds and user-specified integrations.

    ### Data Investigation

    # Let's first see some of the data. We'll load a PCAP file in using Scapy. If you don't want to or can't install Scapy, feel free to skip this section.
    cap = rdpcap(DATA_PCAP)

    # get the frames
    eth_frame = cap[3]
    ip_pkt = eth_frame.payload
    segment = ip_pkt.payload
    data = segment.payload

    print(eth_frame.show())

    # There's really a lot of features there. In addition to having multiple layers (which may differ between packets), there are a number of other issues with working directly with PCAP. Often the payload (the `Raw` section above) is encrypted, rendering it useless. The lack of aggregation also makes it difficult to differentiate between packets. What we really care about for this application is what a *session* looks like. In other words, how a Roku interacts with the network is likely quite different than how a Google Home interacts.
    #
    # To save time for the tutorial, all three weeks of PCAP data have already been transformed to flow data, and we can load that in to a typical Pandas dataframe. Due to how the data was created, we have a header row (with column names) as well as a footer row. We've already removed those rows, so nothing to do here.
    #
    # For this application, we used [Zeek](https://www.zeek.org) (formerly known as Bro) to construct the flow data. To include MAC addresses in the conn log, we used the [mac-logging.zeek script](https://github.com/bro/bro/blob/master/scripts/policy/protocols/conn/mac-logging.zeek).
    #
    #     # If you've skipped installing Scapy, you can pick up here.
    #     pdf = pd.read_csv(DATA_SOURCE, sep=\'\t')
    #     print("==> pdf shape: ", pdf.shape)

    #     # We can look at what this new aggregated data looks like, and get a better sense of the columns and their data types. Let's do this the way we're familiar with, using Pandas.
    #     print(pdf.head())
    #     pdf.dtypes

    # That's Pandas, and we could continue the analysis there if we wanted. But what about  [cuDF](https://github.com/rapidsai/cudf)? Let's pivot to that for the majority of this tutorial.
    #
    # One thing cuDF neeeds is for us to specify the data types. We'll write a function to make this easier. As of version 0.6, [strings are supported in cuDF](https://rapidsai.github.io/projects/cudf/en/latest/10min.html?highlight=string#String-Methods). We'll make use of that here.
    def get_dtypes(fn, delim, floats, strings):
        with open(fn, errors='replace') as fp:
            header = fp.readline().strip()

        types = []
        for col in header.split(delim):
            if 'date' in col:
                types.append((col, 'date'))
            elif col in floats:
                types.append((col, 'float64'))
            elif col in strings:
                types.append((col, 'str'))
            else:
                types.append((col, 'int64'))

        return OrderedDict(types)

    dtypes_data_processed = get_dtypes(DATA_SOURCE,
                                       '\t',
                                       floats=['ts', 'duration'],
                                       strings=[
                                           'uid', 'id.orig_h', 'id.resp_h',
                                           'proto', 'service', 'conn_state',
                                           'local_orig', 'local_resp',
                                           'history', 'tunnel_parents',
                                           'orig_l2_addr', 'resp_l2_addr'
                                       ])

    raw_cdf = cd.io.csv.read_csv(DATA_SOURCE,
                                 delimiter='\t',
                                 names=list(dtypes_data_processed),
                                 dtype=list(dtypes_data_processed.values()),
                                 skiprows=1)

    # Those data types seem right. Let's see what this data looks like now that it's in cuDF.
    # ### Adding ground truth labels back to the data

    # We'll need some labels for our classification task, so we've already prepared a file with those labels.
    dtypes_labels_processed = get_dtypes(
        DATA_LABELS,
        ',',
        floats=[],
        strings=['device', 'mac', 'connection', 'category'])

    labels_cdf = cd.io.csv.read_csv(DATA_LABELS,
                                    delimiter=',',
                                    names=list(dtypes_labels_processed),
                                    dtype=list(
                                        dtypes_labels_processed.values()),
                                    skiprows=1)

    print('Labels...')
    print(labels_cdf.head())

    # We now perform a series of merges to add the ground truth data (device name, connection, category, and categoryID) back to the dataset. Since each row of netflow has two participants, we'll have to do this twice - once for the originator (source) and once for the responder (destination).
    labels_cdf.columns = [
        'orig_device', 'orig_l2_addr', 'orig_connection', 'orig_category',
        'orig_category_id'
    ]
    merged_cdf = cd.merge(raw_cdf, labels_cdf, how='left', on='orig_l2_addr')
    labels_cdf.columns = [
        'resp_device', 'resp_l2_addr', 'resp_connection', 'resp_category',
        'resp_category_id'
    ]
    merged_cdf = cd.merge(merged_cdf, labels_cdf, how='left')
    labels_cdf.columns = [
        'device', 'mac', 'connection', 'category', 'category_id'
    ]

    # Let's just look at our new dataset to make sure everything's okay.
    print('Merged...')
    print(merged_cdf.head())

    # ### Exploding the Netflow Data into Originator and Responder Rows

    # We now have netflow that has one row per (sessionized) communication between an originator and responder. However, in order to classify an individual device, we need to explode data. Instead of one row that contains both originator and responder, we'll explode to one row for originator information (orig_bytes, orig_pkts, orig_ip_bytes) and one for responder information (resp_bytes, resp_pkts, resp_ip_bytes).
    #
    # The easiest way to do this is to create two new dataframes, rename all of the columns, then `concat` them back together. Just for sanity, we'll also check the new shape of our exploded data frame.
    orig_comms_cdf = merged_cdf[[
        'ts', 'id.orig_h', 'id.orig_p', 'proto', 'service', 'duration',
        'orig_bytes', 'orig_pkts', 'orig_ip_bytes', 'orig_device',
        'orig_l2_addr', 'orig_category', 'orig_category_id'
    ]]
    orig_comms_cdf.columns = [
        'ts', 'ip', 'port', 'proto', 'service', 'duration', 'bytes', 'pkts',
        'ip_bytes', 'device', 'mac', 'category', 'category_id'
    ]

    resp_comms_cdf = merged_cdf[[
        'ts', 'id.resp_h', 'id.resp_p', 'proto', 'service', 'duration',
        'resp_bytes', 'resp_pkts', 'resp_ip_bytes', 'resp_device',
        'resp_l2_addr', 'resp_category', 'resp_category_id'
    ]]
    resp_comms_cdf.columns = [
        'ts', 'ip', 'port', 'proto', 'service', 'duration', 'bytes', 'pkts',
        'ip_bytes', 'device', 'mac', 'category', 'category_id'
    ]

    exploded_cdf = cd.multi.concat([orig_comms_cdf, resp_comms_cdf])
    print("==> shape (original) =", merged_cdf.shape)
    print("==> shape =", exploded_cdf.shape)

    num_categories = labels_cdf['category_id'].unique().shape[0]
    print("==> number of IoT categories =", num_categories)

    # We currently need to remove null values before we proceed. Although `dropna` doesn't exist in cuDF yet, we can use a workaround to get us there. Also, due to what's available currently, we can't have any nulls in any place in the DF.
    print('Check if any missing...')
    for col in exploded_cdf.columns:
        print(col, exploded_cdf[col].null_count)

    exploded_cdf['category_id'] = exploded_cdf['category_id'].fillna(-999)
    exploded_cdf['device'] = exploded_cdf['device'].str.fillna("none")
    exploded_cdf['category'] = exploded_cdf['category'].str.fillna("none")

    print('After missing observations imputation...')
    for col in exploded_cdf.columns:
        print(col, exploded_cdf[col].null_count)

    # Looks like all the null values are gone, so now we can proceed. If an IP doesn't have a category ID, we can't use it. So we'll filter those out.
    exploded_cdf = exploded_cdf[exploded_cdf['category_id'] != -999]

    # ### Binning the Data and Aggregating the Features
    #

    # But wait, there's still more data wrangling to be done! While we've exploded the flows into rows for orig/resp, we may want to bin the data further by time. The rationale is that any single communication may not be an accurate representation of how a device typically reacts in its environment. Imagine the simple case of how a streaming camera typically operates (most of its data will be uploaded from the device to a destination) versus how it operates during a firmware update (most of the data will be pushed down to the device, after which a brief disruption in connectivity will occur).
    #
    # There's a lof ot different time binning we could do. It also would be useful to investigate what the average duration of connection is relative to how many connections per time across various time granularities. With that said, we'll just choose a time bin of 1 hour to begin with. In order to bin, we'll use the following formula:
    #
    # $$\text{hour_time_bin}=\left\lfloor{\frac{ts}{60*60}}\right\rfloor$$
    exploded_cdf['hour_time_bin'] = exploded_cdf['ts'].applymap(
        lambda x: math.floor(x / (60 * 60))).astype(int)

    # We also have to make a choice about how we'll aggregate the binned data. One of the simplest ways is to sum the bytes and packets. There are really two choices for bytes, `bytes` and `ip_bytes`. With Bro, `bytes` is taken from the TCP sequence numbers and is potentially inaccurate, so we select `ip_bytes` instead for both originator and responder. We'll also use the sum of the number of packets.
    one_hour_time_bin_cdf = (exploded_cdf[[
        'bytes', 'pkts', 'ip_bytes', 'mac', 'category_id', 'hour_time_bin'
    ]].groupby(['mac', 'category_id', 'hour_time_bin']).agg({
        'category_id': 'min',
        'bytes': 'sum',
        'pkts': 'sum',
        'ip_bytes': 'sum'
    })[['min_category_id', 'sum_bytes', 'sum_pkts', 'sum_ip_bytes']])

    one_hour_time_bin_cdf.columns = [
        'category_id', 'bytes', 'pkts', 'ip_bytes'
    ]

    # ### Creating the Training and Testing Datasets

    # We'll take a traditional 70/30 train/test split, and we'll randomly sample into a train and test data frame.
    cdf_msk = np.random.rand(len(one_hour_time_bin_cdf)) < 0.7
    train_cdf = one_hour_time_bin_cdf[cdf_msk]
    test_cdf = one_hour_time_bin_cdf[~cdf_msk]

    print("==> train length =", len(train_cdf))
    print("==> test length =", len(test_cdf))

    run.log('Train length', len(train_cdf))
    run.log('Test length', len(test_cdf))

    # Prepare the training input (`train_X`), training target (`train_Y`), test input (`test_X`) and test target (`test_Y`) datasets.
    train_X = train_cdf[['pkts', 'ip_bytes']]
    train_Y = train_cdf[['category_id']]

    test_X = test_cdf[['pkts', 'ip_bytes']]
    test_Y = test_cdf[['category_id']]

    # ### Configure XGBoost

    # We choose a classification algorithm that utilizes the GPU - [XGBoost](https://xgboost.readthedocs.io/en/latest/). The package provides support for gradient boosted trees and can leverage distributed GPU compute environments.

    # Getting data into a format for XGBoost is really easy. Just make a `DMatrix` for both training and testin.
    xg_train = xgb.DMatrix(train_X, label=train_Y)
    xg_test = xgb.DMatrix(test_X, label=test_Y)

    # Like any good ML package, there's quite a few parameters to set. We're going to start with the softmax objective function. This will let us get a predicted category out of our model. We'll also set other parameters like the maximum depth and number of threads. You can read more about the parameters [here](https://xgboost.readthedocs.io/en/latest/parameter.html). Experiment with them!

    param = {}
    param['objective'] = 'multi:softmax'
    param['eta'] = 0.1
    param['max_depth'] = 8
    param['silent'] = 1
    param['nthread'] = 4
    param['num_class'] = num_categories
    param['max_features'] = 'auto'
    param['n_gpus'] = 1
    param['tree_method'] = 'gpu_hist'

    # XGBoost allows us to define a watchlist so what we can keep track of performance as the algorithm trains. We'll configure a simple watchlist that is watching `xg_train` and `xg_gest` error rates.
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 20

    # ### Training our First XGBoost Model

    # Now it's time to train
    bst = xgb.train(param, xg_train, num_round, watchlist)

    # Prediction is also easy (and fast).
    pred = bst.predict(xg_test)

    # We might want to get a sense of how our model is by calculating the error rate.
    pred_cdf = cd.from_pandas(pd.DataFrame(pred, columns=['pred']))
    pred_cdf.add_column('category_id', test_Y['category_id'])
    error_rate = (pred_cdf[pred_cdf['pred'] != pred_cdf['category_id']]
                  ['pred'].count()) / test_Y.shape[0]
    run.log('Error rate', error_rate)
    t2 = datetime.now()

    run.log('Runtime', t2 - t1)

コード例 #10

0

ファイルを表示

ファイル: create_train_test.py プロジェクト: karunru/atmaCup8

def merge_all(
    df: cudf.DataFrame,
    campaign: cudf.DataFrame,
    map_game_feed_native_video_assets: cudf.DataFrame,
    advertiser_video: cudf.DataFrame,
    advertiser_converted_video: cudf.DataFrame,
) -> cudf.DataFrame:
    # merge df and campaign
    res = cudf.merge(
        df, campaign, left_on="campaign_id", right_on="id", how="left"
    ).drop(
        columns=["id", "mst_advertiser_id"]
    )  # remove campaign keys

    # merge res and map_game_feed_native_video_assets
    res = cudf.merge(
        res,
        map_game_feed_native_video_assets,
        left_on="game_feed_id",
        right_on="mst_game_feed_id",
        how="left",
    ).drop(
        columns=["mst_game_feed_id"]
    )  # remove map_game_feed_native_video_assets keys

    # merge res and advertiser_video (horizontal case)
    horizontal = advertiser_video.copy()
    left_keys = ["horizontal_mst_advertiser_video_id", "advertiser_id"]
    right_keys = ["id", "mst_advertiser_id"]
    horizontal.columns = [
        f"horizontal_{c}" if c not in right_keys else c for c in horizontal.columns
    ]
    res = cudf.merge(
        res, horizontal, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_video keys

    # merge res and advertiser_video (vertical case)
    vertical = advertiser_video.copy()
    left_keys = ["vertical_mst_advertiser_video_id", "advertiser_id"]
    right_keys = ["id", "mst_advertiser_id"]
    vertical.columns = [
        f"vertical_{c}" if c not in right_keys else c for c in vertical.columns
    ]
    res = cudf.merge(
        res, vertical, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_video keys

    # merge res and advertiser_converted_video (horizontal case)
    left_keys = [
        "horizontal_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    horizontal = advertiser_converted_video.copy()
    horizontal.columns = [
        f"horizontal_converted_{c}" if c not in right_keys else c
        for c in horizontal.columns
    ]
    res = cudf.merge(
        res, horizontal, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_converted_video keys

    # merge res and advertiser_converted_video (vertical case)
    left_keys = [
        "vertical_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    vertical = advertiser_converted_video.copy()
    vertical.columns = [
        f"vertical_converted_{c}" if c not in right_keys else c
        for c in vertical.columns
    ]
    res = cudf.merge(
        res, vertical, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_converted_video keys

    return res