Beispiel #1
0
def compute_features(data, output_table, connection_string):
    results_sql = revo.RxSqlServerData(table=output_table,
                                       connection_string=connection_string)
    revo.rx_data_step(input_data=data,
                      output_file=results_sql,
                      overwrite=True,
                      transform_function=featurize_transform)
Beispiel #2
0
def gather_image_paths(images_folder,
                       label_to_number_dict,
                       connection_string,
                       mode="train"):
    table = None
    if mode == "train":
        table = "TrainData"
    elif mode == "val":
        table = "ValData"
    elif mode == "test":
        table = "TestData"
    else:
        raise ValueError("'mode' not recognized")

    query = 'SELECT ([file_stream].GetFileNamespacePath()) as image FROM [{}] WHERE [is_directory] = 0 AND file_type = \'png\''.format(
        table)
    filetable_sql = revo.RxSqlServerData(sql_query=query,
                                         connection_string=connection_string)
    data = revo.rx_import(filetable_sql)

    data["image"] = data["image"].apply(lambda x: os.path.join(
        images_folder, x[1:]))  # TODO: assert to confirm paths exist
    data["class"] = data["image"].map(
        lambda x: os.path.basename(os.path.dirname(x)))
    data["label"] = data["class"].map(lambda x: label_to_number_dict[x])
    print(data.iloc[0, 0])
    return data
Beispiel #3
0
def DoInMemoryDivision_SQLPlayground():
    start_time = time.time()
    #define the strings for connection
    conn_str = 'Driver=SQL Server;Server=SANDY;Database=S19SQLPlayground;Trusted_Connection=True;'
    customer_query = '''SELECT * FROM dbo.Customer;'''
    product_query = '''SELECT * FROM dbo.Product;'''
    purchase_query = '''SELECT * FROM dbo.Purchase;'''

    input_query = [customer_query, product_query, purchase_query]
    table_name = ['customer_data', 'product_data', 'purchase_data']

    #try connection and write query to build the dataframe
    connected = True
    try:
        for i in range(len(input_query)):
            data_source = revoscale.RxSqlServerData(sql_query=input_query[i],
                                                    connection_string=conn_str)
            revoscale.RxInSqlServer(connection_string=conn_str,
                                    num_tasks=1,
                                    auto_cleanup=False)
            #globals()[name] create new variable
            globals()[table_name[i]] = pd.DataFrame(
                revoscale.rx_import(data_source))
    except:
        print(
            "Cannot connect to DataBase and create the data table, existing..."
        )
        connected = False
    if connected == False:
        exit(0)

    #iterate 3 tables, either add a new entry to the custExistence dictionary or update the count
    custExistence = {}
    for cu_index, cu_row in customer_data.iterrows():
        for pr_index, pr_row in product_data.iterrows():
            for pu_index, pu_row in purchase_data.iterrows():
                if pu_row["CustomerId"] == cu_row["CustomerId"] and pu_row[
                        "ProductId"] == pr_row["ProductId"]:
                    currentcustomer = pu_row["CustomerId"]
                    if currentcustomer in custExistence:
                        custExistence[currentcustomer] += 1
                    else:
                        custExistence[currentcustomer] = 1
    #print(custExistence)

    #Count all the product (count the row)
    nbProducts = product_data.shape[0]
    #print(nbProducts)

    #print out the customer who has bought all the products
    for customerid in custExistence:
        if custExistence[customerid] == nbProducts:
            print("CustomerId : " + str(customerid))

    end_time = time.time()
    print("This took " + str(end_time - start_time) + " s")
Beispiel #4
0
def perform_clustering():
    conn_str = 'Driver=SQL Server;Server=DESKTOP-NLR1KT5;Database=tpcxbb_1gb;Trusted_Connection=True;'

    input_query = '''SELECT
    ss_customer_sk AS customer,
    ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
    ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
    ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
    COALESCE(returns_count, 0) AS frequency
    FROM
    (
      SELECT
        ss_customer_sk,
        COUNT(distinct(ss_ticket_number)) AS orders_count,
        COUNT(ss_item_sk) AS orders_items,
        SUM( ss_net_paid ) AS orders_money
      FROM store_sales s
      GROUP BY ss_customer_sk
    ) orders
    LEFT OUTER JOIN
    (
      SELECT
        sr_customer_sk,
        count(distinct(sr_ticket_number)) as returns_count,
        COUNT(sr_item_sk) as returns_items,
        SUM( sr_return_amt ) AS returns_money
    FROM store_returns
    GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''

    column_info = {
        "customer": {
            "type": "integer"
        },
        "orderRatio": {
            "type": "integer"
        },
        "itemsRatio": {
            "type": "integer"
        },
        "frequency": {
            "type": "integer"
        }
    }

    data_source = revoscale.RxSqlServerData(sql_query=input_query,
                                            column_Info=column_info,
                                            connection_string=conn_str)
    revoscale.RxInSqlServer(connection_string=conn_str,
                            num_tasks=1,
                            auto_cleanup=False)
    customer_data = pd.DataFrame(revoscale.rx_import(data_source))
    print("Data frame:", customer_data.head(n=5))
    cdata = customer_data
    K = range(1, 20)
    KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
    centroids = (k.cluster_centers_ for k in KM)

    D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
    dist = (np.min(D, axis=1) for D in D_k)
    avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
    plt.plot(K, avgWithinSS, 'b*-')
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within-cluster sum of squares')
    plt.title('Elbow for KMeans clustering')
    plt.show()

    n_clusters = 4

    means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
    columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
    est = means_cluster.fit(customer_data[columns])
    clusters = est.labels_
    customer_data['cluster'] = clusters

    for c in range(n_clusters):
        cluster_members = customer_data[customer_data['cluster'] == c][:]
        print('Cluster{}(n={}):'.format(c, len(cluster_members)))
        print('-' * 17)

    print(customer_data.groupby(['cluster']).mean())
Beispiel #5
0
from scipy.spatial import distance as sci_distance
from sklearn import cluster as sk_cluster

# Connection string to connect to SQL Server named instance.
conn_str = 'Driver=SQL Server;Server=MS;Database=ML;Trusted_Connection=True;'
input_query = '''SELECT * FROM [OnlineSales].[RFM]'''

# Define the columns we wish to import.
column_info = {
    "customer": {"type": "integer"},
    "orderRatio": {"type": "integer"},
    "itemsRatio": {"type": "integer"},
    "frequency": {"type": "integer"}
    }

data_source = revoscalepy.RxSqlServerData(sql_query=input_query, column_Info=column_info, connection_string=conn_str)
revoscalepy.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False)
# import data source and convert to pandas dataframe.
customer_data = pd.DataFrame(revoscalepy.rx_import(data_source))
print("Data frame:", customer_data.head(n=5))
print(customer_data.describe())

cdata = customer_data
K = range(1, 20)
KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
centroids = (k.cluster_centers_ for k in KM)

D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
dist = (np.min(D, axis=1) for D in D_k)
avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
plt.plot(K, avgWithinSS, 'b*-')
Beispiel #6
0
def perform_clustering():
    ################################################################################################

    ##	Connect to DB and select data

    ################################################################################################

    # Connection string to connect to SQL Server named instance.
    conn_str = 'Driver=SQL Server;Server=DESKTOP-CF2TNBV;Database=tpcxbb_1gb;Trusted_Connection=True;'

    input_query = '''SELECT
    ss_customer_sk AS customer,
    ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
    ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
    ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
    COALESCE(returns_count, 0) AS frequency
    FROM
    (
      SELECT
        ss_customer_sk,
        -- return order ratio
        COUNT(distinct(ss_ticket_number)) AS orders_count,
        -- return ss_item_sk ratio
        COUNT(ss_item_sk) AS orders_items,
        -- return monetary amount ratio
        SUM( ss_net_paid ) AS orders_money
      FROM store_sales s
      GROUP BY ss_customer_sk
    ) orders
    LEFT OUTER JOIN
    (
      SELECT
        sr_customer_sk,
        -- return order ratio
        count(distinct(sr_ticket_number)) as returns_count,
        -- return ss_item_sk ratio
        COUNT(sr_item_sk) as returns_items,
        -- return monetary amount ratio
        SUM( sr_return_amt ) AS returns_money
    FROM store_returns
    GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''

    # Define the columns we wish to import.
    column_info = {
        "customer": {
            "type": "integer"
        },
        "orderRatio": {
            "type": "integer"
        },
        "itemsRatio": {
            "type": "integer"
        },
        "frequency": {
            "type": "integer"
        }
    }

    data_source = revoscale.RxSqlServerData(sql_query=input_query,
                                            column_Info=column_info,
                                            connection_string=conn_str)
    revoscale.RxInSqlServer(connection_string=conn_str,
                            num_tasks=1,
                            auto_cleanup=False)
    # import data source and convert to pandas dataframe.
    customer_data = pd.DataFrame(revoscale.rx_import(data_source))
    print("Data frame:", customer_data.head(n=5))

    ################################################################################################

    ##	Determine number of clusters using the Elbow method

    ################################################################################################

    cdata = customer_data
    K = range(1, 20)
    KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
    centroids = (k.cluster_centers_ for k in KM)

    D_k = (sci_distance.cdist(cdata, cent, 'euclidean') for cent in centroids)
    dist = (np.min(D, axis=1) for D in D_k)
    avgWithinSS = [sum(d) / cdata.shape[0] for d in dist]
    plt.plot(K, avgWithinSS, 'b*-')
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within-cluster sum of squares')
    plt.title('Elbow for KMeans clustering')
    plt.show()

    ################################################################################################
    ##	Perform clustering using Kmeans
    ################################################################################################

    # It looks like k=4 is a good number to use based on the elbow graph.
    n_clusters = 4

    means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
    columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
    est = means_cluster.fit(customer_data[columns])
    clusters = est.labels_
    customer_data['cluster'] = clusters

    # Print some data about the clusters:

    # For each cluster, count the members.
    for c in range(n_clusters):
        cluster_members = customer_data[customer_data['cluster'] == c][:]
        print('Cluster{}(n={}):'.format(c, len(cluster_members)))
        print('-' * 17)

    # Print mean values per cluster.
    print(customer_data.groupby(['cluster']).mean())
Beispiel #7
0
def DoInMemoryDivision():
    # Connection string to connect to local DB
    conn_str = 'Driver=SQL Server;Server=DESKTOP-M98I69D;Database=S19SQLPlayground_Seb;Trusted_Connection=True;'
    # Queries to retrieve the Customers, Products, and purchases data respectively
    query_customers = '''SELECT * FROM dbo.Customer'''
    query_products = '''SELECT * FROM dbo.Product'''
    query_purchases = '''SELECT * FROM dbo.Purchase'''

    # Define the columns we wish to import for Customers, Products, and purchases data respectively
    col_cust_info = {
        "Customer": {
            "type": "integer"
        },
        "FirstName": {
            "type": "str"
        },
        "LastName": {
            "type": "str"
        }
    }
    col_prod_info = {
        "Product": {
            "type": "integer"
        },
        "Description": {
            "type": "str"
        },
        "UnitPrice": {
            "type": "integer"
        }
    }
    col_pur_info = {
        "Customer": {
            "type": "integer"
        },
        "Product": {
            "type": "integer"
        },
        "Quantity": {
            "type": "integer"
        }
    }

    # Start Timer to measure execution time
    start = timer()

    data_source_cust = revoscale.RxSqlServerData(sql_query=query_customers,
                                                 column_Info=col_cust_info,
                                                 connection_string=conn_str)
    data_source_prod = revoscale.RxSqlServerData(sql_query=query_products,
                                                 column_Info=col_prod_info,
                                                 connection_string=conn_str)
    data_source_pur = revoscale.RxSqlServerData(sql_query=query_purchases,
                                                column_Info=col_pur_info,
                                                connection_string=conn_str)
    # retrieve the data sources
    revoscale.RxInSqlServer(connection_string=conn_str,
                            num_tasks=3,
                            auto_cleanup=False)
    # import data sources and convert to pandas dataframe.
    customer_data = pd.DataFrame(revoscale.rx_import(data_source_cust))
    product_data = pd.DataFrame(revoscale.rx_import(data_source_prod))
    purchase_data = pd.DataFrame(revoscale.rx_import(data_source_pur))

    # Create a DataFrame to store customer purchase count
    cust_dict = {'CustId': [], 'PurchaseCount': []}
    cust_Purc_Count = pd.DataFrame(cust_dict)

    # Nb of products
    nb_products = len(product_data.index)
    # Division process
    # Iterate over all customers by df rows
    for cindex, crow in customer_data.iterrows():
        # Iterate over all products by df rows
        for pindex, prow in product_data.iterrows():
            # Iterate over all purchases by df rows
            for hindex, hrow in purchase_data.iterrows():
                # Check whether the currentCustomer has bought the current product
                # print (crow['CustomerId'], hrow['CustomerId'], prow['ProductId'] )
                if (crow['CustomerId']
                        == hrow['CustomerId']) and (prow['ProductId']
                                                    == hrow['ProductId']):
                    # Either add a new DataRow in custPurchasesCount
                    # if no existing for the current customer or update the count
                    custExistence = 0
                    for index, cust_count in cust_Purc_Count.iterrows():
                        if cust_count['CustId'] == crow['CustomerId']:
                            custExistence += 1

                    if (custExistence == 0):
                        # First product for this customer
                        td = {'CustId': crow['CustomerId'], 'PurchaseCount': 1}
                        # Make a dataframe to be able to append to cust_Purc_Count
                        tdf = pd.DataFrame(td, index=[0])
                        # Append to cust_Purc_Count
                        # We must use the syntax below because for df append doesn't occur in place but in the object returned by the method
                        cust_Purc_Count = cust_Purc_Count.append(tdf)
                    else:
                        # The product is already referenced cust_Purc_Count => Just increase the PurchaseCount
                        cust_count['PurchaseCount'] += 1

    for dindex, drow in cust_Purc_Count.iterrows():
        # Check the customers that bought all the products
        if drow['PurchaseCount'] == nb_products:
            print('\nDivision CustomerId: ' + str(drow['CustId']))

    end = timer()
    print("The In Memory Division in Python took (S) : ", end - start)
def perform_clustering():
    ################################################################################################

    ##	Connect to DB and select data

    ################################################################################################

    # Connection string to connect to SQL Server named instance.
    conn_str = 'Driver=SQL Server;Server=DESKTOP-INU1SHI;Database=tpcxbb_1gb;Trusted_Connection=True;'

    input_query = '''SELECT
    ss_customer_sk AS customer,
    ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
    ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
    ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
    COALESCE(returns_count, 0) AS frequency
    FROM
    (
      SELECT
        ss_customer_sk,
        -- return order ratio
        COUNT(distinct(ss_ticket_number)) AS orders_count,
        -- return ss_item_sk ratio
        COUNT(ss_item_sk) AS orders_items,
        -- return monetary amount ratio
        SUM( ss_net_paid ) AS orders_money
      FROM store_sales s
      GROUP BY ss_customer_sk
    ) orders
    LEFT OUTER JOIN
    (
      SELECT
        sr_customer_sk,
        -- return order ratio
        count(distinct(sr_ticket_number)) as returns_count,
        -- return ss_item_sk ratio
        COUNT(sr_item_sk) as returns_items,
        -- return monetary amount ratio
        SUM( sr_return_amt ) AS returns_money
    FROM store_returns
    GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''


    # Define the columns we wish to import.
    column_info = {
        "customer": {"type": "integer"},
        "orderRatio": {"type": "integer"},
        "itemsRatio": {"type": "integer"},
        "frequency": {"type": "integer"}
    }

   
    data_source = revoscale.RxSqlServerData(sql_query=input_query, column_Info=column_info,
                                              connection_string=conn_str)
    ComputeContext = revoscale.RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False)
    # import data source and convert to pandas dataframe.
    customer_data = pd.DataFrame(revoscale.rx_import(data_source))
    print("Data frame:", customer_data.head(n=5))




    ################################################################################################

        ##	Determine number of clusters using the Elbow method

    ################################################################################################
 
     cdata = customer_data
     K = range(1,20)
     KM = (sk_cluster.KMeans(n_clusters=k).fit(cdata) for k in K)
     centroids = (k.cluster_centers_ for k in KM)

     D_K = (sci_distance.cdist(cdata,cent,'euclidean')for cent in centroids)
     dist = (np.min(D, axis=1)for D in D_K)
     avgWithinSS = [sum(d)/cdata.shape[0] for d in dist]
     plt.plot(K, avgWithinSS, 'b*-')
     plt.grid(True)
     plt.xlabel('Number of Clusters')
     plt.ylabel('Average Within Clusters Sum of Squares')
     plt.title('Kmeans Clustering')
     plt.show()
Beispiel #9
0
def perform_clustering():
    ################################################################################################

    ##	Connect to DB and select data

    ################################################################################################

    # Connection string to connect to SQL Server named instance.
    conn_str = 'Driver=SQL Server;Server=VC5-SOPHIA;Database=tpcxbb_1gb;Trusted_Connection=True;'

    input_query = '''SELECT
    ss_customer_sk AS customer,
    ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
    ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
    ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
    COALESCE(returns_count, 0) AS frequency
    FROM
    (
      SELECT
        ss_customer_sk,
        -- return order ratio
        COUNT(distinct(ss_ticket_number)) AS orders_count,
        -- return ss_item_sk ratio
        COUNT(ss_item_sk) AS orders_items,
        -- return monetary amount ratio
        SUM( ss_net_paid ) AS orders_money
      FROM store_sales s
      GROUP BY ss_customer_sk
    ) orders
    LEFT OUTER JOIN
    (
      SELECT
        sr_customer_sk,
        -- return order ratio
        count(distinct(sr_ticket_number)) as returns_count,
        -- return ss_item_sk ratio
        COUNT(sr_item_sk) as returns_items,
        -- return monetary amount ratio
        SUM( sr_return_amt ) AS returns_money
    FROM store_returns
    GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''

    # Define the columns we wish to import.
    column_info = {
        "customer": {
            "type": "integer"
        },
        "orderRatio": {
            "type": "float"
        },
        "itemsRatio": {
            "type": "float"
        },
        "frequency": {
            "type": "integer"
        }
    }

    data_source = revoscale.RxSqlServerData(sql_query=input_query,
                                            column_Info=column_info,
                                            connection_string=conn_str)
    revoscale.RxInSqlServer(connection_string=conn_str,
                            num_tasks=1,
                            auto_cleanup=False)
    # import data source and convert to pandas dataframe.
    customer_data = pd.DataFrame(revoscale.rx_import(data_source))
    #print("Data frame:", customer_data.head(n=5))

    cdata = customer_data
    n_clusters = 4

    means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
    columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
    est = means_cluster.fit(customer_data[columns])
    clusters = est.labels_
    customer_data['cluster'] = clusters

    # Print some data about the clusters:

    # For each cluster, count the members.
    for c in range(n_clusters):
        cluster_members = customer_data[customer_data['cluster'] == c][:]
        print('Cluster{}(n={}):'.format(c, len(cluster_members)))
        print('-' * 17)

    # Print mean values per cluster.
    print(customer_data.groupby(['cluster']).mean())