forked from nikhitmago/lookalike-modelling
-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
160 lines (112 loc) · 5.83 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import pyspark
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer,VectorIndexer
from pyspark.ml.feature import StandardScaler,StandardScalerModel
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
import numpy as np
from numpy import allclose
import matplotlib.pylab as plt
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import math as mat
import pickle
APP_NAME = "predict"
# In[9]:
# Calculate 'num_neighbours' nearest neighbours using KNN algorithm using 'tree_type'
# (Ball_tress, Kd_trees etc.)
# Returns recall and population shared as a tuple
def NN_helper_client(knn_train, knn_test, test_pd, num_neighbours, num_indices, tree_type):
print("NN Helper start")
# Initiating nearest neighbor search
neighbors = NearestNeighbors(n_neighbors = num_neighbours,algorithm = tree_type).fit(knn_test)
# Generating indices and distances for K Nearest Neighbours
knn_distances,knn_indices = neighbors.kneighbors(knn_train)
knn_indices_pd = pd.DataFrame(knn_indices.flatten(),columns=["indices"])
# Calculating frequency for each instance in the KNN matrix
freq_table = pd.crosstab(index = knn_indices_pd["indices"],columns = "freq")
freq_table['ind'] = freq_table.index
freq_table = freq_table.merge(test_pd[['cust_id','indices']],left_on="ind",right_on="indices",how="left")
freq_table = freq_table.drop(['ind','indices'],axis=1)
freq_table = freq_table.sort(["freq"], ascending = [0])
freq_table = freq_table.head(num_indices)
return (freq_table)
# In[10]:
def findNearestNeighbour_client(train_all,test_all,num_neighbours,num_indices,tree_type):
# Fetching indices for training and test sets
train_all['indices'] = train_all.index
test_all['indices'] = test_all.index
# Converting features into SparseVector format
l_train=list()
for k in train_all.scaled_weighted_features:
l_train.append(Vectors.sparse(len(k), [(i,j) for i,j in enumerate(k) if j != 0 ]))
l_test=list()
for k in test_all.scaled_weighted_features:
l_test.append(Vectors.sparse(len(k), [(i,j) for i,j in enumerate(k) if j != 0 ]))
# Converting to a numpy array
knn_train = np.asarray(l_train)
knn_test = np.asarray(l_test)
# Calling the helper function to get the frequency table
result = NN_helper_client(knn_train,knn_test,test_all,num_neighbours,num_indices,tree_type)
return (result)
def main(sc):
sqlContext = SQLContext(sc)
input_path = ''
output_path = ''
model_path = ''
model_info_path = model_path + ''
model_scaler_path = model_path + ''
model_train_set_path = model_path + ''
#IMPORT THE CLIENT DATA
client_data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(input_path)
# Load the models and train data from Training Interface paths
model_info = sc.pickleFile(model_info_path).flatMap(lambda x: x.items()).collectAsMap()
scalerModel = StandardScalerModel.load(model_scaler_path)
df_master_new = sc.pickleFile(model_train_set_path).toDF()
col_names = model_info['col_names']
sorted_top_varimp = model_info['varimp']
# Pulling data for most import 6 features
client_data = client_data.select(client_data.u_msisdn.cast('string'),*(col(c).cast("double").alias(c) for c in col_names))
# Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
client_master = client_data.rdd.map(lambda r: Row(cust_id=r[0],features=Vectors.dense(r[1:]))).toDF()
# Scale and normaize the features so that all features can be compared
# and create a new column for the features
client_scaler = StandardScaler(inputCol="features", outputCol="scaled_features",
withStd=True, withMean=True)
# Compute summary statistics by fitting the StandardScaler
scalerModel = client_scaler.fit(client_master)
# Normalize each feature to have unit standard deviation.
client_master = scalerModel.transform(client_master)
#The old features have been replaced with their scaled versions and thus
# we no longer care about the old, unbalanced features
client_master = client_master.drop('features')
sqlContext.registerDataFrameAsTable(df_master_new,"df_master_train_table")
# Remove the negative labels as only the positive ones are important
train_all_client = sqlContext.sql('select * from df_master_train_table where label = 1')
# Multiply feature values with corresponding importances
m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp), inputCol="scaled_features", outputCol="scaled_weighted_features")
train_all_client = m.transform(train_all_client)
client_master = m.transform(client_master)
sqlContext.dropTempTable("df_master_train_table")
nn = 1000
popshared = 0.30
num_indices = (int)(popshared * client_master.count())
tree_type = "kd_tree"
nn,popshared,num_indices
train_pd = train_all_client.toPandas()
test_pd = client_master.toPandas()
freq_table = findNearestNeighbour_client(train_pd,test_pd,nn,num_indices,tree_type)
sqlContext.createDataFrame(freq_table[['cust_id','freq']],).repartition(1).write.format("com.databricks.spark.csv").save(output_path)
if __name__ == "__main__":
conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster("yarn")
sc = SparkContext(conf=conf)
main(sc)