def test_fit_predict_with_intermediate_fit_params(): # tests that Pipeline passes fit_params to intermediate steps # when fit_predict is invoked pipe = Pipeline([("transf", TransfFitParams()), ("clf", FitParamT())]) pipe.fit_predict( X=None, y=None, transf__should_get_this=True, clf__should_succeed=True ) assert pipe.named_steps["transf"].fit_params["should_get_this"] assert pipe.named_steps["clf"].successful assert "should_succeed" not in pipe.named_steps["transf"].fit_params
def test_fit_predict_with_intermediate_fit_params(): # tests that Pipeline passes fit_params to intermediate steps # when fit_predict is invoked pipe = Pipeline([('transf', TransfFitParams()), ('clf', FitParamT())]) pipe.fit_predict(X=None, y=None, transf__should_get_this=True, clf__should_succeed=True) assert_true(pipe.named_steps['transf'].fit_params['should_get_this']) assert_true(pipe.named_steps['clf'].successful) assert_false('should_succeed' in pipe.named_steps['transf'].fit_params)
def test_fit_predict_with_intermediate_fit_params(): # tests that Pipeline passes fit_params to intermediate steps # when fit_predict is invoked pipe = Pipeline([('transf', TransfFitParams()), ('clf', FitParamT())]) pipe.fit_predict(X=None, y=None, transf__should_get_this=True, clf__should_succeed=True) assert pipe.named_steps['transf'].fit_params['should_get_this'] assert pipe.named_steps['clf'].successful assert 'should_succeed' not in pipe.named_steps['transf'].fit_params
def cluster_analysis(df_test): numeric_features = ['amount', 'day'] numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) # account name used as dummy feature categorical_features = ['account_name'] categorical_transformer = OneHotEncoder(handle_unknown='ignore') preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, numeric_features ), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', DBSCAN(0.2))]) df_test.dates = pd.to_datetime(df_test.dates) df_test['day'] = df_test.dates.dt.day df_test.head() prediction = clf.fit_predict(df_test) df_test['prediction'] = "Regular-Irregular" df_test.loc[prediction < 0, 'prediction'] = 'Discretionary' return df_test
def bestClassify(X,Y): "Best classifier function" tfidf = True if tfidf: vec = TfidfVectorizer(preprocessor = identity, tokenizer = identity, sublinear_tf = True) else: vec = CountVectorizer(preprocessor = identity, tokenizer = identity) km = KMeans(n_clusters=2, n_init=100, verbose=1) clusterer = Pipeline( [('vec', vec), ('cls', km)] ) prediction = clusterer.fit_predict(X,Y) checker = defaultdict(list) for pred,truth in zip(prediction,Y): checker[pred].append(truth) labeldict = {} for pred, label in checker.items(): labeldict[pred] = Counter(label).most_common(1)[0][0] #print(pred, Counter(label).most_common(1)[0][0]) prediction = [labeldict[p] for p in prediction] labels = list(labeldict.values()) print(labels) print(confusion_matrix(Y, prediction, labels=labels)) print("Homogeneity:", homogeneity_score(Y,prediction)) print("Completeness:", completeness_score(Y,prediction)) print("V-measure:", v_measure_score(Y,prediction)) print("Rand-Index:", adjusted_rand_score(Y,prediction))
def train_pipeline(): """ Load or create dataset, then create and fit pipeline, show its results, and return it trained. """ try: train_df = load_data('train') except FileNotFoundError: train_df = prepare_data(list_audiofiles(config.train_path)) save_data(train_df, 'train') train = np.array(handle_wrong_rows(train_df, 'train')) pipeline = Pipeline([('scaler', StandardScaler()), ('clusterization', RelabeledBayesianGaussianMixture(config=config, n_components=config.n_classes, tol=0.00001, covariance_type='tied', max_iter=10000, random_state=18))]) tr_predictions = pipeline.fit_predict(train) targets = pd.read_csv(config.targets).target print('\nModel accuracy: %.3f' % (metrics.accuracy_score(targets, tr_predictions))) if config.verbose: plot_clusters(pipeline.steps[0][1].transform(train), tr_predictions, targets) return pipeline
def test_case_1(self): num_features_pipeline = Pipeline([ ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')), ('scale', MinMaxScaler()), ('transform', QuantileTransformer(output_distribution='normal')) ]) cat_features_pipeline = Pipeline([ ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False)) ]) preprocessor = ColumnTransformer(transformers=[('num', num_features_pipeline, numerical_features), ('cat', cat_features_pipeline, categorical_features)]) classifier_pipeline = Pipeline( steps=[('preprocessing', preprocessor ), ('classify', CustomClassifier(base=LogisticRegression()))]) y_pred = classifier_pipeline.fit_predict(X_train, y_train) print(y_pred)
class ClusteringPipelineHandler: def __init__(self, config: ClusteringConfig): self.pipe = Pipeline([("prep", Preprocessor()), ("extractor", FasttextExtractor(config)), ("cluster", Clustering(config.min_cluster_size, config.cosine_thrsh))]) def _get_clusters(self, inp: Dict[str, str]) -> Dict[str, List[str]]: texts: List[str] = [w for w in inp.values()] labels = self.pipe.fit_predict(texts) labels_buckets: Dict[str, List[str]] = dict() for key, c in zip(list(inp.keys()), labels): cc = str(c) if cc not in labels_buckets: labels_buckets[cc] = [] labels_buckets[cc].append(key) return labels_buckets def on_post(self, req, resp) -> None: if req.content_length: try: inp = ujson.loads(req.stream.read()) answer = self._get_clusters(inp) resp.body = ujson.dumps(answer) resp.status = falcon.HTTP_200 except Exception as e: resp.body = ujson.dumps({'Error': traceback.format_exc()}) resp.status = falcon.HTTP_500 else: resp.body = ujson.dumps({'Error': 'data payload is mandatory'}) resp.status = falcon.HTTP_400
def test_pipeline(): trajs = AlanineDipeptide().get_cached().trajectories p = Pipeline([('diheds', DihedralFeaturizer(['phi', 'psi'], sincos=False)), ('hmm', VonMisesHMM(n_states=4))]) predict = p.fit_predict(trajs) p.named_steps['hmm'].summarize()
def test_pipeline(): trajs = AlanineDipeptide().get_cached().trajectories topology = trajs[0].topology indices = topology.select('backbone') p = Pipeline([('diheds', SuperposeFeaturizer(indices, trajs[0][0])), ('hmm', GaussianHMM(n_states=4))]) predict = p.fit_predict(trajs) p.named_steps['hmm'].summarize()
def test_pipeline(): trajs = AlanineDipeptide().get_cached().trajectories topology = trajs[0].topology indices = topology.select('backbone') p = Pipeline([ ('diheds', SuperposeFeaturizer(indices, trajs[0][0])), ('hmm', GaussianHMM(n_states=4)) ]) predict = p.fit_predict(trajs) p.named_steps['hmm'].summarize()
def clusters(self, k=None, method='kmeans', ret_clusterizer=False, **kwargs): """ Extract clusters from input data. """ pipeline = Pipeline([ ('fill', Imputer()), ('cluster', KMeans(k or 4, **kwargs)) ]) labels = pipeline.fit_predict(self.pivot_table) if ret_clusterizer: return labels, pipeline else: return labels
def train(self,argv): testmode = False #seperate testfile or do cross validation if len(argv) == 2: trainfile = argv[1] else: exit("Use kmeansBinary.py <trainfile>") # X and Y are the result of the read corpus function. X is a list of all documents that are tokenized and Y is a list of all labels # The use_sentiment boolean can be changed to use the categories(False) or the polarity(True) X, Y = self.read_corpus(trainfile, use_sentiment=True) # we use a dummy function as tokenizer and preprocessor, # since the texts are already preprocessed and tokenized. vec = TfidfVectorizer(preprocessor = self.identity, tokenizer = self.identity,sublinear_tf=True) #vec = CountVectorizer(preprocessor = self.identity, tokenizer = self.identity) #vec = DictVectorizer() km = Pipeline( [('vec', vec), ('cls', cluster.KMeans(n_clusters=2, n_init=10, verbose=1))] ) labels_pred = km.fit_predict(X,Y) labels_true = Y c = defaultdict(list) #calculate confusion matrix for pred,true in zip(labels_pred,labels_true): c[pred].append(true) label = {} for key in c: count = Counter(c[key]) label[key] = count.most_common(1)[0][0] print(key, count.most_common(6)) labels_pred = [label[l] for l in labels_pred] labels = list(set(label.values())) print(labels) print(vec.get_feature_names()) print("Homogeneity: %0.3f" % homogeneity_score(labels_true, labels_pred)) print("Completeness: %0.3f" % completeness_score(labels_true, labels_pred)) print("V-measure: %0.3f" % v_measure_score(labels_true, labels_pred)) print("Adjusted Rand-Index: %.3f" % adjusted_rand_score(labels_true, labels_pred)) print(confusion_matrix(labels_true, labels_pred, labels=labels))
def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately iris = load_iris() scaler = StandardScaler() km = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([('scaler', scaler), ('Kmeans', km)]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred)
def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately iris = load_iris() scaler = StandardScaler() km = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([('scaler', scaler), ('Kmeans', km)]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred)
def parse_image(args, root_img: np.ndarray, image_name: str): """Parse a single image, get the hough lines and find the vanishing points""" # Create hough pipeline that transforms the image and finally predicts the hough lines # Note: This pipeline is saving intermediate steps as image files in the output folder hough_pipeline = Pipeline(steps=[ ('image_resizer', ResizeTransformer(args.shape)), # ('colour_masker', ColourSegmentTransformer(3)), # ('plot_colour_mask_image', PlotTransformer(image_name, suffix="colour", folder=args.output_folder)), ('canny_image', CannyTransformer()), ('segment_image', SegmentTransformer() if args.segment_canny else None), ('plot_canny_image', PlotTransformer(image_name, suffix="canny", folder=args.output_folder)), ('hough_transform', HoughLinesEstimator( threshold=args.hough_threshold, weight_decay=args.weight_decay, vertical_degrees_filter=args.degrees_filter )), ]) # Get the pipeline results and filter them according to argument settings hough_transform = hough_pipeline.fit_predict(root_img) hough_transform.filter_horizontal_lines(degrees=args.degrees_filter) hough_transform.limit_lines(args.hough_limit) hough_transform.group_lines(r=args.hough_group_radius) if args.cluster_hough_lines: hough_transform.cluster_lines() # Add padding to the hough transform hough_transform.add_padding(args.padding) # Get the vanishing points with the chosen method vps, reference_transformer = METHODS[args.method](args, hough_transform) # Plot decimal coordinates of the vanish points print([((x - args.padding )/args.img_width, (y - args.padding)/args.img_height) for x,y in vps]) # Plot and print on the original image Pipeline(steps=[ ('image_resizer', ResizeTransformer(args.shape)), ('pad_image', PadTransformer(args.padding)), # ('plot_pad_image', PlotTransformer(image_name, suffix="padded_orig")) ('add_reference', reference_transformer), ('add_vanishing_points', DrawPointsTransformer(vps, colour=(255, 255, 0))), ('plot_final_image', PlotTransformer(image_name, suffix="final", folder=args.output_folder)), ]).fit_transform(root_img)
def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately scaler = StandardScaler() km = KMeans(random_state=0) # As pipeline doesn't clone estimators on construction, # it must have its own estimators scaler_for_pipeline = StandardScaler() km_for_pipeline = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred)
def cluster_analysis_v3(df_test): df_test['time_since_last_transaction'] = df_test['datetime'] - df_test['datetime'].shift() df_test['time_since_last_transaction']= df_test['time_since_last_transaction'].apply(lambda x: convert_to_mins_v3(x)) df_test = df_test.reset_index() main_dict = {} for party in list(df_test['other_account_name'].value_counts().index): temp_df = df_test[df_test['other_account_name'] == party].copy() temp_df['time_since_last_trans_party'] = temp_df['datetime'] - temp_df['datetime'].shift() pos_dict = pd.Series(temp_df['time_since_last_trans_party'].values, index = temp_df['index']) main_dict.update(pos_dict) df_test['time_since_last_transaction_party'] = df_test['index'].map(main_dict) df_test['time_since_last_transaction_party']= df_test['time_since_last_transaction_party'].apply(lambda x: convert_to_mins_v3(x)) numeric_features = ['amount', 'day', 'time_since_last_transaction_party', 'time_since_last_transaction'] numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) # account name used as dummy feature categorical_features = ['account_name'] categorical_transformer = OneHotEncoder(handle_unknown='ignore') preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) clf = Pipeline(steps=[('preprocessor', preprocessor), ('cluster', DBSCAN(0.2))]) df_test.dates = pd.to_datetime(df_test.dates) df_test['day'] = df_test.dates.dt.day df_test.head() #df = df[[]] prediction = clf.fit_predict(df_test) df_test['prediction'] = "Regular-Irregular" df_test.loc[prediction < 0, 'prediction'] = 'Discretionary' return df_test
def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately iris = load_iris() scaler = StandardScaler() km = KMeans(random_state=0) # As pipeline doesn't clone estimators on construction, # it must have its own estimators scaler_for_pipeline = StandardScaler() km_for_pipeline = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred)
class ClusteringAlgorithm(object): def __init__(self, instances, conf): self.instances = instances self.conf = conf self.num_clusters = self.conf.num_clusters self.clustering = None @abc.abstractmethod def get_distortion(self): return @abc.abstractmethod def get_centroids(self): return def get_predicted_proba(self): return None def get_all_proba(self): return None def fit(self): self.pipeline = Pipeline([('scaler', StandardScaler()), ('clustering', self.algo)]) self.assigned_clusters = self.pipeline.fit_predict( self.instances.features.get_values()) def generate(self, drop_annotated_instances=False): self.clustering = Clusters(self.instances, self.assigned_clusters, clustering_algo=self) self.clustering.generate( self.get_centroids(), drop_annotated_instances=drop_annotated_instances) def export(self, output_dir, quick=False): self.clustering.export(output_dir) self.clustering.gen_eval(output_dir, quick=quick)
def cluster(list_of_texts: List[str], num_clusters: int=3) -> List[int]: """ Cluster a list of texts into a given number of clusters, based on their tf-idf-weighted bag-of-word vectors. Args: list_of_texts: a list of untokenized texts num_clusters: the target number of clusters Returns: a list with the cluster id for each text, e.g. [0,1,0,0,2,2,1] """ pipeline = Pipeline([ ("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clust", KMeans(n_clusters=num_clusters)) ]) try: clusters = pipeline.fit_predict(list_of_texts) except ValueError: clusters = list(range(len(list_of_texts))) return clusters
('logistic', LogisticRegression(C=0.1))]) logistic_genre_cert_PCA.fit(data, binary12) km4 = Pipeline([('genre_cert', genre_cert), ('svd', TruncatedSVD(n_components=2)), ('kmeans', KMeans(n_clusters=4))]) km4.fit(data) data.loc[:, 'binary12'] = data['num_months_wait'].apply(makeBinary) data_check = data[data['genre_name'].notnull() == True].reset_index() # get groups km4_check = pd.DataFrame(data = km4.fit_predict(data_check))\ .rename(columns = {0: 'cluster4'}) km4 = Pipeline([('genre_cert', genre_cert), ('svd', TruncatedSVD(n_components=2)), ('kmeans', KMeans(n_clusters=4))]) km4.fit(data) data_all = pd.merge(data_check, km4_check, left_index=True, right_index=True) cluster = [ data_all[data_all['cluster4'] == i]['keyword_name'] for i in range(4) ] comment_words = [' '.join(cluster[i]).replace('based', '').replace('novel', '')\ .replace('young adult', '').replace(',', '') for i in range(4)]
orders_numb_top = dt.orders[['user_id', 'order_number', 'order_id']].\ sort_values(['user_id', 'order_number'], ascending=[1, 0]).\ groupby('user_id').head(LAST_N_PRIORS)['order_id'].\ values priors_filtered = dt.priors[dt.priors.order_id.isin(orders_numb_top)] users_prior['all_products'] = priors_filtered.groupby( 'user_id')['product_id'].apply(set) else: users_prior['all_products'] = dt.priors.groupby( 'user_id')['product_id'].apply(set) user_products = users_prior.all_products.apply( lambda x: " ".join([str(prod_id) for prod_id in x])) clusters = pipeline.fit_predict(user_products) ar_clust, ar_cnt = np.unique(clusters, return_counts=True) max_clust = np.argmax(ar_cnt) for cl, cnt in zip(ar_clust, ar_cnt): if cnt < 500: clusters[clusters == cl] = max_clust # test the GAAC clusterer with 4 clusters #clusterer = GAAClusterer(N_CLUSTERS, normalise=False) #clusters = clusterer.cluster(X_svd, True) pd.DataFrame({ 'user_id': user_products.index, 'cluster': clusters }).to_csv('../tmp/user_by_cluster.csv', index=False) print('Done clustering', np.unique(clusters, return_counts=True))
""" import matplotlib.pyplot as plt import numpy as np import pandas as pd from hdbscan import HDBSCAN from sklearn.cluster import DBSCAN from sklearn.pipeline import Pipeline from analysis.data import GeographicArea, features from analysis.scaler import SpatialWaterVapourScaler file_pattern = 'data/input/METOPAB_20160801_global_evening.nc' area = GeographicArea(lat=(-25, 50), lon=(-45, 60)) df = area.import_dataset(file_pattern) X = df[features].values # create estimators scaler = SpatialWaterVapourScaler(km=60, H2O=0.1, delD=10) # cluster = DBSCAN(eps=2.4, min_samples=14) cluster = HDBSCAN(min_cluster_size=14, gen_min_span_tree=True) # create pipeline pipeline = Pipeline([('scaler', scaler), ('cluster', cluster)]) y = pipeline.fit_predict(X) subarea = GeographicArea(lat=(-20, 0), lon=(22, 50)) area.subarea_plot(X, y, subarea=subarea, include_noise=True) # print('dbcv score: ', cluster.relative_validity_)
('svd', TruncatedSVD(100, random_state=appconfig['random_state'])), ('normalizer', Normalizer(copy=False)), ('clustering', MiniBatchKMeans(random_state=appconfig['random_state'])) ]) ## # clustering routine print('clustering') for index, classe in enumerate(appconfig['classification']['allowed_classes']): corpus = [contrato['corpo'] for contrato in classes_contratos[classe]] pipeline.set_params(clustering__n_clusters=appconfig['clustering']['num_clusters'][index]) predictions = pipeline.fit_predict(corpus) for index, prediction in enumerate(predictions): classes_contratos[classe][index]['_cluster'] = np.asscalar(prediction) ## # persisting # flatten classes_contratos values clusterized_contratos = reduce(lambda x,y: x+y, classes_contratos.values()) print('persisting results') with dbi.opensession() as session: predicoes = Predicao_Contrato.__table__
__author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr' ref_dim = 0 s = DTWSampler(scaling_col_idx=ref_dim, reference_idx=0, interp_kind="linear") km = KMeans(n_clusters=3) data = [] data.append(numpy.loadtxt("data/Xi_ref.txt")) data.append(numpy.loadtxt("data/Xi_0.txt")) data.append(numpy.loadtxt("data/Xi_1.txt")) d = data[0].shape[1] max_sz = max([ts.shape[0] for ts in data]) n_rep = 5 npy_arr = numpy.zeros((len(data) * n_rep, max_sz, d)) + numpy.nan std_per_d = None for idx_rep in range(n_rep): for idx, ts in enumerate(data): sz = ts.shape[0] npy_arr[idx + idx_rep * len(data), :sz] = ts + 0.1 * numpy.random.randn( sz, d) * ts.std(axis=0) npy_arr = npy_arr.reshape(-1, max_sz * d) dtw_kmeans = Pipeline([('dtw_sampler', s), ('l2-kmeans', km)]) print(dtw_kmeans.fit_predict(npy_arr))
def clustering_captcha(self, image_path, check=False): """对验证码图像进行聚类操作以分离出验证码图片中的各个字符 参数 ---- image_path: str 单个验证码图片的绝对路径 check: bool 是否对聚类后的验证码图片检查聚类效果及基于列的像素点分布图 返回值 ---- (image_vectors, col_npixs): tuple [2] 长度为2的tuple,其中tuple的第一个对象为根据聚类得到的除背景以外的 所有类的像素矩阵,tuple的第二个对象为第一个对象所形成图像的每一列 的非背景像素个数 image_vectors: {array-like} [self.width * self.height, self.n_chars + 1] col_npixs: {array-like} [self.width, self.n_chars + 1] """ image = self.de_noise(image_path) image_pixs = np.array(image.getdata()) image_pixs = image_pixs.astype(np.float) sc = StandardScaler() km = KMeans(n_clusters=(self.n_chars + 2)) clu = Pipeline(steps=[('sc', sc), ('km', km)]) clusters = clu.fit_predict(image_pixs) image_vectors = np.zeros((self.n_chars+2, self.width*self.height)) col_npixs = np.zeros((self.n_chars+2, self.width)) for i in np.unique(clusters): image_vectors[i, clusters == i] = 1 image_vectors[i, :] = self.de_line(image_vectors[i, :]) col_npixs[i, :] = image_vectors[i, :].reshape(( self.height, self.width)).sum(axis=0) cluster_bkg = np.argmax(col_npixs.sum(axis=1)) image_vectors = np.delete(image_vectors, (cluster_bkg), axis=0) col_npixs = np.delete(col_npixs, (cluster_bkg), axis=0) if check: if not self.checking_path: self.checking_path = os.path.join(self.training_images_path, 'checking') if not os.path.isdir(self.checking_path): os.mkdir(self.checking_path) clusters_path = os.path.join(self.checking_path, 'clusters') if not os.path.isdir(clusters_path): os.mkdir(clusters_path) n_clusters = col_npixs.shape[0] img_name = os.path.split(image_path)[1].split('.')[0] for i in range(n_clusters): new_img_name = os.path.join(clusters_path, img_name + '_cluster' + str(i) + '_img' + '.jpg') new_fig_name = os.path.join(clusters_path, img_name + '_cluster' + str(i) + '_fig' + '.jpg') im_new = Image.new('1', (self.width, self.height)) im_new.putdata(image_vectors[i, :]) im_new.save(new_img_name) plt.plot(col_npixs[i, :]) plt.savefig(new_fig_name) plt.close('all') return (image_vectors, col_npixs)
def main(): datatrain = pd.read_excel( "../PAN-15/logregexcel_PAN-15trainlargeconcatenated.xlsx", names=["Folder", "labels", "Text1", "Text2"]) dataframetrain = transform_data(datatrain) #dataframetrain = dataframetrain.sample(frac=1) Xtrain = dataframetrain['text'].tolist() Ytrain = dataframetrain['labels'].tolist() datatest = pd.read_excel( "../PAN-15/logregexcel_PAN-15testlargeconcatenated.xlsx", names=["Folder", "labels", "Text1", "Text2"]) dataframetest = transform_data(datatest) #dataframetest = dataframetest.sample(frac=1) Xtest = dataframetest['text'].tolist() Ytest = dataframetest['labels'].tolist() vec = TfidfVectorizer(preprocessor=preprocessor) classifier = Pipeline([('vec', vec), ('cls', KMeans(n_clusters=2))]) classifier.fit(Xtrain, Ytrain) try: X_prep = vec.fit_transform(Xtest).toarray() labels = classifier.fit_predict(Xtest) pca = PCA(n_components=2).fit(X_prep) coords = pca.transform(X_prep) label_colors = [ "red", "blue", "green", "yellow", "black", "purple", "cyan" ] colors = [label_colors[i] for i in labels] plt.scatter(coords[:, 0], coords[:, 1], c=colors) centroids = classifier.named_steps['cls'].cluster_centers_ centroid_coords = pca.transform(centroids) plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker="X", s=200, linewidth=2, c="#444d61") plt.show() except: pass try: coefs = classifier.named_steps['cls'].coef_ print(coefs) features = classifier.named_steps['vec'].get_feature_names() print_n_most_informative_features(coefs, features, 10) print() except: pass Yguess = classifier.predict(Xtest) Ylist = [] for i in Yguess: if i < 0.5: Ylist.append(0) else: Ylist.append(1) print(classification_report(Ytest, Ylist)) print(accuracy_score(Ytest, Ylist))
def compare_clustering(data, storetofile): preprocessor = get_preprocessor() kmeans = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', KMeans(n_clusters=2, init='random', algorithm='full', random_state=42) ) ]) spectral = Pipeline( steps=[('preprocessor', preprocessor), ('classifier', SpectralClustering( n_clusters=2, assign_labels='discretize', random_state=42) )]) gaussian = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', GaussianMixture(n_components=2, n_init=10, init_params='random', random_state=42))]) X = data.drop(['IstKunde'], axis=1) y_kmeans = kmeans.fit_predict(X) y_spectral = spectral.fit_predict(X) y_gaussian = gaussian.fit_predict(X) data_kmeans = data.copy() data_kmeans['IstKunde'] = y_kmeans data_spectral = data.copy() data_spectral['IstKunde'] = y_spectral data_gaussian = data.copy() data_gaussian['IstKunde'] = y_gaussian sns.set(style="ticks") f1 = sns.pairplot(data_kmeans, vars=('Land_ID', 'Branche_ID', 'Mitarbeiteranzahl', 'Umsatz', 'Wachstum'), hue="IstKunde") f1.fig.canvas.set_window_title('Kmeans-Clustering Scattermatrix') f2 = sns.pairplot(data_spectral, vars=('Land_ID', 'Branche_ID', 'Mitarbeiteranzahl', 'Umsatz', 'Wachstum'), hue="IstKunde") f2.fig.canvas.set_window_title('Spectral-Clustering Scattermatrix') f3 = sns.pairplot(data_gaussian, vars=('Land_ID', 'Branche_ID', 'Mitarbeiteranzahl', 'Umsatz', 'Wachstum'), hue="IstKunde") f3.fig.canvas.set_window_title('Gaussian-Mixture-Clustering Scattermatrix') # pca = Pipeline(steps=[('preprocessor', preprocessor), ('pca', PCA(n_components=2))]) pca = PCA(n_components=2) arr_2d = pca.fit_transform(X) plt.figure(figsize=(15, 8)) colors = ['red', 'navy'] target_names = ['KeinKunde', 'IstKunde'] lw = 2 plt.title('PCA of Customer dataset: Cluster Comparison') plt.subplot(2, 2, 1, title='Kmeans') for color, i, target_name in zip(colors, [0, 1], target_names): plt.scatter(arr_2d[y_kmeans == i, 0], arr_2d[y_kmeans == i, 1], color=color, alpha=.8, lw=lw, label=target_name) plt.subplot(2, 2, 2, title='Spectral') for color, i, target_name in zip(colors, [0, 1], target_names): plt.scatter(arr_2d[y_spectral == i, 0], arr_2d[y_spectral == i, 1], color=color, alpha=.8, lw=lw, label=target_name) plt.subplot(2, 2, 3, title='Gaussian') for color, i, target_name in zip(colors, [0, 1], target_names): plt.scatter(arr_2d[y_gaussian == i, 0], arr_2d[y_gaussian == i, 1], color=color, alpha=.8, lw=lw, label=target_name) plt.legend(loc='best', shadow=False, scatterpoints=1) print("Kmeans-Clustering Information grouped IstKunde:") print(data_kmeans.groupby(['IstKunde', 'Land_ID', 'Branche_ID']).count()) print("Spectral-Clustering Information grouped IstKunde:") print(data_spectral.groupby(['IstKunde', 'Land_ID', 'Branche_ID']).count()) print("Gaussian-Mixture-Clustering Information grouped IstKunde:") print(data_gaussian.groupby(['IstKunde', 'Land_ID', 'Branche_ID']).count()) if storetofile: pd.DataFrame.to_csv( data_kmeans, "C:/Users/dakoch/Downloads/CustomerClustering/customer_cluster_kmeans.csv", float_format="%.2f") pd.DataFrame.to_csv( data_spectral, "C:/Users/dakoch/Downloads/CustomerClustering/customer_cluster_spectral.csv", float_format="%.2f") pd.DataFrame.to_csv( data_gaussian, "C:/Users/dakoch/Downloads/CustomerClustering/customer_cluster_gaussian.csv", float_format="%.2f") plt.show()
class ClusteringModel: """ ClusteringModel encapsulates all the components needed to encode a list of images according to the extracted words and the layout of words, and use these as features for an unsupervised clustering using DBSCAN. Other clustering methods may be more suitable for your dataset, e.g. k-means or agglomerative clustering or HDBSCAN. This model primarily interacts with the data via a Pandas Dataframe that contains location of the image files. This assumes that the OCR results have been fetched, and stored in the same directory according to the <image_name.jpg>.json convention. """ def __init__(self, layout_shape: (int, int), vocabulary_size: int, ocr_provider: object, n_pca_components: int = 200, vocabulary: List[str] = None, stopwords: List[str] = None, pipeline: Pipeline = None): """ Constructor for a clustering model :param layout_shape: The dimensions for the layout encoding. (50, 79) works well for credit cards sized images. :param vocabulary_size: The size of the vocabulary for the word encoding. 2000-3000 works well for a large number of unrecognized cards. :param ocr_provider: An instance of an OCR provider class used to get the words from the image :param n_pca_components: The number of desired components for PCA on the word encoding and the layout encoding. The actual number of components is limited by the number of rows of data. :param vocabulary: A pre-defined vocabulary if available :param stopwords: A list of stopwords to filter out if the vocabulary is regenerated :param pipeline: An sklearn pipeline containing the PCAs for the word and layout encoding """ self.layout_shape = layout_shape self.vocabulary_size = vocabulary_size self.ocr_provider = ocr_provider self.n_pca_components = n_pca_components self.stopwords = stopwords if vocabulary is not None: self.encoder = WordAndLayoutEncoder(vocabulary, layout_shape) else: self.encoder = None self.pipeline = pipeline def _generate_vocabulary(self, data: pd.DataFrame, image_name_column: str): """ Adapted from plan_agnostic_vocabulary_vector in RoutingClassifier.ipynb :param data: Pandas DataFrame containing all the images to be clustered :param image_name_column: Column in the dataframe with the filename :returns: a list containing the most frequent words in the OCR text for these images """ logging.info( f"Counting extracted words across all images to generate the encoding vocabulary" ) # Finds the most popular words out of a bag comprised of all plans # Guarantees a length based on vocabulary_size count = 0 counter = Counter() for index, row in data.iterrows(): try: filename = data.loc[index, image_name_column] ocr_results = self.ocr_provider.get_ocr_results(filename) for word in ocr_results: if not self.stopwords or (word.text.lower() not in self.stopwords): counter.update({word.text: 1}) count += 1 if count % 5000 == 0: logging.info( f"Processed {count} images for vocabulary generation") except: logging.error("Could not locate image file: {}".format( row[image_name_column])) raise # Create the vocabulary vector based on the most common words vocabulary_vector = [] for word in counter.most_common(self.vocabulary_size): vocabulary_vector.append(word[0]) return vocabulary_vector def _encode_dataset(self, data: pd.DataFrame, image_name_column: str): """ Encode all the images designated in the data DataFrame into the word+layout encoding by running OCR API (with local caching via the ocr_results utility function) :param data: a pandas DataFrame containing a list of images and their metadata :param image_name_column: column in the dataframe that has the file paths in the blob storage container :returns: a 2D numpy array and an array mask. The 2D numpy arrays contains the concatenated word and layout encoding for each encoded image. The mask is an array of the same length as the original data. A zero entry denotes unsuccessfully encoded image. A one denotes a successfully image """ empty_ocr_count = 0 mask = np.zeros(len(data)) encoded_data = np.zeros((len(data), self.vocabulary_size + self.layout_shape[0] * self.layout_shape[1])) counter = 0 for index, row in data.iterrows(): try: filename = data.loc[index, image_name_column] ocr_results = self.ocr_provider.get_ocr_results(filename) if len(ocr_results) == 0: empty_ocr_count += 1 else: mask[counter] = 1 encodings = self.encoder.encode_ocr_results(ocr_results) encoded_data[counter, :] = encodings except: logging.error("Could not locate blob: {}".format( row[image_name_column])) raise counter += 1 if empty_ocr_count > 0: logging.warning( "Empty OCR results resulting in null entries for {} images". format(empty_ocr_count)) return encoded_data, mask def find_clusters(self, data: pd.DataFrame, image_name_column: str, min_samples: int = 10, epsilon: float = 3): """ Encode the dataset and perform clustering via the following steps: 1) constructing a vocabulary if it is not already supplied, 2) encode the images based on the presence of the vocabulary words and the bounding boxes of the detected text on the image grid. This is accomplished via the `WordAndLayoutEncoder` available in the `Routing_Forms` example. 3) apply PCA to each encoding component independently then run clustering on the dataset The final number of components from applying PCA is determined by the min of the specified `n_pca_components` and the size of the data. The resulting encoding is expected to be an array of size 2 * number of components. :param data: a Pandas Dataframe containing the image metadata / filename :param image_name_column: the column name in the dataframe with the filename :param min_samples: DBSCAN parameter controlling the number of samples in a neighborhood for a point to be considered as a core point. :param epislon: DBSCAN parameter controlling the maximum distance between two samples for one to be considered as in the neighborhood of the other. :returns: a copy of the data with the "cluster" column added or overwritten, a dataframe containing the encoding with PCA applied (for further data visualization, for example), and the vocabulary used for the word encoding """ # Produce word and layout encoding from the images; there may be empty rows due to failed OCR on an image if self.encoder is None: vocabulary = self._generate_vocabulary(data, image_name_column) self.encoder = WordAndLayoutEncoder(vocabulary, self.layout_shape) (encoding, mask) = self._encode_dataset(data, image_name_column) if sum(mask) == data.shape[0]: logging.info(f"All {sum(mask)} images are successfully encoded") else: logging.error( f"{data.shape[0] - sum(mask)} images failed encoding") # Remove the empty rows before applying PCA encoding = encoding[mask == 1, :] self.n_pca_components = min(self.n_pca_components, encoding.shape[0]) transformer = ColumnTransformer([ ("word_pca", PCA(n_components=self.n_pca_components), list(range(0, self.vocabulary_size))), ("layout_pca", PCA(n_components=self.n_pca_components), list( range( self.vocabulary_size, self.vocabulary_size + self.layout_shape[0] * self.layout_shape[1]))) ]) dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric="euclidean", leaf_size=40) self.pipeline = Pipeline([("pca", transformer), ("dbscan", dbscan)]) Y = self.pipeline.fit_predict(encoding) data_copy = data.copy() data_copy.drop(["cluster"], axis=1, errors="ignore") data_copy.loc[mask == 1, "cluster"] = Y # A bit of extra work to return the encodings with PCA applied to help with data visualization encoded_data = pd.DataFrame(self.pipeline["pca"].transform(encoding)) return (data_copy, encoded_data, vocabulary)
param_grid=parms, scoring="v_measure_score", cv=[(range(0,len(data)), range(0,len(data)))]) # do not need CV parms_result=gs_cluster.fit(data,text_data.labels_true()) print(parms_result.best_score_) print(parms_result.best_params_) ''' result = [] for g in list(model_selection.ParameterGrid(params)): print() print(g) texf_cluster.set_params(**g) labels_pred = texf_cluster.fit_predict(data) print(labels_pred) count_table = score_data.count_table(text_data.init_num_by_cls, labels_pred, g['KMeans__n_clusters']) print(count_table) #total_entropy=score_data.total_entropy(count_table) #print("Total Entropy:",total_entropy) print( "homogeneity score, completeness score, v score:", metrics.homogeneity_completeness_v_measure(text_data.labels_true(), labels_pred)) print( "Adjusted Mutual Information:", metrics.adjusted_mutual_info_score(text_data.labels_true(), labels_pred))
print('Cluster ' + str(i) + ': ') print('Number Of Purchases: ' + str(pipelineClustering['kmeans'].cluster_centers_[i][0])) print('Days From Last Purchase: ' + str(pipelineClustering['kmeans'].cluster_centers_[i][1])) print('Days From First Purchase: ' + str(pipelineClustering['kmeans'].cluster_centers_[i][2])) print('Total Revenue: ' + str(pipelineClustering['kmeans'].cluster_centers_[i][3])) # Find which cluster each customer belongs to in out dataset # Again, not including the customer id column because its numerical value has no say in the customers habits # Creates new column for each customer noting the cluster they belong to. # We now have our dataset where each customer has a column for the cluster they belong to data['Cluster Category'] = pd.Series(pipelineClustering.fit_predict(data[[ 'Number Of Purchases', 'Days From Last Purchase', 'Days From First Purchase', 'Total Revenue' ]]._get_numeric_data().dropna(axis=1)), index=data.index) data['Cluster Category'].replace( { 0: 'New Customer', 1: 'Loyal Customer', 2: 'Non-frequent Customer', 3: 'High Spender/Loyal Customer' }, inplace=True) data = data[[ 'CustomerID', 'Number Of Purchases', 'Days From Last Purchase', 'Days From First Purchase', 'Total Revenue', 'Cluster Category'
################################################# ############# Pipeline ############# from sklearn.pipeline import Pipeline from sklearn.pipeline import make_pipeline ############# 1. impute + glm steps = [('imputation', imp), ('logistic_regression', logreg)] pipe = Pipeline(steps) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) pipe.score(X_test, y_test) ############# 2. scale + KMenas steps = [('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters = 4))] pipe = Pipeline(steps) y_clus = pipe.fit_predict(X) ############# 3. scale + knn # Setup the pipeline steps: steps steps = [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())] # Create the pipeline: pipeline pipe = Pipeline(steps) # Specify the hyperparameter space param_grid = {'knn__n_neighbors' = np.arange(1, 50)} # 'step name'__'parameter name' # Create the GridSearchCV object: cv cv = GridSearchCV(pipe, param_grid, cv = 5) # Fit to the training set cv.fit(X_train, y_train) y_pred = cv.predict(X_test) ############# 4. scale + SVM
class Sekitei: def __init__(self): self.proba = {} self.quota = {} self.is_taken = {} self.keys = [] self.cluster_expressions = {} self.delta = {} self.part = {} self.bad_part = {} self.i = 0 self.j = 0 self.cluster_expressions_help = {} self.model = Pipeline([ ('scaler', StandardScaler()), # ('clustering', Birch(n_clusters=20, threshold=0.1))]) # ('clustering', AgglomerativeClustering(n_clusters=20))]) # ('clustering', DBSCAN(eps=3, min_samples=5))]) ('clustering', KMeans(n_clusters=20)) ]) self.classifier = Pipeline([ ('scaler', StandardScaler()), # ('classification', LogisticRegression(C=10000))]) # eps = -0.05; k = 10 # ('classification', LinearSVC())]) # ('classification', KNeighborsClassifier())]) # ('classification', BernoulliNB(alpha=0.5))]) # eps = 0.5; k = 5 ('classification', DecisionTreeClassifier(criterion='entropy')) ]) self.check_functions = [] self.parameters = [] self.T = time.time() def _segments(self, segments, param): if len(segments) == param['n']: return True else: return False def _param(self, segments, param): if re.search('[\?&]' + param['p'] + '([\&\/].*)?$', url) is not None: return True else: return False def _param_name(self, segments, param): if re.search('[\?&]' + param['p'] + '=', url) is not None: return True else: return False def _segment_name(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] if segments[param['i']] == param['s']: return True else: return False def _segment_09(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] if segments[param['i']].isdigit(): return True else: return False def _segment_substr_09(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] if re.search('[^\d]+\d+[^\d]+$', segments[param['i']]) is not None: return True else: return False def _segment_ext(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] ext = segments[param['i']].split('.') if len(ext) > 1: if ext[-1].lower() == param['ext']: # if re.search('\.' + param['ext'] + '$', segments[param['i']]) is not None: return True return False def _segment_ext_substr_09(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] ext = segments[param['i']].split('.') if len(ext) > 1: if ext[-1].lower() == param['ext'] and re.search( '[^\d]+\d+[^\d]+$', segments[param['i']]) is not None: # if re.search('\.' + param['ext'] + '$', segments[param['i']]) is not None and re.search('[^\d]+\d+[^\d]+$', segments[param['i']]) is not None: return True return False def _segment_len(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] if len(segments[param['i']]) == param['L']: return True else: return False def _segment_2points(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] wik = segments[param['i']].split(':') if len(wik) != 1: # and wik[0] == param['wik']: return True else: return False def _segment_strix(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] strix = segments[param['i']].split('_') if param['strix'] == len(strix): return True else: return False def _segment_strix_quote(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] strix = segments[param['i']].split('_') if len(strix) == 0: return False if ',' in strix[0]: return True else: return False def _segment_smile(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] if '(' in segments[param['i']]: return True else: return False def _segment_ru(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] if re.search('[А-Яа-я]', segments[param['i']]) is not None: return True else: return False def _segment_in_br(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] m = re.search('\((.*)\)', segments[param['i']]) if m is not None: data = m.groups()[0].split('_') if len(data) == param['data']: return True return False def _segment_defis(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] if '-' in segments[param['i']]: return True else: return False def _segment_start_dig(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] if re.match('[0-9]+', segments[param['i']]): return True else: return False def _segment_more(self, segments, param): if len(segments) <= param['i']: return False pos = segments[param['i']].find('?') if pos != -1: segments[param['i']] = segments[param['i']][:pos] if len(segments[param['i']]) > 15: return True else: return False def init_one(self, feature): m = re.match('segments:([0-9]+)$', feature) if m is not None: return self._segments, {'n': int(m.groups()[0])} m = re.match('param:(.*)$', feature) if m is not None: return self._param, {'p': m.groups()[0]} m = re.match('param_name:(.*)$', feature) if m is not None: return self._param_name, {'p': m.groups()[0]} m = re.match('segment_name_([0-9]+):(.*)$', feature) if m is not None: return self._segment_name, { 'i': int(m.groups()[0]), 's': m.groups()[1] } m = re.match('segment_\[0\-9\]_([0-9]+):1$', feature) if m is not None: return self._segment_09, {'i': int(m.groups()[0])} m = re.match('segment_substr\[0\-9\]_([0-9]+):1$', feature) if m is not None: return self._segment_substr_09, {'i': int(m.groups()[0])} m = re.match('segment_ext_([0-9]+):(.*)$', feature) if m is not None: return self._segment_ext, { 'i': int(m.groups()[0]), 'ext': m.groups()[1] } m = re.match('segment_ext_substr\[0\-9\]_([0-9]+):(.*)$', feature) if m is not None: return self._segment_ext_substr_09, { 'i': int(m.groups()[0]), 'ext': m.groups()[1] } m = re.match('segment_len_([0-9]+):([0-9]+)$', feature) if m is not None: return self._segment_len, { 'i': int(m.groups()[0]), 'L': int(m.groups()[1]) } m = re.match('segment_2points_([0-9]+):1$', feature) if m is not None: return self._segment_2points, {'i': int(m.groups()[0])} m = re.match('segment_strix_([0-9]+):([0-9]+)$', feature) if m is not None: return self._segment_strix, { 'i': int(m.groups()[0]), 'strix': int(m.groups()[1]) } m = re.match('segment_strix_quote_([0-9]+):1$', feature) if m is not None: return self._segment_strix_quote, {'i': int(m.groups()[0])} m = re.match('segment_smile_([0-9]+):1$', feature) if m is not None: return self._segment_smile, {'i': int(m.groups()[0])} m = re.match('segment_ru_([0-9]+):1$', feature) if m is not None: return self._segment_ru, {'i': int(m.groups()[0])} m = re.match('segment_in_br_([0-9]+):(.*)$', feature) if m is not None: return self._segment_in_br, { 'i': int(m.groups()[0]), 'data': int(m.groups()[1]) } m = re.match('segment_defis_([0-9]+):1$', feature) if m is not None: return self._segment_defis, {'i': int(m.groups()[0])} m = re.match('segment_start_dig_([0-9]+):1$', feature) if m is not None: return self._segment_start_dig, {'i': int(m.groups()[0])} m = re.match('segment_more_([0-9]+):1$', feature) if m is not None: return self._segment_more, {'i': int(m.groups()[0])} #print('ooops', feature) return False, False def init_functions(self, keys): for key in keys: f, p = self.init_one(key) self.check_functions.append(f) self.parameters.append(p) def check_url(self, url): N = len(self.keys) X = np.zeros((1, N)) segments = url.split('/')[3:] if segments[-1] == '\n' or segments[-1] == '': del segments[-1] elif segments[-1][-1] == '\n': segments[-1] = segments[-1][:-1] for i in range(len(segments)): try: segments[i] = urllib.unquote(segments[i]).decode('utf8') except UnicodeDecodeError: try: segments[i] = urllib.unquote(segments[i]).decode('cp1251') except UnicodeDecodeError: pass for i in range(N): X[0, i] = self.check_functions[i](segments, self.parameters[i]) return X def extract_features(self, URLS): result = Counter() X_ = {} for line in URLS: X_[line] = [] segments = line.split('/')[3:] if segments[-1] == '\n' or segments[-1] == '': del segments[-1] elif segments[-1][-1] == '\n': segments[-1] = segments[-1][:-1] result['segments:' + str(len(segments))] += 1 X_[line].append('segments:' + str(len(segments))) if (len(segments) == 0): continue for i in range(len(segments)): segment = segments[i] try: segment = urllib.unquote(segment).decode('utf8') except UnicodeDecodeError: try: segment = urllib.unquote(segment).decode('cp1251') except UnicodeDecodeError: pass if '?' in segment: mb_par = segment.split('?') params = mb_par[1].split('&') for p in params: result['param:' + p] += 1 X_[line].append('param:' + p) result['param_name:' + p.split('=')[0]] += 1 X_[line].append('param_name:' + p.split('=')[0]) segment = mb_par[0] result['segment_name_' + str(i) + ':' + segment] += 1 X_[line].append('segment_name_' + str(i) + ':' + segment) if segment.isdigit(): result['segment_[0-9]_' + str(i) + ':1'] += 1 X_[line].append('segment_[0-9]_' + str(i) + ':1') if re.search('[^\d]+\d+[^\d]+$', segment) is not None: result['segment_substr[0-9]_' + str(i) + ':1'] += 1 X_[line].append('segment_substr[0-9]_' + str(i) + ':1') ext = segment.split('.') if len(ext) > 1: result['segment_ext_' + str(i) + ':' + ext[-1].lower()] += 1 X_[line].append('segment_ext_' + str(i) + ':' + ext[-1].lower()) if len(ext) > 1 and re.search('[^\d]+\d+[^\d]+$', segment) is not None: result['segment_ext_substr[0-9]_' + str(i) + ':' + ext[-1].lower()] += 1 X_[line].append('segment_ext_substr[0-9]_' + str(i) + ':' + ext[-1].lower()) wik = segment.split(':') if len(wik) != 1: #result['segment_2points_' + str(i) + ':' + wik[0]] += 1 #X_[line].append('segment_2points_' + str(i) + ':' + wik[0]) result['segment_2points_' + str(i) + ':1'] += 1 X_[line].append('segment_2points_' + str(i) + ':1') strix = segment.split('_') if len(strix) > 1: result['segment_strix_' + str(i) + ':' + str(len(strix))] += 1 X_[line].append('segment_strix_' + str(i) + ':' + str(len(strix))) if len(strix) > 0: if ',' in strix[0]: result['segment_strix_quote_' + str(i) + ':1'] += 1 X_[line].append('segment_strix_quote_' + str(i) + ':1') result['segment_len_' + str(i) + ':' + str(len(segment))] += 1 X_[line].append('segment_len_' + str(i) + ':' + str(len(segment))) if '(' in segment: result['segment_smile_' + str(i) + ':1'] += 1 X_[line].append('segment_smile_' + str(i) + ':1') m = re.search('\((.*)\)', segment) if m is not None: data = m.groups()[0].split('_') result['segment_in_br_' + str(i) + ':' + str(len(data))] += 1 X_[line].append('segment_in_br_' + str(i) + ':' + str(len(data))) if re.search('[А-Яа-я]', segment) is not None: result['segment_ru_' + str(i) + ':1'] += 1 X_[line].append('segment_ru_' + str(i) + ':1') # if '-' in segment: # result['segment_defis_' + str(i) + ':1'] += 1 # X_[line].append('segment_defis_' + str(i) + ':1') # if re.match('[0-9]+', segment): # result['segment_start_dig_' + str(i) + ':1'] += 1 # X_[line].append('segment_start_dig_' + str(i) + ':1') if len(segment) > 15: result['segment_more_' + str(i) + ':1'] += 1 X_[line].append('segment_more_' + str(i) + ':1') for key in result.keys(): if result[key] > 100: self.keys.append(key) self.init_functions(self.keys) # print self.keys X = np.zeros((len(URLS), len(self.keys))) for j, url in enumerate(URLS): X[j, :] = self.check_url(url) ''' for i, key in enumerate(self.keys): if (key in X_[url]) != X[j, i]: print('f**k', key, url, X[j, i], key in X_[url]) ''' return X def fit_model(self, QLINK_URLS, UNKNOWN_URLS, QUOTA): self.__init__() URLS = QLINK_URLS + UNKNOWN_URLS X = self.extract_features(URLS) y = np.zeros((len(QLINK_URLS) + len(UNKNOWN_URLS))) y[:len(QLINK_URLS)] = 1 clusters = self.model.fit_predict(X) self.classifier.fit(X, y) self.un_clusters, counts = np.unique(clusters, return_counts=True) #print counts, self.keys eps = -0.09 Delta = 20 dupl = 1 a = 0 b = 0 k = 10 zero = 0 if self.classifier.score(X, y) < 0.7: k = 1.7 eps = 0.27 for cluster, count in np.dstack((self.un_clusters, counts))[0]: self.proba[cluster] = np.min((np.max( (np.sum(y[clusters == cluster]) / count - eps, 0)), 1)) self.is_taken[cluster] = 0 #self.quota[cluster] = np.ceil(QUOTA * np.sum(y[clusters == cluster]) / len(QLINK_URLS)) min_quota = QUOTA / len(QLINK_URLS) * k #self.quota[cluster] = np.ceil(k * np.sum(y[clusters == cluster]) + (QUOTA - k * np.sum(y)) * np.sum(1 - y[clusters == cluster]) / np.sum(1 - y)) self.quota[cluster] = min_quota * np.sum( y[clusters == cluster]) + 100 self.cluster_expressions_help[cluster] = np.mean( X[clusters == cluster], axis=0) > 0.5 self.cluster_expressions[cluster] = np.zeros( len(self.cluster_expressions_help[cluster])) self.cluster_expressions[cluster][ self.cluster_expressions_help[cluster]] = 1 self.delta[cluster] = np.ceil( np.sum( np.abs( np.mean(X[clusters == cluster], axis=0) - self.cluster_expressions[cluster]))) + Delta self.part[cluster] = np.sum(y[clusters == cluster]) / np.sum(y) self.bad_part[cluster] = np.sum( 1 - y[clusters == cluster]) / np.sum(1 - y) if self.proba[cluster] < 0.01: a += 1 if np.sum(y[clusters == cluster]) == 0: zero += count #print a #print self.classifier.score(X, y) if zero > 300: #print "here" for cluster in self.un_clusters: self.quota[cluster] = self.quota[cluster] + 1400 self.proba[cluster] = np.min((np.max( (self.proba[cluster] + 0.35, 0)), 1)) ''' elif zero > 200: #print "here" for cluster in self.un_clusters: self.quota[cluster] = self.quota[cluster] + 1000 self.proba[cluster] = np.min((np.max((self.proba[cluster] + 1, 0)), 1)) elif a > 300: for cluster in self.un_clusters: self.quota[cluster] = self.quota[cluster] + 1000 self.proba[cluster] = np.min((np.max((self.proba[cluster] + 1, 0)), 1)) ''' self.T = 0 self.cluster_expressions_ = np.zeros( (len(self.un_clusters), len(self.keys))) for i in range(len(self.un_clusters)): self.cluster_expressions_[i, :] = self.cluster_expressions[ self.un_clusters[i]] #print self.proba def predict_cluster(self, X): Dist = len(self.cluster_expressions.keys()) cl = -1 nums = np.sum(X != self.cluster_expressions_, axis=1) ind = np.argmin(nums) cl = self.un_clusters[ind] Dist = nums[ind] if Dist <= self.delta[cl]: return cl return -1 def predict_fetch(self, url): X = self.check_url(url) # ~1-3-4s fetch = self.classifier.predict(X) # ~3-7-7s y = self.predict_cluster(X) # ~1-2-2 if fetch: if y != -1: self.is_taken[y] += 1 return True if y == -1: return False fetch = np.random.choice((True, False), p=(self.proba[y], 1 - self.proba[y])) if fetch and self.is_taken[y] < self.quota[y]: self.is_taken[y] += 1 return True return False