Example #1
0
def test_fit_predict_with_intermediate_fit_params():
    # tests that Pipeline passes fit_params to intermediate steps
    # when fit_predict is invoked
    pipe = Pipeline([("transf", TransfFitParams()), ("clf", FitParamT())])
    pipe.fit_predict(
        X=None, y=None, transf__should_get_this=True, clf__should_succeed=True
    )
    assert pipe.named_steps["transf"].fit_params["should_get_this"]
    assert pipe.named_steps["clf"].successful
    assert "should_succeed" not in pipe.named_steps["transf"].fit_params
Example #2
0
def test_fit_predict_with_intermediate_fit_params():
    # tests that Pipeline passes fit_params to intermediate steps
    # when fit_predict is invoked
    pipe = Pipeline([('transf', TransfFitParams()), ('clf', FitParamT())])
    pipe.fit_predict(X=None,
                     y=None,
                     transf__should_get_this=True,
                     clf__should_succeed=True)
    assert_true(pipe.named_steps['transf'].fit_params['should_get_this'])
    assert_true(pipe.named_steps['clf'].successful)
    assert_false('should_succeed' in pipe.named_steps['transf'].fit_params)
Example #3
0
def test_fit_predict_with_intermediate_fit_params():
    # tests that Pipeline passes fit_params to intermediate steps
    # when fit_predict is invoked
    pipe = Pipeline([('transf', TransfFitParams()), ('clf', FitParamT())])
    pipe.fit_predict(X=None,
                     y=None,
                     transf__should_get_this=True,
                     clf__should_succeed=True)
    assert pipe.named_steps['transf'].fit_params['should_get_this']
    assert pipe.named_steps['clf'].successful
    assert 'should_succeed' not in pipe.named_steps['transf'].fit_params
def cluster_analysis(df_test):
    numeric_features = ['amount', 'day']
    numeric_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='median')), ('scaler',
                                                    StandardScaler())])
    # account name used as dummy feature
    categorical_features = ['account_name']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer, numeric_features
                       ), ('cat', categorical_transformer,
                           categorical_features)])

    clf = Pipeline(steps=[('preprocessor',
                           preprocessor), ('cluster', DBSCAN(0.2))])

    df_test.dates = pd.to_datetime(df_test.dates)
    df_test['day'] = df_test.dates.dt.day
    df_test.head()

    prediction = clf.fit_predict(df_test)

    df_test['prediction'] = "Regular-Irregular"
    df_test.loc[prediction < 0, 'prediction'] = 'Discretionary'

    return df_test
def bestClassify(X,Y):
	"Best classifier function"
	tfidf = True

	if tfidf:
		vec = TfidfVectorizer(preprocessor = identity,
							tokenizer = identity, sublinear_tf = True)
	else:
		vec = CountVectorizer(preprocessor = identity,
							tokenizer = identity)

	km = KMeans(n_clusters=2, n_init=100, verbose=1)
	clusterer = Pipeline( [('vec', vec),
								('cls', km)] )

	prediction = clusterer.fit_predict(X,Y)

	checker = defaultdict(list)
	for pred,truth in zip(prediction,Y):
		checker[pred].append(truth)

	labeldict = {}
	for pred, label in checker.items():
		labeldict[pred] = Counter(label).most_common(1)[0][0]
		#print(pred, Counter(label).most_common(1)[0][0])

	prediction = [labeldict[p] for p in prediction]
	labels = list(labeldict.values())
	print(labels)
	print(confusion_matrix(Y, prediction, labels=labels))

	print("Homogeneity:", homogeneity_score(Y,prediction))
	print("Completeness:", completeness_score(Y,prediction))
	print("V-measure:", v_measure_score(Y,prediction))
	print("Rand-Index:", adjusted_rand_score(Y,prediction))
def train_pipeline():
    """
    Load or create dataset, then create and fit pipeline, show its results, and return it trained.
    """
    try:
        train_df = load_data('train')
    except FileNotFoundError:
        train_df = prepare_data(list_audiofiles(config.train_path))
        save_data(train_df, 'train')

    train = np.array(handle_wrong_rows(train_df, 'train'))
    pipeline = Pipeline([('scaler', StandardScaler()),
                         ('clusterization', RelabeledBayesianGaussianMixture(config=config,
                                                                             n_components=config.n_classes,
                                                                             tol=0.00001,
                                                                             covariance_type='tied',
                                                                             max_iter=10000,
                                                                             random_state=18))])

    tr_predictions = pipeline.fit_predict(train)
    targets = pd.read_csv(config.targets).target
    print('\nModel accuracy: %.3f' % (metrics.accuracy_score(targets, tr_predictions)))

    if config.verbose:
        plot_clusters(pipeline.steps[0][1].transform(train), tr_predictions, targets)

    return pipeline
Example #7
0
    def test_case_1(self):
        num_features_pipeline = Pipeline([
            ('impute', SimpleImputer(missing_values=np.nan, strategy='mean')),
            ('scale', MinMaxScaler()),
            ('transform', QuantileTransformer(output_distribution='normal'))
        ])

        cat_features_pipeline = Pipeline([
            ('impute',
             SimpleImputer(missing_values=np.nan,
                           strategy='constant',
                           fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])

        preprocessor = ColumnTransformer(transformers=[('num',
                                                        num_features_pipeline,
                                                        numerical_features),
                                                       ('cat',
                                                        cat_features_pipeline,
                                                        categorical_features)])

        classifier_pipeline = Pipeline(
            steps=[('preprocessing', preprocessor
                    ), ('classify',
                        CustomClassifier(base=LogisticRegression()))])

        y_pred = classifier_pipeline.fit_predict(X_train, y_train)

        print(y_pred)
class ClusteringPipelineHandler:
    def __init__(self, config: ClusteringConfig):
        self.pipe = Pipeline([("prep", Preprocessor()),
                              ("extractor", FasttextExtractor(config)),
                              ("cluster",
                               Clustering(config.min_cluster_size,
                                          config.cosine_thrsh))])

    def _get_clusters(self, inp: Dict[str, str]) -> Dict[str, List[str]]:
        texts: List[str] = [w for w in inp.values()]
        labels = self.pipe.fit_predict(texts)
        labels_buckets: Dict[str, List[str]] = dict()
        for key, c in zip(list(inp.keys()), labels):
            cc = str(c)
            if cc not in labels_buckets:
                labels_buckets[cc] = []
            labels_buckets[cc].append(key)
        return labels_buckets

    def on_post(self, req, resp) -> None:
        if req.content_length:
            try:
                inp = ujson.loads(req.stream.read())
                answer = self._get_clusters(inp)
                resp.body = ujson.dumps(answer)
                resp.status = falcon.HTTP_200
            except Exception as e:
                resp.body = ujson.dumps({'Error': traceback.format_exc()})
                resp.status = falcon.HTTP_500
        else:
            resp.body = ujson.dumps({'Error': 'data payload is mandatory'})
            resp.status = falcon.HTTP_400
Example #9
0
def test_pipeline():
    trajs = AlanineDipeptide().get_cached().trajectories
    p = Pipeline([('diheds', DihedralFeaturizer(['phi', 'psi'], sincos=False)),
                  ('hmm', VonMisesHMM(n_states=4))])

    predict = p.fit_predict(trajs)
    p.named_steps['hmm'].summarize()
Example #10
0
def test_pipeline():
    trajs = AlanineDipeptide().get_cached().trajectories
    topology = trajs[0].topology

    indices = topology.select('backbone')
    p = Pipeline([('diheds', SuperposeFeaturizer(indices, trajs[0][0])),
                  ('hmm', GaussianHMM(n_states=4))])

    predict = p.fit_predict(trajs)
    p.named_steps['hmm'].summarize()
Example #11
0
def test_pipeline():
    trajs = AlanineDipeptide().get_cached().trajectories
    topology = trajs[0].topology

    indices = topology.select('backbone')
    p = Pipeline([
        ('diheds', SuperposeFeaturizer(indices, trajs[0][0])),
        ('hmm', GaussianHMM(n_states=4))
    ])

    predict = p.fit_predict(trajs)
    p.named_steps['hmm'].summarize()
Example #12
0
 def clusters(self, k=None, method='kmeans', ret_clusterizer=False, **kwargs):
     """
     Extract clusters from input data.
     """
     pipeline = Pipeline([
         ('fill', Imputer()),
         ('cluster', KMeans(k or 4, **kwargs))
     ])
     labels = pipeline.fit_predict(self.pivot_table)
     if ret_clusterizer:
         return labels, pipeline
     else:
         return labels
Example #13
0
	def train(self,argv):
		testmode = False #seperate testfile or do cross validation

		if len(argv) == 2:
		    trainfile = argv[1]
		else:
		    exit("Use kmeansBinary.py <trainfile>")


		# X and Y are the result of the read corpus function. X is a list of all documents that are tokenized and Y is a list of all labels
		# The use_sentiment boolean can be changed to use the categories(False) or the polarity(True)
		X, Y = self.read_corpus(trainfile, use_sentiment=True)

		# we use a dummy function as tokenizer and preprocessor,
		# since the texts are already preprocessed and tokenized.
		vec = TfidfVectorizer(preprocessor = self.identity, tokenizer = self.identity,sublinear_tf=True)
		#vec = CountVectorizer(preprocessor = self.identity, tokenizer = self.identity)
		#vec = DictVectorizer()

		km = Pipeline( [('vec', vec),
                            ('cls', cluster.KMeans(n_clusters=2, n_init=10, verbose=1))] )
		
		labels_pred = km.fit_predict(X,Y)
		labels_true = Y

		c = defaultdict(list)
		#calculate confusion matrix
		for pred,true in zip(labels_pred,labels_true):
			c[pred].append(true)

		label = {}
		for key in c:
			count = Counter(c[key])
			label[key] = count.most_common(1)[0][0]
			print(key, count.most_common(6))

		labels_pred = [label[l] for l in labels_pred]
		labels = list(set(label.values()))
		print(labels)
		
		print(vec.get_feature_names())
		print("Homogeneity: %0.3f" % homogeneity_score(labels_true, labels_pred))
		print("Completeness: %0.3f" % completeness_score(labels_true, labels_pred))
		print("V-measure: %0.3f" % v_measure_score(labels_true, labels_pred))
		print("Adjusted Rand-Index: %.3f" % adjusted_rand_score(labels_true, labels_pred))
		print(confusion_matrix(labels_true, labels_pred, labels=labels))
Example #14
0
def test_fit_predict_on_pipeline():
    # test that the fit_predict method is implemented on a pipeline
    # test that the fit_predict on pipeline yields same results as applying
    # transform and clustering steps separately
    iris = load_iris()
    scaler = StandardScaler()
    km = KMeans(random_state=0)

    # first compute the transform and clustering step separately
    scaled = scaler.fit_transform(iris.data)
    separate_pred = km.fit_predict(scaled)

    # use a pipeline to do the transform and clustering in one step
    pipe = Pipeline([('scaler', scaler), ('Kmeans', km)])
    pipeline_pred = pipe.fit_predict(iris.data)

    assert_array_almost_equal(pipeline_pred, separate_pred)
Example #15
0
def test_fit_predict_on_pipeline():
    # test that the fit_predict method is implemented on a pipeline
    # test that the fit_predict on pipeline yields same results as applying
    # transform and clustering steps separately
    iris = load_iris()
    scaler = StandardScaler()
    km = KMeans(random_state=0)

    # first compute the transform and clustering step separately
    scaled = scaler.fit_transform(iris.data)
    separate_pred = km.fit_predict(scaled)

    # use a pipeline to do the transform and clustering in one step
    pipe = Pipeline([('scaler', scaler), ('Kmeans', km)])
    pipeline_pred = pipe.fit_predict(iris.data)

    assert_array_almost_equal(pipeline_pred, separate_pred)
Example #16
0
def parse_image(args, root_img: np.ndarray, image_name: str):
    """Parse a single image, get the hough lines and find the vanishing points"""
    # Create hough pipeline that transforms the image and finally predicts the hough lines
    # Note: This pipeline is saving intermediate steps as image files in the output folder
    hough_pipeline = Pipeline(steps=[
        ('image_resizer', ResizeTransformer(args.shape)),
        # ('colour_masker', ColourSegmentTransformer(3)),
        # ('plot_colour_mask_image', PlotTransformer(image_name, suffix="colour", folder=args.output_folder)),
        ('canny_image', CannyTransformer()),
        ('segment_image', SegmentTransformer() if args.segment_canny else None),
        ('plot_canny_image', PlotTransformer(image_name, suffix="canny", folder=args.output_folder)),
        ('hough_transform', HoughLinesEstimator(
            threshold=args.hough_threshold,
            weight_decay=args.weight_decay,
            vertical_degrees_filter=args.degrees_filter
        )),
    ])
    # Get the pipeline results and filter them according to argument settings
    hough_transform = hough_pipeline.fit_predict(root_img)
    hough_transform.filter_horizontal_lines(degrees=args.degrees_filter)
    hough_transform.limit_lines(args.hough_limit)
    hough_transform.group_lines(r=args.hough_group_radius)
    if args.cluster_hough_lines:
        hough_transform.cluster_lines()

    # Add padding to the hough transform
    hough_transform.add_padding(args.padding)

    # Get the vanishing points with the chosen method
    vps, reference_transformer = METHODS[args.method](args, hough_transform)

    # Plot decimal coordinates of the vanish points
    print([((x - args.padding )/args.img_width, (y - args.padding)/args.img_height) for x,y in vps])

    # Plot and print on the original image
    Pipeline(steps=[
        ('image_resizer', ResizeTransformer(args.shape)),
        ('pad_image', PadTransformer(args.padding)),
        # ('plot_pad_image', PlotTransformer(image_name, suffix="padded_orig"))
        ('add_reference', reference_transformer),
        ('add_vanishing_points', DrawPointsTransformer(vps, colour=(255, 255, 0))),
        ('plot_final_image', PlotTransformer(image_name, suffix="final", folder=args.output_folder)),
    ]).fit_transform(root_img)
Example #17
0
def test_fit_predict_on_pipeline():
    # test that the fit_predict method is implemented on a pipeline
    # test that the fit_predict on pipeline yields same results as applying
    # transform and clustering steps separately
    scaler = StandardScaler()
    km = KMeans(random_state=0)
    # As pipeline doesn't clone estimators on construction,
    # it must have its own estimators
    scaler_for_pipeline = StandardScaler()
    km_for_pipeline = KMeans(random_state=0)

    # first compute the transform and clustering step separately
    scaled = scaler.fit_transform(iris.data)
    separate_pred = km.fit_predict(scaled)

    # use a pipeline to do the transform and clustering in one step
    pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)])
    pipeline_pred = pipe.fit_predict(iris.data)

    assert_array_almost_equal(pipeline_pred, separate_pred)
def cluster_analysis_v3(df_test):
    df_test['time_since_last_transaction'] = df_test['datetime'] - df_test['datetime'].shift()
    df_test['time_since_last_transaction']= df_test['time_since_last_transaction'].apply(lambda x: convert_to_mins_v3(x))
    df_test = df_test.reset_index()
    
    main_dict = {}
    for party in list(df_test['other_account_name'].value_counts().index):
        temp_df = df_test[df_test['other_account_name'] == party].copy()
        temp_df['time_since_last_trans_party'] = temp_df['datetime'] - temp_df['datetime'].shift()
        pos_dict = pd.Series(temp_df['time_since_last_trans_party'].values, index = temp_df['index'])
        main_dict.update(pos_dict)
    df_test['time_since_last_transaction_party'] = df_test['index'].map(main_dict)
    df_test['time_since_last_transaction_party']= df_test['time_since_last_transaction_party'].apply(lambda x: convert_to_mins_v3(x))
    
    
    numeric_features = ['amount', 'day', 'time_since_last_transaction_party', 'time_since_last_transaction']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    # account name used as dummy feature
    categorical_features = ['account_name']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('cluster', DBSCAN(0.2))])

    df_test.dates = pd.to_datetime(df_test.dates)
    df_test['day'] = df_test.dates.dt.day
    df_test.head()
    #df = df[[]]
    prediction = clf.fit_predict(df_test)

    df_test['prediction'] = "Regular-Irregular"
    df_test.loc[prediction < 0, 'prediction'] = 'Discretionary'

    return df_test
Example #19
0
def test_fit_predict_on_pipeline():
    # test that the fit_predict method is implemented on a pipeline
    # test that the fit_predict on pipeline yields same results as applying
    # transform and clustering steps separately
    iris = load_iris()
    scaler = StandardScaler()
    km = KMeans(random_state=0)
    # As pipeline doesn't clone estimators on construction,
    # it must have its own estimators
    scaler_for_pipeline = StandardScaler()
    km_for_pipeline = KMeans(random_state=0)

    # first compute the transform and clustering step separately
    scaled = scaler.fit_transform(iris.data)
    separate_pred = km.fit_predict(scaled)

    # use a pipeline to do the transform and clustering in one step
    pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)])
    pipeline_pred = pipe.fit_predict(iris.data)

    assert_array_almost_equal(pipeline_pred, separate_pred)
Example #20
0
class ClusteringAlgorithm(object):

    def __init__(self, instances, conf):
        self.instances = instances
        self.conf = conf
        self.num_clusters = self.conf.num_clusters
        self.clustering = None

    @abc.abstractmethod
    def get_distortion(self):
        return

    @abc.abstractmethod
    def get_centroids(self):
        return

    def get_predicted_proba(self):
        return None

    def get_all_proba(self):
        return None

    def fit(self):
        self.pipeline = Pipeline([('scaler', StandardScaler()),
                                  ('clustering', self.algo)])
        self.assigned_clusters = self.pipeline.fit_predict(
                                        self.instances.features.get_values())

    def generate(self, drop_annotated_instances=False):
        self.clustering = Clusters(self.instances,
                                   self.assigned_clusters,
                                   clustering_algo=self)
        self.clustering.generate(
                            self.get_centroids(),
                            drop_annotated_instances=drop_annotated_instances)

    def export(self, output_dir, quick=False):
        self.clustering.export(output_dir)
        self.clustering.gen_eval(output_dir, quick=quick)
def cluster(list_of_texts: List[str], num_clusters: int=3) -> List[int]:
    """
    Cluster a list of texts into a given number of clusters,
    based on their tf-idf-weighted bag-of-word vectors.

    Args:
        list_of_texts: a list of untokenized texts
        num_clusters: the target number of clusters

    Returns: a list with the cluster id for each text, e.g. [0,1,0,0,2,2,1]
    """
    pipeline = Pipeline([
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clust", KMeans(n_clusters=num_clusters))
    ])

    try:
        clusters = pipeline.fit_predict(list_of_texts)
    except ValueError:
        clusters = list(range(len(list_of_texts)))

    return clusters
Example #22
0
                                    ('logistic', LogisticRegression(C=0.1))])

logistic_genre_cert_PCA.fit(data, binary12)

km4 = Pipeline([('genre_cert', genre_cert),
                ('svd', TruncatedSVD(n_components=2)),
                ('kmeans', KMeans(n_clusters=4))])

km4.fit(data)

data.loc[:, 'binary12'] = data['num_months_wait'].apply(makeBinary)

data_check = data[data['genre_name'].notnull() == True].reset_index()

# get groups
km4_check = pd.DataFrame(data = km4.fit_predict(data_check))\
.rename(columns = {0: 'cluster4'})

km4 = Pipeline([('genre_cert', genre_cert),
                ('svd', TruncatedSVD(n_components=2)),
                ('kmeans', KMeans(n_clusters=4))])

km4.fit(data)

data_all = pd.merge(data_check, km4_check, left_index=True, right_index=True)

cluster = [
    data_all[data_all['cluster4'] == i]['keyword_name'] for i in range(4)
]
comment_words = [' '.join(cluster[i]).replace('based', '').replace('novel', '')\
                 .replace('young adult', '').replace(',', '') for i in range(4)]
Example #23
0
    orders_numb_top = dt.orders[['user_id', 'order_number', 'order_id']].\
        sort_values(['user_id', 'order_number'], ascending=[1, 0]).\
        groupby('user_id').head(LAST_N_PRIORS)['order_id'].\
        values

    priors_filtered = dt.priors[dt.priors.order_id.isin(orders_numb_top)]
    users_prior['all_products'] = priors_filtered.groupby(
        'user_id')['product_id'].apply(set)
else:
    users_prior['all_products'] = dt.priors.groupby(
        'user_id')['product_id'].apply(set)

user_products = users_prior.all_products.apply(
    lambda x: " ".join([str(prod_id) for prod_id in x]))

clusters = pipeline.fit_predict(user_products)

ar_clust, ar_cnt = np.unique(clusters, return_counts=True)
max_clust = np.argmax(ar_cnt)
for cl, cnt in zip(ar_clust, ar_cnt):
    if cnt < 500:
        clusters[clusters == cl] = max_clust

# test the GAAC clusterer with 4 clusters
#clusterer = GAAClusterer(N_CLUSTERS, normalise=False)
#clusters = clusterer.cluster(X_svd, True)
pd.DataFrame({
    'user_id': user_products.index,
    'cluster': clusters
}).to_csv('../tmp/user_by_cluster.csv', index=False)
print('Done clustering', np.unique(clusters, return_counts=True))
Example #24
0
"""

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from hdbscan import HDBSCAN
from sklearn.cluster import DBSCAN
from sklearn.pipeline import Pipeline

from analysis.data import GeographicArea, features
from analysis.scaler import SpatialWaterVapourScaler

file_pattern = 'data/input/METOPAB_20160801_global_evening.nc'
area = GeographicArea(lat=(-25, 50), lon=(-45, 60))
df = area.import_dataset(file_pattern)
X = df[features].values

# create estimators
scaler = SpatialWaterVapourScaler(km=60, H2O=0.1, delD=10)
# cluster = DBSCAN(eps=2.4, min_samples=14)
cluster = HDBSCAN(min_cluster_size=14, gen_min_span_tree=True)

# create pipeline
pipeline = Pipeline([('scaler', scaler), ('cluster', cluster)])

y = pipeline.fit_predict(X)

subarea = GeographicArea(lat=(-20, 0), lon=(22, 50))
area.subarea_plot(X, y, subarea=subarea, include_noise=True)

# print('dbcv score: ', cluster.relative_validity_)
Example #25
0
    ('svd', TruncatedSVD(100, random_state=appconfig['random_state'])),
    ('normalizer', Normalizer(copy=False)),
    ('clustering', MiniBatchKMeans(random_state=appconfig['random_state']))
])


##
# clustering routine

print('clustering')
for index, classe in enumerate(appconfig['classification']['allowed_classes']):

    corpus = [contrato['corpo'] for contrato in classes_contratos[classe]]

    pipeline.set_params(clustering__n_clusters=appconfig['clustering']['num_clusters'][index])
    predictions = pipeline.fit_predict(corpus)

    for index, prediction in enumerate(predictions):
        classes_contratos[classe][index]['_cluster'] = np.asscalar(prediction)


##
# persisting

# flatten classes_contratos values
clusterized_contratos = reduce(lambda x,y: x+y, classes_contratos.values())

print('persisting results')
with dbi.opensession() as session:

    predicoes = Predicao_Contrato.__table__
Example #26
0
__author__ = 'Romain Tavenard romain.tavenard[at]univ-rennes2.fr'

ref_dim = 0
s = DTWSampler(scaling_col_idx=ref_dim, reference_idx=0, interp_kind="linear")
km = KMeans(n_clusters=3)

data = []
data.append(numpy.loadtxt("data/Xi_ref.txt"))
data.append(numpy.loadtxt("data/Xi_0.txt"))
data.append(numpy.loadtxt("data/Xi_1.txt"))

d = data[0].shape[1]

max_sz = max([ts.shape[0] for ts in data])
n_rep = 5

npy_arr = numpy.zeros((len(data) * n_rep, max_sz, d)) + numpy.nan
std_per_d = None
for idx_rep in range(n_rep):
    for idx, ts in enumerate(data):
        sz = ts.shape[0]
        npy_arr[idx +
                idx_rep * len(data), :sz] = ts + 0.1 * numpy.random.randn(
                    sz, d) * ts.std(axis=0)
npy_arr = npy_arr.reshape(-1, max_sz * d)

dtw_kmeans = Pipeline([('dtw_sampler', s), ('l2-kmeans', km)])

print(dtw_kmeans.fit_predict(npy_arr))
	def clustering_captcha(self, image_path, check=False):
		"""对验证码图像进行聚类操作以分离出验证码图片中的各个字符

		参数
		----
		image_path: str
			单个验证码图片的绝对路径
		check: bool
			是否对聚类后的验证码图片检查聚类效果及基于列的像素点分布图

		返回值
		----
		(image_vectors, col_npixs): tuple [2]
			长度为2的tuple,其中tuple的第一个对象为根据聚类得到的除背景以外的
			所有类的像素矩阵,tuple的第二个对象为第一个对象所形成图像的每一列
			的非背景像素个数
			image_vectors: {array-like} [self.width * self.height, self.n_chars + 1]
			col_npixs: {array-like} [self.width, self.n_chars + 1]
		"""
		image = self.de_noise(image_path)
		image_pixs = np.array(image.getdata())
		image_pixs = image_pixs.astype(np.float)

		sc = StandardScaler()
		km = KMeans(n_clusters=(self.n_chars + 2))
		clu = Pipeline(steps=[('sc', sc), ('km', km)])
		clusters = clu.fit_predict(image_pixs)

		image_vectors = np.zeros((self.n_chars+2, self.width*self.height))
		col_npixs = np.zeros((self.n_chars+2, self.width))

		for i in np.unique(clusters):
			image_vectors[i, clusters == i] = 1
			image_vectors[i, :] = self.de_line(image_vectors[i, :])
			col_npixs[i, :] = image_vectors[i, :].reshape((
				self.height, self.width)).sum(axis=0)
		cluster_bkg = np.argmax(col_npixs.sum(axis=1))
		image_vectors = np.delete(image_vectors, (cluster_bkg), axis=0)
		col_npixs = np.delete(col_npixs, (cluster_bkg), axis=0)

		if check:
			if not self.checking_path:
				self.checking_path = os.path.join(self.training_images_path, 'checking')

			if not os.path.isdir(self.checking_path):
				os.mkdir(self.checking_path)

			clusters_path = os.path.join(self.checking_path, 'clusters')
			if not os.path.isdir(clusters_path):
				os.mkdir(clusters_path)

			n_clusters = col_npixs.shape[0]
			img_name = os.path.split(image_path)[1].split('.')[0]
			for i in range(n_clusters):
				new_img_name = os.path.join(clusters_path, 
					img_name + '_cluster' + str(i) + '_img' + '.jpg')
				new_fig_name = os.path.join(clusters_path, 
					img_name + '_cluster' + str(i) + '_fig' + '.jpg')

				im_new = Image.new('1', (self.width, self.height))
				im_new.putdata(image_vectors[i, :])
				im_new.save(new_img_name)
				plt.plot(col_npixs[i, :])
				plt.savefig(new_fig_name)
				plt.close('all')

		return (image_vectors, col_npixs)
def main():

    datatrain = pd.read_excel(
        "../PAN-15/logregexcel_PAN-15trainlargeconcatenated.xlsx",
        names=["Folder", "labels", "Text1", "Text2"])
    dataframetrain = transform_data(datatrain)
    #dataframetrain = dataframetrain.sample(frac=1)
    Xtrain = dataframetrain['text'].tolist()
    Ytrain = dataframetrain['labels'].tolist()

    datatest = pd.read_excel(
        "../PAN-15/logregexcel_PAN-15testlargeconcatenated.xlsx",
        names=["Folder", "labels", "Text1", "Text2"])
    dataframetest = transform_data(datatest)
    #dataframetest = dataframetest.sample(frac=1)
    Xtest = dataframetest['text'].tolist()
    Ytest = dataframetest['labels'].tolist()

    vec = TfidfVectorizer(preprocessor=preprocessor)

    classifier = Pipeline([('vec', vec), ('cls', KMeans(n_clusters=2))])

    classifier.fit(Xtrain, Ytrain)

    try:
        X_prep = vec.fit_transform(Xtest).toarray()
        labels = classifier.fit_predict(Xtest)
        pca = PCA(n_components=2).fit(X_prep)
        coords = pca.transform(X_prep)
        label_colors = [
            "red", "blue", "green", "yellow", "black", "purple", "cyan"
        ]
        colors = [label_colors[i] for i in labels]
        plt.scatter(coords[:, 0], coords[:, 1], c=colors)
        centroids = classifier.named_steps['cls'].cluster_centers_
        centroid_coords = pca.transform(centroids)
        plt.scatter(centroid_coords[:, 0],
                    centroid_coords[:, 1],
                    marker="X",
                    s=200,
                    linewidth=2,
                    c="#444d61")
        plt.show()
    except:
        pass

    try:
        coefs = classifier.named_steps['cls'].coef_
        print(coefs)
        features = classifier.named_steps['vec'].get_feature_names()
        print_n_most_informative_features(coefs, features, 10)
        print()
    except:
        pass
    Yguess = classifier.predict(Xtest)
    Ylist = []
    for i in Yguess:
        if i < 0.5:
            Ylist.append(0)
        else:
            Ylist.append(1)

    print(classification_report(Ytest, Ylist))
    print(accuracy_score(Ytest, Ylist))
Example #29
0
def compare_clustering(data, storetofile):
    preprocessor = get_preprocessor()
    kmeans = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier',
         KMeans(n_clusters=2, init='random', algorithm='full', random_state=42)
         )
    ])

    spectral = Pipeline(
        steps=[('preprocessor', preprocessor),
               ('classifier',
                SpectralClustering(
                    n_clusters=2, assign_labels='discretize', random_state=42)
                )])

    gaussian = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier',
                                GaussianMixture(n_components=2,
                                                n_init=10,
                                                init_params='random',
                                                random_state=42))])

    X = data.drop(['IstKunde'], axis=1)
    y_kmeans = kmeans.fit_predict(X)
    y_spectral = spectral.fit_predict(X)
    y_gaussian = gaussian.fit_predict(X)

    data_kmeans = data.copy()
    data_kmeans['IstKunde'] = y_kmeans
    data_spectral = data.copy()
    data_spectral['IstKunde'] = y_spectral
    data_gaussian = data.copy()
    data_gaussian['IstKunde'] = y_gaussian

    sns.set(style="ticks")
    f1 = sns.pairplot(data_kmeans,
                      vars=('Land_ID', 'Branche_ID', 'Mitarbeiteranzahl',
                            'Umsatz', 'Wachstum'),
                      hue="IstKunde")
    f1.fig.canvas.set_window_title('Kmeans-Clustering Scattermatrix')
    f2 = sns.pairplot(data_spectral,
                      vars=('Land_ID', 'Branche_ID', 'Mitarbeiteranzahl',
                            'Umsatz', 'Wachstum'),
                      hue="IstKunde")
    f2.fig.canvas.set_window_title('Spectral-Clustering Scattermatrix')
    f3 = sns.pairplot(data_gaussian,
                      vars=('Land_ID', 'Branche_ID', 'Mitarbeiteranzahl',
                            'Umsatz', 'Wachstum'),
                      hue="IstKunde")
    f3.fig.canvas.set_window_title('Gaussian-Mixture-Clustering Scattermatrix')

    #    pca = Pipeline(steps=[('preprocessor', preprocessor), ('pca', PCA(n_components=2))])
    pca = PCA(n_components=2)
    arr_2d = pca.fit_transform(X)
    plt.figure(figsize=(15, 8))
    colors = ['red', 'navy']
    target_names = ['KeinKunde', 'IstKunde']
    lw = 2
    plt.title('PCA of Customer dataset: Cluster Comparison')
    plt.subplot(2, 2, 1, title='Kmeans')
    for color, i, target_name in zip(colors, [0, 1], target_names):
        plt.scatter(arr_2d[y_kmeans == i, 0],
                    arr_2d[y_kmeans == i, 1],
                    color=color,
                    alpha=.8,
                    lw=lw,
                    label=target_name)

    plt.subplot(2, 2, 2, title='Spectral')
    for color, i, target_name in zip(colors, [0, 1], target_names):
        plt.scatter(arr_2d[y_spectral == i, 0],
                    arr_2d[y_spectral == i, 1],
                    color=color,
                    alpha=.8,
                    lw=lw,
                    label=target_name)

    plt.subplot(2, 2, 3, title='Gaussian')
    for color, i, target_name in zip(colors, [0, 1], target_names):
        plt.scatter(arr_2d[y_gaussian == i, 0],
                    arr_2d[y_gaussian == i, 1],
                    color=color,
                    alpha=.8,
                    lw=lw,
                    label=target_name)

    plt.legend(loc='best', shadow=False, scatterpoints=1)

    print("Kmeans-Clustering Information grouped IstKunde:")
    print(data_kmeans.groupby(['IstKunde', 'Land_ID', 'Branche_ID']).count())
    print("Spectral-Clustering Information grouped IstKunde:")
    print(data_spectral.groupby(['IstKunde', 'Land_ID', 'Branche_ID']).count())
    print("Gaussian-Mixture-Clustering Information grouped IstKunde:")
    print(data_gaussian.groupby(['IstKunde', 'Land_ID', 'Branche_ID']).count())

    if storetofile:
        pd.DataFrame.to_csv(
            data_kmeans,
            "C:/Users/dakoch/Downloads/CustomerClustering/customer_cluster_kmeans.csv",
            float_format="%.2f")
        pd.DataFrame.to_csv(
            data_spectral,
            "C:/Users/dakoch/Downloads/CustomerClustering/customer_cluster_spectral.csv",
            float_format="%.2f")
        pd.DataFrame.to_csv(
            data_gaussian,
            "C:/Users/dakoch/Downloads/CustomerClustering/customer_cluster_gaussian.csv",
            float_format="%.2f")

    plt.show()
Example #30
0
class ClusteringModel:
    """
    ClusteringModel encapsulates all the components needed to encode a list of images
    according to the extracted words and the layout of words, and use these as features
    for an unsupervised clustering using DBSCAN. Other clustering methods may be more
    suitable for your dataset, e.g. k-means or agglomerative clustering or HDBSCAN.

    This model primarily interacts with the data via a Pandas Dataframe that contains
    location of the image files. This assumes that the OCR results have been fetched, 
    and stored in the same directory according to the <image_name.jpg>.json convention.
    """
    def __init__(self,
                 layout_shape: (int, int),
                 vocabulary_size: int,
                 ocr_provider: object,
                 n_pca_components: int = 200,
                 vocabulary: List[str] = None,
                 stopwords: List[str] = None,
                 pipeline: Pipeline = None):
        """
        Constructor for a clustering model

        :param layout_shape: The dimensions for the layout encoding. (50, 79) works well for credit cards sized images.
        :param vocabulary_size: The size of the vocabulary for the word encoding. 2000-3000 works well for a large number of unrecognized cards.
        :param ocr_provider: An instance of an OCR provider class used to get the words from the image
        :param n_pca_components: The number of desired components for PCA on the word encoding 
        and the layout encoding. The actual number of components is limited by the number of rows of data.
        :param vocabulary: A pre-defined vocabulary if available
        :param stopwords: A list of stopwords to filter out if the vocabulary is regenerated
        :param pipeline: An sklearn pipeline containing the PCAs for the word and layout encoding
        """
        self.layout_shape = layout_shape
        self.vocabulary_size = vocabulary_size
        self.ocr_provider = ocr_provider
        self.n_pca_components = n_pca_components
        self.stopwords = stopwords
        if vocabulary is not None:
            self.encoder = WordAndLayoutEncoder(vocabulary, layout_shape)
        else:
            self.encoder = None
        self.pipeline = pipeline

    def _generate_vocabulary(self, data: pd.DataFrame, image_name_column: str):
        """
        Adapted from plan_agnostic_vocabulary_vector in RoutingClassifier.ipynb

        :param data: Pandas DataFrame containing all the images to be clustered
        :param image_name_column: Column in the dataframe with the filename
        :returns: a list containing the most frequent words in the OCR text for these images
        """

        logging.info(
            f"Counting extracted words across all images to generate the encoding vocabulary"
        )

        # Finds the most popular words out of a bag comprised of all plans
        # Guarantees a length based on vocabulary_size
        count = 0
        counter = Counter()

        for index, row in data.iterrows():
            try:
                filename = data.loc[index, image_name_column]

                ocr_results = self.ocr_provider.get_ocr_results(filename)

                for word in ocr_results:
                    if not self.stopwords or (word.text.lower()
                                              not in self.stopwords):
                        counter.update({word.text: 1})

                count += 1
                if count % 5000 == 0:
                    logging.info(
                        f"Processed {count} images for vocabulary generation")
            except:
                logging.error("Could not locate image file: {}".format(
                    row[image_name_column]))
                raise

        # Create the vocabulary vector based on the most common words
        vocabulary_vector = []
        for word in counter.most_common(self.vocabulary_size):
            vocabulary_vector.append(word[0])

        return vocabulary_vector

    def _encode_dataset(self, data: pd.DataFrame, image_name_column: str):
        """
        Encode all the images designated in the data DataFrame into the word+layout encoding
        by running OCR API (with local caching via the ocr_results utility function)

        :param data: a pandas DataFrame containing a list of images and their metadata
        :param image_name_column: column in the dataframe that has the file paths in the blob storage container
        :returns: a 2D numpy array and an array mask.
        The 2D numpy arrays contains the concatenated word and layout encoding for each encoded image.
        The mask is an array of the same length as the original data.
        A zero entry denotes unsuccessfully encoded image. A one denotes a successfully image
        """

        empty_ocr_count = 0
        mask = np.zeros(len(data))
        encoded_data = np.zeros((len(data), self.vocabulary_size +
                                 self.layout_shape[0] * self.layout_shape[1]))

        counter = 0
        for index, row in data.iterrows():
            try:
                filename = data.loc[index, image_name_column]
                ocr_results = self.ocr_provider.get_ocr_results(filename)

                if len(ocr_results) == 0:
                    empty_ocr_count += 1
                else:
                    mask[counter] = 1
                    encodings = self.encoder.encode_ocr_results(ocr_results)
                    encoded_data[counter, :] = encodings

            except:
                logging.error("Could not locate blob: {}".format(
                    row[image_name_column]))
                raise

            counter += 1

        if empty_ocr_count > 0:
            logging.warning(
                "Empty OCR results resulting in null entries for {} images".
                format(empty_ocr_count))

        return encoded_data, mask

    def find_clusters(self,
                      data: pd.DataFrame,
                      image_name_column: str,
                      min_samples: int = 10,
                      epsilon: float = 3):
        """
        Encode the dataset and perform clustering via the following steps:
        1) constructing a vocabulary if it is not already supplied, 
        2) encode the images based on the presence of the vocabulary words and the bounding boxes 
        of the detected text on the image grid. This is accomplished via the `WordAndLayoutEncoder` 
        available in the `Routing_Forms` example.
        3) apply PCA to each encoding component independently then run clustering on the dataset

        The final number of components from applying PCA is determined by the min of the specified
        `n_pca_components` and the size of the data. The resulting encoding is expected to be 
        an array of size 2 * number of components.

        :param data: a Pandas Dataframe containing the image metadata / filename
        :param image_name_column: the column name in the dataframe with the filename
        :param min_samples: DBSCAN parameter controlling the number of samples 
        in a neighborhood for a point to be considered as a core point.
        :param epislon: DBSCAN parameter controlling the maximum distance between two samples 
        for one to be considered as in the neighborhood of the other.        
        :returns: a copy of the data with the "cluster" column added or overwritten, 
        a dataframe containing the encoding with PCA applied (for further data visualization, for example), 
        and the vocabulary used for the word encoding

        """

        # Produce word and layout encoding from the images; there may be empty rows due to failed OCR on an image
        if self.encoder is None:
            vocabulary = self._generate_vocabulary(data, image_name_column)
            self.encoder = WordAndLayoutEncoder(vocabulary, self.layout_shape)

        (encoding, mask) = self._encode_dataset(data, image_name_column)

        if sum(mask) == data.shape[0]:
            logging.info(f"All {sum(mask)} images are successfully encoded")
        else:
            logging.error(
                f"{data.shape[0] - sum(mask)} images failed encoding")

        # Remove the empty rows before applying PCA
        encoding = encoding[mask == 1, :]
        self.n_pca_components = min(self.n_pca_components, encoding.shape[0])

        transformer = ColumnTransformer([
            ("word_pca", PCA(n_components=self.n_pca_components),
             list(range(0, self.vocabulary_size))),
            ("layout_pca", PCA(n_components=self.n_pca_components),
             list(
                 range(
                     self.vocabulary_size, self.vocabulary_size +
                     self.layout_shape[0] * self.layout_shape[1])))
        ])
        dbscan = DBSCAN(eps=epsilon,
                        min_samples=min_samples,
                        metric="euclidean",
                        leaf_size=40)

        self.pipeline = Pipeline([("pca", transformer), ("dbscan", dbscan)])
        Y = self.pipeline.fit_predict(encoding)

        data_copy = data.copy()
        data_copy.drop(["cluster"], axis=1, errors="ignore")
        data_copy.loc[mask == 1, "cluster"] = Y

        # A bit of extra work to return the encodings with PCA applied to help with data visualization
        encoded_data = pd.DataFrame(self.pipeline["pca"].transform(encoding))

        return (data_copy, encoded_data, vocabulary)
                            param_grid=parms,
                            scoring="v_measure_score",
                            cv=[(range(0,len(data)), range(0,len(data)))]) # do not need CV

    parms_result=gs_cluster.fit(data,text_data.labels_true())
    print(parms_result.best_score_)
    print(parms_result.best_params_)
    '''

    result = []

    for g in list(model_selection.ParameterGrid(params)):
        print()
        print(g)
        texf_cluster.set_params(**g)
        labels_pred = texf_cluster.fit_predict(data)
        print(labels_pred)
        count_table = score_data.count_table(text_data.init_num_by_cls,
                                             labels_pred,
                                             g['KMeans__n_clusters'])
        print(count_table)
        #total_entropy=score_data.total_entropy(count_table)
        #print("Total Entropy:",total_entropy)
        print(
            "homogeneity score, completeness score, v score:",
            metrics.homogeneity_completeness_v_measure(text_data.labels_true(),
                                                       labels_pred))
        print(
            "Adjusted Mutual Information:",
            metrics.adjusted_mutual_info_score(text_data.labels_true(),
                                               labels_pred))
    print('Cluster ' + str(i) + ': ')
    print('Number Of Purchases: ' +
          str(pipelineClustering['kmeans'].cluster_centers_[i][0]))
    print('Days From Last Purchase: ' +
          str(pipelineClustering['kmeans'].cluster_centers_[i][1]))
    print('Days From First Purchase: ' +
          str(pipelineClustering['kmeans'].cluster_centers_[i][2]))
    print('Total Revenue: ' +
          str(pipelineClustering['kmeans'].cluster_centers_[i][3]))

# Find which cluster each customer belongs to in out dataset
# Again, not including the customer id column because its numerical value has no say in the customers habits
# Creates new column for each customer noting the cluster they belong to.
# We now have our dataset where each customer has a column for the cluster they belong to
data['Cluster Category'] = pd.Series(pipelineClustering.fit_predict(data[[
    'Number Of Purchases', 'Days From Last Purchase',
    'Days From First Purchase', 'Total Revenue'
]]._get_numeric_data().dropna(axis=1)),
                                     index=data.index)

data['Cluster Category'].replace(
    {
        0: 'New Customer',
        1: 'Loyal Customer',
        2: 'Non-frequent Customer',
        3: 'High Spender/Loyal Customer'
    },
    inplace=True)

data = data[[
    'CustomerID', 'Number Of Purchases', 'Days From Last Purchase',
    'Days From First Purchase', 'Total Revenue', 'Cluster Category'
Example #33
0
#################################################
############# Pipeline #############
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
############# 1. impute + glm
steps = [('imputation', imp), ('logistic_regression', logreg)]
pipe = Pipeline(steps)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
pipe.score(X_test, y_test)

############# 2. scale + KMenas
steps = [('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters = 4))]
pipe = Pipeline(steps)
y_clus = pipe.fit_predict(X)

############# 3. scale + knn
# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]
# Create the pipeline: pipeline
pipe = Pipeline(steps)
# Specify the hyperparameter space
param_grid = {'knn__n_neighbors' = np.arange(1, 50)}              # 'step name'__'parameter name'
# Create the GridSearchCV object: cv
cv = GridSearchCV(pipe, param_grid, cv = 5)
# Fit to the training set
cv.fit(X_train, y_train)

y_pred = cv.predict(X_test)

############# 4. scale + SVM
class Sekitei:
    def __init__(self):
        self.proba = {}
        self.quota = {}
        self.is_taken = {}
        self.keys = []
        self.cluster_expressions = {}
        self.delta = {}
        self.part = {}
        self.bad_part = {}
        self.i = 0
        self.j = 0
        self.cluster_expressions_help = {}
        self.model = Pipeline([
            ('scaler', StandardScaler()),
            # ('clustering', Birch(n_clusters=20, threshold=0.1))])
            # ('clustering', AgglomerativeClustering(n_clusters=20))])
            # ('clustering', DBSCAN(eps=3, min_samples=5))])
            ('clustering', KMeans(n_clusters=20))
        ])
        self.classifier = Pipeline([
            ('scaler', StandardScaler()),
            # ('classification', LogisticRegression(C=10000))]) # eps = -0.05; k = 10
            # ('classification', LinearSVC())])
            # ('classification', KNeighborsClassifier())])
            # ('classification', BernoulliNB(alpha=0.5))]) # eps = 0.5; k = 5
            ('classification', DecisionTreeClassifier(criterion='entropy'))
        ])
        self.check_functions = []
        self.parameters = []
        self.T = time.time()

    def _segments(self, segments, param):
        if len(segments) == param['n']:
            return True
        else:
            return False

    def _param(self, segments, param):
        if re.search('[\?&]' + param['p'] + '([\&\/].*)?$', url) is not None:
            return True
        else:
            return False

    def _param_name(self, segments, param):
        if re.search('[\?&]' + param['p'] + '=', url) is not None:
            return True
        else:
            return False

    def _segment_name(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        if segments[param['i']] == param['s']:
            return True
        else:
            return False

    def _segment_09(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        if segments[param['i']].isdigit():
            return True
        else:
            return False

    def _segment_substr_09(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        if re.search('[^\d]+\d+[^\d]+$', segments[param['i']]) is not None:
            return True
        else:
            return False

    def _segment_ext(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        ext = segments[param['i']].split('.')
        if len(ext) > 1:
            if ext[-1].lower() == param['ext']:
                # if re.search('\.' + param['ext'] + '$', segments[param['i']]) is not None:
                return True
        return False

    def _segment_ext_substr_09(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        ext = segments[param['i']].split('.')
        if len(ext) > 1:
            if ext[-1].lower() == param['ext'] and re.search(
                    '[^\d]+\d+[^\d]+$', segments[param['i']]) is not None:
                # if re.search('\.' + param['ext'] + '$', segments[param['i']]) is not None and re.search('[^\d]+\d+[^\d]+$', segments[param['i']]) is not None:
                return True
        return False

    def _segment_len(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        if len(segments[param['i']]) == param['L']:
            return True
        else:
            return False

    def _segment_2points(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        wik = segments[param['i']].split(':')
        if len(wik) != 1:  # and wik[0] == param['wik']:
            return True
        else:
            return False

    def _segment_strix(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        strix = segments[param['i']].split('_')
        if param['strix'] == len(strix):
            return True
        else:
            return False

    def _segment_strix_quote(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        strix = segments[param['i']].split('_')
        if len(strix) == 0:
            return False
        if ',' in strix[0]:
            return True
        else:
            return False

    def _segment_smile(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        if '(' in segments[param['i']]:
            return True
        else:
            return False

    def _segment_ru(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        if re.search('[А-Яа-я]', segments[param['i']]) is not None:
            return True
        else:
            return False

    def _segment_in_br(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        m = re.search('\((.*)\)', segments[param['i']])
        if m is not None:
            data = m.groups()[0].split('_')
            if len(data) == param['data']:
                return True
        return False

    def _segment_defis(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        if '-' in segments[param['i']]:
            return True
        else:
            return False

    def _segment_start_dig(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        if re.match('[0-9]+', segments[param['i']]):
            return True
        else:
            return False

    def _segment_more(self, segments, param):
        if len(segments) <= param['i']:
            return False
        pos = segments[param['i']].find('?')
        if pos != -1:
            segments[param['i']] = segments[param['i']][:pos]
        if len(segments[param['i']]) > 15:
            return True
        else:
            return False

    def init_one(self, feature):
        m = re.match('segments:([0-9]+)$', feature)
        if m is not None:
            return self._segments, {'n': int(m.groups()[0])}
        m = re.match('param:(.*)$', feature)
        if m is not None:
            return self._param, {'p': m.groups()[0]}
        m = re.match('param_name:(.*)$', feature)
        if m is not None:
            return self._param_name, {'p': m.groups()[0]}
        m = re.match('segment_name_([0-9]+):(.*)$', feature)
        if m is not None:
            return self._segment_name, {
                'i': int(m.groups()[0]),
                's': m.groups()[1]
            }
        m = re.match('segment_\[0\-9\]_([0-9]+):1$', feature)
        if m is not None:
            return self._segment_09, {'i': int(m.groups()[0])}
        m = re.match('segment_substr\[0\-9\]_([0-9]+):1$', feature)
        if m is not None:
            return self._segment_substr_09, {'i': int(m.groups()[0])}
        m = re.match('segment_ext_([0-9]+):(.*)$', feature)
        if m is not None:
            return self._segment_ext, {
                'i': int(m.groups()[0]),
                'ext': m.groups()[1]
            }
        m = re.match('segment_ext_substr\[0\-9\]_([0-9]+):(.*)$', feature)
        if m is not None:
            return self._segment_ext_substr_09, {
                'i': int(m.groups()[0]),
                'ext': m.groups()[1]
            }
        m = re.match('segment_len_([0-9]+):([0-9]+)$', feature)
        if m is not None:
            return self._segment_len, {
                'i': int(m.groups()[0]),
                'L': int(m.groups()[1])
            }
        m = re.match('segment_2points_([0-9]+):1$', feature)
        if m is not None:
            return self._segment_2points, {'i': int(m.groups()[0])}
        m = re.match('segment_strix_([0-9]+):([0-9]+)$', feature)
        if m is not None:
            return self._segment_strix, {
                'i': int(m.groups()[0]),
                'strix': int(m.groups()[1])
            }
        m = re.match('segment_strix_quote_([0-9]+):1$', feature)
        if m is not None:
            return self._segment_strix_quote, {'i': int(m.groups()[0])}
        m = re.match('segment_smile_([0-9]+):1$', feature)
        if m is not None:
            return self._segment_smile, {'i': int(m.groups()[0])}
        m = re.match('segment_ru_([0-9]+):1$', feature)
        if m is not None:
            return self._segment_ru, {'i': int(m.groups()[0])}
        m = re.match('segment_in_br_([0-9]+):(.*)$', feature)
        if m is not None:
            return self._segment_in_br, {
                'i': int(m.groups()[0]),
                'data': int(m.groups()[1])
            }
        m = re.match('segment_defis_([0-9]+):1$', feature)
        if m is not None:
            return self._segment_defis, {'i': int(m.groups()[0])}
        m = re.match('segment_start_dig_([0-9]+):1$', feature)
        if m is not None:
            return self._segment_start_dig, {'i': int(m.groups()[0])}
        m = re.match('segment_more_([0-9]+):1$', feature)
        if m is not None:
            return self._segment_more, {'i': int(m.groups()[0])}
        #print('ooops', feature)
        return False, False

    def init_functions(self, keys):
        for key in keys:
            f, p = self.init_one(key)
            self.check_functions.append(f)
            self.parameters.append(p)

    def check_url(self, url):
        N = len(self.keys)
        X = np.zeros((1, N))
        segments = url.split('/')[3:]
        if segments[-1] == '\n' or segments[-1] == '':
            del segments[-1]
        elif segments[-1][-1] == '\n':
            segments[-1] = segments[-1][:-1]
        for i in range(len(segments)):
            try:
                segments[i] = urllib.unquote(segments[i]).decode('utf8')
            except UnicodeDecodeError:
                try:
                    segments[i] = urllib.unquote(segments[i]).decode('cp1251')
                except UnicodeDecodeError:
                    pass
        for i in range(N):
            X[0, i] = self.check_functions[i](segments, self.parameters[i])
        return X

    def extract_features(self, URLS):
        result = Counter()
        X_ = {}
        for line in URLS:
            X_[line] = []
            segments = line.split('/')[3:]
            if segments[-1] == '\n' or segments[-1] == '':
                del segments[-1]
            elif segments[-1][-1] == '\n':
                segments[-1] = segments[-1][:-1]
            result['segments:' + str(len(segments))] += 1
            X_[line].append('segments:' + str(len(segments)))
            if (len(segments) == 0):
                continue
            for i in range(len(segments)):
                segment = segments[i]
                try:
                    segment = urllib.unquote(segment).decode('utf8')
                except UnicodeDecodeError:
                    try:
                        segment = urllib.unquote(segment).decode('cp1251')
                    except UnicodeDecodeError:
                        pass
                if '?' in segment:
                    mb_par = segment.split('?')
                    params = mb_par[1].split('&')
                    for p in params:
                        result['param:' + p] += 1
                        X_[line].append('param:' + p)
                        result['param_name:' + p.split('=')[0]] += 1
                        X_[line].append('param_name:' + p.split('=')[0])
                    segment = mb_par[0]
                result['segment_name_' + str(i) + ':' + segment] += 1
                X_[line].append('segment_name_' + str(i) + ':' + segment)
                if segment.isdigit():
                    result['segment_[0-9]_' + str(i) + ':1'] += 1
                    X_[line].append('segment_[0-9]_' + str(i) + ':1')
                if re.search('[^\d]+\d+[^\d]+$', segment) is not None:
                    result['segment_substr[0-9]_' + str(i) + ':1'] += 1
                    X_[line].append('segment_substr[0-9]_' + str(i) + ':1')
                ext = segment.split('.')
                if len(ext) > 1:
                    result['segment_ext_' + str(i) + ':' +
                           ext[-1].lower()] += 1
                    X_[line].append('segment_ext_' + str(i) + ':' +
                                    ext[-1].lower())
                if len(ext) > 1 and re.search('[^\d]+\d+[^\d]+$',
                                              segment) is not None:
                    result['segment_ext_substr[0-9]_' + str(i) + ':' +
                           ext[-1].lower()] += 1
                    X_[line].append('segment_ext_substr[0-9]_' + str(i) + ':' +
                                    ext[-1].lower())
                wik = segment.split(':')
                if len(wik) != 1:
                    #result['segment_2points_' + str(i) + ':' + wik[0]] += 1
                    #X_[line].append('segment_2points_' + str(i) + ':' + wik[0])
                    result['segment_2points_' + str(i) + ':1'] += 1
                    X_[line].append('segment_2points_' + str(i) + ':1')
                strix = segment.split('_')
                if len(strix) > 1:
                    result['segment_strix_' + str(i) + ':' +
                           str(len(strix))] += 1
                    X_[line].append('segment_strix_' + str(i) + ':' +
                                    str(len(strix)))
                if len(strix) > 0:
                    if ',' in strix[0]:
                        result['segment_strix_quote_' + str(i) + ':1'] += 1
                        X_[line].append('segment_strix_quote_' + str(i) + ':1')
                result['segment_len_' + str(i) + ':' + str(len(segment))] += 1
                X_[line].append('segment_len_' + str(i) + ':' +
                                str(len(segment)))
                if '(' in segment:
                    result['segment_smile_' + str(i) + ':1'] += 1
                    X_[line].append('segment_smile_' + str(i) + ':1')
                m = re.search('\((.*)\)', segment)
                if m is not None:
                    data = m.groups()[0].split('_')
                    result['segment_in_br_' + str(i) + ':' +
                           str(len(data))] += 1
                    X_[line].append('segment_in_br_' + str(i) + ':' +
                                    str(len(data)))
                if re.search('[А-Яа-я]', segment) is not None:
                    result['segment_ru_' + str(i) + ':1'] += 1
                    X_[line].append('segment_ru_' + str(i) + ':1')
        #      if '-' in segment:
        #          result['segment_defis_' + str(i) + ':1'] += 1
        #          X_[line].append('segment_defis_' + str(i) + ':1')
        #      if re.match('[0-9]+', segment):
        #          result['segment_start_dig_' + str(i) + ':1'] += 1
        #          X_[line].append('segment_start_dig_' + str(i) + ':1')
                if len(segment) > 15:
                    result['segment_more_' + str(i) + ':1'] += 1
                    X_[line].append('segment_more_' + str(i) + ':1')

        for key in result.keys():
            if result[key] > 100:
                self.keys.append(key)
        self.init_functions(self.keys)
        # print self.keys
        X = np.zeros((len(URLS), len(self.keys)))
        for j, url in enumerate(URLS):
            X[j, :] = self.check_url(url)
            '''
            for i, key in enumerate(self.keys):
                if (key in X_[url]) != X[j, i]:
                    print('f**k', key, url, X[j, i], key in X_[url])
            '''
        return X

    def fit_model(self, QLINK_URLS, UNKNOWN_URLS, QUOTA):
        self.__init__()
        URLS = QLINK_URLS + UNKNOWN_URLS
        X = self.extract_features(URLS)
        y = np.zeros((len(QLINK_URLS) + len(UNKNOWN_URLS)))
        y[:len(QLINK_URLS)] = 1
        clusters = self.model.fit_predict(X)
        self.classifier.fit(X, y)
        self.un_clusters, counts = np.unique(clusters, return_counts=True)
        #print counts, self.keys
        eps = -0.09
        Delta = 20
        dupl = 1
        a = 0
        b = 0
        k = 10
        zero = 0
        if self.classifier.score(X, y) < 0.7:
            k = 1.7
            eps = 0.27
        for cluster, count in np.dstack((self.un_clusters, counts))[0]:
            self.proba[cluster] = np.min((np.max(
                (np.sum(y[clusters == cluster]) / count - eps, 0)), 1))
            self.is_taken[cluster] = 0
            #self.quota[cluster] = np.ceil(QUOTA * np.sum(y[clusters == cluster]) / len(QLINK_URLS))
            min_quota = QUOTA / len(QLINK_URLS) * k
            #self.quota[cluster] = np.ceil(k * np.sum(y[clusters == cluster]) + (QUOTA - k * np.sum(y)) * np.sum(1 - y[clusters == cluster]) / np.sum(1 - y))
            self.quota[cluster] = min_quota * np.sum(
                y[clusters == cluster]) + 100
            self.cluster_expressions_help[cluster] = np.mean(
                X[clusters == cluster], axis=0) > 0.5
            self.cluster_expressions[cluster] = np.zeros(
                len(self.cluster_expressions_help[cluster]))
            self.cluster_expressions[cluster][
                self.cluster_expressions_help[cluster]] = 1
            self.delta[cluster] = np.ceil(
                np.sum(
                    np.abs(
                        np.mean(X[clusters == cluster], axis=0) -
                        self.cluster_expressions[cluster]))) + Delta
            self.part[cluster] = np.sum(y[clusters == cluster]) / np.sum(y)
            self.bad_part[cluster] = np.sum(
                1 - y[clusters == cluster]) / np.sum(1 - y)
            if self.proba[cluster] < 0.01:
                a += 1
            if np.sum(y[clusters == cluster]) == 0:
                zero += count
        #print a
        #print self.classifier.score(X, y)
        if zero > 300:
            #print "here"
            for cluster in self.un_clusters:
                self.quota[cluster] = self.quota[cluster] + 1400
                self.proba[cluster] = np.min((np.max(
                    (self.proba[cluster] + 0.35, 0)), 1))
        '''
        elif zero > 200:
            #print "here"
            for cluster in self.un_clusters:
                self.quota[cluster] = self.quota[cluster] + 1000
                self.proba[cluster] = np.min((np.max((self.proba[cluster] + 1, 0)), 1))
        elif a > 300:
            for cluster in self.un_clusters:
                self.quota[cluster] = self.quota[cluster] + 1000
                self.proba[cluster] = np.min((np.max((self.proba[cluster] + 1, 0)), 1))
        '''
        self.T = 0
        self.cluster_expressions_ = np.zeros(
            (len(self.un_clusters), len(self.keys)))
        for i in range(len(self.un_clusters)):
            self.cluster_expressions_[i, :] = self.cluster_expressions[
                self.un_clusters[i]]
        #print self.proba

    def predict_cluster(self, X):
        Dist = len(self.cluster_expressions.keys())
        cl = -1
        nums = np.sum(X != self.cluster_expressions_, axis=1)
        ind = np.argmin(nums)
        cl = self.un_clusters[ind]
        Dist = nums[ind]
        if Dist <= self.delta[cl]:
            return cl
        return -1

    def predict_fetch(self, url):
        X = self.check_url(url)  # ~1-3-4s
        fetch = self.classifier.predict(X)  # ~3-7-7s
        y = self.predict_cluster(X)  # ~1-2-2
        if fetch:
            if y != -1:
                self.is_taken[y] += 1
            return True
        if y == -1:
            return False
        fetch = np.random.choice((True, False),
                                 p=(self.proba[y], 1 - self.proba[y]))
        if fetch and self.is_taken[y] < self.quota[y]:
            self.is_taken[y] += 1
            return True
        return False