def test_get_text_vocabulary(): """Generate data to find Vocabulary""" Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') sel = sqlalchemy.select([Points]) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=True, remove_virtual=True) df_clean = clean_pipe.fit_transform(dataset_raw) # Get vocabulary for DESCRIPTOR feature - a text feature VocabularyText = transform_pipeline.VocabularyText() vocabulary = VocabularyText\ .get_text_vocabulary(X=df_clean, col_name='DESCRIPTOR', remove_suffix=False, max_features=80) # Sove vocabulary file_name = r'../data/vocab_descriptor.txt' transform_pipeline.VocabularyText.save_vocabulary(vocabulary, file_name) return None
def test_categorical_pipe(): Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=False, remove_virtual=True) categorical_pipe = Transform.categorical_pipeline( categorical_attributes=None, categories_file=r'../data/categorical_categories.dat') df_clean = clean_pipe.fit_transform(dataset_raw) ohe_array = categorical_pipe.fit_transform(df_clean).toarray() # Find more about categorical pipe ohe = categorical_pipe.named_steps['catEncoder'] ohe.categories # ohe.categories_ when categories='auto' return ohe_array
def test_cleaning_pipe(): Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=False, remove_virtual=True) df = clean_pipe.fit_transform(dataset_raw) return df
def test_get_database_features(): # Instantiate local classes Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) full_pipeline = Pipeline([('clean_pipe', clean_pipe), ('text_pipe',text_pipe), ]) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get a points dataframe customer_id = 15 sel = sqlalchemy.select([Points]).where(Points.customer_id.__eq__(customer_id)) database = Insert.pandas_select_execute(sel) sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id)) customer_name = Insert.core_select_execute(sel)[0].name database_features = ExtractLabels.get_database_features(database, full_pipeline, instance_name=customer_name) return database_features
def test_time(): Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() RemoveAttribute = transform_pipeline.RemoveAttribute( Transform.drop_attributes) RemoveNan = transform_pipeline.RemoveNan(Transform.nan_replace_dict) SetDtypes = transform_pipeline.SetDtypes(Transform.type_dict) TextCleaner = transform_pipeline.TextCleaner(Transform._text_clean_attrs, replace_numbers=True) UnitCleaner = transform_pipeline.UnitCleaner(Transform.unit_dict) DuplicateRemover = transform_pipeline.DuplicateRemover(Transform.dupe_cols, remove_dupe=True) VirtualRemover = transform_pipeline.VirtualRemover(remove_virtual=True) t0 = time.time() df0 = RemoveAttribute.fit_transform(dataset_raw) t1 = time.time() df1 = RemoveNan.fit_transform(df0) t2 = time.time() df2 = SetDtypes.fit_transform(df1) t3 = time.time() df3 = TextCleaner.fit_transform(df2) t4 = time.time() df4 = UnitCleaner.fit_transform(df3) t5 = time.time() indicies = DuplicateRemover.get_duplicate_indicies(df4, 'NAME') print('Duplicate names') print(df4['NAME'].iloc[indicies[:50]]) df5 = DuplicateRemover.fit_transform(df4) t6 = time.time() df6 = VirtualRemover.fit_transform(df5) t7 = time.time() print('RemoveAttribute : {}'.format(t1 - t0)) print('RemoveNan : {}'.format(t2 - t1)) print('SetDtypes : {}'.format(t3 - t2)) print('TextCleaner : {}'.format(t4 - t3)) print('UnitCleaner : {}'.format(t5 - t4)) print('DuplicateRemover : {}'.format(t6 - t5)) print('VirtualRemover : {}'.format(t7 - t6)) return None
def test_cluster_with_hyperparameters(): """Test clustering with hyperparameters""" # Instantiate local classes Transform = transform_pipeline.Transform() UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get a points dataframe customer_id = 13 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) with Insert.engine.begin() as connection: # res = connection.execute(sel).fetchone() database = pd.read_sql(sel, connection) df_clean = clean_pipe.fit_transform(database) X = text_pipe.fit_transform(df_clean).toarray() #_word_vocab = text_pipe.named_steps['WordDictToSparseTransformer'].vocabulary #df_text = pd.DataFrame(X, columns=_word_vocab) hyperparameters = { 'by_size': False, 'distance': 'euclidean', 'clusterer': 'ward.D', 'n_components': 8, 'reduce': 'MDS', 'index': 'Ratkowsky' } result = UnsupervisedCluster.cluster_with_hyperparameters( hyperparameters, X) best_nc_df = result.best_nc_dataframe sel = sqlalchemy.select([Customers])\ .where(Customers.id.__eq__(customer_id)) with Insert.engine.begin() as connection: res = connection.execute(sel).fetchone() correct_k = res.correct_k return result
def test_unsupervised_cluster(): # Instantiate local classes Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get a points dataframe customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) with Insert.engine.begin() as connection: # res = connection.execute(sel).fetchone() database = pd.read_sql(sel, connection) df_clean = clean_pipe.fit_transform(database) X = text_pipe.fit_transform(df_clean).toarray() _word_vocab = text_pipe.named_steps[ 'WordDictToSparseTransformer'].vocabulary df_text = pd.DataFrame(X, columns=_word_vocab) # Get number of clusters sel = sqlalchemy.select([Customers])\ .where(Customers.id.__eq__(customer_id)) with Insert.engine.begin() as connection: res = connection.execute(sel).fetchone() correct_k = res.correct_k if X.shape[0] <= 3 or correct_k == 1: # Dont cluster - just pass 1 cluster total prediction_agglo = np.ones((X.shape[0])) else: # Cluster agglomerative = AgglomerativeClustering(n_clusters=correct_k, affinity='euclidean', linkage='ward') prediction_agglo = agglomerative.fit_predict(X) return df_clean, prediction_agglo
def test_calc_categories_dict(): """Generate data to find categories""" Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') sel = sqlalchemy.select([Points]) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=False, remove_virtual=True) string_pipe = transform_pipeline.SetDtypes( type_dict={ 'TYPE': str, 'ALARMTYPE': str, 'FUNCTION': str, 'VIRTUAL': str, 'CS': str, 'SENSORTYPE': str, 'DEVUNITS': str }) categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe), ('string_pipe', string_pipe)]) df_clean = categories_clean_pipe.fit_transform(dataset_raw) """Calculate and save categories to be used later""" Encoding = transform_pipeline.EncodingCategories() columns = [ 'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE', 'DEVUNITS' ] categories_dict = Encoding.calc_categories_dict(df_clean, columns) save_path = r'../data/categorical_categories.dat' Encoding.save_categories_to_disc(categories_dict, save_path) categories_dict1 = Encoding.read_categories_from_disc(save_path) for key in set((*categories_dict.keys(), *categories_dict1.keys())): assert (np.array_equal(categories_dict[key], categories_dict1[key])) return None
def legacy_numeric_transform_pipeline_MIL(): # Transform pipeline TransformLegacy = transform_pipeline.Transform() # Legacy categorication dictionary... # Cleaning pipeline clean_pipe = TransformLegacy.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=True, remove_virtual=True) # Text feature encoders name_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc(POINTNAME_VOCABULARY_FILENAME) name_text_pipe = TransformLegacy.text_pipeline_label(attributes=['NAME'], vocabulary=name_vocabulary) descriptor_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc(DESCRIPTOR_VOCABULARY_FILENAME) descriptor_text_pipe = TransformLegacy.text_pipeline_label(attributes=['DESCRIPTOR'], vocabulary=descriptor_vocabulary) # Categorical Features categorical_pipe = TransformLegacy.categorical_pipeline( categorical_attributes=None, handle_unknown='ignore', categories_file=r'../data/categorical_categories_old.dat') # Numeric features numeric_pipe = TransformLegacy.numeric_pipeline(numeric_attributes=None) # Union combined_features = FeatureUnion(transformer_list=[ ('CategoricalPipe', categorical_pipe), ('NameTextPipe',name_text_pipe), ('DescriptorTextPipe',descriptor_text_pipe), ('NumericPipe',numeric_pipe), ]) full_pipeline = Pipeline([ ('CleaningPipe', clean_pipe), ('CombinedCategorical',combined_features), ]) return full_pipeline
def test_read_categories(): # Ititialize Transform = transform_pipeline.Transform() categories_file = r'../data/categorical_categories.dat' categories = Transform._read_categories(Transform.cat_attributes, categories_file) categorical_attributes = Transform.cat_attributes ReplaceNone = transform_pipeline.ReplaceNone(categorical_attributes) DataFrameSelector = transform_pipeline.DataFrameSelector( categorical_attributes) OneHotEncoder = transform_pipeline.OneHotEncoder(categories=categories) # Get raw database Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=False, remove_virtual=True) df_clean1 = clean_pipe.fit_transform(dataset_raw) # Transform df0 = ReplaceNone.fit_transform(df_clean1) df1_array = DataFrameSelector.fit_transform(df0) ohearray = OneHotEncoder.fit_transform(df1_array).toarray() # Examine the transformers print(df0[categorical_attributes].iloc[:5]) print(df1_array[:5]) OneHotEncoder.categories return None
def test_text_pipe(): Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') customer_id = 15 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) dataset_raw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=True, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) full_pipeline = Pipeline([ ('clean_pipe', clean_pipe), ('text_pipe', text_pipe), ]) dataset = full_pipeline.fit_transform(dataset_raw) return dataset
def test_full_pipeline(): Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # group_id = 4 group_id = 15 sel = sqlalchemy.select([Points]).where(Points.group_id.__eq__(group_id)) dfraw = Insert.pandas_select_execute(sel) # Transform pipeline Transform = transform_pipeline.Transform() # Cleaning pipeline clean_pipe = Transform.cleaning_pipeline(drop_attributes=None, nan_replace_dict=None, dtype_dict=None, unit_dict=None, remove_dupe=True, replace_numbers=True, remove_virtual=True) # Text feature encoders name_file = r'../data/vocab_name.txt' name_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc( name_file) name_text_pipe = Transform.text_pipeline_label(attributes=['NAME'], vocabulary=name_vocabulary) descriptor_file = r'../data/vocab_descriptor.txt' descriptor_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc( descriptor_file) descriptor_text_pipe = Transform.text_pipeline_label( attributes=['DESCRIPTOR'], vocabulary=descriptor_vocabulary) # Categorical Features categorical_pipe = Transform.categorical_pipeline( categorical_attributes=None, handle_unknown='ignore', categories_file=r'../data/categorical_categories.dat') # Numeric features numeric_pipe = Transform.numeric_pipeline(numeric_attributes=None) # Union combined_features = FeatureUnion(transformer_list=[ ('CategoricalPipe', categorical_pipe), ('NameTextPipe', name_text_pipe), ('DescriptorTextPipe', descriptor_text_pipe), ('NumericPipe', numeric_pipe), ]) full_pipeline = Pipeline([ ('CleaningPipe', clean_pipe), ('CombinedFeatures', combined_features), ]) combined_csr = full_pipeline.fit_transform(dfraw) combined_csr.shape CleaningPipe = full_pipeline.steps[0][1] # CleaningPipe RemoveAttribute = full_pipeline.steps[0][1][0] # RemoveAttribute RemoveNan = full_pipeline.steps[0][1][1] SetDtypes = full_pipeline.steps[0][1][2] TextCleaner = full_pipeline.steps[0][1][3] UnitCleaner = full_pipeline.steps[0][1][4] DuplicateRemover = full_pipeline.steps[0][1][5] VirtualRemover = full_pipeline.steps[0][1][6] df0 = RemoveAttribute.fit_transform(copy.deepcopy(dfraw)) df1 = RemoveNan.fit_transform(copy.deepcopy(df0)) df2 = SetDtypes.fit_transform(copy.deepcopy(df1)) df3 = TextCleaner.fit_transform(copy.deepcopy(df2)) df4 = UnitCleaner.fit_transform(copy.deepcopy(df3)) df5 = DuplicateRemover.fit_transform(copy.deepcopy(df4)) df6 = VirtualRemover.fit_transform(copy.deepcopy(df5)) return None
def main(): # Hyperparameters hyperparams = { 'by_size':False, 'n_components':8, 'reduce':'MDS', 'clusterer':'ward.D', 'distance':'euclidean', 'index':'all'} # Instantiate local classes Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Clustering class UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints() # Save hyperparameters to SQL # See if its already inserted sel = sqlalchemy.select([ClusteringHyperparameter]).where( sqlalchemy.sql.and_(ClusteringHyperparameter.by_size == hyperparams['by_size'], ClusteringHyperparameter.clusterer == hyperparams['clusterer'], ClusteringHyperparameter.distance == hyperparams['distance'], ClusteringHyperparameter.reduce == hyperparams['reduce'], ClusteringHyperparameter.n_components == hyperparams['n_components'])) with Insert.engine.connect() as connection: res = connection.execute(sel).fetchall() if res.__len__(): # Get hyperparameters id of existing hyperparameter set hyperparameter_id = res[0].id else: # Insert new object res = Insert.core_insert_instance(ClusteringHyperparameter, hyperparams) hyperparameter_id = res.inserted_primary_key[0] # Get customer list from SQL sel = sqlalchemy.select([Customers]) customers = Insert.core_select_execute(sel) # Iterate through customers and cluster for customer in customers: # Get points from SQL sel = sqlalchemy.select([Points]).where(Points.customer_id.__eq__(customer.id)) database = Insert.pandas_select_execute(sel) if database.shape[0] == 0: print('Customer ID {} Skipped, points shape {}'.format(customer.id, database.shape[0])) continue else: df_clean = clean_pipe.fit_transform(database) X = text_pipe.fit_transform(df_clean).toarray() #_word_vocab = text_pipe.named_steps['WordDictToSparseTransformer'].vocabulary #df_text = pd.DataFrame(X, columns=_word_vocab) # NbClust clustering print('Customer ID {}\nDB Size : {}'.format(customer_id, X_reduced.shape)) try: print('Starting NbClust') # Perform clustering with NbClust package result = UnsupervisedCluster.cluster_with_hyperparameters(hyperparams, X) best_nc_df = result.best_nc_dataframe except RRuntimeError as e: if str(e).__contains__('computationally singular'): # The eigenvalue matrix is singular. Reduce the number of dimensions _hyperparams = hyperparams _hyperparams['n_components'] = int(_hyperparams['n_components'] / 2) result = UnsupervisedCluster.cluster_with_hyperparameters(hyperparams, X) best_nc_df = result.best_nc_dataframe else: print(e) continue # Build dictionary for SQL sel = sqlalchemy.select([Customers]).where(Customers.id.__eq__(customer.id)) with Insert.engine.connect() as connection: res = connection.execute(sel).fetchone() correct_k = res.correct_k values = best_nc_df.loc['Number_clusters'].to_dict() values['correct_k'] = correct_k values['customer_id'] = customer.id values['hyperparameter_id'] = hyperparameter_id n_lens = Clustering.get_n_len_features(X) for key, val in n_lens.items(): values[key] = int(val) # Save results to SQL res = Insert.core_insert_instance(Clustering, values) print("Inserted {}".format(res.inserted_primary_key)) return None
if correct_n_clusters == 1 or X.shape[0] <= 3: # Dont cluster if there is only one system # Dont cluster - just pass 1 cluster total prediction_agglo = np.ones((X.shape[0])) else: # Cluster agglomerative = AgglomerativeClustering(n_clusters=correct_n_clusters, affinity='euclidean', linkage='ward') prediction_agglo = agglomerative.fit_predict(X) return prediction_agglo # Instantiate local classes Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering')
def test_cluster_with_hyperparameters2(): # Instantiate local classes Transform = transform_pipeline.Transform() UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) # Set up connection to SQL Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008', driver_name='SQL Server Native Client 10.0', database_name='Clustering') # Get a points dataframe customer_id = 13 sel = sqlalchemy.select([Points ]).where(Points.customer_id.__eq__(customer_id)) with Insert.engine.begin() as connection: # res = connection.execute(sel).fetchone() database = pd.read_sql(sel, connection) df_clean = clean_pipe.fit_transform(database) X = text_pipe.fit_transform(df_clean).toarray() #_word_vocab = text_pipe.named_steps['WordDictToS hyperparameters = { 'by_size': False, 'distance': 'euclidean', 'clusterer': 'ward.D', 'n_components': 8, 'reduce': 'MDS', 'index': 'Ratkowsky' } # Clean hyperparameters hyperparams = UnsupervisedCluster._parse_hyperparameter_dictionary( hyperparameters) # Perform dimensionality reduction on data X_dim_reduced = UnsupervisedCluster._dimensionality_reduction( X, method=hyperparams['reduce'], n_components=hyperparams['n_components']) # Conditionally call nbclust package or optimalk package # based on input clustering hyperparameters if hyperparams['index'] in UnsupervisedCluster.nbclust_indicies: # Cluster with nbclust and clustering algorithm min_nc = 3 # Static max_nc = UnsupervisedCluster._get_max_nc(X) # Based on actual data best_nc_df = UnsupervisedCluster._nbclust_calc( X_dim_reduced, index=hyperparams['index'], clusterer=hyperparams['clusterer'], distance=hyperparams['distance'], min_nc=min_nc, max_nc=max_nc) # Get number of clusters sel = sqlalchemy.select([Customers])\ .where(Customers.id.__eq__(customer_id)) with Insert.engine.begin() as connection: res = connection.execute(sel).fetchone() correct_k = res.correct_k print(correct_k) pass
def save_tfrecord_sql(customer_ids, peritem_keys, label_key, reciprocal, n_bins, tfrecord_writer): """Save TFRecord EIE format to files for ranking See rank_write_record.py for how Mongo database documents are converted to tf.train.Example objects Here the tf.train.Example objects are nested into the Example-in-example format recommended by tensorflow ranking library EIE Examples are of the form {'serialized_context':tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])), 'serialized_examples': tf.train.Feature(bytes_list=tf.train.BytesList(value=value))} for 'serialized_context' value is a serialized tf.train.Example for 'serialized_examples' value is a list of serialized tf.train.Example objects that will be ranked according to their relevance to the context features Inputs ------- customer_ids : (list) of customer_ids in SQL database to save peritem_keys : (list) of string keys that exist in peritem_features. Should be ['by_size','n_components','clusterer','reduce','index'] reciprocal : (bool) Set up how I want to assign labels to objects Reciprocal will cause labels to be the inverse of the loss metric Set to True if I do not want labels to be binned n_bins : (int) number of bins for relevance label if reciprocal is False tfrecord_writer : (tf.io.TFRecordWriter) To serialized EIE TFRecord """ """Create a pipeline for transforming points databases""" assert hasattr(customer_ids, '__iter__'), "customer_ids must be iterable" msg = "Each ID in customer_ids must be int type, not {}" for _id in customer_ids: assert isinstance(_id, int), msg.format(type(_id)) Transform = transform_pipeline.Transform() # Create 'clean' data processing pipeline clean_pipe = Transform.cleaning_pipeline(remove_dupe=False, replace_numbers=False, remove_virtual=True) # Create pipeline specifically for clustering text features text_pipe = Transform.text_pipeline(vocab_size='all', attributes='NAME', seperator='.', heirarchial_weight_word_pattern=True) full_pipeline = Pipeline([ ('clean_pipe', clean_pipe), ('text_pipe', text_pipe), ]) for customer_id in customer_ids: print("Saving TFRecord for Customer ID : {}".format(customer_id)) """Serialize context featuers -> serialized_context This is a serialized tf.train.Example object""" # Get Points databases related to customer_id sel = sqlalchemy.select([Points]).where( Points.customer_id.__eq__(customer_id)) database = Insert.pandas_select_execute(sel) sel = sqlalchemy.select([Customers.name ]).where(Customers.id.__eq__(customer_id)) customer_name = Insert.core_select_execute(sel)[0].name if database.shape[0] == 0: print(database.shape) print(customer_name) # Null databases should be skipped continue # Extract database featuers from Points database try: database_features = ExtractLabels.get_database_features( database, full_pipeline, instance_name=customer_name) except Labeling.PipelineError: print("An error occured while getting database features") print("Customer name : {}".format(customer_name)) print("Customer ID : {}".format(customer_id)) print(database) continue context_features = database_features.to_dict(orient='records')[0] context_features.pop('instance') # Create serialized TFRecord proto context_proto_str = serialize_context_from_dictionary(context_features) """Serialize peritem features. AKA examples or instances that will be ranked This is a list of serialized tf.train.Example objects""" # Get a list of Clustering primary keys related to customer_id sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\ .where(Clustering.customer_id.__eq__(customer_id)) res = Insert.core_select_execute(sel) if len(res) == 0: # No clustering examples were found with the database print("Skipped {} No results".format(customer_name)) continue primary_keys = [x.id for x in res] correct_k = res[0].correct_k # From primary keys create records. Records are used to find # Example features and labels records = get_records(primary_keys) # best_labels.hyperparameter_dict values are the peritem_features # The loss metric related to each hyperparameter_dict are labels to # each example best_labels = ExtractLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2) example_features = [] for label in best_labels: feature_dict = {} for key in peritem_keys: feature_dict[key] = label.hyperparameter_dict[key] feature_dict[label_key] = label.loss example_features.append(feature_dict) peritem_list = serialize_examples_from_dictionary( example_features, label_key=label_key, peritem_keys=peritem_keys, reciprocal=reciprocal, n_bins=n_bins, shuffle_peritem=shuffle_peritem) """Prepare serialized feature spec for EIE format""" serialized_dict = { 'serialized_context': _bytes_feature([context_proto_str]), 'serialized_examples': _bytes_feature(peritem_list) } # Convert dictionary to tf.train.Example object serialized_proto = tf.train.Example(features=tf.train.Features( feature=serialized_dict)) serialized_str = serialized_proto.SerializeToString() tfrecord_writer.write(serialized_str) tfrecord_writer.close() return None