class LSASearchEngine(SmartSearchEngine): # # Registry implementation using kd-tree def __init__(self): super(LSASearchEngine, self).__init__() self._service_array = [] self._lsi_index = None self._tfidf_matrix = None self._svd_matrix = None def load_configuration(self, configuration_file): super(LSASearchEngine, self).load_configuration(configuration_file) config = configparser.ConfigParser() config.read(configuration_file) number_of_topics = config.getint('RegistryConfigurations', 'number_of_topics') self._metric = config.get('RegistryConfigurations', 'metric').lower() self._svd = TruncatedSVD(n_components=number_of_topics) self._vectorizer = TfidfVectorizer( sublinear_tf=False, analyzer='word', lowercase=False, use_bm25idf=self._use_bm25idf, bm25_tf=self._use_bm25tf, k=self._bm25_k, preprocessor=StringPreprocessorAdapter('english.long')) def unpublish(self, service): pass def _preprocess(self, bag_of_words): return bag_of_words.get_words_str() def _after_publish(self, documents): self._tfidf_matrix = self._vectorizer.fit_transform(documents) self._svd_matrix = self._svd.fit_transform( self._tfidf_matrix.toarray()) self._lsi_index = NearestNeighbors(len(self._service_array), algorithm='brute', metric=self._metric) self._lsi_index.fit(self._svd_matrix) def publish(self, service): pass def find(self, query): query = StringTransformer().transform(query) query_array = self._vectorizer.transform( [self._query_transformer.transform(query).get_words_str()]) query_array = self._svd.transform(query_array.toarray()) result = self._lsi_index.kneighbors(query_array, return_distance=False)[0] result_list = [] for index in result: result_list.append(self._service_array[index]) return result_list def number_of_services(self): pass
class BallTreeSearchEngine(SmartSearchEngine): # # Registry implementation using ball-tree def __init__(self): super(BallTreeSearchEngine, self).__init__() self._service_array = [] self._balltree_index = None self._tfidf_matrix = None def load_configuration(self, configuration_file): super(BallTreeSearchEngine, self).load_configuration(configuration_file) config = configparser.ConfigParser() config.read(configuration_file) self._metric = config.get('RegistryConfigurations', 'metric').lower() self._leaf_size = config.getint('RegistryConfigurations', 'leaf_size') self._vectorizer = TfidfVectorizer( sublinear_tf=False, analyzer='word', lowercase=False, use_bm25idf=self._use_bm25idf, bm25_tf=self._use_bm25tf, k=self._bm25_k, preprocessor=StringPreprocessorAdapter()) def unpublish(self, service): pass def _preprocess(self, bag_of_words): return bag_of_words.get_words_str() def _after_publish(self, documents): self._tfidf_matrix = self._vectorizer.fit_transform(documents) self._balltree_index = BallTree(self._tfidf_matrix.toarray(), leaf_size=self._leaf_size, metric=self._metric) def publish(self, service): pass def find(self, query): query = StringTransformer().transform(query) query_array = self._vectorizer.transform([ self._query_transformer.transform(query).get_words_str() ]).toarray() result = self._balltree_index.query(query_array, k=len(self._service_array), return_distance=False) result_list = [] for index in result[0]: result_list.append(self._service_array[index]) return result_list def number_of_services(self): pass
class BernoulliRBMSearchEngine(SmartSearchEngine): # # Registry implementation using ball-tree def __init__(self): super(BernoulliRBMSearchEngine, self).__init__() self._service_array = [] self._bernoulliRBM_index = None self._tfidf_matrix = None def load_configuration(self, configuration_file): super(BernoulliRBMSearchEngine, self).load_configuration(configuration_file) self._vectorizer = TfidfVectorizer( sublinear_tf=False, analyzer='word', lowercase=False, use_bm25idf=self._use_bm25idf, bm25_tf=self._use_bm25tf, k=self._bm25_k, preprocessor=StringPreprocessorAdapter()) def unpublish(self, service): pass def _preprocess(self, bag_of_words): return bag_of_words.get_words_str() def _after_publish(self, documents): self._tfidf_matrix = self._vectorizer.fit_transform(documents) self._bernoulliRBM = BernoulliRBM(learning_rate=1) self._rbm_matrix = self._bernoulliRBM.fit_transform(self._tfidf_matrix) self._bernoulliRBM_index = NearestNeighbors(len(self._service_array), algorithm='brute', metric='euclidean') self._bernoulliRBM_index.fit(self._rbm_matrix) def publish(self, service): pass def find(self, query): query = StringTransformer().transform(query) query_array = self._vectorizer.transform( [self._query_transformer.transform(query).get_words_str()]) query_array = self._bernoulliRBM.transform(query_array.toarray()) result = self._bernoulliRBM_index.kneighbors(query_array, return_distance=False)[0] result_list = [] for index in result: result_list.append(self._service_array[index]) return result_list def number_of_services(self): pass
class KMeansSearchEngine(SmartSearchEngine): # # Registry implementation using clusters def __init__(self): super(KMeansSearchEngine, self).__init__() self._service_array = [] self._cluster_index = None self._cluster = {} self._document_cluster = {} self._tfidf_matrix = None def load_configuration(self, configuration_file): super(KMeansSearchEngine, self).load_configuration(configuration_file) config = configparser.ConfigParser() config.read(configuration_file) self._n_clusters = config.getint('RegistryConfigurations', 'n_clusters') self._vectorizer = TfidfVectorizer(sublinear_tf=False, analyzer='word', lowercase=False, use_bm25idf=self._use_bm25idf, bm25_tf=self._use_bm25tf, k = self._bm25_k, preprocessor=StringPreprocessorAdapter('english.long')) def unpublish(self, service): pass def _preprocess(self, bag_of_words): return bag_of_words.get_words_str() def _after_publish(self, documents): self._tfidf_matrix = self._vectorizer.fit_transform(documents) self._cluster_index = KMeans(n_clusters=self._n_clusters) self._cluster_index.fit(self._tfidf_matrix.toarray()) i = 0 self._document_cluster = {} for document in documents: if not self._cluster_index.labels_[i] in self._document_cluster: self._document_cluster[self._cluster_index.labels_[i]] = [] self._document_cluster[self._cluster_index.labels_[i]].append((document, i)) i += 1 print(('Number of clusters: ' + str(len(self._document_cluster)))) for label in self._document_cluster: print(('Label elements: ' + str(len(self._document_cluster[label])))) for label in self._document_cluster: self._cluster[label] = NearestNeighbors(len(self._document_cluster[label]), algorithm='brute', metric='euclidean') tfidf_matrix = self._vectorizer.transform(doc[0] for doc in self._document_cluster[label]) self._cluster[label].fit(tfidf_matrix.toarray()) def publish(self, service): pass def find(self, query): query = StringTransformer().transform(query) query_array = self._vectorizer.transform([self._query_transformer.transform(query).get_words_str()]).toarray() target_label = self._cluster_index.predict(query_array)[0] target_indexes = self._cluster[target_label].kneighbors(query_array, return_distance=False)[0] result = [] for target in target_indexes: result.append(self._document_cluster[target_label][target][1]) result_list = [] for index in result: result_list.append(self._service_array[index]) return result_list def number_of_services(self): pass