def test_similarity_two_texts(self): # First preprocess documents doc1 = "This is test. Aeroplane flight" doc2 = "This is test" doc1 = preprocess(doc1) doc2 = preprocess(doc2) similarity = self.similarity_model.documents_similarity(doc1, doc2) assert isinstance(similarity, float)
def assign_cluster_to_doc(doc_id): doc = ClassifiedDocument.objects.get(id=doc_id) grp_id = doc.group_id cluster_model = ClusteringModel.objects.get(group_id=grp_id) model = cluster_model.model # instance of KMeansDocs class processed = preprocess(doc.text) X = model.vectorizer.transform([processed]).toarray()[0] label = int(model.model.predict([X])[0]) docs_labels = [(doc_id, label)] feature = compress_sparse_vector(list(map(lambda x: float(x), X))) features = {doc_id: feature} # update labels data write_cluster_labels_data(cluster_model, docs_labels, features, update=True) # calculate new silhouette score silhouette_score = cluster_model.calculate_silhouette_score() cluster_model.silhouette_score = silhouette_score cluster_model.save() # update_size vs silhouette scores update_cluster_score_vs_size(cluster_model, 1) # increased size is 1 if silhouette_score < SILHOUETTE_THRESHOLD: logger.warning( "cluster score for cluster {} reached below threshold {}. Re-clustering." .format( # noqa cluster_model.id, SILHOUETTE_THRESHOLD)) update_cluster(cluster_model.id) return True
def update_cluster(cluster_id): try: cluster_model = ClusteringModel.objects.get(id=cluster_id) except ClusteringModel.DoesNotExist: logger.warn( "Clustering Model with id {} does not exist".format(cluster_id)) if cluster_model.all_clustered: # if all clustered, no need to recluster return docs = get_unclustered_docs(cluster_model) increased_size = docs.count() texts = list( map(lambda x: preprocess(x['text'], ignore_numbers=True), docs)) docids = list(map(lambda x: x['id'], docs)) kmeans_model = cluster_model.model CLUSTER_CLASS = type(kmeans_model) # TODO: add update criteria for doc2vec # update status # cluster_model.ready = False cluster_model.last_clustering_started = timezone.now() cluster_model.save() kmeans_model.update_cluster(texts) docs_labels = list( zip( docids, # convert from np.int64 to int list(map(lambda x: int(x), kmeans_model.model.labels_)))) if CLUSTER_CLASS == KMeansDocs: features = [] vectorizer = kmeans_model.vectorizer for txt in texts: arr = vectorizer.transform([txt]).toarray()[0] compressed = compress_sparse_vector(arr) features.append(compressed) # write/update to file docids_features = dict(zip(docids, features)) write_clustered_data_to_files(cluster_model, docs_labels, kmeans_model.model.cluster_centers_, docids_features, update=True) # relevant terms can be calculated only after writing other data relevant_terms = cluster_model.compute_relevant_terms() write_relevant_terms_data(cluster_model, relevant_terms) # update status cluster_model.last_clustered_on = timezone.now() cluster_model.silhouette_score = cluster_model.calculate_silhouette_score() cluster_model.ready = True cluster_model.save() # Update size vs silhouette scores update_cluster_score_vs_size(cluster_model, increased_size)
def text_rank(sentences): """ @sentences: list of sentences """ # first calculate the transition matrix using sentence similarity size = len(sentences) processed = [preprocess(x) for x in sentences] matrix = [[1.0 for _ in range(size)] for _ in range(size)] # later updated for i, s1 in enumerate(sentences): for j in range(i+1, size): # no need to check for i==j case as it is already set 1.0 similarity = sentences_similarity(processed[i], processed[j]) matrix[i][j] = similarity matrix[j][i] = similarity return page_rank(np.asarray(matrix))
def get_text_vector(self, text): processed = preprocess(text, ignore_numbers=True) text_terms = {} # fill in text_terms first for x in processed.split(): text_terms[x] = text_terms.get(x, 0.0) + 1 vector = [0.0] * self.terms_len for k, v in text_terms.items(): termid = self.terms_indices.get(k) # only if the term is already in our index if termid: inv_freq = self.inverse_freqs.get(termid, 0) idf = 0 if not inv_freq else\ math.log(self.total_docs/float(inv_freq)) vector[self.terms_indices[k]] = v * idf return vector
def preprocess(inp): return preprocess(inp)
def create_new_clusters(name, group_id, n_clusters, CLUSTER_CLASS=KMeansDocs, doc2vec_group_id=None): """If already exists, override it""" try: cluster_model = ClusteringModel.objects.get(group_id=group_id) except ClusteringModel.DoesNotExist: cluster_model = ClusteringModel.objects.create(name=name, group_id=group_id, n_clusters=n_clusters) options = ClusteringOptions(n_clusters=n_clusters) if CLUSTER_CLASS == KMeansDocs: docs = ClassifiedDocument.objects.filter(group_id=group_id).\ values('id', 'text') if not docs or docs.count() < n_clusters: logger.warn( "Too less documents for clustering for group_id {}".format( group_id)) raise Exception("Too less documents for given number of clusters") texts = list( map(lambda x: preprocess(x['text'], ignore_numbers=True), docs)) docids = list(map(lambda x: x['id'], docs)) cluster_params = texts elif CLUSTER_CLASS == KMeansDoc2Vec: # get Doc2VecModel doc2vecmodel = Doc2VecModel.objects.get(group_id=doc2vec_group_id) cluster_params = [x for x in doc2vecmodel.model.docvecs] docids = doc2vecmodel.model.docvecs.doctags.keys() features = cluster_params else: raise Exception("Invalid class") k_means = CLUSTER_CLASS(options) logger.info("Creating clustering model for group_id {}".format(group_id)) kmeans_model = k_means.perform_cluster(cluster_params) docs_labels = zip( docids, # convert from np.int64 to int list(map(lambda x: int(x), kmeans_model.model.labels_))) # Save to database cluster_model.model = kmeans_model cluster_model.name = name cluster_model.group_id = group_id cluster_model.n_clusters = n_clusters logger.info("Saving model to database. Group_id".format(group_id)) cluster_model.silhouette_score = kmeans_model.get_silhouette_score() cluster_model.save() # create features for KMeansDocs if CLUSTER_CLASS == KMeansDocs: features = [] vectorizer = kmeans_model.vectorizer for txt in texts: arr = vectorizer.fit_transform([txt]).toarray()[0] compressed = compress_sparse_vector(arr) features.append(compressed) relevant_terms = get_relevant_terms(list(map(tokenize, texts))) docids_features = dict(zip(docids, features)) # Now write to files logger.info( "Writing clustering results to files. Group id: {}".format(group_id)) write_clustured_data_to_files(cluster_model, docs_labels, kmeans_model.model.cluster_centers_, docids_features, relevant_terms) # mark clustering complete as true cluster_model.ready = True cluster_model.save() return cluster_model
def perform_clustering(cluster_model, CLUSTER_CLASS=KMeansDocs, doc2vec_group_id=None): n_clusters = cluster_model.n_clusters group_id = cluster_model.group_id name = cluster_model.name cluster_model.last_clustering_started = timezone.now() cluster_model.save() options = ClusteringOptions(n_clusters=n_clusters) if CLUSTER_CLASS == KMeansDocs: docs = ClassifiedDocument.objects.filter(group_id=group_id).\ values('id', 'text') if not docs or docs.count() < n_clusters: logger.warn( "Too less documents for clustering for group_id {}".format( group_id)) raise Exception("Too less documents for given number of clusters") texts = list( map(lambda x: preprocess(x['text'], ignore_numbers=True), docs)) docids = list(map(lambda x: x['id'], docs)) cluster_params = texts elif CLUSTER_CLASS == KMeansDoc2Vec: # get Doc2VecModel doc2vecmodel = Doc2VecModel.objects.get(group_id=doc2vec_group_id) cluster_params = [x for x in doc2vecmodel.model.docvecs] docids = doc2vecmodel.model.docvecs.doctags.keys() features = cluster_params else: raise Exception("Invalid class") k_means = CLUSTER_CLASS(options) logger.info("Creating clustering model for group_id {}".format(group_id)) kmeans_model = k_means.perform_cluster(cluster_params) docs_labels = zip( docids, # convert from np.int64 to int list(map(lambda x: int(x), kmeans_model.model.labels_))) # Save to database cluster_model.model = kmeans_model cluster_model.name = name cluster_model.group_id = group_id cluster_model.n_clusters = n_clusters logger.info("Saving model to database. Group_id".format(group_id)) cluster_model.save() # create features for KMeansDocs if CLUSTER_CLASS == KMeansDocs: features = [] vectorizer = kmeans_model.vectorizer for txt in texts: arr = vectorizer.transform([txt]).toarray()[0] compressed = compress_sparse_vector(arr) features.append(compressed) docids_features = dict(zip(docids, features)) # Now write to files logger.info( "Writing clustering results to files. Group id: {}".format(group_id)) write_clustered_data_to_files(cluster_model, docs_labels, kmeans_model.model.cluster_centers_, docids_features) # relevant terms can be calculated only after writing other data relevant_terms = cluster_model.compute_relevant_terms() write_relevant_terms_data(cluster_model, relevant_terms) # mark clustering complete as true, and update clustered date cluster_model.ready = True cluster_model.silhouette_score = cluster_model.calculate_silhouette_score() cluster_model.last_clustered_on = timezone.now() cluster_model.save() return cluster_model
def classify_text(classifier, text): text = preprocess(text) classified = classifier.classify_text(text) classified.sort(key=lambda x: x[1], reverse=True) return classified