Esempio n. 1
0
 def test_similarity_two_texts(self):
     # First preprocess  documents
     doc1 = "This is test. Aeroplane flight"
     doc2 = "This is test"
     doc1 = preprocess(doc1)
     doc2 = preprocess(doc2)
     similarity = self.similarity_model.documents_similarity(doc1, doc2)
     assert isinstance(similarity, float)
Esempio n. 2
0
def assign_cluster_to_doc(doc_id):
    doc = ClassifiedDocument.objects.get(id=doc_id)
    grp_id = doc.group_id
    cluster_model = ClusteringModel.objects.get(group_id=grp_id)
    model = cluster_model.model  # instance of KMeansDocs class
    processed = preprocess(doc.text)
    X = model.vectorizer.transform([processed]).toarray()[0]
    label = int(model.model.predict([X])[0])
    docs_labels = [(doc_id, label)]
    feature = compress_sparse_vector(list(map(lambda x: float(x), X)))
    features = {doc_id: feature}
    # update labels data
    write_cluster_labels_data(cluster_model,
                              docs_labels,
                              features,
                              update=True)
    # calculate new silhouette score
    silhouette_score = cluster_model.calculate_silhouette_score()
    cluster_model.silhouette_score = silhouette_score
    cluster_model.save()
    # update_size vs silhouette scores
    update_cluster_score_vs_size(cluster_model, 1)  # increased size is 1
    if silhouette_score < SILHOUETTE_THRESHOLD:
        logger.warning(
            "cluster score for cluster {} reached below threshold {}. Re-clustering."
            .format(  # noqa
                cluster_model.id, SILHOUETTE_THRESHOLD))
        update_cluster(cluster_model.id)
    return True
Esempio n. 3
0
def update_cluster(cluster_id):
    try:
        cluster_model = ClusteringModel.objects.get(id=cluster_id)
    except ClusteringModel.DoesNotExist:
        logger.warn(
            "Clustering Model with id {} does not exist".format(cluster_id))
    if cluster_model.all_clustered:
        # if all clustered, no need to recluster
        return
    docs = get_unclustered_docs(cluster_model)
    increased_size = docs.count()
    texts = list(
        map(lambda x: preprocess(x['text'], ignore_numbers=True), docs))
    docids = list(map(lambda x: x['id'], docs))
    kmeans_model = cluster_model.model
    CLUSTER_CLASS = type(kmeans_model)
    # TODO: add update criteria for doc2vec

    # update status
    # cluster_model.ready = False
    cluster_model.last_clustering_started = timezone.now()
    cluster_model.save()

    kmeans_model.update_cluster(texts)
    docs_labels = list(
        zip(
            docids,  # convert from np.int64 to int
            list(map(lambda x: int(x), kmeans_model.model.labels_))))
    if CLUSTER_CLASS == KMeansDocs:
        features = []
        vectorizer = kmeans_model.vectorizer
        for txt in texts:
            arr = vectorizer.transform([txt]).toarray()[0]
            compressed = compress_sparse_vector(arr)
            features.append(compressed)
    # write/update to file
    docids_features = dict(zip(docids, features))
    write_clustered_data_to_files(cluster_model,
                                  docs_labels,
                                  kmeans_model.model.cluster_centers_,
                                  docids_features,
                                  update=True)
    # relevant terms can be calculated only after writing other data
    relevant_terms = cluster_model.compute_relevant_terms()
    write_relevant_terms_data(cluster_model, relevant_terms)

    # update status
    cluster_model.last_clustered_on = timezone.now()
    cluster_model.silhouette_score = cluster_model.calculate_silhouette_score()
    cluster_model.ready = True
    cluster_model.save()
    # Update size vs silhouette scores
    update_cluster_score_vs_size(cluster_model, increased_size)
Esempio n. 4
0
def text_rank(sentences):
    """
    @sentences: list of sentences
    """
    # first calculate the transition matrix using sentence similarity
    size = len(sentences)
    processed = [preprocess(x) for x in sentences]
    matrix = [[1.0 for _ in range(size)] for _ in range(size)]  # later updated
    for i, s1 in enumerate(sentences):
        for j in range(i+1, size):
            # no need to check for i==j case as it is already set 1.0
            similarity = sentences_similarity(processed[i], processed[j])
            matrix[i][j] = similarity
            matrix[j][i] = similarity
    return page_rank(np.asarray(matrix))
Esempio n. 5
0
 def get_text_vector(self, text):
     processed = preprocess(text, ignore_numbers=True)
     text_terms = {}
     # fill in text_terms first
     for x in processed.split():
         text_terms[x] = text_terms.get(x, 0.0) + 1
     vector = [0.0] * self.terms_len
     for k, v in text_terms.items():
         termid = self.terms_indices.get(k)
         # only if the term is already in our index
         if termid:
             inv_freq = self.inverse_freqs.get(termid, 0)
             idf = 0 if not inv_freq else\
                 math.log(self.total_docs/float(inv_freq))
             vector[self.terms_indices[k]] = v * idf
     return vector
Esempio n. 6
0
 def preprocess(inp):
     return preprocess(inp)
Esempio n. 7
0
def create_new_clusters(name,
                        group_id,
                        n_clusters,
                        CLUSTER_CLASS=KMeansDocs,
                        doc2vec_group_id=None):
    """If already exists, override it"""
    try:
        cluster_model = ClusteringModel.objects.get(group_id=group_id)
    except ClusteringModel.DoesNotExist:
        cluster_model = ClusteringModel.objects.create(name=name,
                                                       group_id=group_id,
                                                       n_clusters=n_clusters)
    options = ClusteringOptions(n_clusters=n_clusters)

    if CLUSTER_CLASS == KMeansDocs:
        docs = ClassifiedDocument.objects.filter(group_id=group_id).\
                values('id', 'text')
        if not docs or docs.count() < n_clusters:
            logger.warn(
                "Too less documents for clustering for group_id {}".format(
                    group_id))
            raise Exception("Too less documents for given number of clusters")
        texts = list(
            map(lambda x: preprocess(x['text'], ignore_numbers=True), docs))
        docids = list(map(lambda x: x['id'], docs))
        cluster_params = texts
    elif CLUSTER_CLASS == KMeansDoc2Vec:
        # get Doc2VecModel
        doc2vecmodel = Doc2VecModel.objects.get(group_id=doc2vec_group_id)
        cluster_params = [x for x in doc2vecmodel.model.docvecs]
        docids = doc2vecmodel.model.docvecs.doctags.keys()
        features = cluster_params
    else:
        raise Exception("Invalid class")

    k_means = CLUSTER_CLASS(options)
    logger.info("Creating clustering model for group_id {}".format(group_id))
    kmeans_model = k_means.perform_cluster(cluster_params)
    docs_labels = zip(
        docids,  # convert from np.int64 to int
        list(map(lambda x: int(x), kmeans_model.model.labels_)))

    # Save to database
    cluster_model.model = kmeans_model
    cluster_model.name = name
    cluster_model.group_id = group_id
    cluster_model.n_clusters = n_clusters
    logger.info("Saving model to database. Group_id".format(group_id))
    cluster_model.silhouette_score = kmeans_model.get_silhouette_score()
    cluster_model.save()

    # create features for KMeansDocs
    if CLUSTER_CLASS == KMeansDocs:
        features = []
        vectorizer = kmeans_model.vectorizer
        for txt in texts:
            arr = vectorizer.fit_transform([txt]).toarray()[0]
            compressed = compress_sparse_vector(arr)
            features.append(compressed)
        relevant_terms = get_relevant_terms(list(map(tokenize, texts)))

    docids_features = dict(zip(docids, features))
    # Now write to files
    logger.info(
        "Writing clustering results to files. Group id: {}".format(group_id))
    write_clustured_data_to_files(cluster_model, docs_labels,
                                  kmeans_model.model.cluster_centers_,
                                  docids_features, relevant_terms)
    # mark clustering complete as true
    cluster_model.ready = True
    cluster_model.save()
    return cluster_model
Esempio n. 8
0
def perform_clustering(cluster_model,
                       CLUSTER_CLASS=KMeansDocs,
                       doc2vec_group_id=None):
    n_clusters = cluster_model.n_clusters
    group_id = cluster_model.group_id
    name = cluster_model.name

    cluster_model.last_clustering_started = timezone.now()
    cluster_model.save()

    options = ClusteringOptions(n_clusters=n_clusters)
    if CLUSTER_CLASS == KMeansDocs:
        docs = ClassifiedDocument.objects.filter(group_id=group_id).\
                values('id', 'text')
        if not docs or docs.count() < n_clusters:
            logger.warn(
                "Too less documents for clustering for group_id {}".format(
                    group_id))
            raise Exception("Too less documents for given number of clusters")
        texts = list(
            map(lambda x: preprocess(x['text'], ignore_numbers=True), docs))
        docids = list(map(lambda x: x['id'], docs))
        cluster_params = texts
    elif CLUSTER_CLASS == KMeansDoc2Vec:
        # get Doc2VecModel
        doc2vecmodel = Doc2VecModel.objects.get(group_id=doc2vec_group_id)
        cluster_params = [x for x in doc2vecmodel.model.docvecs]
        docids = doc2vecmodel.model.docvecs.doctags.keys()
        features = cluster_params
    else:
        raise Exception("Invalid class")

    k_means = CLUSTER_CLASS(options)
    logger.info("Creating clustering model for group_id {}".format(group_id))
    kmeans_model = k_means.perform_cluster(cluster_params)
    docs_labels = zip(
        docids,  # convert from np.int64 to int
        list(map(lambda x: int(x), kmeans_model.model.labels_)))

    # Save to database
    cluster_model.model = kmeans_model
    cluster_model.name = name
    cluster_model.group_id = group_id
    cluster_model.n_clusters = n_clusters
    logger.info("Saving model to database. Group_id".format(group_id))
    cluster_model.save()

    # create features for KMeansDocs
    if CLUSTER_CLASS == KMeansDocs:
        features = []
        vectorizer = kmeans_model.vectorizer
        for txt in texts:
            arr = vectorizer.transform([txt]).toarray()[0]
            compressed = compress_sparse_vector(arr)
            features.append(compressed)

    docids_features = dict(zip(docids, features))
    # Now write to files
    logger.info(
        "Writing clustering results to files. Group id: {}".format(group_id))
    write_clustered_data_to_files(cluster_model, docs_labels,
                                  kmeans_model.model.cluster_centers_,
                                  docids_features)
    # relevant terms can be calculated only after writing other data
    relevant_terms = cluster_model.compute_relevant_terms()
    write_relevant_terms_data(cluster_model, relevant_terms)

    # mark clustering complete as true, and update clustered date
    cluster_model.ready = True
    cluster_model.silhouette_score = cluster_model.calculate_silhouette_score()
    cluster_model.last_clustered_on = timezone.now()
    cluster_model.save()
    return cluster_model
Esempio n. 9
0
def classify_text(classifier, text):
    text = preprocess(text)
    classified = classifier.classify_text(text)
    classified.sort(key=lambda x: x[1], reverse=True)
    return classified