def DBSCAN_then_BERT_agglomerative(corpus, k=5):
    cluster_assignment = cosine_with_DBSCAN(corpus, output="list")
    clusters = cluster_by_index(cluster_assignment)
    outliers = clusters[-1]
    cluster_sentences = get_cluster_dict(cluster_assignment, corpus)
    outlier_sentences = cluster_sentences[-1]

    outliers_embeddings = embed_corpus(outlier_sentences)
    clustering_model = AgglomerativeClustering(n_clusters=k)
    clustering_model.fit(outliers_embeddings)
    outlier_assignment = clustering_model.labels_

    outlier_clusters = {}
    for i in range(k):
        outlier_clusters[i] = []

    for i in range(len(outlier_assignment)):
        cluster = outlier_assignment[i]
        index = outliers[i]
        if index == 1:
            print(i)
        outlier_clusters[cluster].append(index)

    outlier_clusters_re_indexed = re_index_clusters(outlier_clusters,
                                                    max(cluster_assignment))

    return [clusters, outlier_clusters_re_indexed]
def DBSCAN_then_BERT_SVM(corpus):
    cluster_assignment = cosine_with_DBSCAN(corpus, output="list")
    corpus_embeddings = embed_corpus(corpus)
    original_clusters = cluster_by_index(cluster_assignment)

    non_outlier_assignments = []
    non_outlier_embeddings = []
    for i in range(len(cluster_assignment)):
        if cluster_assignment[i] != -1:
            non_outlier_assignments.append(cluster_assignment[i])
            non_outlier_embeddings.append(corpus_embeddings[i])

    clf = svm.SVC()
    clf.fit(non_outlier_embeddings, non_outlier_assignments)

    max_cluster = max(cluster_assignment)
    outlier_classifications = {}
    for i in range(-1, max_cluster + 1):
        outlier_classifications[i] = []

    for i in range(len(cluster_assignment)):
        if cluster_assignment[i] == -1:
            cluster = clf.predict([corpus_embeddings[i]])[0]
            outlier_classifications[cluster].append(i)

    return [original_clusters, outlier_classifications]
def BERT_with_agglomerative(corpus, k=10):
    corpus_embeddings = embed_corpus(corpus)

    clustering_model = AgglomerativeClustering(n_clusters=k)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_

    return [cluster_by_index(cluster_assignment)]
def BERT_with_kmeans(corpus, k=10):
    corpus_embeddings = embed_corpus(corpus)

    clustering_model = KMeans(n_clusters=k)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_

    return [cluster_by_index(cluster_assignment)]
def DBSCAN_then_BERT_KNN(corpus, k=5):
    cluster_assignment = cosine_with_DBSCAN(corpus, output="list")
    clusters = cluster_by_index(cluster_assignment)
    outliers = clusters[-1]

    corpus_embeddings = embed_corpus(corpus)
    outlier_classifications = classify_outliers(cluster_assignment,
                                                corpus_embeddings, outliers, k)

    return [clusters, outlier_classifications]
def cosine_with_DBSCAN(corpus, eps=0.6, min_samples=3, output="dict"):
    vectorizer = TfidfVectorizer()
    tfidfs = vectorizer.fit_transform(corpus)

    cluster = DBSCAN(eps=eps,
                     min_samples=min_samples,
                     metric="cosine",
                     algorithm="brute")
    cluster.fit_predict(tfidfs)
    cluster_assignment = cluster.labels_

    if output == "list":
        return cluster_assignment
    elif output == "sentences":
        return get_cluster_dict(cluster_assignment, corpus)

    return cluster_by_index(cluster_assignment)