Python process_corpus Examples

Programming Language: Python

Namespace/Package Name: pre_processing

Method/Function: process_corpus

Examples at hotexamples.com: 4

Python process_corpus - 4 examples found. These are the top rated real world Python examples of pre_processing.process_corpus extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: main.py Project: Li-Pengfei/review_clustering_Deloitte

def q2(content):
    pos_tags = []
    doc_noimprove, doc_extracted, doc_other = pre_processing.process_corpus(
        content, pos_tags, question=2)
    doc_day, doc_time = doc_extracted[0], doc_extracted[1]
    label_day = []
    label_time = []
    cluster_info_all = []

    for idx, sing_review in enumerate(doc_day):
        doc_day[idx] = (sing_review[0], sing_review[1],
                        day_extract(sing_review))
        label_day.append(day_extract(sing_review))
    for idx, sing_review in enumerate(doc_time):
        doc_time[idx] = (sing_review[0], sing_review[1],
                         time_extract(sing_review))
        label_time.append(time_extract(sing_review))

    unique_day, unique_day_indices = np.unique(label_day, return_inverse=True)
    for i in range(len(unique_day)):
        label = unique_day[i]
        idx_set = np.where(np.array(unique_day_indices) == i)[0]
        sent = get_Cluster_Centroid([doc_day[i][0] for i in idx_set])
        cluster_info_all.append([2, label, len(idx_set), sent])

    doc_time_cluster = [doc[0] for doc in doc_time]
    sent = get_Cluster_Centroid(doc_time_cluster)
    cluster_info_all.append([2, 'specific_time', len(doc_time), sent])

    return doc_day + doc_time + doc_noimprove + doc_other, cluster_info_all

Example #2

Show file

File: main.py Project: Li-Pengfei/review_clustering_Deloitte

def q6(content):
    pos_tags = ['NN', 'NNS']
    doc_noimprove, doc_extracted, doc_other = pre_processing.process_corpus(
        content, pos_tags, question=6)
    doc_nn, nn_extracted = doc_extracted[0], doc_extracted[1]
    print 'Comment with keywords:', len(doc_nn)
    print 'No comments:', len(doc_noimprove)
    print 'Comment without keywords:', len(doc_other), "\n"

    # Rule-based clustering
    unclustered_index, nn_extracted, cluster_info = rule_based_clustering.clustering(
        nn_extracted, doc_nn, question=6)

    # cluster_info_all format: [[question number, label, freq, centroid_sentence],...]
    cluster_info_all = [[6, info[0], info[1], info[2]]
                        for info in cluster_info]

    # LSI + Spectral Clustering
    nn_extracted_unclustered = [nn_extracted[i][0] for i in unclustered_index]
    doc_nn_unclustered = [doc_nn[i] for i in unclustered_index]
    similarity_matrix = auto_clustering.lsi(nn_extracted_unclustered)
    label_auto, cluster_info = auto_clustering.spectral_clustering(
        similarity_matrix, nn_extracted_unclustered, doc_nn_unclustered)
    for i, idx in enumerate(unclustered_index):
        nn_extracted[idx] = nn_extracted[idx] + (label_auto[i], )

    for info in cluster_info:
        cluster_info_all.append([6, info[0], info[1], info[2]])

    return nn_extracted + doc_noimprove + doc_other, cluster_info_all

Example #3

Show file

File: main.py Project: Li-Pengfei/review_clustering_Deloitte

def q9(content):
    pos_tags = ['NN', 'NNS']
    doc_noimprove, doc_extracted, doc_other = pre_processing.process_corpus(
        content, pos_tags, question=9)
    doc_nn, nn_extracted = doc_extracted[0], doc_extracted[1]
    print 'Comment with keywords:', len(doc_nn)
    print 'No comments:', len(doc_noimprove)
    print 'Comment without keywords:', len(doc_other)
    df = post_processing.df_count_tuple(nn_extracted)

    nn_clean = post_processing.filter_ne(nn_extracted, doc_nn, df, question=9)
    df = post_processing.df_count(nn_clean)
    nn_extracted, cluster_info = post_processing.main_category_clustering(
        df, nn_extracted, nn_clean, doc_nn)

    # cluster_info_all format: [[question number, label, freq, centroid_sentence],...]
    cluster_info_all = [[9, info[0], info[1], info[2]]
                        for info in cluster_info]

    return nn_extracted + doc_noimprove + doc_other, cluster_info_all

Example #4

Show file

File: main.py Project: Li-Pengfei/review_clustering_Deloitte

def q5(content):
    pos_tags = ['NN', 'NNS', 'JJ', 'JJR', 'JJS']
    doc_noimprove, doc_extracted, doc_other = pre_processing.process_corpus(
        content, pos_tags, question=5)
    doc_nn, nn_extracted = doc_extracted[0], doc_extracted[1]
    print 'Comment with keywords:', len(doc_nn)
    print 'No comments:', len(doc_noimprove)
    print 'Comment without keywords:', len(doc_other), "\n"

    # LSI + Spectral Clustering
    nn_extracted_corpus = [nn_single[0] for nn_single in nn_extracted]
    similarity_matrix = auto_clustering.lsi(nn_extracted_corpus)
    label_auto, cluster_info = auto_clustering.spectral_clustering(
        similarity_matrix, nn_extracted_corpus, doc_nn, cluster_num=10)
    for idx in range(len(nn_extracted_corpus)):
        nn_extracted[idx] = nn_extracted[idx] + (label_auto[idx], )

    cluster_info_all = [[5, info[0], info[1], info[2]]
                        for info in cluster_info]
    return nn_extracted + doc_noimprove + doc_other, cluster_info_all