コード例 #1
0
def search(corpus):
    """Search Demo function"""

    normalized_corpus = normalize_corpus(corpus)
    vectorizer, feature_matrix = build_feature_matrix(
        normalized_corpus, 'tfidf')

    q = input('Enter search query. Press "Enter" to stop: \n')

    while q != '':
        q = normalize_document(q)
        q_tfidf = vectorizer.transform([q])
        ans_mat = q_tfidf.dot(feature_matrix.transpose())
        ans_list = []
        for j in range(ans_mat.shape[1]):
            if ans_mat[0, j] > 0:
                ans_list.append((j, ans_mat[0, j]))
        ans_list.sort(key=lambda x: x[1], reverse=True)

        print()
        print('************ {} ************'.format(q))

        for item in ans_list[:5]:
            print()
            print('Document no. {}, rank: {}'.format(item[0], item[1]))
            print(corpus[item[0]][:])
            print()
            print()

        q = input('Enter search query. Press "Enter" to stop: \n')
    print()
コード例 #2
0
def main():
    """
    使用KMeans和AP算法进行聚类后发现,如果没有对数据做去重处理的话,
    使用AP算法会得到非常多类,这时可能还是KMeans比较合适
    :return:
    """
    book_data = pd.read_csv('./data/data.csv')  # 读取数据集
    print(book_data.head())

    # 获取书名和书的内容
    book_titles = book_data['title'].tolist()
    book_content = book_data['content'].tolist()
    print("Title:", book_titles[0])
    print("Content:", book_content[0])

    # 将书的内容规范化,去除标点、空格,一本书的内容为一句不含标点空格的句子
    norm_book_content = normalize_corpus(book_content)

    # 从规范化后的书本内容中提取tfidf特征矩阵
    vectorizer, feature_matrix = build_feature_matrix(norm_book_content,
                                                      feature_type='tfidf',
                                                      min_df=0.2, max_df=0.9,
                                                      ngram_range=(1, 2))
    print(feature_matrix)
    print(feature_matrix.shape)

    feature_names = vectorizer.get_feature_names()  # 书本内容的每个特征的名字
    print(feature_names[:10])

    num_clusters = 10  # 想要聚成多少个类

    KM(feature_matrix, feature_names, book_data, num_clusters)  # KMeans算法
    AP(feature_matrix, feature_names, book_data, num_clusters)  # AP算法
コード例 #3
0
def updated_topic_extraction(corpus, tm_obj, cluster_num):
    '''
    Main function of topic modeling when a new document is assigned to 
    the nearest cluster
    '''

    n_topics = int(os.getenv('TOPIC_NUMBER_PER_CLUSTER'))
    print("Cluster #{}:".format(cluster_num))
    norm_corpus = normalize_corpus(corpus)
    vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                                    feature_type='tfidf')
    feature_names = vectorizer.get_feature_names()

    # Update the model object
    tm_obj.fit_transform(tfidf_matrix)
    weights = tm_obj.components_

    topics = get_topics_terms_weights(weights, feature_names)
    print_topics_udf(topics=topics,
                     total_topics=n_topics,
                     num_terms=10,
                     display_weights=True)

    # Return the updated model object
    return tm_obj
コード例 #4
0
def train_lsi_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi
コード例 #5
0
def train_lsi_model_gensim(corpus, total_topics=2):
    
    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lsi = models.LsiModel(corpus_tfidf, 
                          id2word=dictionary,
                          num_topics=total_topics)
    return lsi
コード例 #6
0
def topic_extraction(documents, labels):
    '''
    Main function of topic modeling
    '''

    num_clusters = len(set(labels))
    n_topics = int(os.getenv('TOPIC_NUMBER_PER_CLUSTER'))
    matched = False
    tm_obj = []
    for c in range(num_clusters):
        print("=" * 70)
        print("Cluster #{}:".format(c))
        corpus = [
            document for i, document in enumerate(documents) if labels[i] == c
        ]
        norm_corpus = normalize_corpus(corpus)
        vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                                        feature_type='tfidf')
        feature_names = vectorizer.get_feature_names()
        if os.getenv('TOPIC_MODELING') == "lda":
            # Use Latent Dirichlet Allocation for topic modeling
            lda = LatentDirichletAllocation(n_components=n_topics,
                                            max_iter=1000,
                                            learning_method='online',
                                            learning_offset=10.,
                                            random_state=42)
            lda.fit(tfidf_matrix)
            weights = lda.components_
            matched = True
            tm_obj.append(lda)

        if os.getenv('TOPIC_MODELING') == "nmf":
            # Use Nonnegative Matrix Factorization for topic modeling
            nmf = NMF(n_components=n_topics,
                      random_state=42,
                      alpha=.1,
                      l1_ratio=.5)
            nmf.fit(tfidf_matrix)
            weights = nmf.components_
            matched = True
            tm_obj.append(nmf)

        if not matched:
            raise ValueError("Unknown topic modeling algorithm!")

        topics = get_topics_terms_weights(weights, feature_names)
        print_topics_udf(topics=topics,
                         total_topics=n_topics,
                         num_terms=10,
                         display_weights=True)

    return tm_obj
コード例 #7
0
ファイル: cluster.py プロジェクト: michaelliu03/py-project
def process():
    title,content,book_data =load_book_data()
    norm_book_content = normalize_corpus(content)
    #print(norm_book_content)
    # 提取 tf-idf 特征
    vectorizer, feature_matrix = build_feature_matrix(norm_book_content,
                                                      feature_type='tfidf',
                                                      min_df=0.2, max_df=0.90,
                                                      ngram_range=(1, 2))
    print(feature_matrix.shape)
    # 获取特征名字
    feature_names = vectorizer.get_feature_names()
    #print(feature_names)

    # 打印某些特征
    print(feature_names[:10])

    num_clusters = 10
    km_obj, clusters = k_means(feature_matrix=feature_matrix,
                               num_clusters=num_clusters)
    book_data['Cluster'] = clusters



    # 获取每个cluster的数量
    c = Counter(clusters)
    print(c.items())


    # 取 cluster 数据
    cluster_data = get_cluster_data(clustering_obj=km_obj,
                                    book_data=book_data,
                                    feature_names=feature_names,
                                    num_clusters=num_clusters,
                                    topn_features=5)


    print_cluster_data(cluster_data)

    plot_clusters(num_clusters=num_clusters,
              feature_matrix=feature_matrix,
              cluster_data=cluster_data,
              book_data=book_data,
              plot_size=(16, 8))
コード例 #8
0
for lab in raw_label:
    text = str(lab).strip("['\()]")
    raw_label_1.append(text)

print(raw_data_1[0])
print(raw_label_1[0])

train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(
    raw_data_1, raw_label_1)

print(train_corpus[0])
print(train_labels[0])

from normalization import normalize_corpus

norm_train_corpus = normalize_corpus(train_corpus)
norm_test_corpus = normalize_corpus(test_corpus)
from feature_extractors import averaged_word_vectorizer
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
import nltk
import gensim

tokenized_train = [nltk.word_tokenize(text) for text in norm_train_corpus]

tokenized_test = [nltk.word_tokenize(text) for text in norm_test_corpus]

# build word2vec model
model = gensim.models.Word2Vec(tokenized_train,
                               size=700,
                               window=200,
                               min_count=30,
コード例 #9
0

description = list(df['Description'])
category = list(df['Category'])
# #print category
problem = 'Internet nahi chal raha hai aur kutte raat ko bahut bhokte hai'
temp_post = problem.split('.')
query_post = []
for t in temp_post:
    if t == '':
        continue
    query_post.append(t)
print query_post

#NOrmalize and extract features from the corpus
norm_corpus = normalize_corpus(description, lemmatize=True)
tfidf_vectorizer, tfidf_features = build_feature_matrix(norm_corpus, feature_type='tfidf', ngram_range=(1,1), min_df=0.0, max_df=1.0)

features = tfidf_features.todense()
feature_name = tfidf_vectorizer.get_feature_names()

def display_features(features, feature_name):
    dff = pd.DataFrame(data=features, columns=feature_name)
    print dff

# display_features(features, feature_name)
#Normalize and extract features from the query corpus
norm_query_docs = normalize_corpus(query_post, lemmatize=True)
#print norm_corpus
#print norm_query_docs
コード例 #10
0
def main():
    no_list = 5000
    dataframe = pd.read_csv(
        './data/final_questions_data.csv',
        names=['user', 'college', 'category', 'problems', 'problem_link'])

    answers = list(dataframe['problems'][1:no_list])
    answers = answers[1:no_list]

    if os.path.isfile('norm_corpus.csv'):
        pass
        read_df = pd.read_csv('norm_corpus.csv',
                              names=['norm'],
                              index_col=False)
        norm_corpus = read_df['norm'][1:].values.astype('U').tolist()
    else:
        norm_corpus = normalize_corpus(answers, lemmatize=True)
        write_df = pd.DataFrame(norm_corpus)
        write_df.to_csv('norm_corpus.csv', index=False, header=None)
    vectorizer, corpus_features = build_feature_matrix(norm_corpus,
                                                       feature_type='tfidf')

    doc_lengths = [len(doc.split()) for doc in norm_corpus]
    avg_dl = np.average(doc_lengths)
    corpus_term_idfs = compute_corpus_term_idfs(corpus_features, norm_corpus)

    for answer in answers:
        answers = list(dataframe['problems'][1:no_list])

        answers.remove(convert(answer))
        model_answer = convert(answer)
        print(model_answer)
        # normalize answers
        norm_corpus = normalize_corpus(answers, lemmatize=True)

        # normalize model_answer
        norm_model_answer = normalize_corpus(model_answer, lemmatize=True)

        # extract features from model_answer
        model_answer_features = vectorizer.transform(norm_model_answer)

        for index, doc in enumerate(model_answer):
            doc_features = model_answer_features[index]
            bm25_scores = compute_bm25_similarity(doc_features,
                                                  corpus_features,
                                                  doc_lengths,
                                                  avg_dl,
                                                  corpus_term_idfs,
                                                  k1=1.5,
                                                  b=0.75)
            semantic_similarity_scores = []
            for sentence in answers:
                score = (sentence_similarity(sentence, model_answer[0]) +
                         sentence_similarity(model_answer[0], sentence)) / 2
                semantic_similarity_scores.append(score)
            print('Model Answer', ':', doc)
            print('-' * 40)
            doc_index = 0
            sim_scores = []
            for score_tuple in zip(semantic_similarity_scores, bm25_scores):
                sim_score = ((score_tuple[0] * 10) + score_tuple[1]) / 2
                sim_scores.append(sim_score)
            #print (sim_scores)
            print(
                sorted(range(len(sim_score)), key=lambda i: sim_score[i])[-5:])
            break
            print('Ans num: {} Score: {}\nAnswer: {}'.format(
                doc_index + 1, sim_score, answers[doc_index]))
            print('-' * 40)
            doc_index = doc_index + 1

        break
コード例 #11
0
from gensim import corpora, models
from normalization import normalize_corpus
import numpy as np

toy_corpus = ["The fox jumps over the dog",
"The fox is very clever and quick",
"The dog is slow and lazy",
"The cat is smarter than the fox and the dog",
"Python is an excellent programming language",
"Java and Ruby are other programming languages",
"Python and Java are very popular programming languages",
"Python programs are smaller than Java programs"]

# LSI topic model
norm_tokenized_corpus = normalize_corpus(toy_corpus, tokenize=True)
norm_tokenized_corpus

dictionary = corpora.Dictionary(norm_tokenized_corpus)
print dictionary.token2id

corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus]
corpus

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

total_topics = 2

lsi = models.LsiModel(corpus_tfidf, 
                      id2word=dictionary, 
コード例 #12
0
'The sky is blue and beautiful',
'Look at the bright blue sky!',
'Python is a great Programming language',
'Python and Java are popular Programming languages',
'Among Programming languages, both Python and Java are the most used in Analytics',
'The fox is quicker than the lazy dog',
'The dog is smarter than the fox',
'The dog, fox and cat are good friends']

query_docs = ['The fox is definitely smarter than the dog',
            'Java is a static typed programming language unlike Python',
            'I love to relax under the beautiful blue sky!']  


# normalize and extract features from the toy corpus
norm_corpus = normalize_corpus(toy_corpus, lemmatize=True)
tfidf_vectorizer, tfidf_features = build_feature_matrix(norm_corpus,
                                                        feature_type='tfidf',
                                                        ngram_range=(1, 1), 
                                                        min_df=0.0, max_df=1.0)
                                                        
# normalize and extract features from the query corpus
norm_query_docs =  normalize_corpus(query_docs, lemmatize=True)            
query_docs_tfidf = tfidf_vectorizer.transform(norm_query_docs)

def compute_cosine_similarity(doc_features, corpus_features,
                              top_n=3):
    # get document vectors
    doc_features = doc_features[0]
    # compute similarities
    similarity = np.dot(doc_features, 
コード例 #13
0
movie_data = pd.read_csv('movie_data.csv')

print movie_data.head()

movie_titles = movie_data['Title'].tolist()
movie_synopses = movie_data['Synopsis'].tolist()

print 'Movie:', movie_titles[0]
print 'Movie Synopsis:', movie_synopses[0][:1000]

from normalization import normalize_corpus  ###
from utils import build_feature_matrix

# normalize corpus
norm_movie_synopses = normalize_corpus(movie_synopses,
                                       lemmatize=True,
                                       only_text_chars=True)  ####

# extract tf-idf features
vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses,
                                                  feature_type='tfidf',
                                                  min_df=0.24,
                                                  max_df=0.85,
                                                  ngram_range=(1, 2))
# view number of features
print feature_matrix.shape

# get feature names
feature_names = vectorizer.get_feature_names()

# print sample features
コード例 #14
0
def main():
    corpus, labels = get_data()
    print("total data size:", len(labels))
    corpus, labels = remove_empty_docs(corpus, labels)
    print("sample:", corpus[10])
    print("label of sample:", labels[10])
    label_name_map = ['spam', 'normal']  # 0代表spam,1代表normal
    print("actual type:", label_name_map[int(labels[10])])

    # 划分数据集
    train_corpus, train_labels, test_corpus, test_labels = prepare_datasets(
        corpus, labels)
    # 对语料进行预处理
    norm_train_corpus = normalize_corpus(train_corpus)
    norm_test_corpus = normalize_corpus(test_corpus)

    # 词袋模型
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf模型
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # 对处理后的语料进行分词
    tokenized_train = [jieba.lcut(text) for text in norm_train_corpus]
    tokenized_test = [jieba.lcut(text) for text in norm_test_corpus]

    # 词向量Word2Vec
    model = gensim.models.Word2Vec(tokenized_train,
                                   size=500,
                                   window=100,
                                   min_count=30,
                                   sample=1e-3)

    # 分别以多项分布朴素贝叶斯、SVM、逻辑回归算法训练分类器并评估各个分类器性能
    mnb = MultinomialNB()  # 朴素贝叶斯
    svm = SGDClassifier()  # SVM
    lr = LogisticRegression()  # 逻辑斯特回归

    print("\nNavie Bayes based on BOW")
    mnb_bow_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    print("\nLogistic Regression based on BOW")
    lr_bow_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    print("\nSVM based on BOW")
    svm_bow_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    print("\nNavie Bayes based on tfidf")
    mnb_tfidf_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    print("\nLogistic Regression based on tfidf")
    lr_tfidf_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    print("\nSVM based on tfidf")
    svm_tfidf_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
コード例 #15
0
ファイル: execute.py プロジェクト: bspong/WeCP_Task
def run():
    answers = [
        'Functions are used as one-time processing snippet for inling and jumbling the code.',
        'Functions are used for reusing, inlining and jumbling the code.',
        'Functions are used as one-time processing snippet for inlining and organizing the code.',
        'Functions are used as one-time processing snippet for modularizing and jumbling the code.',
        'Functions are used for reusing, inling and organizing the code.',
        'Functions are used as one-time processing snippet for modularizing and organizing the code.',
        'Functions are used for reusing, modularizing and jumbling the code.',
        'Functions are used for reusing, modularizing and organizing the code.'
    ]

    model_answer = [
        "Functions are used for reusing, modularizing and organizing the code."
    ]

    # normalize answers
    norm_corpus = normalize_corpus(answers, lemmatize=True)

    # normalize model_answer
    norm_model_answer = normalize_corpus(model_answer, lemmatize=True)

    vectorizer, corpus_features = build_feature_matrix(
        norm_corpus, feature_type='frequency')

    # extract features from model_answer
    model_answer_features = vectorizer.transform(norm_model_answer)

    doc_lengths = [len(doc.split()) for doc in norm_corpus]
    avg_dl = np.average(doc_lengths)
    corpus_term_idfs = compute_corpus_term_idfs(corpus_features, norm_corpus)

    for index, doc in enumerate(model_answer):

        doc_features = model_answer_features[index]
        bm25_scores = compute_bm25_similarity(doc_features,
                                              corpus_features,
                                              doc_lengths,
                                              avg_dl,
                                              corpus_term_idfs,
                                              k1=1.5,
                                              b=0.75)
        semantic_similarity_scores = []
        for sentence in answers:
            score = (sentence_similarity(sentence, model_answer[0]) +
                     sentence_similarity(model_answer[0], sentence)) / 2
            semantic_similarity_scores.append(score)
        print 'Model Answer', ':', doc
        print '-' * 40
        doc_index = 0
        for score_tuple in zip(semantic_similarity_scores, bm25_scores):
            sim_score = ((score_tuple[0] * 10) + score_tuple[1]) / 2
            if (sim_score < 1):
                sim_score = 0
            elif (1 <= sim_score <= 2):
                sim_score = 1
            elif (2 < sim_score <= 4):
                sim_score = 2
            elif (4 < sim_score <= 6):
                sim_score = 3
            elif (6 < sim_score <= 8):
                sim_score = 4
            elif (8 < sim_score <= 10):
                sim_score = 5
            print 'Ans num: {} Score: {}\nAnswer: {}'.format(
                doc_index + 1, sim_score, answers[doc_index])
            print '-' * 40
            doc_index = doc_index + 1
        print
コード例 #16
0
print finder.nbest(bigram_measures.raw_freq, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
trigram_measures = TrigramAssocMeasures()

print finder.nbest(trigram_measures.raw_freq, 10)
print finder.nbest(trigram_measures.pmi, 10)

# print get_top_ngrams(corpus, ngram_val=2, limit=10)
from feature_extractors import build_feature_matrix
import networkx
import numpy as np
import matplotlib
from normalization import normalize_corpus

norm = normalize_corpus(corpus)
# construcat weighted document term matrix
vec, dt_matrix = build_feature_matrix(norm, feature_type='tfidf')

similarity_matrix = (dt_matrix * dt_matrix.T)
# view document similarity matrix
print np.round(similarity_matrix.todense(), 2)

# build similarity graph
similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
# networkx.draw_networkx(similarity_graph)
from gensim.summarization import summarize, keywords

def text_summarization_gensim(text, summary_ratio=0.5):
    
    summary = summarize(text, split=True, ratio=summary_ratio)
    for sentence in summary:
        print(sentence)

docs = parse_document(toy_text)
text = ' '.join(docs)
text_summarization_gensim(text, summary_ratio=0.4)


    
sentences = parse_document(toy_text)
norm_sentences = normalize_corpus(sentences,lemmatize=False) 

total_sentences = len(norm_sentences)
print('Total Sentences in Document:', total_sentences)

num_sentences = 3
num_topics = 2

vec, dt_matrix = build_feature_matrix(sentences, 
                                      feature_type='frequency')

td_matrix = dt_matrix.transpose()
td_matrix = td_matrix.multiply(td_matrix > 0)

u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
                                         
コード例 #18
0
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 03 19:33:32 2016

@author: DIP
"""

from nltk.corpus import gutenberg
from normalization import normalize_corpus
import nltk
from operator import itemgetter

alice = gutenberg.sents(fileids='carroll-alice.txt')
alice = [' '.join(ts) for ts in alice]
norm_alice = filter(None, normalize_corpus(alice, lemmatize=False))

# print first line
print norm_alice[0]

def flatten_corpus(corpus):
    return ' '.join([document.strip() 
                     for document in corpus])
                         
def compute_ngrams(sequence, n):
    return zip(*[sequence[index:] 
                 for index in range(n)])


def get_top_ngrams(corpus, ngram_val=1, limit=5):

    corpus = flatten_corpus(corpus)
    'Look at the bright blue sky!', 'Python is a great Programming language',
    'Python and Java are popular Programming languages',
    'Among Programming languages, both Python and Java are the most used in Analytics',
    'The fox is quicker than the lazy dog', 'The dog is smarter than the fox',
    'The dog, fox and cat are good friends'
]

# Documents that we will be measuring similarities for
query_docs = [
    'The fox is definitely smarter than the dog',
    'Java is a static typed programming language unlike Python',
    'I love to relax under the beautiful blue sky!'
]

# We normalize and extract features from the toy corpus
norm_corpus = normalize_corpus(toy_corpus, lemmatize=True)
# NB: As before it returns the particular 'vectorizer' used as well as the extracted feature matrix
tfidf_vectorizer, tfidf_features = build_feature_matrix(norm_corpus,
                                                        feature_type='tfidf',
                                                        ngram_range=(1, 1),
                                                        min_df=0.0,
                                                        max_df=1.0)

# Similarly, we normalize and extract features from the query corpus
norm_query_docs = normalize_corpus(query_docs, lemmatize=True)
# We use the same vectorizer that we used to build the feature matrix for the corpus also for query doc
query_docs_tfidf = tfidf_vectorizer.transform(norm_query_docs)


def compute_cosine_similarity(doc_features, corpus_features, top_n=3):
    # Get document vectors
コード例 #20
0
print(dataset.head())

# Divide data into training and testing sets
train_data = dataset[:25000]
test_data = dataset[25000:]
# Check size (len) and first few elements (head()) of test_data (sub)frame

# Divide the data into the data (review) and the label (sentiment) in both training and testing sets
train_reviews = np.array(train_data['review'])
train_sentiments = np.array(train_data['sentiment'])
test_reviews = np.array(test_data['review'])
test_sentiments = np.array(test_data['sentiment'])

# Normalize the training review data using the normalization.py module
norm_train_reviews = normalize_corpus(train_reviews,
                                      lemmatize=False,
                                      only_text_chars=True)

# Extract features from these normalized training reviews
# - which features? Try other features using parameters provided in utils.py
vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews,
                                                  feature_type='tfidf',
                                                  ngram_range=(1, 1),
                                                  min_df=0.0,
                                                  max_df=1.0)

from sklearn.linear_model import SGDClassifier

# Build/train an SVM classifier model with the train features extracted from reviews
svm = SGDClassifier(loss='hinge', n_iter=500)
svm.fit(train_features,
train_reviews = np.array(train_data['review'])
train_sentiments = np.array(train_data['sentiment'])
test_reviews = np.array(test_data['review'])
test_sentiments = np.array(test_data['sentiment'])


sample_docs = [100, 5817, 7626, 7356, 1008, 7155, 3533, 13010]
sample_data = [(test_reviews[index],
                test_sentiments[index])
                  for index in sample_docs]

sample_data    

# normalization
norm_train_reviews = normalize_corpus(train_reviews,
                                      lemmatize=True,
                                      only_text_chars=True)
# feature extraction                                                                            
vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews,
                                                  feature_type='tfidf',
                                                  ngram_range=(1, 1), 
                                                  min_df=0.0, max_df=1.0)                                      
                                      
                                      

from sklearn.linear_model import SGDClassifier
# build the model
svm = SGDClassifier(loss='hinge', n_iter=500)
svm.fit(train_features, train_sentiments)

コード例 #22
0
print dataset.target_names

corpus, labels = dataset.data, dataset.target
corpus, labels = remove_empty_docs(corpus, labels)

print 'Sample document:', corpus[10]
print 'Class label:', labels[10]
print 'Actual class label:', dataset.target_names[labels[10]]

train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(
    corpus, labels, test_data_proportion=0.3)

from normalization import normalize_corpus

norm_train_corpus = normalize_corpus(train_corpus)
norm_test_corpus = normalize_corpus(test_corpus)

''.strip()

from feature_extractors import bow_extractor, tfidf_extractor
from feature_extractors import averaged_word_vectorizer
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
import nltk
import gensim

# bag of words features
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
bow_test_features = bow_vectorizer.transform(norm_test_corpus)

# tfidf features
コード例 #23
0
ファイル: classifier.py プロジェクト: sun830910/NLP_Beginner
def main():
    corpus, labels = get_data()  # 获取数据集

    print("总的数据量:", len(labels))

    corpus, labels = remove_empty_docs(corpus, labels)

    print('样本之一:', corpus[10])
    print('样本的label:', labels[10])
    label_name_map = ["垃圾邮件", "正常邮件"]
    print('实际类型:', label_name_map[int(labels[10])],
          label_name_map[int(labels[5900])])

    # 对数据进行划分
    train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(
        corpus, labels, test_data_proportion=0.3)

    from normalization import normalize_corpus

    # 进行归一化
    norm_train_corpus = normalize_corpus(train_corpus)
    norm_test_corpus = normalize_corpus(test_corpus)

    ''.strip()

    from feature_extractors import bow_extractor, tfidf_extractor
    import gensim
    import jieba

    # 词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # tokenize documents
    tokenized_train = [jieba.lcut(text) for text in norm_train_corpus]
    print(tokenized_train[2:10])
    tokenized_test = [jieba.lcut(text) for text in norm_test_corpus]
    # build word2vec 模型
    model = gensim.models.Word2Vec(tokenized_train,
                                   size=500,
                                   window=100,
                                   min_count=30,
                                   sample=1e-3)

    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    mnb = MultinomialNB()
    svm = SGDClassifier(loss='hinge', n_iter=100)
    lr = LogisticRegression()

    # 基于词袋模型的多项朴素贝叶斯
    print("基于词袋模型特征的贝叶斯分类器")
    mnb_bow_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于词袋模型特征的逻辑回归
    print("基于词袋模型特征的逻辑回归")
    lr_bow_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于词袋模型的支持向量机方法
    print("基于词袋模型的支持向量机")
    svm_bow_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于tfidf的多项式朴素贝叶斯模型
    print("基于tfidf的贝叶斯模型")
    mnb_tfidf_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
    # 基于tfidf的逻辑回归模型
    print("基于tfidf的逻辑回归模型")
    lr_tfidf_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    # 基于tfidf的支持向量机模型
    print("基于tfidf的支持向量机模型")
    svm_tfidf_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    import re

    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels,
                                                svm_tfidf_predictions):
        if label == 0 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break

    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels,
                                                svm_tfidf_predictions):
        if label == 1 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break
コード例 #24
0
def main(No_ch=0):
    # Adquisicion del corpus >>>>>>>> INICIO
    print("Adquisición de corpus de depresion")
    dic.chunks_paths = []
    dic.loadchunkXML('dpp')
    dic.analyzeChunk('dpp', No_ch)
    dic.chunks_paths = []
    dic.loadchunkXML('dpn')
    dic.analyzeChunk('dpn', No_ch)
    print('Numero de chunks en types ', len(dic.types['dpp']))
    print('Numero de chunks en types ', len(dic.types['dpn']))
    dic.initialize_class_types('dp')
    dic.appendPost('dpp', 'dp')
    dic.appendPost('dpn', 'dp')
    print('Numero de instancias en depresion', len(dic.types['dp']['rows']))
    dic.types['dp']['cols'] = dic.fillOnesZeros('dp')
    print('Matriz Y', len(dic.types['dp']['cols']))
    dic.types['dp']['names'] = ['Negative', 'Positive']
    # Adquisicion del corpus >>>>>>>> FIN
    # Normalizado del corpus >>>>>>>>>> INICIO
    norm_train_corpus = norm.normalize_corpus(dic.types['dp']['rows'])
    # Normalizado del corpus >>>>>>>>>> FIN
    from feature_extractor import bow_extractor, tfidf_extractor, bow_extractor_maxdf
    from sklearn.feature_selection import mutual_info_classif
    import nltk
    import gensim
    # BOW features
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    feature_names = bow_vectorizer.get_feature_names()
    print('Numero de caracteristicas tomadas en cuenta', len(feature_names))
    from sklearn.naive_bayes import MultinomialNB
    nb = MultinomialNB()
    from sklearn.model_selection import cross_val_predict
    y_predicted = cross_val_predict(nb,
                                    bow_train_features,
                                    dic.types['dp']['cols'],
                                    cv=10)
    evaluator.get_metrics(dic.types['dp']['cols'], y_predicted)
    bow_vectorizer, bow_train_features = bow_extractor_maxdf(norm_train_corpus)
    res = dict(
        zip(
            feature_names,
            mutual_info_classif(bow_train_features,
                                dic.types['dp']['cols'],
                                discrete_features=True)))
    for feat in res.keys():
        print(feat, str(res[feat]), '\n')
    # y_predicted = cross_val_predict( nb, bow_train_features, dic.types['dp']['cols'], cv=10)
    # evaluator.get_metrics(dic.types['dp']['cols'], y_predicted)
    # Adquisicion del corpus >>>>>>>> INICIO
    print("Adquisición de corpus de anorexia")
    dic.chunks_paths = []
    dic.loadchunkXML('axp')
    dic.analyzeChunk('axp', No_ch)
    dic.chunks_paths = []
    dic.loadchunkXML('axn')
    dic.analyzeChunk('axn', No_ch)
    print('Numero de chunks en types ', len(dic.types['axp']))
    print('Numero de chunks en types ', len(dic.types['axn']))
    dic.initialize_class_types('ax')
    dic.appendPost('axp', 'ax')
    dic.appendPost('axn', 'ax')
    print('Numero de instancias en anorexia', len(dic.types['ax']['rows']))
    dic.types['ax']['cols'] = dic.fillOnesZeros('ax')
    print('Matriz Y', len(dic.types['ax']['cols']))
    dic.types['ax']['names'] = ['Negative', 'Positive']
    # Adquisicion del corpus >>>>>>>> FIN
    # Normalizado del corpus >>>>>>>>>> INICIO
    norm_train_corpus = norm.normalize_corpus(dic.types['ax']['rows'])
    # Normalizado del corpus >>>>>>>>>> FIN
    # BOW features
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    feature_names = bow_vectorizer.get_feature_names()
    print('Numero de caracteristicas tomadas en cuenta', len(feature_names))
    nb = MultinomialNB()
    y_predicted = cross_val_predict(nb,
                                    bow_train_features,
                                    dic.types['ax']['cols'],
                                    cv=10)
    evaluator.get_metrics(dic.types['ax']['cols'], y_predicted)
    bow_vectorizer, bow_train_features = bow_extractor_maxdf(norm_train_corpus)
    res = dict(
        zip(
            feature_names,
            mutual_info_classif(bow_train_features,
                                dic.types['ax']['cols'],
                                discrete_features=True)))
    for feat in res.keys():
        print(feat, str(res[feat]), '\n')
from normalization import normalize_corpus

#print "Step 1::::********************************************Parsing CSV file and converting into array*********************"
results = []
with open("Req_BM2.csv") as csvfile:
    reader = csv.reader(csvfile)  # change contents to floats
    for row in reader:  # each row is a list
        results.append(row[1])

# print results

sentences = results

# normalize corpus
norm_req_synopses = normalize_corpus(sentences,
                                     lemmatize=True,
                                     only_text_chars=False)

# extract tf-idf features
vectorizer, feature_matrix = build_feature_matrix(norm_req_synopses,
                                                  feature_type='tfidf',
                                                  min_df=0.1,
                                                  max_df=0.9,
                                                  ngram_range=(1, 2))

# view number of features
#print feature_matrix.shape

# get feature names
feature_names = vectorizer.get_feature_names()
コード例 #26
0
ファイル: classifier.py プロジェクト: vincent775/test_nlp
def main():
    corpus, labels = get_data()  #获取数据集
    print('总的数据量:', len(corpus))
    print('labels数据量:', len(labels))
    corpus, labels = remove_empty_docs(corpus, labels)
    print('样本之一:', corpus[0])
    print('样本的label:', labels[243])
    label_name_map = ['垃圾邮件', '正常邮件']
    print('实际:', label_name_map[int(labels[10])],
          label_name_map[int(labels[8908])])
    #对数据进行划分
    train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(
        corpus, labels, test_data_proportion=0.3)
    print('训练数据量:', len(train_corpus))
    print('测试数据量:', len(test_corpus))

    from normalization import normalize_corpus
    #对数据归一化
    norm_train_corpus = normalize_corpus(train_corpus)
    norm_test_corpus = normalize_corpus(test_corpus)

    ''.strip()

    from feature_extractors import bow_extractor, tfidf_extractor
    import gensim
    import jieba

    #词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    #tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # tokenize documents
    tokenized_train = [jieba.lcut(text) for text in norm_train_corpus]
    print(tokenized_train[2:10])
    tokenized_test = [jieba.lcut(text) for text in norm_test_corpus]

    #训练分类器
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    mnb = MultinomialNB()
    # svm = SGDClassifier(loss='hinge', n_iter=100)
    svm = SGDClassifier(loss='hinge')
    lr = LogisticRegression()

    #基于词袋模型的多项朴素贝叶斯
    print('基于词袋模型特征的贝叶斯分类器')
    mnb_bow_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)
    # 基于词袋模型的逻辑回归
    print("基于词袋模型特征的逻辑回归")
    lr_bow_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)
    # 基于词袋模型的支持向量机
    print('基于词袋模型特征的支持向量机')
    svm_bow_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=bow_train_features,
        train_labels=train_labels,
        test_features=bow_test_features,
        test_labels=test_labels)

    # 基于tfidf的多项式朴素贝叶斯模型
    print('基于tfidf的多项式朴素贝叶斯模型')
    mnb_tfidf_predictions = train_predict_evaluate_model(
        classifier=mnb,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
    # 基于tfidf的逻辑回归模型
    print('基于tfidf的逻辑回归模型')
    lr_tfidf_predictions = train_predict_evaluate_model(
        classifier=lr,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
    # 基于tfidf的支持向量机模型
    print('基于tfidf支持向量机模型')
    svm_tfidf_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)

    #显示部分正确归类和部分错误归类
    import re
    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels,
                                                svm_tfidf_predictions):
        if label == 0 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))
            num += 1
            if num == 4:
                break
        if label == 1 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))
            num += 1
            if num == 4:
                break
コード例 #27
0
def main():
    label_name_map = ["垃圾邮件", "正常邮件"]

    # 对数据进行划分
    train_corpus, test_corpus, train_labels, test_labels = data_preprocess()

    # 标准化,去除特殊字符
    norm_train_corpus = normalize_corpus(train_corpus)
    norm_test_corpus = normalize_corpus(test_corpus)

    # 词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # tokenize documents
    # tokenized_train = [jieba.lcut(text) for text in norm_train_corpus]
    # tokenized_test = [jieba.lcut(text) for text in norm_test_corpus]

    # build word2vec 模型
    # logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s",level=logging.INFO)
    # model = gensim.models.Word2Vec(tokenized_train,
    #                                size=500,
    #                                window=100,
    #                                min_count=30,
    #                                sample=1e-3)
    # model.save("./vector.model")
    # model=gensim.models.Word2Vec.load("./vector.model")
    # print("已加载词向量模型....")

    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression

    mnb = MultinomialNB()
    svm = SGDClassifier(loss='hinge', n_iter=100)
    lr = LogisticRegression()

    # 基于词袋模型的多项朴素贝叶斯
    print("基于词袋模型特征的贝叶斯分类器")
    mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                                       train_features=bow_train_features,
                                                       train_labels=train_labels,
                                                       test_features=bow_test_features,
                                                       test_labels=test_labels)

    # 基于词袋模型特征的逻辑回归
    print("基于词袋模型特征的逻辑回归")
    lr_bow_predictions = train_predict_evaluate_model(classifier=lr,
                                                      train_features=bow_train_features,
                                                      train_labels=train_labels,
                                                      test_features=bow_test_features,
                                                      test_labels=test_labels)

    # 基于词袋模型的支持向量机方法
    print("基于词袋模型的支持向量机")
    svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                                       train_features=bow_train_features,
                                                       train_labels=train_labels,
                                                       test_features=bow_test_features,
                                                       test_labels=test_labels)


    # 基于tfidf的多项式朴素贝叶斯模型
    print("基于tfidf的贝叶斯模型")
    mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)
    # 基于tfidf的逻辑回归模型
    print("基于tfidf的逻辑回归模型")
    lr_tfidf_predictions=train_predict_evaluate_model(classifier=lr,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)


    # 基于tfidf的支持向量机模型
    print("基于tfidf的支持向量机模型")
    svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)

#取出一部分正确分类的样本和一部分错误分类的样本
    import re
    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
        if label == 0 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break
    num = 0
    for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
        if label == 1 and predicted_label == 0:
            print('邮件类型:', label_name_map[int(label)])
            print('预测的邮件类型:', label_name_map[int(predicted_label)])
            print('文本:-')
            print(re.sub('\n', ' ', document))

            num += 1
            if num == 4:
                break
コード例 #28
0
ファイル: execute.py プロジェクト: qguo96/nearest_neighbour
def run():
    """
    answers=['Functions are used as one-time processing snippet for inling and jumbling the code.',
    'Functions are used for reusing, inlining and jumbling the code.',
    'Functions are used as one-time processing snippet for inlining and organizing the code.',
    'Functions are used as one-time processing snippet for modularizing and jumbling the code.',
    'Functions are used for reusing, inling and organizing the code.',
    'Functions are used as one-time processing snippet for modularizing and organizing the code.',
    'Functions are used for reusing, modularizing and jumbling the code.',
    'Functions are used for reusing, modularizing and organizing the code.']

    model_answer = ["Functions are used for reusing, modularizing and organizing the code."]
    """
    dev_questions = []
    dev_question_answers = []
    train_questions = []
    train_question_answers = []
    filep = os.path.dirname(os.path.abspath(__file__))
    #train_file = os.path.join(filep, "NQ-open.train.jsonl")
    #dev_file = os.path.join(filep, "NQ-open.efficientqa.dev.1.1.jsonl")
    train_file = os.path.join(filep, "test_train.jsonl")
    dev_file = os.path.join(filep, "test_dev.jsonl")

    with open(train_file, "r") as f:
        for line in f:
            d = json.loads(line)
            train_questions.append((d["question"]))
            if "answer" not in d:
                d["answer"] = "random"
            train_question_answers.append(d["answer"])

    len_train = len(train_questions)

    with open(dev_file, "r") as f:
        for line in f:
            d = json.loads(line)
            dev_questions.append((d["question"]))
            if "answer" not in d:
                d["answer"] = "random"
            dev_question_answers.append(d["answer"])

    len_dev = len(dev_questions)

    answers = train_questions
    model_answer = dev_questions

    # normalize answers
    norm_corpus = normalize_corpus(answers, lemmatize=True)
    print(sys.getsizeof(norm_corpus))
    print(len(norm_corpus))
    # normalize model_answer
    norm_model_answer = normalize_corpus(model_answer, lemmatize=True)

    vectorizer, corpus_features = build_feature_matrix(
        norm_corpus, feature_type='frequency')

    # extract features from model_answer
    model_answer_features = vectorizer.transform(norm_model_answer)

    doc_lengths = [len(doc.split()) for doc in norm_corpus]
    avg_dl = np.average(doc_lengths)
    corpus_term_idfs = compute_corpus_term_idfs(corpus_features, norm_corpus)

    train_predict = [None] * len_dev
    dev_predict = [None] * len_dev
    for index, doc in enumerate(model_answer):
        print(index)
        doc_features = model_answer_features[index]
        #bm25_scores = compute_bm25_similarity(model_answer_features,corpus_features,doc_lengths,avg_dl,corpus_term_idfs,k1=0.82, b=0.68)
        bm25_scores = compute_bm25_similarity(doc_features,
                                              corpus_features,
                                              doc_lengths,
                                              avg_dl,
                                              corpus_term_idfs,
                                              k1=0.82,
                                              b=0.68)
        exit()
        semantic_similarity_scores = []
        for sentence in answers:
            score = (sentence_similarity(sentence, model_answer[0]) +
                     sentence_similarity(model_answer[0], sentence)) / 2
            semantic_similarity_scores.append(score)
        doc_index = 0
        max_index = 0
        max_score = 0
        for score_tuple in zip(semantic_similarity_scores, bm25_scores):
            sim_score = ((score_tuple[0] * 10) + score_tuple[1]) / 2
            if sim_score > max_score:
                max_score = sim_score
                max_index = doc_index
            doc_index = doc_index + 1
        dev_predict[index] = train_question_answers[max_index][0]
    predict_output = [None] * len_dev
    for i in range(len_dev):
        output_dict = {
            'question': dev_questions[i],
            'prediction': dev_predict[i]
        }
        predict_output[i] = output_dict

    pred_file = os.path.join(filep, 'ef_dev_predict.json')
    with open(pred_file, 'w') as output:
        output.write(json.dumps(predict_output, indent=4) + '\n')
コード例 #29
0
def main():
    corpus, labels = get_data()  # 获取数据集

    print("总的数据量:", len(labels))

    corpus, labels = remove_empty_docs(corpus, labels)

    # print('样本之一:', corpus[10])
    # print('样本的label:', labels[10])
    # label_name_map = ["垃圾邮件", "正常邮件"]
    # print('实际类型:', label_name_map[int(labels[10])], label_name_map[int(labels[5900])]) #labels[0:4999]为1.0,labels[5000:10001]为0.0
    # print('实际类型:', label_name_map[1], label_name_map[0])

    # 对数据进行划分
    train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(
        corpus, labels, test_data_proportion=0.3)
    #对数据进行规整化和预处理
    from normalization import normalize_corpus

    # 进行归一化
    norm_train_corpus = normalize_corpus(train_corpus)
    # print(norm_train_corpus[:3])
    norm_test_corpus = [
        '中信(国际)电子科技有限公司推出新产品:升职步步高、做生意发大财、连找情人都用的上,详情进入网址httpwwwusa5588comccc电话:02033770208服务热线:013650852999',
        '向专利局递交申请需要将文件转为PDF格式。我已经将说明书、说明书附图、权利要求书、摘要转化为PDF格式。由于WORED文档转化为PDF文档时公式和变量容易变形,而这种错误在申请递交给专利局之后将无法弥补,所以,请你逐字对照检查,确保PDF文件中没有变形错误,尤其是变量的上标、下标、运算符。'
    ]

    # norm_test_corpus = normalize_corpus(test_corpus)
    # print(norm_test_corpus)
    from feature_extractors import bow_extractor, tfidf_extractor
    import gensim
    import jieba

    # 词袋模型特征
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    """
    bow_train_features:
    (0, 173)	1  第0个列表元素,**词典中索引为173的元素**, 词频
    (0, 54)	1
    (0, 4)	1

    """
    # bow_test_features = bow_vectorizer.transform(norm_test_corpus)

    # tfidf 特征
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

    # 训练分类器
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import LogisticRegression
    mnb = MultinomialNB()  # 朴素贝叶斯
    svm = SGDClassifier(loss='hinge', n_iter=100)  # 支持向量机
    lr = LogisticRegression()  # 逻辑回归
    print("基于tfidf的支持向量机模型")
    svm_tfidf_predictions = train_predict_evaluate_model(
        classifier=svm,
        train_features=tfidf_train_features,
        train_labels=train_labels,
        test_features=tfidf_test_features,
        test_labels=test_labels)
    print(svm_tfidf_predictions)
コード例 #30
0
print dataset.target_names

corpus, labels = dataset.data, dataset.target
corpus, labels = remove_empty_docs(corpus, labels)

print 'Sample document:', corpus[10]
print 'Class label:',labels[10]
print 'Actual class label:', dataset.target_names[labels[10]]

train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,
                                                                        labels,
                                                                        test_data_proportion=0.3)
                                                                        
from normalization import normalize_corpus

norm_train_corpus = normalize_corpus(train_corpus)
norm_test_corpus = normalize_corpus(test_corpus)  

''.strip()

from feature_extractors import bow_extractor, tfidf_extractor
from feature_extractors import averaged_word_vectorizer
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
import nltk
import gensim

# bag of words features
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)  
bow_test_features = bow_vectorizer.transform(norm_test_corpus) 

# tfidf features
コード例 #31
0
ファイル: Preprocessing.py プロジェクト: yashtatia/Pycharm
from feature_extractors import build_feature_matrix
import pandas as pd
import numpy as np

file_loc = "/media/ytatia/New Volume/Grievances related to Summer Term (Responses).xlsx"
file_loc1 = "/media/ytatia/New Volume/grievances.xlsx"
file_loc2 = "/media/ytatia/New Volume/Hostel related grievances.xlsx"
df = pd.read_excel(file_loc, index_col=0, skiprows=1, header=0, parse_cols="I")
df1 = pd.read_excel(file_loc1,
                    index_col=0,
                    skiprows=1,
                    header=0,
                    parse_cols="I")
df2 = pd.read_excel(file_loc2,
                    index_col=0,
                    skiprows=1,
                    header=0,
                    parse_cols="H")

frames = [df, df1, df2]

result = pd.concat(frames)

doxyDonkeyPosts = result.index
#print doxyDonkeyPosts

query_post = ['The food quality in the mess sucks']

#NOrmalize and extract features from the corpus
norm_corpus = normalize_corpus(doxyDonkeyPosts, lemmatize=True)
コード例 #32
0
def normalize_text(text):
    return normalize_corpus(text, lemmatize=True)
コード例 #33
0
# -*- coding: utf-8 -*-
"""

"""

from nltk.corpus import gutenberg
from normalization import normalize_corpus
import nltk
from operator import itemgetter

alice = gutenberg.sents(fileids='carroll-alice.txt')
alice = [' '.join(ts) for ts in alice]
norm_alice = filter(None, normalize_corpus(alice, lemmatize=False))

# print first line
print norm_alice[0]


def flatten_corpus(corpus):
    return ' '.join([document.strip() for document in corpus])


def compute_ngrams(sequence, n):
    return zip(*[sequence[index:] for index in range(n)])


def get_top_ngrams(corpus, ngram_val=1, limit=5):

    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)
コード例 #34
0
print movie_data.head()

movie_titles = movie_data['Title'].tolist()
movie_synopses = movie_data['Synopsis'].tolist()

print 'Movie:', movie_titles[0]
print 'Movie Synopsis:', movie_synopses[0][:1000]


from normalization import normalize_corpus
from utils import build_feature_matrix

# normalize corpus
norm_movie_synopses = normalize_corpus(movie_synopses,
                                       lemmatize=True,
                                       only_text_chars=True)

# extract tf-idf features
vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses,
                                                  feature_type='tfidf',
                                                  min_df=0.24, max_df=0.85,
                                                  ngram_range=(1, 2))
# view number of features
print feature_matrix.shape     

# get feature names
feature_names = vectorizer.get_feature_names()

# print sample features
print feature_names[:20]      
コード例 #35
0
toy_corpus = ["The fox jumps over the dog",
              "The fox is very clever and quick",
              "The dog is slow and lazy",
              "The cat is smarter than the fox and the dog",
              "Python is an excellent programming language",
              "Java and Ruby are other programming languages",
              "Python and Java are very popular programming languages",
              "Python programs are smaller than Java programs"]


# ### LSI topic model

# In[56]:


norm_tokenized_corpus = normalize_corpus(toy_corpus, tokenize=True)
norm_tokenized_corpus


# In[39]:


dictionary = corpora.Dictionary(norm_tokenized_corpus)
print(dictionary.token2id)


# In[42]:


corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus]
corpus
コード例 #36
0
        raw = open(path, encoding='utf-8').read()
        dataset.data = np.append(dataset.data, raw)
    dataset.target = targets.ravel()

    return dataset


# In[4]:


print('reading training data corpus ...')
dataset = get_data(whichData='train')
corpus, labels = dataset.data, dataset.target

print('normalizing corpus ...')
norm_corpus = normalize_corpus(corpus)

print('creating BOW features ...')
bow_vectorizer, bow_features = bow_extractor(norm_corpus)
# print(bow_features.shape)

print('creating tfidf features ...')
tfidf_vectorizer, tfidf_features = tfidf_extractor(norm_corpus)  
# print(tfidf_features.shape)

print('creating averaged word vector features ...')
tokenized_corpus = [nltk.word_tokenize(text) for text in norm_corpus]
model = gensim.models.Word2Vec(tokenized_corpus, size=500, window=100, min_count=30, sample=1e-3)
avg_wv_features = averaged_word_vectorizer(corpus=tokenized_corpus, model=model, num_features=500) 
# print(avg_wv_features.shape)
コード例 #37
0
ファイル: cluster.py プロジェクト: vincent775/test_nlp
                                     max_df=max_df,
                                     ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        raise Exception(
            "Wrong feature type entered.Possible values:'binary','frequency','tfidf'"
        )
    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    return vectorizer, feature_matrix


#加载数据处理文件
from normalization import normalize_corpus

norm_book_content = normalize_corpus(book_content)

#提取tf-idf 特征
vectorizer, feature_matrix = build_feature_matrix(norm_book_content,
                                                  feature_type='tfidf',
                                                  min_df=0.2,
                                                  max_df=0.90,
                                                  ngram_range=(1, 2))

# 查看特征数量
print('特征数量:', feature_matrix.shape)
#获取特征名字
feature_names = vectorizer.get_feature_names()
#打印某些特征
print('抽取个别特征:', feature_names[:10])
コード例 #38
0
from gensim.summarization import summarize, keywords

def text_summarization_gensim(text, summary_ratio=0.5):
    
    summary = summarize(text, split=True, ratio=summary_ratio)
    for sentence in summary:
        print sentence

docs = parse_document(toy_text)
text = ' '.join(docs)
text_summarization_gensim(text, summary_ratio=0.4)


    
sentences = parse_document(toy_text)
norm_sentences = normalize_corpus(sentences,lemmatize=False) 

total_sentences = len(norm_sentences)
print 'Total Sentences in Document:', total_sentences   



num_sentences = 3
num_topics = 2

vec, dt_matrix = build_feature_matrix(sentences, 
                                      feature_type='frequency')

td_matrix = dt_matrix.transpose()
td_matrix = td_matrix.multiply(td_matrix > 0)