Beispiel #1
0
def load_dict(dic_path):
    """Load pre-trained word2vec/char2vec vector dictionary."""
    log.info("Start loading dict")
    with open(dic_path, "r") as f:
        word_dic = json.load(f)
    log.info("End loading dict")
    return word_dic
Beispiel #2
0
def read_origin_data_file(data_file_path):
    """This is used to read the data file"""
    log.info("Start reading the origin data file")
    file_origin_list = []
    with open(data_file_path, "r") as f:
        while True:
            line = f.readline()
            if not line:
                break
            line = line.replace("\n", "")
            file_origin_list.append(line)
    return file_origin_list
Beispiel #3
0
def calculate_tf_idf_matrix(file_origin_list):
    """This is used to calculate the tf-idf matrix value"""
    log.info("Start calculating the tf idf matrix")
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()

    #第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    tfidf = transformer.fit_transform(
        vectorizer.fit_transform(file_origin_list))

    word_list = vectorizer.get_feature_names()

    # convert tfidf to csr_matrix
    tfidf_sparse_matrix = csr_matrix(tfidf)
    return word_list, tfidf_sparse_matrix
Beispiel #4
0
def main():
    train_word_file = "/home/chenyu/daguan/data/train_word"
    test_word_file = "/home/chenyu/daguan/data/test_word"
    train_label_file = "/home/chenyu/daguan/data/train_label"
    dict_file = "/home/chenyu/daguan/output/word_dic_64.json"
    #model_save_path = "/home/chenyu/daguan/model/basic_svm"
    basic_model = BaselineModel(64)

    train_vector, train_label, train_cv_vector, train_cv_label, test_vector = generate_model_input_data(
        train_word_file, train_label_file, test_word_file, dict_file,
        basic_model.embedding_size)

    c_value = [1, 10, 50, 100]
    for c in c_value:
        model_save_path = "/home/chenyu/daguan/model/basic_svm_" + str(c)
        basic_model.fit(c, model_save_path, train_vector, train_label)
        train_predict_result = basic_model.predict(model_save_path,
                                                   train_vector)
        cv_predict_result = basic_model.predict(model_save_path,
                                                train_cv_vector)

        accuracy_score = metrics.precision_score(train_label,
                                                 train_predict_result,
                                                 average='micro')
        F1_score = metrics.f1_score(train_label,
                                    train_predict_result,
                                    average='weighted')
        log.info("The following is the result for c value " + str(c))
        log.info("The accuracy for train data is " + str(accuracy_score))
        log.info("The f1 score for train data is " + str(F1_score))

        accuracy_score = metrics.precision_score(train_cv_label,
                                                 cv_predict_result,
                                                 average='micro')
        F1_score = metrics.f1_score(train_cv_label,
                                    cv_predict_result,
                                    average='weighted')
        log.info("The accuracy for cv data is " + str(accuracy_score))
        log.info("The f1 score for cv data is " + str(F1_score))
Beispiel #5
0
def create_file_vector(train_file_path, test_file_path, dic_path,
                       embedding_size):
    """This is used to create the file vector represent
    This baseline method is as follow:
        1. train word2vec
        2. calculate tf-idf
        3. sort tf-idf value, get Top 128 word
        4. get average vector value of Top 128 word as file vector represent
    """
    # Read train file
    train_origin_list = read_origin_data_file(train_file_path)
    test_origin_list = read_origin_data_file(test_file_path)
    file_origin_list = train_origin_list + test_origin_list

    word_list, tf_idf_sparse_matrix = calculate_tf_idf_matrix(file_origin_list)
    word_dic = load_dict(dic_path)
    file_vector_list = []

    # Create word vector array
    word_vector = []
    for word in word_list:
        if word_dic.has_key(word):
            word_vector.append(word_dic[word])
        else:
            word_vector.append([0.0] * 64)
    word_vector = np.array(word_vector)

    # Loop all file to create file vector representation
    log.info("The number of file is %d" % (len(file_origin_list)))
    log.info("The number of word list is %d" % (len(word_list)))
    for i in range(len(file_origin_list)):
        if i % 1000 == 0:
            log.info("Now has processed %d file" % (i))

        normal_array = tf_idf_sparse_matrix[i].toarray()
        cur_file_vector = calculate_file_vector(word_vector, normal_array)
        file_vector_list.append(cur_file_vector)

    return file_vector_list[:len(train_origin_list
                                 )], file_vector_list[len(train_origin_list):]
Beispiel #6
0
 def predict(self, model_save_path, file_vector_list):
     log.info("Start predicting the model")
     classifier = joblib.load(model_save_path)
     predict_result = classifier.predict(file_vector_list)
     return predict_result
Beispiel #7
0
 def fit(self, value_c, model_save_path, train_vector, train_label):
     log.info("Start fitting the model")
     classifier = svm.LinearSVC(random_state=0, C=value_c)
     classifier.fit(train_vector, train_label)
     joblib.dump(classifier, model_save_path, compress=3)
     log.info("End fitting the modle")