Esempio n. 1
0
    def text_mining(self, x_train, x_test, y_train, y_test):
        x_train_msg = []
        x_test_msg = []
        crf = CRFWordSegment()
        for x_msg in x_train:
            data_lst = crf.crfpp(x_msg.message)
            data_msg = ' '.join(data_lst)
            x_train_msg.append(data_msg)

        for x_msg in x_test:
            data_lst = crf.crfpp(x_msg.message)
            data_msg = ' '.join(data_lst)
            x_test_msg.append(data_msg)

        text_clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', RandomForestClassifier())])
        start_time = time.time()
        text_clf = text_clf.fit(x_train_msg, y_train)
        total_time = time.time() - start_time
        self.time_train_text.append(total_time / len(y_train))

        start_time = time.time()
        y_pred = text_clf.predict(x_test_msg)
        total_time = time.time() - start_time
        self.time_predict_text.append(total_time / len(y_pred))

        f1 = f1_score(y_test, y_pred)
        return f1
Esempio n. 2
0
def topic_feature_process(x_train, x_test, y_train, y_test):
    x_train_msg = []
    x_test_msg = []
    x_corpus = []
    crf = CRFWordSegment()
    for x_msg in x_train:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_train_msg.append(data_msg)

    for x_msg in x_test:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_test_msg.append(data_msg)

    x_corpus.extend(x_train_msg)
    x_corpus.extend(x_test_msg)
    vectorizer = TfidfVectorizer()
    tfidf_corpus = vectorizer.fit_transform(x_corpus).toarray()

    tfidf_train = tfidf_corpus[0:len(x_train_msg)]
    tfidf_test = tfidf_corpus[len(x_train_msg):len(tfidf_corpus)]
    y_pred = cls_cos_sim(tfidf_test, tfidf_train, y_train)

    f1 = f1_score(y_test, y_pred)
    return f1
Esempio n. 3
0
 def process(self, word_list):
     ret = []
     lst = self.remove_dup_sentense(word_list)
     crf = CRFWordSegment()
     for l in lst:
         ret.append(crf.crfpp(unicode(l,'utf8'))[0])
     return ret
Esempio n. 4
0
def load_data():
    print('start...')
    nlp = CRFWordSegment()
    with codecs.open('data/db/filterel4000.json', 'r', 'utf-8') as f:
        lines = f.readlines()
        data_obj = []
        for data in lines:
            json_data = json.loads(data)
            if json_data['cred_value'] == 'maybe' or json_data[
                    'tag_with'] == 'NaN':
                continue

            mapping = NewDataMapping()
            message = json_data['message']
            mapping.message = message
            if json_data['cred_value'] == 'no':
                mapping.prediction_result = 0
            else:
                mapping.prediction_result = 1
            social_features = []
            social_features.append(int(json_data['likes']))
            social_features.append(int(json_data['shares']))
            social_features.append(int(json_data['comments']))
            social_features.append(int(json_data['url']))
            social_features.append(int(json_data['hashtag']))
            social_features.append(int(json_data['images']))
            social_features.append(int(json_data['vdo']))
            social_features.append(int(json_data['location']))
            social_features.append(int(json_data['non_location']))
            social_features.append(int(json_data['share_only_friend']))
            social_features.append(int(json_data['is_public']))
            social_features.append(int(json_data['feeling_status']))
            social_features.append(int(json_data['tag_with']))
            mapping.social_features = social_features

            text_features = []
            text_features.append(len(message))
            text_features.append(message.count('?'))
            text_features.append(message.count('!'))
            message_lst = nlp.crfpp(message)
            number_in_dict = dict_list & set(message_lst)
            out_side_dict = len(message_lst) - len(number_in_dict)
            text_features.append(len(message_lst))
            text_features.append(len(number_in_dict))
            text_features.append(out_side_dict)
            mapping.text_features = text_features

            social_and_text_features = []
            social_and_text_features.extend(social_features)
            social_and_text_features.extend(text_features)
            mapping.social_and_text_features = social_and_text_features

            data_obj.append(mapping)

    pickle.dump(data_obj, open('data/newresult/data/data_obj.obj', 'wb'))
    return data_obj
Esempio n. 5
0
def topic_text_social(x_train, x_test, y_train, y_test):
    from sklearn.feature_extraction.text import TfidfVectorizer
    x_train_msg = []
    x_test_msg = []
    crf = CRFWordSegment()
    x_cropus = []
    for x_msg in x_train:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_train_msg.append(data_msg)

    x_cropus.extend(x_train_msg)

    for x_msg in x_test:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_test_msg.append(data_msg)

    x_cropus.extend(x_test_msg)
    tf = TfidfVectorizer()
    tf_id = tf.fit_transform(x_cropus)

    x_all = []
    x_all.extend(x_train)
    x_all.extend(x_test)

    tf_id = tf_id.toarray()
    tf_and_feature = []
    for i in range(0, len(tf_id)):
        all_data = []
        all_data.extend(tf_id[i])
        all_data.extend(x_all[i].social_features)
        all_data.extend(x_all[i].text_features)
        tf_and_feature.append(all_data)

    x_tf_and_feature_train = tf_and_feature[0:len(x_train_msg)]
    x_tf_and_feature_test = tf_and_feature[len(x_train_msg):len(tf_id)]

    y_pred = cls_cos_sim(x_tf_and_feature_test, x_tf_and_feature_train,
                         y_train)

    f1 = f1_score(y_test, y_pred)
    return f1
Esempio n. 6
0
 def process(self, word_list):
     ret = []
     lst = self.remove_dup_sentense(word_list)
     crf = CRFWordSegment()
     for l in lst:
         try:
             ret.append(crf.crfpp(unicode(l, 'utf8')))
         except Exception as e:
             pass
     return ret
 def process(self, word_list):
     ret = []
     lst = self.remove_dup_sentense(word_list)
     crf = CRFWordSegment()
     for l in lst:
         try:
             ret.append(crf.crfpp(l))            
         except Exception as e:
             pass
     return ret
Esempio n. 8
0
def topic_and_text(x_train, x_test, y_train, y_test):
    from sklearn.feature_extraction.text import TfidfVectorizer
    x_train_msg = []
    x_test_msg = []
    crf = CRFWordSegment()
    x_cropus = []
    for x_msg in x_train:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_train_msg.append(data_msg)

    x_cropus.extend(x_train_msg)

    for x_msg in x_test:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_test_msg.append(data_msg)

    x_cropus.extend(x_test_msg)
    tf = TfidfVectorizer()
    tf_id = tf.fit_transform(x_cropus)

    x_all = []
    x_all.extend(x_train)
    x_all.extend(x_test)

    tf_id = tf_id.toarray()
    tf_and_feature = []
    for i in range(0, len(tf_id)):
        all_data = []
        all_data.extend(tf_id[i])
        all_data.extend(x_all[i].text_features)
        tf_and_feature.append(all_data)

    x_tf_and_feature_train = tf_and_feature[0:len(x_train_msg)]
    x_tf_and_feature_test = tf_and_feature[len(x_train_msg):len(tf_id)]

    clf = RandomForestClassifier()
    clf.fit(x_tf_and_feature_train, y_train)

    y_pred = clf.predict(x_tf_and_feature_test)
    return get_result(y_test, y_pred)
Esempio n. 9
0
def topic_feature_process(x_train, x_test, y_train, y_test):
    x_train_msg = []
    x_test_msg = []
    crf = CRFWordSegment()
    for x_msg in x_train:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_train_msg.append(data_msg)

    for x_msg in x_test:
        data_lst = crf.crfpp(x_msg.message)
        data_msg = ' '.join(data_lst)
        x_test_msg.append(data_msg)

    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', RandomForestClassifier())])

    text_clf = text_clf.fit(x_train_msg, y_train)
    y_pred = text_clf.predict(x_test_msg)
    return get_result(y_test, y_pred)
Esempio n. 10
0
def load_data():
    print('start...')
    nlp = CRFWordSegment()
    with open('data/db/filterel4000.json') as f:
        lines = f.readlines()
        data_obj = []
        for data in lines:
            json_data = json.loads(data)
            if json_data['cred_value'] == 'maybe' or json_data[
                    'tag_with'] == 'NaN':
                continue

            mapping = MappingData()
            message = json_data['message']
            mapping.message = message
            if json_data['cred_value'] == 'no':
                mapping.prediction_result = 0
            else:
                mapping.prediction_result = 1
            feature_data = []
            feature_data.append(int(json_data['likes']))
            feature_data.append(int(json_data['shares']))
            feature_data.append(int(json_data['comments']))
            feature_data.append(int(json_data['url']))
            feature_data.append(int(json_data['hashtag']))
            feature_data.append(int(json_data['images']))
            feature_data.append(int(json_data['vdo']))
            feature_data.append(int(json_data['location']))
            feature_data.append(int(json_data['non_location']))
            feature_data.append(int(json_data['share_only_friend']))
            feature_data.append(int(json_data['is_public']))
            feature_data.append(int(json_data['feeling_status']))
            feature_data.append(int(json_data['tag_with']))
            mapping.feature_list = feature_data

            feature_and_word_data = feature_data[:]

            feature_and_word_data.append(len(message))
            feature_and_word_data.append(message.count('?'))
            feature_and_word_data.append(message.count('!'))

            message_lst = nlp.crfpp(message)
            number_in_dict = dict_list & set(message_lst)
            out_side_dict = len(message_lst) - len(number_in_dict)
            feature_and_word_data.append(len(message_lst))
            feature_and_word_data.append(len(number_in_dict))
            feature_and_word_data.append(out_side_dict)
            mapping.feature_and_word_list = feature_and_word_data
            data_obj.append(mapping)

        pickle.dump(data_obj, open('data/data/data4000.data', 'wb'))
        print('end load...')