Example #1
0
def main():
    # Parameters
    use_stopwords = True
    character_level = False  # 是否为单字级别的token

    raw_set_name = 'THUCNews'
    stopwords_name = 'cn_stopwords_punctuations.csv'
    new_set_name = 'THUCNews.txt'
    cut_set_name = 'THUCNews_jieba.txt'

    # Paths
    init_path = os.getcwd()

    data_dir = os.path.join(init_path, 'datasets')
    stopwords_path = os.path.join(data_dir, stopwords_name)
    raw_set_path = os.path.join(data_dir, raw_set_name)
    new_set_path = os.path.join(data_dir, new_set_name)
    cut_set_path = os.path.join(data_dir, cut_set_name)

    # Transforming and Dumping
    x_texts, y_labels = get_texts_from_source(raw_set_path)
    pd.DataFrame({'label': y_labels, 'text': x_texts})\
        .to_csv(new_set_path, sep='\t', index=False, header=True)

    if use_stopwords:
        stopwords = pp.get_stopwords(stopwords_path)
        x_texts = pp.tokenize_texts(x_texts, stopwords, character_level)
    else:
        x_texts = pp.tokenize_texts(x_texts, character_level=character_level)

    pd.DataFrame({'label': y_labels, 'text': x_texts})\
        .to_csv(cut_set_path, sep='\t', index=False, header=True)
Example #2
0
def titile_tokens():
    cnt = 0
    try:
        sw_path = os.path.join(cwd, "text/news_stopwords.txt")
        sw_list = prep.get_stopwords(sw_path)
        conn = MongoClient("127.0.0.1", 27017)
        db = conn.netease
        target = db.token_war.find({})

        for i in target:
            if "title_keywords" not in i.keys() or not i['title_keywords']:
                cnt += 1
                title_keywords = []
                if len(i['keywords']) > 1:  # article have keywords
                    title_keywords = i['title'] + '\t\t' + ' '.join(
                        i['keywords'])
                else:
                    title_keywords = i['title']
                tokens_string = ' '.join(
                    prep.tokenize(title_keywords,
                                  sw_list=sw_list,
                                  language='CN'))
                db.token_war.update_one(
                    {'number': i['number']},
                    {'$set': {
                        'title_keywords': tokens_string
                    }})
        print "%d title tokenize completed!" % cnt

        conn.close()
    except Exception as e:
        print "From:art_tokenize:\t\nUnexpect Error: {}".format(e)
Example #3
0
def testwebhook():
    if request.method == 'GET':
        return verify_webhook(request)
    elif request.method == 'POST':
        return do_conversational_flow(bot, request, utterances, model, encoder,
                                      vector, stemmer, get_stopwords())
    else:
        return 'Error de solicitud'
Example #4
0
def get_new_vec(model, docs, sw_path=default_sw_path, language="CN"):
    sw_list = get_stopwords(sw_path)
    new_doc_vec = []
    try:
        for doc in docs:
            tokens = tokenize(doc, sw_list, language=language)
            doc_vec = model.infer_vector(tokens)
            new_doc_vec.append(doc_vec)
        return new_doc_vec
    except Exception as e:
        print "From get_new_vec:\n\tUnexpect Error:{}".format(e)
Example #5
0
 def __iter__(self):
     sw_list = get_stopwords(self.sw_path)
     f = open(self.file_path)
     csv_reader = csv.reader(f, delimiter='\t')
     for i, line in enumerate(csv_reader):
         if i + 1 > self.lines:
             self.lines = i + 1
             self.label_list.append(line[0])  # get the doc label
         tag = "%s_%s" % (self.file_name, str(i))
         # print '---1'
         yield doc2vec.TaggedDocument(tokenize(line[1], sw_list, self.t),
                                      tags=[tag])
Example #6
0
def main():
    crawl_state = crawl.main()
    sp = os.path.join(cwd, "text/news_stopwords.txt")
    sw_list = prep.get_stopwords(sp)
    a = art_tokenize()
    b = rm_sp_tokens(sw_list)
    c = pos_clean_tokens()
    titile_tokens()
    title_clean_tokens()
    crawl_state = crawl_state + '\n' + a + '\n' + b + '\n' + c
    print "From the mongodb_io.py:"
    print crawl_state
    return crawl_state
Example #7
0
def extract_bag_of_words(df, use_cached=True):
    if use_cached and os.path.isfile('./cache/_cached_bag_of_words.pkl'):
        with open('./cache/_cached_bag_of_words.pkl', 'rb') as infile:
            tokens = pickle.load(infile)
            return tokens
    stopwords = get_stopwords()
    tokens = {}
    for n in range(df.shape[0]):
        if not n % 1000:
            print(n)
        tokens[df.iloc[n]['status_id']] = pipe(df.iloc[n]['message'], stopwords)
    with open('./cache/_cached_bag_of_words.pkl', 'wb') as outfile:
        pickle.dump(tokens, outfile)
    return tokens
Example #8
0
def get_word_count(x):
    """
    return normal word count, stop word count, unusual word count
    """
    wc = x.apply(lambda text: len(word_tokenize(text)))
    unique_wc = x.apply(lambda text: len(np.unique(tokenizer(text))))
    stop_wc = x.apply(lambda text: len(get_stopwords(tokenizer(text))))
    unusual_wc = x.apply(lambda text: len(get_unusual_words(tokenizer(text))))
    return pd.DataFrame(
        {
            '{}_word_count'.format(x.name): wc,
            '{}_unique_word_count'.format(x.name): unique_wc,
            '{}_stopword_count'.format(x.name): stop_wc,
            '{}_unusual_word_count'.format(x.name): unusual_wc,
            '{}_total_word_count'.format(x.name): wc + stop_wc + unusual_wc
        }
    )
def tokenize():
    content = request.args.get('content', None, type=str)

    print "From view.py tokenize:%s" % content
    sw_list = prep.get_stopwords(
        "/home/skipper/study/python/project/text/news_stopwords.txt")
    tokens = prep.tokenize(content, sw_list, language="CN")
    for i in tokens:
        print i
    tokens_string = '/'.join(tokens)
    print tokens_string
    pos_tag, pos_string = prep.pos_test(tokens)

    print "From view.py tokenize: %s\n%s\n%s\n" % (tokens_string, pos_tag,
                                                   pos_string)
    detail = {}
    detail['tokens_string'] = tokens_string
    detail['pos_tag'] = pos_tag
    detail['pos_string'] = pos_string

    return jsonify(detail)
Example #10
0
# Paths
init_path = os.getcwd()

data_dir = os.path.join(init_path, 'datasets')
stopwords_path = os.path.join(data_dir, stopwords_name)

save_dir = os.path.join(init_path, 'saved_models')
model_path = os.path.join(save_dir, model_name)
tokenizer_path = os.path.join(save_dir, tokenizer_name)
labels_path = os.path.join(save_dir, labels_index)

graph = get_default_graph()

# stopwords, label names
stopwords = pp.get_stopwords(stopwords_path)
with open(labels_path) as f:
    labels_name = f.readlines()
labels_name = [x.split('\t') for x in labels_name]
labels_name = {int(x[0]): x[1] for x in labels_name}

# model, tokenizer
model = load_model(model_path, compile=True)
with open(tokenizer_path, 'rb') as f:
    tokenizer = pickle.load(f)


def get_pred(texts):
    """预测文本类别。

    Parameters
def main(date_start=None,
         date_end=None,
         n_clusters=0,
         days=5,
         saving=0,
         show=0):
    if saving == 1:
        import myvisual as visual
    else:
        import myvisual_show as visual
    sw_path = os.path.join(cwd, "text/news_stopwords.txt")
    sw_list = prep.get_stopwords(sw_path)

    if date_start and date_end:
        s_eles = date_start.split('/')
        s_eles = [int(i) for i in s_eles]
        s_eles = [str(i) for i in s_eles]
        d_st = datetime(int(s_eles[0]), int(s_eles[1]), int(s_eles[2]))
        e_eles = date_end.split('/')
        e_eles = [int(i) for i in e_eles]
        e_eles = [str(i) for i in e_eles]
        d_end = datetime(int(e_eles[0]), int(e_eles[1]), int(e_eles[2]), 23,
                         59, 59)
        if n_clusters == 0:
            base_path = '_'.join(s_eles) + '-' + '_'.join(e_eles) + '-orign'
        else:
            base_path = '_'.join(s_eles) + '-' + '_'.join(
                e_eles) + '-Clusters_%s' % str(n_clusters)
        print "The store paht: %s" % base_path
    else:
        c_t = time.localtime(time.time())
        tag_time = crawl.get_point_time(
            days)  #  0 present today 0:0   -1 present tommorow
        b = str(tag_time).split()[0].split('-')
        b = [int(i) for i in b]
        b = [str(i) for i in b]
        # modify 9/19/2017
        # add
        date_start = '/'.join(i for i in b)
        date_end = '/'.join([str(c_t[0]), str(c_t[1]), str(c_t[2])])
        # print "-------"
        # print "%s---%s"%(date_start, date_end)
        d_st = tag_time
        d_end = datetime(c_t[0], c_t[1], c_t[2], c_t[3], c_t[4], c_t[5])
        # print d_st, d_end
        # add done

        if n_clusters == 0:
            base_path = '_'.join(b) + '-' + '_'.join(
                [str(c_t[0]), str(c_t[1]),
                 str(c_t[2])]) + '-orign'
        else:
            base_path = '_'.join(b) + '-' + '_'.join(
                [str(c_t[0]), str(c_t[1]),
                 str(c_t[2])]) + '-Clusters_%s' % str(n_clusters)
        print "The store path: %s" % base_path
    flask_path = '/static/img/' + base_path
    print flask_path

    if os.path.exists(os.path.join(cwd, 'app') + flask_path) and show == 0:
        p_list = ['mds.jpg', 'pca.jpg', 'bar.jpg', 'pie.jpg']
        p_list = [flask_path + '/' + i for i in p_list]
        p_list.append(base_path)
        print "The main function return file path list:"
        print p_list
        return p_list
    else:
        path = os.path.join(cwd, 'app') + flask_path
        if saving == 1:
            os.mkdir(path)

    # changing
        print "Loading data from the MongoDB......"
        # if date_start and date_end:
        # 	target = mgio.get_target_1(d_st, d_end)
        # else:
        # 	target = mgio.get_target(tag_time)
        target = mgio.get_target_1(d_st, d_end)
        # changed

        # modify 9/19/2017
        # Maybe the mongodb no news during specified time period
        # add
        if not target.count():
            print "Oh Oh Oh   No news between %s and %s" % (d_st, d_end)
            print "Crawling the news form the website..."
            # crawl.main()
            mgio.main()
            target = mgio.get_target_1(d_st, d_end)
        # return
    # From the first execute when every test, so using jupyter, but you need recreate jupyter file
    # add done

        data = prep.convert_to_dataframe(target)
        cnt = len(data)
        doc_id = list(data.id)

        print "The number of article: %d" % cnt
        print "Detail as follows:"
        df = pd.DataFrame(data, columns=['id', 'date', 'title', 'url'])
        print df

        tokens_string = list(data.im_tokens)
        d = {'id': data['id'], 'date': data['date'], 'keywords': tokens_string}
        df = pd.DataFrame(d)
        print df

        if cnt > 0:
            print "Creating TF TDM......"
            tf_matrix, dictionary = mycluster.get_tf_matrix(tokens_string)
            print "The length of dictionary: %d" % len(dictionary)
            print "Creating TF-IDF TDM......"
            tfidf_matrix = mycluster.get_tfidf_matrix(tf_matrix)
            print "Creating the words and weight of every doc......"
            docs_keywords_string, docs_words_weight = mycluster.get_first_n_words(
                tfidf_matrix, 10, dictionary)

            k_w = {}
            for i, ele in enumerate(docs_words_weight):
                keys = ele.keys()
                weight = ele.values()
                k_w['%d_word' % i] = pd.Series(keys)
                k_w['%d_weight' % i] = pd.Series(weight)

            df = pd.DataFrame(k_w)
            print df

            if n_clusters == 0:
                n_clusters = int(sqrt(cnt / 2))

            print "The n_clusters is: %d" % n_clusters
            print "Clustering......"
            labels = mycluster.get_label_list(n_clusters, tfidf_matrix)
            data['cluster'] = labels
            index_list = list(data['cluster'].value_counts().index)

            hot_sort = data['cluster'].value_counts()
            df = pd.DataFrame.from_dict({
                'Cluster Label': hot_sort.index,
                'Article Count': hot_sort.values
            })
            df = pd.DataFrame(df, columns=['Cluster Label', 'Article Count'])
            print df

            for i in index_list:
                print "Cluster Lable: %d" % index_list[i]
                print "# The count of article: %d" % hot_sort[i]
                df = pd.DataFrame.from_dict({'id': list(pd.DataFrame(data.loc[data['cluster'] == i])['id']),\
                                            'title':list(pd.DataFrame(data.loc[data['cluster'] == i])['title']),\
                                            'url': list(pd.DataFrame(data.loc[data['cluster'] == i])['url'])})
                df = pd.DataFrame(df, columns=['id', 'title', 'url'])
                print df

            clusters_topic_dict = {}
            clusters_art_number = {}
            clusters_title_keywords = mycluster.get_clusters_detail(
                n_clusters, data, 'im_title')
            for i, art_list in enumerate(clusters_title_keywords):
                texts = [ele.split() for ele in art_list]
                t = topic.get_topic_string(
                    texts)  # get topic according im_tokens through lda model
                clusters_topic_dict[index_list[i]] = t
                clusters_art_number[index_list[i]] = len(art_list)

            topic_list = []
            art_number_list = []
            c_classes_keywords_sort = []
            for idx in index_list:
                topic_list.append(clusters_topic_dict[idx])
                art_number_list.append(clusters_art_number[idx])

            df = pd.DataFrame.from_dict({'Cluster Label': index_list, 'Article Count': art_number_list,\
                                        'Topic Based on Title': topic_list})
            df = pd.DataFrame(df,
                              columns=[
                                  'Cluster Label', 'Article Count',
                                  'Topic Based on Title'
                              ])
            print df

            visual.mds_show(tfidf_matrix, clusters_topic_dict, n_clusters,
                            data, path, saving)
            visual.pca_show(tfidf_matrix, clusters_topic_dict, n_clusters,
                            data, path, saving)
            clusters_cmt_number = {}
            clusters_cmt_detail = []
            clusters_id_list = mycluster.get_clusters_detail(
                n_clusters, data, 'id')
            for i, cluster_id_list in enumerate(clusters_id_list):
                cmt_detail = mgio.id_get_cmts(
                    cluster_id_list)  # return DataFrame format
                clusters_cmt_detail.append(cmt_detail)
                clusters_cmt_number[index_list[i]] = len(cmt_detail)

            visual.cluster_barh_new(n_clusters, index_list, clusters_topic_dict,\
                    clusters_art_number, clusters_cmt_number, path, saving)

            cmt_number_list = []
            for idx in index_list:
                cmt_number_list.append(clusters_cmt_number[idx])
            hot_sort = data['cluster'].value_counts()
            df = pd.DataFrame.from_dict({
                'Cluster Label': hot_sort.index,
                'Comment Count': cmt_number_list
            })
            df = pd.DataFrame(df, columns=['Cluster Label', 'Comment Count'])
            print df

            file_path = os.path.join(cwd, 'cmt_stit.txt')
            # file_path = '/home/skipper/study/python/project_v2/cmt_stit.txt'
            stopwords_path = os.path.join(cwd, "text/stopsign.txt")
            # stopwords_path = "/home/skipper/nltk_data/Other_data/stopwords/stopsign.txt"

            d_stit_prop_dict = {}
            s_stit_prop_dict = {}

            for i, cmt_detail in enumerate(clusters_cmt_detail):
                cmt_list = list(cmt_detail['content'])
                svm_label = sentiment.get_stit_label(cmt_list, file_path = file_path, \
                        sw_path = stopwords_path, name = "svm", language = "CN")
                snow_label = sentiment.get_snow_label(cmt_list)
                # Add to DataFrame
                cmt_detail['d_label'] = svm_label
                cmt_detail['s_label'] = snow_label
                d_stit_prop_dict[index_list[i]] = sentiment.get_stit_prop(
                    svm_label)
                s_stit_prop_dict[index_list[i]] = sentiment.get_stit_prop(
                    snow_label)

            for i, cmt_detail in enumerate(clusters_cmt_detail):
                print "# The Cluster label: %d" % index_list[i]
                print "# The count of article: %d" % cmt_number_list[i]
                df = pd.DataFrame(
                    cmt_detail,
                    columns=['vote', 'against', 'd_label', 'content'])
                print df

            visual.cluster_stit_pie(n_clusters, index_list, d_stit_prop_dict,
                                    clusters_topic_dict, path, saving)

            print "Creating the sub class Topic......"
            sub_cluster, sub_weight = mycluster.get_classes(
                n_clusters, data, docs_words_weight, docs_keywords_string)
            classes_keywords = mycluster.get_class_keywords(
                n_clusters, 3, sub_cluster, sub_weight)

            print "For K-means clustering "
            mycluster.all_output(n_clusters, classes_keywords, clusters_topic_dict, \
                  sub_cluster, data, d_stit_prop_dict, s_stit_prop_dict)

        p_list = ['mds.jpg', 'pca.jpg', 'bar.jpg', 'pie.jpg']
        p_list = [flask_path + '/' + i for i in p_list]
        p_list.append(base_path)
        print "The main function return file path list:"
        print p_list
        return p_list