Ejemplo n.º 1
0
def train():
    pro_dir = Path(__file__).absolute().parent.parent
    train_file = pro_dir / 'input' / 'processed' / 'train_second.csv'
    test_file = pro_dir / 'input' / 'processed' / 'predict_second.csv'
    test_a_file = pro_dir / 'input' / 'processed' / 'predict_first.csv'
    columns = ['jieba']

    stop_words = get_stop_words()
    train = pd.read_csv(train_file)
    test = pd.read_csv(test_file)
    test_a = pd.read_csv(test_a_file)

    corpus = []
    for df in [train, test, test_a]:
        for col in columns:
            ser = df[col]
            ser = ser[ser.isnull() == False]  # 因为样本中有NAN
            v = ser.apply(
                lambda x: [w for w in x.split(" ")
                           if w not in stop_words]).values.tolist()
            corpus.extend(v)

    model = Word2Vec(corpus,
                     size=FLAGS.hidden_dim,
                     window=FLAGS.window,
                     min_count=len(columns) * FLAGS.min_count,
                     sg=1,
                     iter=FLAGS.iter,
                     workers=multiprocessing.cpu_count())

    vector_file = pro_dir / 'input' / 'word2vec' / "my_w2v_{dim}_{iter}_{wd}.txt".format(
        dim=FLAGS.hidden_dim, iter=FLAGS.iter, wd=FLAGS.window)
    vector_file.parent.mkdir(mode=0o755, exist_ok=True)

    model.wv.save_word2vec_format(str(vector_file), binary=False)
Ejemplo n.º 2
0
def remove_stop_words(words):
    stop_words = get_stop_words()
    final_words = []
    for word in words:
        if len(word) > 1 and word not in stop_words:
            final_words.append(word)
    return final_words
Ejemplo n.º 3
0
def get_tf_vectorizer_data(posts):
    tf_vectorizer = utils.get_model(os.path.join(ROOT, "outputs", "tf.pkl"))
    if tf_vectorizer is None:
        tf_vectorizer = CountVectorizer(max_df=0.6, min_df=0.01, stop_words=utils.get_stop_words())
        tf_vectorizer.fit(posts)
        utils.save_model(tf_vectorizer, os.path.join(ROOT, 'outputs', 'tf.pkl'))

    return tf_vectorizer.transform(posts)
Ejemplo n.º 4
0
def create_word_cloud(no_topics, lda, feature_names):
    for i in range(0, no_topics):
        d = dict(zip(utils.traverse(feature_names), lda.components_[i]))
        wc = wordcloud.WordCloud(background_color='white',
                                 max_words=50,
                                 stopwords=utils.get_stop_words())
        image = wc.generate_from_frequencies(d)
        image.to_file(WHERE_OUTPUTS / 'outputs' + r'\Topic' + str(i + 1) +
                      '.png')
        plt.figure()
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")
        plt.show()
Ejemplo n.º 5
0
def extract_tf_idf(df):
    posts = df['text'].tolist()

    tf_idf_model = utils.get_model(os.path.join(ROOT / "outputs", "tfidf.pkl"))
    if tf_idf_model is None:
        tf_idf_model = TfidfVectorizer(stop_words=utils.get_stop_words(),
                                       ngram_range=(1, 2))
        tf_idf_model.fit(posts)
        utils.save_model(tf_idf_model,
                         os.path.join(ROOT / 'outputs', 'tfidf.pkl'))

    tf_idf_matrix = tf_idf_model.transform(posts)

    tf_idf_dataframe = pd.DataFrame(columns=['id', 'tfidf'])
    tf_idf_dataframe['id'] = df['id'].tolist()
    tf_idf_dataframe['tfidf'] = helpers.reduce_damnation(tf_idf_matrix)
    return tf_idf_dataframe
Ejemplo n.º 6
0
    def display(label, parser, summarizer):
        label.delete(1.0, END)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        #USED TO COUNT THE WORDS IN A SUMMARY
        global displayCount
        counter = 0
        displayCount += 1
        '''for item in summarizer(parser.document,SENTENCES_COUNT):
           for word in parser.tokenize_words(item):
               counter+=1
       print(" wordcount ", counter)
       print("--------------------")
       if displayCount%3==0:
           print("--------ARTICLE ",runCount+26,"------------")
       #END SUMMARY COUNT CODE'''

        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            label.insert(END, sentence)
Ejemplo n.º 7
0
def get_meaningful_words_tf_idf_difference(df):
    path = ROOT + '/outputs/MeaningfulWords.pkl'
    path_object = pathlib.Path(path)
    if path_object.exists():
        return pd.read_pickle(path)
    df_neg = utils.get_abusive_df(df)
    df_pos = utils.get_no_abusive_df(df)
    posts = [' '.join(df_neg['text'].tolist()), ' '.join(df_pos['text'].tolist())]

    tfidf = utils.get_model(os.path.join(ROOT, "outputs", "tfidf.pkl"))
    if tfidf is None:
        tfidf = TfidfVectorizer(stop_words=utils.get_stop_words(), ngram_range=(1, 2))
        tfidf.fit(posts)
        utils.save_model(tfidf, os.path.join(ROOT, 'outputs', 'tfidf.pkl'))

    x = tfidf.transform(posts)
    x = x[0, :] - x[1, :]
    df_tf_idf = pd.DataFrame(x.toarray(), columns=tfidf.get_feature_names())
    df_tf_idf = df_tf_idf.sort_values(by=0, axis=1, ascending=False)
    df_tf_idf.to_pickle(path)
    return df_tf_idf
Ejemplo n.º 8
0
#!/usr/bin/env python
# coding: utf8

import os
import re
from tqdm import tqdm
import pandas as pd

# 分词工具
import jieba

from utils import get_stop_words

stop_words = get_stop_words()
fill_value = "CSFxe"
# user_dict = './yan_word.txt'

def clean_str(stri):
    stri = re.sub(r'[a-zA-Z0-9]+', '', stri)
    if stri == '':
        return fill_value
    return stri.strip()


def _filter_stop_words(word_list):
    _filter_words = [w for w in word_list if w not in stop_words and len(w) > 0]
    x = " ".join(_filter_words)
    return x


data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "input")
Ejemplo n.º 9
0
from nlp.stemmers import Stemmer
from utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT = 10

if __name__ == "__main__":
    #url = "http://www.zsstritezuct.estranky.cz/clanky/predmety/cteni/jak-naucit-dite-spravne-cist.html"
    url = "https://www.npr.org/2018/10/21/658921379/futuristic-dreams-turn-to-nightmare-in-electric-state"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = lexSum(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)

    print('')
    print('')

    summarizer = luhnSum(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)

    print('')
    print('')
            np.save(utils.get_y_train_path(save_directory), y_train)
            np.save(utils.get_y_test_path(save_directory), y_test)

        return embeddings


if __name__ == '__main__':
    input_train = pnd.read_csv(params.INPUT_TRAIN_FILENAME, sep=';')
    # input_test = pnd.read_csv(params.INPUT_TEST_FILENAME, sep=';')
    y = pnd.read_csv(utils.get_labels_path(), sep=';')

    drug_names_path = utils.get_drug_names_path()
    drug_names_df = pnd.read_csv(drug_names_path)
    drug_names_set = set(drug_names_df[params.DRUG_NAME_COL])

    stop_words = utils.compute_stop_words(input_train.question, max_df=STOP_WORDS_TFIDF_MAX_DF) if COMPUTE_STOP_WORDS \
        else utils.get_stop_words(STOP_WORDS_FILEPATH)
    if COMPUTE_STOP_WORDS:
        print("stop words: %s" % ', '.join(stop_words))

    fast_text_embedding = FastTextEmbedding(input_train.question,
                                            y.intention,
                                            drug_description_embedding=False,
                                            drug_names_set=drug_names_set,
                                            stop_words=stop_words,
                                            model_path=MODEL_PATH,
                                            do_correction=True,
                                            verbose=True)
    utils.create_dir(EMBEDDING_DIRPATH)
    fast_text_embedding.run(save_directory=EMBEDDING_DIRPATH)
Ejemplo n.º 11
0
def get_chi_wordlist(datas, num, key):
    # file_name_all = '/home/ren/law_crawler/data_law_tf/data_law_tf_all.pkl'
    # in_file = open(file_name_all, 'rb')
    # datas = pickle.load(in_file)
    # in_file.close()
    # a = ['我的','的','我的' ]
    # print a.count('我')
    # print '---'
    # datas = [{'ob_content_seg': ['我的','我的','阿噗'], 'ob_label':1}, {'ob_content_seg':['我的','我的','阿噗'], 'ob_label':0},
    #          {'ob_content_seg':['我','我他','阿噗大'], 'ob_label': 1}, {'ob_content_seg':['我的','我的哦哦','阿大噗'], 'ob_label': 0}]
    word_dict = {}
    # label_num={label1:num1, label2:num2, ...}
    label_num = {}
    for i in range(len(datas)):
        data = datas[i]
        if not label_num.get(data['ob_label']):
            label_num[data['ob_label']] = 1
        else:
            label_num[data['ob_label']] += 1
        ob_content_seg = data[key]
        if ob_content_seg:
            for word in set(ob_content_seg):
                if word.strip() != '':
                    if not word_dict.get(word.strip()):
                        word_dict[word.strip()] = {}
                        word_dict[word.strip()][data['ob_label']] = 1
                    else:
                        if not word_dict[word.strip()].get(data['ob_label']):
                            word_dict[word.strip()][data['ob_label']] = 1
                        else:
                            word_dict[word.strip()][data['ob_label']] += 1
    # for a in word_dict:
    #     print a, word_dict[a]

    print len(word_dict)
    print label_num
    label_list = label_num.keys()
    print label_list

    # word_dict={word1:{label1: , label2: ,...'chi': {label1: , label2: ,...}}, word2:{}...}
    for word in word_dict:
        for label in label_list:
            if not word_dict[word].get(label):
                word_dict[word][label] = 0
        # 计算词的CHI
        #
        word_dict[word]['chi'] = {}
        for label in label_list:
            a = word_dict[word][label]
            b = 0
            for i in label_list:
                if i != label:
                    b += word_dict[word][i]
            c = label_num[label] - a
            tmp = 0
            for i in label_num:
                if i != label:
                    tmp += label_num[i]

            d = tmp - b
            word_dict[word]['chi'][label] = float((a * c - b * d)**2) / float(
                (a + b) * (c + d))

    word_list = word_dict.items()
    # 删掉停用词
    stop_words_list = utils.get_stop_words()
    i = 0
    while i < len(word_list):
        if len(word_list[i][0]) < 2 or word_list[i][0] in stop_words_list:
            del word_list[i]
        else:
            i += 1
    sort_lists = []
    for label in label_list:
        lst = sorted(word_list, key=lambda x: x[1]['chi'][label], reverse=True)
        words = [word[0] for word in lst]
        sort_lists.extend(words[:num])
    sort_lists = set(sort_lists)
    return list(sort_lists)
Ejemplo n.º 12
0
def main(*args):

    # load stop words
    stop_words = get_stop_words()

    plot = const.PLOT_DEFAULT
    print_ = const.PRINT_DEFAULT
    max_features = None
    random_state = const.RANDOM_STATE_DEFAULT
    order = -1  # default descending order
    wordcloud_n = None
    wordcloud_ = False
    cos_sim = False
    even_distrib = const.EVEN_DISTRIB_DEFAULT
    plt.rcParams.update({'font.size': const.FONT_SIZE_DEFAULT})
    pre_vec = False
    limit_size = False
    min_df = 1
    max_df = 1.0
    param_compare = False

    # print command line arguments
    for arg in args:
        k = arg.split("=")[0]
        v = arg.split("=")[1]
        if k == 'plot':
            plot = utils.str_to_bool(v)
        elif k == 'print':
            print_ = utils.str_to_bool(v)
        elif k == 'max_features':
            max_features = int(v)
        elif k == 'stop_words':
            if utils.str_to_bool(v) == False:
                stop_words = None
        elif k == 'random_state':
            random_state = int(v)
        elif k == 'order':
            order = int(v)
        elif k == 'wordcloud':
            wordcloud_ = utils.str_to_bool(v)
        elif k == 'wordcloud_n':
            wordcloud_n = int(v)
        elif k == 'cos_sim':
            cos_sim = utils.str_to_bool(v)
        elif k == 'font_size':
            plt.rcParams.update({'font.size': int(v)})
        elif k == 'even_distrib':
            even_distrib = utils.str_to_bool(v)
        elif k == 'pre_vec':
            pre_vec = utils.str_to_bool(v)
        elif k == 'limit_size':
            limit_size = utils.str_to_bool(v)
        elif k == 'min_df':
            min_df = int(v)
        elif k == 'max_df':
            max_df = float(v)
            if max_df > 1:
                max_df = int(max_df)
        elif k == 'param_compare':
            param_compare = utils.str_to_bool(v)
        else:
            print("Unknown param: {}".format(k))

    if print_:
        print()
        print("-- Analysis config --")
        print("even_distrib: {}".format(even_distrib))
        print("stop_words: {}".format(stop_words != None))
        print("max_features: {}".format(max_features))
        print("random_state: {}".format(random_state))
        print("wordcloud: {}".format(wordcloud_))
        print("wordcloud_n: {}".format(wordcloud_n))
        print("order: {}".format(order))
        print("cos_sim: {}".format(cos_sim))
        print("param_compare: {}".format(param_compare))
        print("pre_vec: {}".format(pre_vec))
        print("limit_size: {}".format(limit_size))
        print("min_df: {}".format(min_df))
        print("max_df: {}".format(max_df))
        print("plot: {}".format(plot))
        print("--------------------")
        print()

    gen_spotify_df = pd.read_csv(const.GEN_SPOTIFY)
    clean_spotify_df = pd.read_csv(const.CLEAN_SPOTIFY)
    if even_distrib == False:
        clean_spotify_df = pd.read_csv(const.CLEAN_UNEVEN_SPOTIFY)

    gen_deezer_df = pd.read_csv(const.GEN_DEEZER)
    clean_deezer_df = pd.read_csv(const.CLEAN_DEEZER)
    if even_distrib == False:
        clean_deezer_df = pd.read_csv(const.CLEAN_UNEVEN_DEEZER)

    datasets = [
        (const.SPOTIFY, clean_spotify_df),
        (const.DEEZER, clean_deezer_df),
    ]
    vectorizer = CountVectorizer(
        stop_words=stop_words,
        ngram_range=(1, 1),
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
        binary=True,
    )

    # word clouds
    if wordcloud_:
        top_n = gen_word_cloud_grid(
            const.SPOTIFY,
            clean_spotify_df,
            vectorizer=vectorizer,
            n=wordcloud_n,
            order=order,
            random_state=random_state,
            print_=print_
        )
        spotify_shared, spotify_unique = get_shared_words(top_n)

        top_n = gen_word_cloud_grid(
            const.DEEZER,
            clean_deezer_df,
            vectorizer=vectorizer,
            n=wordcloud_n,
            order=order,
            random_state=random_state,
            print_=print_
        )
        deezer_shared, deezer_unique = get_shared_words(top_n)

        if print_:
            print()
            print("Spotify: count shared={}".format(
                len(spotify_shared)/len(spotify_unique)))
            print("Deezer: count shared={}".format(
                len(deezer_shared)/len(deezer_unique)))
            print()

    # cosine similarity
    if cos_sim: 
        for name, dataset in datasets:
            if pre_vec:
                dataset = utils.get_vectorized_df(dataset, vectorizer)

            print("{} class data similarity analysis...".format(name))
            for i in dataset.y.unique():
                class_df = utils.get_class_based_data(
                    dataset,
                    i,
                    random_state=random_state,
                    include_other_classes=True,
                    even_distrib=False,
                    limit_size=limit_size,
                    print_=True,
                )
                if pre_vec == False:
                    class_df = utils.get_vectorized_df(class_df, vectorizer)
                pos_df = utils.get_class_based_data(class_df, 1)
                pos_df.pop('y')
                ave_pos = utils.get_average_cos_sim(pos_df.values)
                neg_df = utils.get_class_based_data(class_df, -1.0)
                neg_df.pop('y')
                ave_neg = utils.get_average_cos_sim(neg_df.values)
                ave_between = utils.get_average_cos_sim(
                    pos_df.values, neg_df.values)
                print("class {}".format(i))
                print("data shape: {}".format(class_df.shape))
                print("average positive cosine similarity: {}".format(ave_pos))
                print("average negative cosine similarity: {}".format(ave_neg))
                print("average between cosine similarity: {}".format(ave_between))
                print("(pos - between )+ (neg - between) percentage = {} ".format(
                    (ave_pos - ave_between) / ave_pos + (ave_neg - ave_between)  / ave_neg
                ))
                print()

    if param_compare:
        # min_df vs pos_sim, neg_sim, between_sim
        params_grid = {
            'min_df': [i for i in range(1, 15)],
            'max_df': np.arange(0.1, 1.0, 0.1),
        }

        for name, dataset in datasets:    
            for i in dataset.y.unique():
                df = utils.get_class_based_data(
                    dataset,
                    i,
                    random_state=random_state,
                    include_other_classes=True,
                    even_distrib=False,
                    limit_size=limit_size,
                )
                for p, v in params_grid.items():
                    print("Comparing cosine similarity vs {} for {} Class {} data...".format(p, name, i))
                    vectorizer = CountVectorizer(
                        stop_words=stop_words,
                        ngram_range=(1, 1),
                        min_df=min_df,
                        max_df=max_df,
                        max_features=max_features,
                        binary=True,
                    )
                    pos_sim = []
                    neg_sim = []
                    between_sim = []
                    diff = []
                    for j in range(len(v)):
                        vectorizer.set_params(**{p: v[j]})
                        class_df = utils.get_vectorized_df(df, vectorizer)
                        pos_df = utils.get_class_based_data(class_df, 1)
                        pos_df.pop('y')
                        ave_pos = utils.get_average_cos_sim(pos_df.values)
                        neg_df = utils.get_class_based_data(class_df, -1.0)
                        neg_df.pop('y')
                        ave_neg = utils.get_average_cos_sim(neg_df.values)
                        ave_between = utils.get_average_cos_sim(
                            pos_df.values, neg_df.values)
                        pos_sim.append(ave_pos)
                        neg_sim.append(ave_neg)
                        between_sim.append(ave_between)
                        diff.append((ave_pos - ave_between)/ave_pos + (ave_neg - ave_between)/ave_neg)
                    
                    plt.figure()
                    plt.title("{} Class {}: {} vs cosine similarity".format(name,i, p))
                    pos_sim = np.array(list(zip(v, pos_sim)))
                    neg_sim = np.array(list(zip(v, neg_sim)))
                    between_sim = np.array(list(zip(v, between_sim)))
                    diff = np.array(list(zip(v, diff)))
                    plt.plot(pos_sim[:, 0], pos_sim[:, 1], label='pos sim')
                    plt.plot(neg_sim[:, 0], neg_sim[:, 1], label='neg sim')
                    plt.plot(between_sim[:, 0], between_sim[:, 1], label='between sim')
                    plt.plot(diff[:, 0], diff[:, 1], label='sim difference (%)')
                    plt.xlabel(p)
                    plt.legend()            

    # grid search eval
    if plot:
        plt.draw()
        plt.show()