def build_keras_input():
    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv')

    vocab = get_vocab(texts)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    word_vecs = load_embeddings(
        'zh',
        '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt'
    )

    # load glove vectors
    # word_vecs = load_embeddings(arg='glove')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    idx_data = make_idx_data(texts, word_idx_map)

    data = (idx_data, valence, arousal)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
def train_doc2vec():
    # def isEnglish(s):
    #     try:
    #         s.encode('ascii')
    #     except UnicodeEncodeError:
    #         return False
    #     else:
    #         return True

    labeled_data, _ = load_vader('./resource/tweets.txt')
    # for i,d in enumerate(labeled_data):
    #     print(i)
    #     if not isEnglish(d):
    #         print('*'*111)
    #         print(i,d)
    # exit()
    unlabeled_data, _ = load_sentiment140('/home/hs/Data/Corpus/training.csv')
    labeled_data = preprocess(labeled_data, replace=True)
    dump_picle(labeled_data, './data/acc/labeled_data.p')
    unlabeled_data = preprocess(unlabeled_data, replace=True)
    dump_picle(unlabeled_data, './data/acc/unlabeled_data.p')
    # labeled_data = load_pickle('./data/acc/labeled_data.p')
    # unlabeled_data = load_pickle('./data/acc/unlabeled_data.p')
    sentence = TaggedLineSentence(labeled_data, unlabeled_data)
    train_docvecs(sentence)
def build_keras_input():
    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv')

    vocab = get_vocab(texts)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    word_vecs = load_embeddings('zh',
                                '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt')

    # load glove vectors
    # word_vecs = load_embeddings(arg='glove')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    idx_data = make_idx_data(texts, word_idx_map)

    data = (idx_data, valence, arousal)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
Beispiel #4
0
def train_doc2vec():
    # def isEnglish(s):
    #     try:
    #         s.encode('ascii')
    #     except UnicodeEncodeError:
    #         return False
    #     else:
    #         return True

    labeled_data, _ = load_vader('./resource/tweets.txt')
    # for i,d in enumerate(labeled_data):
    #     print(i)
    #     if not isEnglish(d):
    #         print('*'*111)
    #         print(i,d)
    # exit()
    unlabeled_data, _ = load_sentiment140('/home/hs/Data/Corpus/training.csv')
    labeled_data = preprocess(labeled_data, replace=True)
    dump_picle(labeled_data, './data/acc/labeled_data.p')
    unlabeled_data = preprocess(unlabeled_data, replace=True)
    dump_picle(unlabeled_data, './data/acc/unlabeled_data.p')
    # labeled_data = load_pickle('./data/acc/labeled_data.p')
    # unlabeled_data = load_pickle('./data/acc/unlabeled_data.p')
    sentence = TaggedLineSentence(labeled_data, unlabeled_data)
    train_docvecs(sentence)
def kNN(train_data, train_labels, test):
    log_state('Use kNN classifier')
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('kNN classifier training complete, saved predict labels to pickle')
    return
def logit(train_data, train_labels, test):
    log_state('Use logistic regression classifier')
    clf = linear_model.LogisticRegression(C=1e5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('MaxEnt classifier training complete, saved predict labels to pickle')
    return
def svm_classify(train_data, train_labels, test):
    log_state('Use SVM classifier')
    clf = svm.SVC(C=5.0, kernel='linear')
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info('SVM classifier training complete, saved predict labels to pickle')
    return
def build_docvecs(model, ratings):
    nb_text = len(ratings)  # 4200
    size = len(model.docvecs['L_SENT_0'])  # 50
    vecs = [
        model.docvecs['L_SENT_%s' % id].reshape((1, size))
        for id in range(nb_text)
    ]
    dump_picle((np.concatenate(vecs), ratings), './data/acc/twitter_docvecs.p')
Beispiel #9
0
def logit(train_data, train_labels, test):
    log_state('Use logistic regression classifier')
    clf = linear_model.LogisticRegression(C=1e5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info(
        'MaxEnt classifier training complete, saved predict labels to pickle')
    return
Beispiel #10
0
def svm_classify(train_data, train_labels, test):
    log_state('Use SVM classifier')
    clf = svm.SVC(C=5.0, kernel='linear')
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info(
        'SVM classifier training complete, saved predict labels to pickle')
    return
Beispiel #11
0
def kNN(train_data, train_labels, test):
    log_state('Use kNN classifier')
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
    logger.info(
        'kNN classifier training complete, saved predict labels to pickle')
    return
Beispiel #12
0
def build_ori_anew_vectors(words):
    filename = "./tmp/anew_vectors.p"
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin")
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def gNB(train_data, train_labels, test, save_result=False):
    log_state('Use Gaussian Naive Bayes classifier')
    clf = GaussianNB()
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    predict_proba = clf.predict_proba(test)
    if save_result == True:
        dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
        dump_picle(predict_proba, './data/predict_labels/predict_proba.p')
    logger.info('Classifier training complete, saved predict labels to pickle')
    return predict_labels
Beispiel #14
0
def gNB(train_data, train_labels, test, save_result=False):
    log_state('Use Gaussian Naive Bayes classifier')
    clf = GaussianNB()
    clf.fit(train_data, train_labels)
    predict_labels = clf.predict(test)
    predict_proba = clf.predict_proba(test)
    if save_result == True:
        dump_picle(predict_labels, './data/predict_labels/predict_labels.p')
        dump_picle(predict_proba, './data/predict_labels/predict_proba.p')
    logger.info('Classifier training complete, saved predict labels to pickle')
    return predict_labels
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Beispiel #16
0
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings('google_news',
                            '/home/hs/Data/Word_Embeddings/google_news.bin')
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Beispiel #17
0
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        'google_news',
        'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin')
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Beispiel #18
0
def build_amended_vectors(arg='word2vec'):
    prefix = None if arg == 'word2vec' else 'GloVe_'
    pos_vectors = load_pickle('./tmp/'+prefix+'common_positive_words.p')
    neg_vectors = load_pickle('./tmp/'+prefix+'common_negative_words.p')
    size = len(pos_vectors[list(pos_vectors.keys())[0]])
    print('The dimension of word vectors: %s.' % size)
    for k in pos_vectors:
        pos_vectors[k]=np.array(pos_vectors[k]).reshape((1, size))
    for k in neg_vectors:
        neg_vectors[k]=np.array(neg_vectors[k]).reshape((1, size))
    amended_pos, amended_neg = amend(pos_vectors, neg_vectors)
    dump_picle(amended_pos, './tmp/amended_'+prefix+'pos.p')
    dump_picle(amended_neg, './tmp/amended_'+prefix+'neg.p')
Beispiel #19
0
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors_retrofitted_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        arg='zh_tw',
        filename="D:\Word_Embeddings\English\glove.6B\GloVe_out_vec_file.txt")
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def common_words(word_vectors, words_list):
    ####################################
    # for word2vec model:
    # full_words = word_vectors.vocab.keys()
    # for GloVe model:
    full_words = word_vectors.keys()
    ####################################
    same_words = set(words_list).intersection(full_words)
    print('Total Number: %s, same word number: %s.'%(len(words_list), len(same_words)))
    vector_dict=dict()
    for w in same_words:
        vector_dict[w]=word_vectors[w]
    dump_picle(vector_dict, './tmp/GloVe_common_negative_words.p')
Beispiel #21
0
def common_words(word_vectors, words_list):
    ####################################
    # for word2vec model:
    # full_words = word_vectors.vocab.keys()
    # for GloVe model:
    full_words = word_vectors.keys()
    ####################################
    same_words = set(words_list).intersection(full_words)
    print('Total Number: %s, same word number: %s.' %
          (len(words_list), len(same_words)))
    vector_dict = dict()
    for w in same_words:
        vector_dict[w] = word_vectors[w]
    dump_picle(vector_dict, './tmp/GloVe_common_negative_words.p')
def build_amended_anew_vectors(words):
    filename = './tmp/amended_anew_vectors_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    amended_pos = load_pickle('./tmp/amended_GloVe_pos.p')
    amended_neg = load_pickle('./tmp/amended_GloVe_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def build_amended_anew_vectors(words):
    filename = './tmp/retrofitted_anew_vectors_word2vec.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt")
    amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p')
    amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Beispiel #24
0
def build_amended_anew_vectors(words):
    filename = "./tmp/amended_anew_vectors.p"
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin")
    amended_pos = load_pickle("./tmp/amended_pos.p")
    amended_neg = load_pickle("./tmp/amended_neg.p")
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Beispiel #25
0
def build_keras_input_amended():
    filename_data, filename_w = './tmp/amended_indexed_data.p', './tmp/amended_Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    # word_vecs = load_embeddings('glove')

    # load amended word vectors
    word_vecs = load_embeddings('amended_word2vec')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
def build_keras_input_amended():
    filename_data, filename_w = './tmp/amended_indexed_data.p', './tmp/amended_Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    # word_vecs = load_embeddings('glove')

    # load amended word vectors
    word_vecs = load_embeddings('amended_word2vec')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
Beispiel #27
0
def build_amended_anew_vectors(words):
    filename = './tmp/amended_anew_vectors.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings('google_news',
                            '/home/hs/Data/Word_Embeddings/google_news.bin')
    amended_pos = load_pickle('./tmp/amended_pos.p')
    amended_neg = load_pickle('./tmp/amended_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Beispiel #28
0
def build_amended_anew_vectors(words):
    filename = './tmp/retrofitted_anew_vectors_word2vec.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        arg='zh_tw',
        filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt")
    amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p')
    amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Beispiel #29
0
def build_amended_anew_vectors(words):
    filename = './tmp/amended_anew_vectors_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        arg='zh_tw',
        filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    amended_pos = load_pickle('./tmp/amended_GloVe_pos.p')
    amended_neg = load_pickle('./tmp/amended_GloVe_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def build_keras_input(texts, scores, test, new=True):
    dims = 300

    # texts, scores are dict type, key: train, dev, devtest.
    keys = ["train", "dev", "devtest"]
    train, train_scores = texts[keys[0]], scores[keys[0]]
    dev, dev_scores = texts[keys[1]], scores[keys[1]]
    devtest, devtest_scores = texts[keys[2]], scores[keys[2]]

    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    test_filename = './tmp/test_data.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w) and new == False:
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)

        test_data = load_pickle(test_filename)

        print('Use existing data. Load OK.')
        return (data, W, test_data)

    print("Construct new data.")
    # load data from pickle

    vocab = get_vocab(train)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    # word_vecs = load_embeddings('D:/Word_Embeddings/glove.840B.300d.txt.w2v')
    word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/glove.840B.300d.txt.w2v')
    # word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/word2vec_twitter_model/word2vec_twitter_model.bin',
    #                             binary=True)

    word_vecs = add_unknown_words(word_vecs, vocab, k=dims)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab, k=dims)

    idx_data_train = make_idx_data(train, word_idx_map)
    idx_data_dev = make_idx_data(dev, word_idx_map)
    idx_data_devtest = make_idx_data(devtest, word_idx_map)

    idx_data_test = make_idx_data(test[2], word_idx_map)

    data = (idx_data_train, idx_data_dev, idx_data_devtest, train_scores, dev_scores, devtest_scores)

    test_data = (test[0], test[1], idx_data_test)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    dump_picle(test_data, test_filename)
    print("Saved: data and W are saved into: %s, and %s." % (filename_data, filename_w))

    return (data, W, test_data)
Beispiel #31
0
def load_sst(path=None, level=None):
    filename = './tmp/SST.p'
    if os.path.isfile(filename):
        print('Load OK.')
        return load_pickle(filename)

    def cleanStr(string):
        string = re.sub(r'^A-Za-z0-9(),!?\'\`', ' ', string)
        string = re.sub(r'\s{2,}', ' ', string)
        string = string.replace('á', 'á').replace('é', 'é').replace('ñ', 'ñ').replace('Â', '').replace('ï', 'ï')
        string = string.replace('ü', 'ü').replace('â', 'â').replace('è', 'è').replace('ö', 'ö').replace('æ', 'æ')
        string = string.replace('ó', 'ó').replace('û', 'û').replace('ô', 'ô').replace('ã', 'ã').replace('ç', 'ç')
        string = string.replace('Ã ', 'à ').replace('Ã', 'í').replace('í­', 'í')
        return string

    # sentiment label
    sentiment_file = open(path + 'sentiment_labels.txt', 'r')
    sentiment_label = {}
    n = 0
    for line in sentiment_file:
        lines = line.strip().split('|')
        if n > 0:
            sentiment_label[int(lines[0])] = float(lines[1])
        n += 1
    sentiment_file.close()

    # phrase dict
    dict_file = open(path + 'dictionary.txt', 'r')
    phrase_dict = {}
    for line in dict_file:
        # line = line.decode('utf-8')
        lines = line.strip().split('|')
        phrase_dict[lines[0]] = int(lines[1])
    dict_file.close()

    # sentence dict
    sentence_file = open(path + 'datasetSentences.txt', 'r')
    sentence_dict = {}
    n = 0
    for line in sentence_file:
        # line = line.decode('utf-8')
        line = line.replace('-LRB-', '(')
        line = line.replace('-RRB-', ')')
        lines = line.strip().split('\t')
        if n > 0:
            sentence_dict[int(lines[0])] = lines[1]
        n += 1
    sentence_file.close()

    # datasplit
    datasplit_file = open(path + 'datasetSplit.txt', 'r')
    split_dict = {}
    n = 0
    for line in datasplit_file:
        lines = line.strip().split(',')
        if n > 0:
            split_dict[int(lines[0])] = int(lines[1])
        n += 1
    datasplit_file.close()

    size = len(sentence_dict)  # size = 11855
    # for i in range(1000):
    #     senti = sentiment_label[phrase_dict[cleanStr(sentence_dict[i + 1])]]
    #     print(i, senti, cleanStr(sentence_dict[i + 1]))
    # exit()
    x_train, y_train_valence, y_train_labels = [], [], []
    x_test, y_test_valence, y_test_labels = [], [], []
    x_valid, y_valid_valence, y_valid_labels = [], [], []

    x_train_polarity, y_train_polarity = [], []
    x_test_polarity, y_test_polarity = [], []
    x_valid_polarity, y_valid_polarity = [], []

    for i in range(size):
        # print sentence_dict[i+1].encode('utf-8')
        sentence = cleanStr(sentence_dict[i + 1])
        senti = sentiment_label[phrase_dict[sentence]]

        # print(senti, sentence)
        labels, polarity = None, None
        if 0 <= senti <= 0.2:
            labels = 1
            polarity = 0
        if 0.2 < senti <= 0.4:
            labels = 2
            polarity = 0
        if 0.4 < senti <= 0.6:
            labels = 3
        if 0.6 < senti <= 0.8:
            labels = 4
            polarity = 1
        if 0.8 < senti <= 1:
            labels = 5
            polarity = 1
        if labels is None:
            raise Exception('Sentiment Error !')

        if split_dict[i + 1] == 1:
            x_train.append(sentence)
            y_train_valence.append(senti)
            y_train_labels.append(labels)
            if polarity is not None:
                x_train_polarity.append(sentence)
                y_train_polarity.append(polarity)
        elif split_dict[i + 1] == 2:
            x_test.append(sentence)
            y_test_valence.append(senti)
            y_test_labels.append(labels)
            if polarity is not None:
                x_test_polarity.append(sentence)
                y_test_polarity.append(polarity)
        else:
            x_valid.append(sentence)
            y_valid_valence.append(senti)
            y_valid_labels.append(labels)
            if polarity is not None:
                x_valid_polarity.append(sentence)
                y_valid_polarity.append(polarity)

    print("Fine-grained: #training: %s, #valid: %s, #test: %s" % (len(x_train), len(x_valid), len(x_test)))
    print("Binary classification: #train: %s, #valid: %s, #test: %s" % (
        len(x_train_polarity), len(x_valid_polarity), len(x_test_polarity)))

    # t = zip(x_train, y_train)
    # random.shuffle(t)
    # x_train, y_train = zip(*t)
    output = (x_train, y_train_valence, y_train_labels,
              x_test, y_test_valence, y_test_labels,
              x_valid, y_valid_valence, y_valid_labels,
              x_train_polarity, y_train_polarity,
              x_test_polarity, y_test_polarity,
              x_valid_polarity, y_valid_polarity)
    dump_picle(output, filename)
    print('Data saved and load successfully.')
    return output
def build_docvecs(model, ratings):
    nb_text = len(ratings)  # 4200
    size = len(model.docvecs['L_SENT_0'])  # 50
    vecs = [model.docvecs['L_SENT_%s' % id].reshape((1, size)) for id in range(nb_text)]
    dump_picle((np.concatenate(vecs), ratings), './data/acc/twitter_docvecs.p')
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
print('dump embedding matrix file OK')
c
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_all.p')
print(idx_data[0])
print(ratings[0])

############################################## tweets ###############################################
# corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
corpus, ratings = load_vader(['tweets'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
from load_data import load_CVAT_2
filename = './resources/CVAT (utf-8).csv'
texts, valence, arousal = load_CVAT_2(filename, categorical="all")
len_text = []
from CKIP_tokenizer import segsentence
out = []
for idx, i in enumerate(texts):
    # print(list(i))
    print(idx)
    out.append(" ".join(segsentence(i)))
    # len_text.append(len(.split()))
from save_data import dump_picle
dump_picle(out, "tokenized_texts_(newest3.31).p")
print("The tokenized text is saved.")
Beispiel #35
0
vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word',
                    'min_df': parameters['min_df'], 'max_df': parameters['max_df'],
                    'binary': parameters['TF_binary'], 'norm': parameters['norm'],
                    'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']}

if __name__ == "__main__":
    unigram = StemmedTfidfVectorizer(**vectorizer_param)
    anew = anew_vectorizer()
    pct = punctuation_estimator()
    strength = strength_vectorizer()
    avg_strength = avg_affective_vectorizer()
    log_state('combine unigram and avg strength features')
    combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)])
    # log_state('combine unigram and strength features')
    # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)])
    # log_state('combine unigram and anew features')
    # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)])
    # log_state('combine unigram and punctuation features')
    # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)])
    texts, _ = load_train_data('Sentiment140')

    transformed_train = combined_features.fit_transform(texts)

    testdata, _ = load_test_data()
    transformed_test = combined_features.transform(testdata)

    dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p')
    dump_picle(transformed_train, "./data/transformed_data/transformed_train.p")
    dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
    neg_idx_data, neg_length = load_data(file_dir + 'neg/')
    print(neg_idx_data.shape, neg_length)
    data = np.concatenate((pos_idx_data, neg_idx_data), axis=0)
    print(data.shape)
    return data, pos_length, neg_length


if __name__ == '__main__':
    ########################################## config ########################################
    file_dir = 'E:/研究/Data/IMDB/aclImdb/train/' if os.name == 'nt' else '/home/hs/Data/imdb/aclImdb/train/'
    vec_dim = 300
    ##########################################################################################

    # get vocab and save to pickle
    vocab = get_vocab(file_dir)
    dump_picle(vocab, './data/tmp/vocab.p')
    print('OK')
    a = load_pickle('./data/tmp/vocab.p')
    for i in a:
        print(i)
    print(len(a))
    exit()
    # end

    # make word index map
    vocab = load_pickle('./data/tmp/vocab.p')
    W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
    dump_picle(word_idx_map, get_file_path('word_idx_map'))
    print('dump word_idx_map successful')
    dump_picle(W, '/home/hs/Data/embedding_matrix.p')
    print('OK')
Beispiel #37
0
def keras_nn_input(word_vectors_model, amending):
    if word_vectors_model == 'word2vec':
        if amending == True:
            filename_data, filename_w = './tmp/amended_w2v_indexed_data.p', './tmp/amended_w2v_Weight.p'
        elif amending == False:
            filename_data, filename_w = './tmp/w2v_indexed_data.p', './tmp/w2v_Weight.p'
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'GloVe':
        if amending == True:
            filename_data, filename_w = './tmp/amended_GloVe_indexed_data.p', './tmp/amended_GloVe_Weight.p'
        elif amending == False:
            filename_data, filename_w = './tmp/GloVe_indexed_data.p', './tmp/GloVe_Weight.p'
        else:
            raise Exception('Wrong!')

    elif word_vectors_model == 'retrofitted_GloVe':
        filename_data, filename_w = './tmp/retrofitted_GloVe_indexed_data.p', './tmp/retrofitted_GloVe_Weight.p'
    elif word_vectors_model == 'retrofitted_word2vec':
        filename_data, filename_w = './tmp/retrofitted_word2vec_indexed_data.p', './tmp/retrofitted_word2vec_Weight.p'

    else:
        raise Exception('Wrong parameter!')

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending))
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)

    if word_vectors_model == 'word2vec':
        if amending == True:
            word_vecs = load_embeddings('amended_word2vec')
        elif amending == False:
            word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'GloVe':
        if amending == True:
            word_vecs = load_embeddings('amended_glove')
        elif amending == False:
            word_vecs = load_embeddings('glove')
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'retrofitted_GloVe':
        word_vecs = load_embeddings('zh_tw', 'D:\Word_Embeddings\English\glove.6B\GloVe_out_vec_file.txt')
        # convert gensim model to dict type
        w2v = dict()
        for key in word_vecs.vocab.keys():
            w2v[key] = word_vecs[key]
        word_vecs = w2v

    elif word_vectors_model == 'retrofitted_word2vec':
        word_vecs = load_embeddings('zh_tw', 'D:\Word_Embeddings\English\word2vec_out_vec_file.txt')
        # convert gensim model to dict type
        w2v = dict()
        for key in word_vecs.vocab.keys():
            w2v[key] = word_vecs[key]
        word_vecs = w2v

    else:
        raise Exception('Wrong parameter!')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending))
    return (data, W)
Beispiel #38
0
def convert(source_file):
    s = load_pickle(source_file)
    dump_picle(s, str(source_file)[:-2] + '_v2.7.p', protocol=2)
Beispiel #39
0
    vocab = defaultdict(float)
    for sent in corpus:
        for word in sent:
            vocab[word] += 1
    print(len(vocab))
    return vocab


# 注意: 这个文件就是CVAT构造cnn输入数据的代码
########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
print(corpus[:2])
vocab = get_vocab(corpus)
dump_picle(vocab, get_file_path('CVAT_Vocab'))
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'),
                                         vocab,
                                         k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
print('OK')

# word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
    neg_idx_data, neg_length = load_data(file_dir + 'neg/')
    print(neg_idx_data.shape, neg_length)
    data = np.concatenate((pos_idx_data, neg_idx_data), axis=0)
    print(data.shape)
    return data, pos_length, neg_length


if __name__ == '__main__':
    ########################################## config ########################################
    file_dir = 'E:/研究/Data/IMDB/aclImdb/train/' if os.name == 'nt' else '/home/hs/Data/imdb/aclImdb/train/'
    vec_dim = 300
    ##########################################################################################

    # get vocab and save to pickle
    vocab = get_vocab(file_dir)
    dump_picle(vocab, './data/tmp/vocab.p')
    print('OK')
    a = load_pickle('./data/tmp/vocab.p')
    for i in a:
        print(i)
    print(len(a))
    exit()
    # end

    # make word index map
    vocab = load_pickle('./data/tmp/vocab.p')
    W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'),
                                             vocab,
                                             k=300)
    dump_picle(word_idx_map, get_file_path('word_idx_map'))
    print('dump word_idx_map successful')
Beispiel #41
0
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'),
                                         vocab,
                                         k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
print('dump embedding matrix file OK')
# word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p')
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
dump_picle([idx_data, ratings],
           './data/corpus/vader/vader_processed_data_all.p')
print(idx_data[0])
print(ratings[0])

############################################## tweets ###############################################
# corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
corpus, ratings = load_vader(['tweets'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
Beispiel #42
0
def load_sst(path=None, level=None):
    filename = './tmp/SST.p'
    if os.path.isfile(filename):
        print('Load OK.')
        return load_pickle(filename)

    def cleanStr(string):
        string = re.sub(r'^A-Za-z0-9(),!?\'\`', ' ', string)
        string = re.sub(r'\s{2,}', ' ', string)
        string = string.replace('á', 'á').replace('é', 'é').replace(
            'ñ', 'ñ').replace('Â', '').replace('ï', 'ï')
        string = string.replace('ü', 'ü').replace('â', 'â').replace(
            'è', 'è').replace('ö', 'ö').replace('æ', 'æ')
        string = string.replace('ó', 'ó').replace('û', 'û').replace(
            'ô', 'ô').replace('ã', 'ã').replace('ç', 'ç')
        string = string.replace('Ã ', 'à ').replace('Ã',
                                                    'í').replace('í­', 'í')
        return string

    # sentiment label
    sentiment_file = open(path + 'sentiment_labels.txt', 'r')
    sentiment_label = {}
    n = 0
    for line in sentiment_file:
        lines = line.strip().split('|')
        if n > 0:
            sentiment_label[int(lines[0])] = float(lines[1])
        n += 1
    sentiment_file.close()

    # phrase dict
    dict_file = open(path + 'dictionary.txt', 'r')
    phrase_dict = {}
    for line in dict_file:
        # line = line.decode('utf-8')
        lines = line.strip().split('|')
        phrase_dict[lines[0]] = int(lines[1])
    dict_file.close()

    # sentence dict
    sentence_file = open(path + 'datasetSentences.txt', 'r')
    sentence_dict = {}
    n = 0
    for line in sentence_file:
        # line = line.decode('utf-8')
        line = line.replace('-LRB-', '(')
        line = line.replace('-RRB-', ')')
        lines = line.strip().split('\t')
        if n > 0:
            sentence_dict[int(lines[0])] = lines[1]
        n += 1
    sentence_file.close()

    # datasplit
    datasplit_file = open(path + 'datasetSplit.txt', 'r')
    split_dict = {}
    n = 0
    for line in datasplit_file:
        lines = line.strip().split(',')
        if n > 0:
            split_dict[int(lines[0])] = int(lines[1])
        n += 1
    datasplit_file.close()

    size = len(sentence_dict)  # size = 11855
    # for i in range(1000):
    #     senti = sentiment_label[phrase_dict[cleanStr(sentence_dict[i + 1])]]
    #     print(i, senti, cleanStr(sentence_dict[i + 1]))
    # exit()
    x_train, y_train_valence, y_train_labels = [], [], []
    x_test, y_test_valence, y_test_labels = [], [], []
    x_valid, y_valid_valence, y_valid_labels = [], [], []

    x_train_polarity, y_train_polarity = [], []
    x_test_polarity, y_test_polarity = [], []
    x_valid_polarity, y_valid_polarity = [], []

    for i in range(size):
        # print sentence_dict[i+1].encode('utf-8')
        sentence = cleanStr(sentence_dict[i + 1])
        senti = sentiment_label[phrase_dict[sentence]]

        # print(senti, sentence)
        labels, polarity = None, None
        if 0 <= senti <= 0.2:
            labels = 1
            polarity = 0
        if 0.2 < senti <= 0.4:
            labels = 2
            polarity = 0
        if 0.4 < senti <= 0.6:
            labels = 3
        if 0.6 < senti <= 0.8:
            labels = 4
            polarity = 1
        if 0.8 < senti <= 1:
            labels = 5
            polarity = 1
        if labels is None:
            raise Exception('Sentiment Error !')

        if split_dict[i + 1] == 1:
            x_train.append(sentence)
            y_train_valence.append(senti)
            y_train_labels.append(labels)
            if polarity is not None:
                x_train_polarity.append(sentence)
                y_train_polarity.append(polarity)
        elif split_dict[i + 1] == 2:
            x_test.append(sentence)
            y_test_valence.append(senti)
            y_test_labels.append(labels)
            if polarity is not None:
                x_test_polarity.append(sentence)
                y_test_polarity.append(polarity)
        else:
            x_valid.append(sentence)
            y_valid_valence.append(senti)
            y_valid_labels.append(labels)
            if polarity is not None:
                x_valid_polarity.append(sentence)
                y_valid_polarity.append(polarity)

    print("Fine-grained: #training: %s, #valid: %s, #test: %s" %
          (len(x_train), len(x_valid), len(x_test)))
    print("Binary classification: #train: %s, #valid: %s, #test: %s" %
          (len(x_train_polarity), len(x_valid_polarity), len(x_test_polarity)))

    # t = zip(x_train, y_train)
    # random.shuffle(t)
    # x_train, y_train = zip(*t)
    output = (x_train, y_train_valence, y_train_labels, x_test, y_test_valence,
              y_test_labels, x_valid, y_valid_valence, y_valid_labels,
              x_train_polarity, y_train_polarity, x_test_polarity,
              y_test_polarity, x_valid_polarity, y_valid_polarity)
    dump_picle(output, filename)
    print('Data saved and load successfully.')
    return output
Beispiel #43
0
from vectorizers import TFIDF_estimator, anew_estimator

# class StemmedTfidfVectorizer(TfidfVectorizer):
#     def build_analyzer(self):
#         # 利用NLTK进行词干化处理
#         english_stemmer = nltk.stem.SnowballStemmer('english')
#         analyzer = super(TfidfVectorizer, self).build_analyzer()
#         return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
#
#
# vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word',
#                     'min_df': parameters['min_df'], 'max_df': parameters['max_df'],
#                     'binary': parameters['TF_binary'], 'norm': parameters['norm'],
#                     'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']}

# log_state('Start generating features')
# log_state((sorted(list(vectorizer_param.items()))))
# log_state('Training data size: ' + str(parameters['test_data_size']))
if __name__ == "__main__":
    vectorizer = anew_estimator()
    train_type = 'Sentiment140'
    texts, _ = load_train_data(train_type)
    transformed_train = vectorizer.fit_transform(texts)
    testdata, _ = load_test_data()
    transformed_test = vectorizer.transform(testdata)
    dump_picle(vectorizer.get_feature_names(),
               './data/features/feature_names.p')
    dump_picle(transformed_train,
               "./data/transformed_data/transformed_train.p")
    dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
Beispiel #44
0
print(embedding_matrix[1])
print(idx_map['我們'])

print(len(word_vecs['我們']))
print(word_vecs['我們'].shape)

print(build_sentence_matrix(model=word_vecs, sententces=corpus[:2], dim=dim))

print('Result')
sentence_embedding_matrix = build_sentence_matrix(word_vecs, corpus, dim=dim)
print(sentence_embedding_matrix.shape)
print(sentence_embedding_matrix[3], valence[3], arousal[3])

from save_data import dump_picle

dump_picle((sentence_embedding_matrix, valence),
           get_file_path('CVAT_sentence_matrix_400'))

exit()
'''
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(sentence_embedding_matrix, valence, test_size=0.2,
                                                                     random_state=0)
print(X_train.shape)
print(len(Y_test))

maxlen = 200
size = 50
X_train = X_train.reshape(X_train.shape[0], 1, maxlen, size)
X_test = X_test.reshape(X_test.shape[0], 1, maxlen, size)
print(X_train.shape)

batch_size = 128
def keras_nn_input(word_vectors_model, amending):
    if word_vectors_model == 'word2vec':
        if amending == True:
            filename_data, filename_w = './tmp/amended_w2v_indexed_data.p', './tmp/amended_w2v_Weight.p'
        elif amending == False:
            filename_data, filename_w = './tmp/w2v_indexed_data.p', './tmp/w2v_Weight.p'
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'GloVe':
        if amending == True:
            filename_data, filename_w = './tmp/amended_GloVe_indexed_data.p', './tmp/amended_GloVe_Weight.p'
        elif amending == False:
            filename_data, filename_w = './tmp/GloVe_indexed_data.p', './tmp/GloVe_Weight.p'
        else:
            raise Exception('Wrong!')
    else:
        raise Exception('Wrong parameter!')

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending))
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)

    if word_vectors_model == 'word2vec':
        if amending == True:
            word_vecs = load_embeddings('amended_word2vec')
        elif amending == False:
            word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'GloVe':
        if amending == True:
            word_vecs = load_embeddings('amended_glove')
        elif amending == False:
            word_vecs = load_embeddings('glove')
        else:
            raise Exception('Wrong!')
    else:
        raise Exception('Wrong parameter!')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending))
    return (data, W)
Beispiel #46
0
print(embedding_matrix[1])
print(idx_map['我們'])

print(len(word_vecs['我們']))
print(word_vecs['我們'].shape)

print(build_sentence_matrix(model=word_vecs, sententces=corpus[:2], dim=dim))

print('Result')
sentence_embedding_matrix = build_sentence_matrix(word_vecs, corpus, dim=dim)
print(sentence_embedding_matrix.shape)
print(sentence_embedding_matrix[3], valence[3], arousal[3])

from save_data import dump_picle

dump_picle((sentence_embedding_matrix, valence), get_file_path('CVAT_sentence_matrix_400'))

exit()

'''
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(sentence_embedding_matrix, valence, test_size=0.2,
                                                                     random_state=0)
print(X_train.shape)
print(len(Y_test))

maxlen = 200
size = 50
X_train = X_train.reshape(X_train.shape[0], 1, maxlen, size)
X_test = X_test.reshape(X_test.shape[0], 1, maxlen, size)
print(X_train.shape)
    lexicon_name = get_file_path('anew')
    logger.info(r"loading lexicon form : " + lexicon_name)
    words, valences, _ = load_anew(lexicon_name)
    corpus, ratings = screen_data(corpus, ratings, words)
    ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
    print(len(corpus))
    # for i in corpus[:100]:
    #     print(i)

    lexicon = dict()
    for i, word in enumerate(words):
        lexicon[word] = valences[i]
    mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos = calculate_ratings(
        corpus, ratings, lexicon)
    dump_picle([
        mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos, ratings
    ], './data/vader_out.p')
    exit()
    from collections import defaultdict
    # idf = sp.log(float(len(D)) / (len([doc.split() for doc in D if t in doc.split()])))

    # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')

    # length = len(vocab)
    ##################################### IDF ####################################################
    # idf=dict()
    # for i, word in enumerate(words):
    #     denominator = sum(1 for doc in corpus if word in doc.split())
    #     if denominator != 0:
    #         idf[word] = sp.log(float(len(corpus)) / denominator)
    #         if i%50 == 0:
Beispiel #48
0
    for i in top_n_ind:
        print('Parameter setting: %s, acc: %s' % (str(list(grid)[i]), param_fitness[i]))


if __name__=='__main__':
    result_analysis('./tmp/grid_search_result.p')
    print('OK')
    exit()

    scope = [0., 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    param_grid = {'a':scope, 'b': scope, 'c': scope}
    param_fitness = []

    grid = ParameterGrid(param_grid)

    for params in grid:
        print('calculating... parameter: %s' % str(params))
        score = my_function(params['a'], params['b'], params['c'])
        print('Score: %s' % score)
        param_fitness.append(score)

    print('grid search complete.')
    # return the best fitness value and its settings
    best_fitness = np.min(np.array(param_fitness))
    best_ind = np.where(np.array(param_fitness)==best_fitness)[0]
    print('best fitness: %s' % best_fitness)
    print('best setting: %s' % str(list(grid)[best_ind]))

    dump_picle((param_grid, param_fitness), './tmp/grid_search_result.p')
Beispiel #49
0
def get_vocab(corpus):
    vocab = defaultdict(float)
    for sent in corpus:
        for word in sent:
            vocab[word] += 1
    print(len(vocab))
    return vocab

# 注意: 这个文件就是CVAT构造cnn输入数据的代码
########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
print(corpus[:2])
vocab = get_vocab(corpus)
dump_picle(vocab, get_file_path('CVAT_Vocab'))
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
print('OK')

# word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
    return [clean_str(sent) for sent in corpus]


vec_dim = 300

############################################## tweets ###############################################
# corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
corpus, ratings = load_vader(['tweets'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
dump_picle(vocab, './data/corpus/vader/vocab_moview_tweets.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print('词汇数量:%s' % str(len(vocab)))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_tweets.p')
print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_tweets.p')
print('dump embedding matrix file OK')
# word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p')
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_tweets.p')
print(idx_data[0])
print(ratings[0])

print('OK')