Esempio n. 1
0
def merge_ham_spam():
    ham_file = "./data/ham.txt"
    spam_file = "./data/spam.txt"

    # load from txt
    import load
    ham_sent_list = cut(load.load_text(ham_file))
    spam_sent_list = cut(load.load_text(spam_file))

    # create DataFrame
    # ham label 0
    # spam label 1
    ham_df = pd.DataFrame(pd.Series(ham_sent_list))
    ham_df.columns = ["sent"]
    spam_df = pd.DataFrame(pd.Series(spam_sent_list))
    spam_df.columns = ["sent"]

    ham_df["label"] = 0
    spam_df["label"] = 1

    # merge
    df = pd.concat([ham_df, spam_df])
    # df.to_pickle("./data/sent_with_label.pkl")
    return df
Esempio n. 2
0
def main():
    # load content (text set)
    import load
    content_file = "./data/content.txt"
    sent_list = load.load_text(content_file)

    # cut
    sent_list = cut(sent_list)
    df = pd.DataFrame(pd.Series(sent_list))
    df.columns = ["sent"]

    # load word2id
    word2id = load.load_word_dict()

    # to_id
    df = sent_list_to_id(df, word2id)
    df.to_pickle("./data/content.pkl")
Esempio n. 3
0
# -*- coding: utf-8 -*-
import re
# import 20
import load

title = "イギリス"
fn = "jawiki-country.json.gz"


def getAllCategory(text):
    p = re.compile("^\[\[Category:(.*?)(\|.*\]\]|\]\])")
    return [
        p.match(line).group(1) for line in text.splitlines() if p.match(line)
    ]


text = load.load_text(fn, title)
print(getAllCategory(text))
# print(type(text))

#for x in text:
#    print (type(x))
Esempio n. 4
0
from pair_extract import pair
from apply_to_hotels import load_hotel


def y_nn():
    with open('y_nn.txt') as f:
        y = list(map(int, f.read().split()))
        return y


if __name__ == '__main__':

    # y_result = y_nn()
    # print(y_result)

    X_pure_train, X_sentences_train, aspects_list_train, _ = load_text(
        'SentiRuEval_rest_markup_train.xml')
    #X_pure_test, X_sentences_test, aspects_list_test, X_p = load_text('SentiRuEval_rest_markup_test.xml')
    X_pure_test, X_sentences_test, X_p = load_hotel()

    y_result1, y_result2, y_result3 = svm(X_pure_train, X_sentences_train,
                                          aspects_list_train, X_pure_test,
                                          X_sentences_test)

    pair(X_p, y_result3)

    #y_result = lingvistic(X_pure_test)
    #y_result = freq(X_pure_train, X_pure_test)

    #print (len(X_pure_test), len(y_result1))

    #save_result(X_pure_test, y_result, 'SentiRuEval_result_rest_test_on_rest_2LSTM.xml')