Beispiel #1
0
def remove_stopped_terms(term_list):
    term_list = remove_terms_with_number(term_list)
    stopped_terms = ftools.read_txt("stopped_terms.txt")
    used_terms = ftools.read_txt("used_terms.txt")
    remove_list = []
    for term in term_list:
        if term.lower(
        ) in stopped_terms or len(term) < 5 or not term.lower() in used_terms:
            remove_list.append(term)
    for re in remove_list:
        term_list.remove(re)
    term_list = [te.lower() for te in term_list]
    term_list = list(set(term_list))
    return term_list
def read_20_newsgroup_weight(root_path):
    file_path_list = []
    sub_dir = os.listdir(root_path + "mini_newsgroups/")
    text_term_list = []
    dictionary = ftools.read_txt(root_path + "dictionary.txt")
    for sd in sub_dir:
        file_names = os.listdir(root_path + "mini_newsgroups/" + sd + "/")
        for fn in file_names:
            file_path_list.append(root_path + "mini_newsgroups/" + sd + "/" + fn)
    for fp in file_path_list:
        text_term_list.append(read_20_newsgroup_file_term(fp))
    weights = data_preprocess.get_txt_weight(text_term_list, dictionary)
    return weights
def read_20_newsgroup_file_term(text_path):
    file_txt_lines = ftools.read_txt(text_path)
    file_txt_lines_terms = [data_preprocess.get_txt_terms(line) for line in file_txt_lines]
    lines_num = -1
    for line_term in file_txt_lines_terms:
        if len(line_term) > 1 and line_term[0] == "Lines":
            lines_num = int(line_term[1])
            break
    needed_lines_terms = file_txt_lines_terms[-lines_num:]
    file_terms = []
    for nlt in needed_lines_terms:
        file_terms.extend(nlt)
    file_terms = list(set(file_terms))
    file_terms = data_preprocess.remove_stopped_terms(file_terms)
    return file_terms
Beispiel #4
0
def read_NYSK_weight(root_path):
    text_term_list = read_NYSK_file_text_term_list(root_path)
    dictionary = ftools.read_txt(root_path + "dictionary.txt")
    weights = data_preprocess.get_txt_weight(text_term_list, dictionary)
    return weights
Beispiel #5
0
def read_NYSK_feature_vector(root_path):
    text_term_list = read_NYSK_file_text_term_list(root_path)
    dictionary = ftools.read_txt(root_path + "dictionary.txt")
    feature_vector = data_preprocess.get_txt_feature_vector(text_term_list, dictionary)
    return feature_vector