Ejemplo n.º 1
0
def start_read(corpus_path, posting_path, term_dictionary, stemmer):
    Parse.set_stop_words_file(corpus_path + "/stop_words.txt")
    Parse.copy_stop_words_file(corpus_path + "/stop_words.txt", posting_path + "/stop_words.txt")
    directory_corpus = os.fsdecode(corpus_path)
    sub_dir_list = os.listdir(directory_corpus)
    size = int((len(sub_dir_list)-1) / 10)  # read the corpus in 11 parts
    if size == 0:
        size = 10
    idx = 1
    counter = 1
    ReadFile.__reset()
    ReadFile.creat_world_city_dictionary()
    for dir in sub_dir_list:
        if not dir == "stop_words.txt":
            ReadFile.creat_corpus_city_dictionary(corpus_path, dir)
    corpus_city_dictionary = {}
    city_dic = ReadFile.corpus_city_dictionary
    city_list = sorted(city_dic)
    for key in city_list:
        corpus_city_dictionary[key] = [city_dic[key], {}]
    ReadFile.__reset()
    sub_dir_list = os.listdir(directory_corpus)
    for dir in sub_dir_list:
        if not dir == "stop_words.txt":
            if counter % size == 0:
                docDict = ReadFile.docDictionary
                Parse.parse_docs({}, term_dictionary, corpus_city_dictionary, ReadFile.textDic, docDict, idx, stemmer, posting_path)
                time = datetime.datetime.now()
                ReadFile.__reset()
                idx += 1
            ReadFile.split_doc(corpus_path, dir)
            counter += 1
    if not counter % size == 0:
        Parse.parse_docs({}, term_dictionary, corpus_city_dictionary, ReadFile.textDic, ReadFile.docDictionary, idx, stemmer, posting_path)
        ReadFile.__reset()
    if stemmer:
        file = open(posting_path + "/city_dictionary_stemmer" + ".pkl", "wb+")
    else:
        file = open(posting_path + "/city_dictionary" + ".pkl", "wb+")
    pickle.dump(corpus_city_dictionary, file, pickle.HIGHEST_PROTOCOL)
    file.close()
    return idx