def start_read(corpus_path, posting_path, term_dictionary, stemmer): Parse.set_stop_words_file(corpus_path + "/stop_words.txt") Parse.copy_stop_words_file(corpus_path + "/stop_words.txt", posting_path + "/stop_words.txt") directory_corpus = os.fsdecode(corpus_path) sub_dir_list = os.listdir(directory_corpus) size = int((len(sub_dir_list)-1) / 10) # read the corpus in 11 parts if size == 0: size = 10 idx = 1 counter = 1 ReadFile.__reset() ReadFile.creat_world_city_dictionary() for dir in sub_dir_list: if not dir == "stop_words.txt": ReadFile.creat_corpus_city_dictionary(corpus_path, dir) corpus_city_dictionary = {} city_dic = ReadFile.corpus_city_dictionary city_list = sorted(city_dic) for key in city_list: corpus_city_dictionary[key] = [city_dic[key], {}] ReadFile.__reset() sub_dir_list = os.listdir(directory_corpus) for dir in sub_dir_list: if not dir == "stop_words.txt": if counter % size == 0: docDict = ReadFile.docDictionary Parse.parse_docs({}, term_dictionary, corpus_city_dictionary, ReadFile.textDic, docDict, idx, stemmer, posting_path) time = datetime.datetime.now() ReadFile.__reset() idx += 1 ReadFile.split_doc(corpus_path, dir) counter += 1 if not counter % size == 0: Parse.parse_docs({}, term_dictionary, corpus_city_dictionary, ReadFile.textDic, ReadFile.docDictionary, idx, stemmer, posting_path) ReadFile.__reset() if stemmer: file = open(posting_path + "/city_dictionary_stemmer" + ".pkl", "wb+") else: file = open(posting_path + "/city_dictionary" + ".pkl", "wb+") pickle.dump(corpus_city_dictionary, file, pickle.HIGHEST_PROTOCOL) file.close() return idx