path_sname = 'wgre2' path_addb = './{}.addb'.format(path_sname) path_wf = './{}.wf'.format(path_sname) num_gwp = 1 if __name__ == '__main__': addb = load_addb(path_addb) if not addb: addb = AdapDictDB() iqueue = mp.Queue(maxsize=3000) oqueue = mp.Queue(maxsize=3000) mlock = mp.Lock() time_parsecsv = time.time() csvparser = CSVParser(path_csv) cands_word = csvparser.process_file() time_parsecsv = time.time() - time_parsecsv print("{:.3f} seconds passed for parsing {} words from {}".format( time_parsecsv, len(cands_word), path_csv)) parsers = list() for i in range(num_gwp): parsers.append(GetWordProcess(mlock, iqueue, oqueue)) for i in range(num_gwp): parsers[i].start() time_getwords = time.time() words_in_addb = addb.words.keys() words_failed = list()
def parse_csv(self, name_csv='default'): path_csv = './csvs/{}.csv'.format(name_csv) time_parsecsv = time.time() csvparser = CSVParser(path_csv) cands_word = csvparser.process_file() time_parsecsv = time.time() - time_parsecsv print("{:.3f} seconds passed for parsing {} words from {}".format( time_parsecsv, len(cands_word), path_csv)) parsers = list() for i in range(self.num_gwp): parsers.append(GetWordProcess(self.mlock, self.iqueue, self.oqueue)) for i in range(self.num_gwp): parsers[i].start() time_getwords = time.time() words_failed = list() num_finished_words = 0 num_words_existed = 0 num_test = 0 num_twords = len(cands_word.keys()) num_to_print = max(int(num_twords / 50), 1) num_words_per_iteration = 100 cws = list(cands_word.keys()) num_finished = 0 while num_finished < self.num_gwp: if len(cws) > 0 and self.oqueue.qsize() < num_words_per_iteration: for cw in cws[:num_words_per_iteration]: if not self.addb.exist_word(cw): self.iqueue.put(cw) else: num_words_existed += 1 # cws.remove(cw) time.sleep(0.001) cws = cws[num_words_per_iteration:] if len(cws) == 0 and self.iqueue.qsize() == 0: for i in range(len(parsers)): self.iqueue.put('FINISHED') if self.oqueue.qsize() > 0: # num_words_got = 0 rw = self.oqueue.get() if rw: if isinstance(rw, list) and rw[0] == 'FINISHED': num_finished += 1 words_failed += rw[1] else: try: if not self.addb.add_word(rw): if rw.name not in cands_word.keys(): self.addb.words[rw.name].set_impf(4) num_twords += 1 else: self.addb.words[rw.name].set_impf( cands_word[rw.name][0]) self.addb.words[rw.name].add_smean( cands_word[rw.name][1]) # print("new word {} added with impf {}".format(rw.name, addb.words[rw.name].get_impf())) num_finished_words += 1 # rw.print_part() else: print("{} is already in addb?".format(rw.name)) except Exception as e: print( "[ERROR | ParsingManager/parse_csv] {}".format( str(e))) if num_finished_words % num_to_print == 0: print( "num_finished_words : {:5d} / {:5d} ({:.2f}%)". format(num_finished_words, num_twords, num_finished_words / num_twords * 100)) else: time.sleep(0.05) for i in range(self.num_gwp): parsers[i].terminate() parsers[i].join() time_getwords = time.time() - time_getwords print("\n\n\n{:.3f} seconds passed for processing {} words".format( time_getwords, num_twords)) print("{} of them already existed".format(num_words_existed)) print("words_failed: {}".format(words_failed)) save_addb(self.addb, self.path_addb) self.save_wordsfailed(self.path_wf, words_failed) print("\n") for cw in cands_word: if cw not in self.addb.words.keys(): print("\"{}\" is not in addb!".format(cw)) # self.addb.view_words() return num_finished_words