class Group(object): # Represent a group of data # Groups can be compared to eachother # Groups try to combine common data stores in more than 1 data file _headings = [] _headingsType = [] _currRow = 0 def __init__(self, name, fileTypes): self._name = name self._fileTypes = fileTypes # @param filenameList: List of filenames that contains data. # @summary: # Uses the file parsers to parse each data file and combine common data def createGroup(self, filenameList): # Get all the possible headers in the files for filename in filenameList: if filename.endswith('.csv'): self._parser = CSVParser(filename) self._parser.parse() for heading in self._parser.getHeadings(): if self._headings.count(heading) == 0: self._headings.append(heading) self._headingsType.append(self._parser.getHeadingType(heading)) db = SQL("C:\\PerGraph\\" + self._name + ".db") db.connect() db.createTable("data",self._headings, self._headingsType) db.commit() db.close() def sort(self, heading): print 'BEFORE' for item in self._allData: print item try: idx = self._headings.index(heading) self._allData.sort(lambda x, y: cmp(x[idx], y[idx])) except ValueError: print 'Header %s does not exist' % heading print 'AFTER' for item in self._allData: print item
def createGroup(self, filenameList): # Get all the possible headers in the files for filename in filenameList: if filename.endswith('.csv'): self._parser = CSVParser(filename) self._parser.parse() for heading in self._parser.getHeadings(): if self._headings.count(heading) == 0: self._headings.append(heading) self._headingsType.append(self._parser.getHeadingType(heading)) db = SQL("C:\\PerGraph\\" + self._name + ".db") db.connect() db.createTable("data",self._headings, self._headingsType) db.commit() db.close()
path_csv = './csvs/words_gre.csv' path_sname = 'wgre2' path_addb = './{}.addb'.format(path_sname) path_wf = './{}.wf'.format(path_sname) num_gwp = 1 if __name__ == '__main__': addb = load_addb(path_addb) if not addb: addb = AdapDictDB() iqueue = mp.Queue(maxsize=3000) oqueue = mp.Queue(maxsize=3000) mlock = mp.Lock() time_parsecsv = time.time() csvparser = CSVParser(path_csv) cands_word = csvparser.process_file() time_parsecsv = time.time() - time_parsecsv print("{:.3f} seconds passed for parsing {} words from {}".format( time_parsecsv, len(cands_word), path_csv)) parsers = list() for i in range(num_gwp): parsers.append(GetWordProcess(mlock, iqueue, oqueue)) for i in range(num_gwp): parsers[i].start() time_getwords = time.time() words_in_addb = addb.words.keys() words_failed = list()
def parse_csv(self, name_csv='default'): path_csv = './csvs/{}.csv'.format(name_csv) time_parsecsv = time.time() csvparser = CSVParser(path_csv) cands_word = csvparser.process_file() time_parsecsv = time.time() - time_parsecsv print("{:.3f} seconds passed for parsing {} words from {}".format( time_parsecsv, len(cands_word), path_csv)) parsers = list() for i in range(self.num_gwp): parsers.append(GetWordProcess(self.mlock, self.iqueue, self.oqueue)) for i in range(self.num_gwp): parsers[i].start() time_getwords = time.time() words_failed = list() num_finished_words = 0 num_words_existed = 0 num_test = 0 num_twords = len(cands_word.keys()) num_to_print = max(int(num_twords / 50), 1) num_words_per_iteration = 100 cws = list(cands_word.keys()) num_finished = 0 while num_finished < self.num_gwp: if len(cws) > 0 and self.oqueue.qsize() < num_words_per_iteration: for cw in cws[:num_words_per_iteration]: if not self.addb.exist_word(cw): self.iqueue.put(cw) else: num_words_existed += 1 # cws.remove(cw) time.sleep(0.001) cws = cws[num_words_per_iteration:] if len(cws) == 0 and self.iqueue.qsize() == 0: for i in range(len(parsers)): self.iqueue.put('FINISHED') if self.oqueue.qsize() > 0: # num_words_got = 0 rw = self.oqueue.get() if rw: if isinstance(rw, list) and rw[0] == 'FINISHED': num_finished += 1 words_failed += rw[1] else: try: if not self.addb.add_word(rw): if rw.name not in cands_word.keys(): self.addb.words[rw.name].set_impf(4) num_twords += 1 else: self.addb.words[rw.name].set_impf( cands_word[rw.name][0]) self.addb.words[rw.name].add_smean( cands_word[rw.name][1]) # print("new word {} added with impf {}".format(rw.name, addb.words[rw.name].get_impf())) num_finished_words += 1 # rw.print_part() else: print("{} is already in addb?".format(rw.name)) except Exception as e: print( "[ERROR | ParsingManager/parse_csv] {}".format( str(e))) if num_finished_words % num_to_print == 0: print( "num_finished_words : {:5d} / {:5d} ({:.2f}%)". format(num_finished_words, num_twords, num_finished_words / num_twords * 100)) else: time.sleep(0.05) for i in range(self.num_gwp): parsers[i].terminate() parsers[i].join() time_getwords = time.time() - time_getwords print("\n\n\n{:.3f} seconds passed for processing {} words".format( time_getwords, num_twords)) print("{} of them already existed".format(num_words_existed)) print("words_failed: {}".format(words_failed)) save_addb(self.addb, self.path_addb) self.save_wordsfailed(self.path_wf, words_failed) print("\n") for cw in cands_word: if cw not in self.addb.words.keys(): print("\"{}\" is not in addb!".format(cw)) # self.addb.view_words() return num_finished_words