class FlatFileSaveSet(object): def __init__(self, database, path = None, table = None, buffer = None, lines_per_chunk = 10000000000): self.database = database self.table = table self.file = None self.strinhg = None self.csv_file = CsvFile(path = path, buffer = buffer) self.csv_file.get_dialect() self.csv_file.get_headings() self.csv_file.parse_headings() self.csv_file.guess_types() self.lines_per_chunk = lines_per_chunk self.lines = self.csv_file.chunk(lines_per_chunk) self.chunks = self.csv_file.chunks self.chunk_status = [] def get_first_generator(self, chunk): save_data = self.csv_file.iterate_csv(chunk, as_dict = True) for num, line in enumerate(save_data): if "prev" not in (line.get("_core_id"), line.get("id")): break else: return () try: save_data.send(1) save_data.next() except StopIteration: pass generator = self.csv_file.iterate_csv(chunk, as_dict = True) return (num, islice(generator, num, None)) def get_end_generator(self, chunk): if chunk + 1 not in self.csv_file.chunks: return [] save_data = self.csv_file.iterate_csv(chunk + 1, as_dict = True, no_end = True) num = 0 for num, line in enumerate(save_data): if "prev" not in (line.get("_core_id"), line.get("id")): break else: num = num + 1 try: save_data.send(1) save_data.next() except StopIteration: pass generator = self.csv_file.iterate_csv(chunk + 1, as_dict = True, no_end = True) return (num, islice(generator, 0, num)) def load_chunk(self, chunk, validate = True): try: start, first_generator = self.get_first_generator(chunk) except ValueError: return ChunkStatus((0,0), "empty chunk") range_start = chunk * self.lines_per_chunk + start + 1 try: end, end_generator = self.get_end_generator(chunk) range_end = (chunk + 1) * self.lines_per_chunk + end except ValueError: range_end, end_generator = self.lines, [] save_data = chain(first_generator, end_generator) save_set = MultipleSaveSet(self.database, save_data, table = self.table) range = (range_start, range_end) try: save_set_errors = save_set.save(validate = validate) except sa.orm.exc.ConcurrentModificationError, e: return ChunkStatus(range, "locking error", error = e) except Exception, e: raise return ChunkStatus(range, "unknown error", error = e)