Esempio n. 1
0
class FlatFileSaveSet(object):

    def __init__(self, database, path = None,
                 table = None,  buffer = None,
                 lines_per_chunk = 10000000000):

        self.database = database
        self.table = table

        self.file = None
        self.strinhg = None

        self.csv_file = CsvFile(path = path,
                                buffer = buffer)

        self.csv_file.get_dialect()
        self.csv_file.get_headings()
        self.csv_file.parse_headings()
        self.csv_file.guess_types()
        self.lines_per_chunk = lines_per_chunk
        self.lines = self.csv_file.chunk(lines_per_chunk)
        self.chunks = self.csv_file.chunks
        self.chunk_status = []

    def get_first_generator(self, chunk):
        save_data = self.csv_file.iterate_csv(chunk, as_dict = True)
        for num, line in enumerate(save_data):
            if "prev" not in (line.get("_core_id"), line.get("id")):
                break
        else:
            return ()

        try:
            save_data.send(1)
            save_data.next()
        except StopIteration:
            pass

        generator = self.csv_file.iterate_csv(chunk, as_dict = True)

        return (num, islice(generator, num, None))


    def get_end_generator(self, chunk):

        if chunk + 1 not in self.csv_file.chunks:
            return []
        save_data = self.csv_file.iterate_csv(chunk + 1, as_dict = True,
                                              no_end = True)

        num = 0
        for num, line in enumerate(save_data):
            if "prev" not in (line.get("_core_id"), line.get("id")):
                break
        else:
            num = num + 1

        try:
            save_data.send(1)
            save_data.next()
        except StopIteration:
            pass

        generator = self.csv_file.iterate_csv(chunk + 1, as_dict = True,
                                                  no_end = True)

        return (num, islice(generator, 0, num))

    def load_chunk(self, chunk, validate = True):

        try:
            start, first_generator = self.get_first_generator(chunk)
        except ValueError:
            return ChunkStatus((0,0), "empty chunk")

        range_start = chunk * self.lines_per_chunk + start + 1

        try:
            end, end_generator = self.get_end_generator(chunk)
            range_end = (chunk + 1) * self.lines_per_chunk + end
        except ValueError:
            range_end, end_generator = self.lines, []

        save_data = chain(first_generator, end_generator)

        save_set = MultipleSaveSet(self.database, save_data,
                                   table = self.table)
        range = (range_start, range_end)

        try:
            save_set_errors = save_set.save(validate = validate)
        except sa.orm.exc.ConcurrentModificationError, e:
            return ChunkStatus(range, "locking error", error = e)
        except Exception, e:
            raise
            return ChunkStatus(range, "unknown error", error = e)