-
Notifications
You must be signed in to change notification settings - Fork 0
/
bundle.py
116 lines (98 loc) · 3.64 KB
/
bundle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import avro.schema
from avro.datafile import DataFileWriter
import os
import time
import avro.io
import hashlib
import math
maxfilesize = 800 * 1000 * 1000
def makedir(name, relative_path):
return {
"FSType": "DIRECTORY",
"Name": name,
"RelativePath": relative_path,
"NumberSiblings": 0,
"SiblingPartNumber": 0,
"ContentMD5": None,
"Content": None
}
def makefile(name, relative_path, number_siblings, sibling_part_number, bytes):
md5 = hashlib.md5()
md5.update(bytes)
return {
"FSType": "FILE",
"Name": name,
"RelativePath": relative_path,
"NumberSiblings": number_siblings,
"SiblingPartNumber": sibling_part_number,
"ContentMD5": md5.hexdigest(),
"Content": bytes
}
def get_file_chunks(file):
size = os.path.getsize(file)
numsiblings = int(math.ceil(float(size) / float(maxfilesize))) - 1
sibling_number = 0
with open(file, 'rb') as fd:
while True:
data = fd.read(maxfilesize / 10)
if not data:
break
yield sibling_number, numsiblings, data
sibling_number += 1
def rotate_avro_file(fd, writer, iteration, fileprefix, destdir, datum, schema):
iteration += 1
avrofile = fileprefix + "-part-{0:04d}.avro".format(iteration)
writer.close()
fd.close()
fd = open(os.path.join(destdir, avrofile), 'wb')
writer = DataFileWriter(fd, datum, schema, codec='deflate')
return fd, writer, iteration
def create_archive(basedir, destdir):
all_files = []
all_dirs = []
# make a snapshot in case the output directory is the bundle source - so we don't recursively bundle the output
for path, dirs, files in os.walk(basedir):
for d in dirs:
dir = os.path.join(path, d)
all_dirs.append(dir)
for f in files:
file = os.path.join(path, f)
all_files.append(file)
schema = avro.schema.parse(
open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "avro-schemas.json")).read())
fileprefix = time.strftime("%Y%m%d-%H%M%S")
avrofile = fileprefix + "-part-0001.avro"
iteration = 1
fd = open(os.path.join(destdir, avrofile), 'wb')
datum = avro.io.DatumWriter()
writer = DataFileWriter(fd, datum, schema, codec='deflate')
try:
for d in all_dirs:
val = makedir(os.path.basename(os.path.normpath(d)),
os.path.relpath(d, basedir))
writer.append(val)
for f in all_files:
for sibling, numsiblings, chunk in get_file_chunks(f):
if (fd.tell() + len(chunk)) > maxfilesize * 1.1:
fd, writer, iteration = rotate_avro_file(fd,
writer,
iteration,
fileprefix,
destdir,
datum,
schema)
file = makefile(os.path.basename(os.path.normpath(f)),
os.path.relpath(f, basedir),
numsiblings,
sibling,
chunk)
writer.append(file)
writer.flush()
del file
for f in all_files:
os.remove(f)
for d in all_dirs:
os.rmdir(d)
finally:
writer.close()
fd.close()