Beispiel #1
0
    def _write_file(self, uuid, md5, size, path, full_path):
        with open(full_path, 'rb') as f:
            data = f.read()

        util.check_uuid(uuid)
        util.check_md5(md5)
        if size != len(data):
            raise ValueError('size of file {}: {} != {}'.format(
                path, len(data), size))
        util.check_md5_hash(data, md5)
        if not path.startswith(uuid + '/'):
            if path.startswith('latest/') or path.startswith('content'):
                path = uuid + '/' + path
            else:
                raise ValueError('path does not start with uuid or svn/git')

        m = pyaccumulo.Mutation(md5)
        m.put(cf='file|project', cq=uuid)
        m.put(cf='file|path', cq=path)
        if md5 not in self.hashes:
            m.put(cf='file|size', cq='size', val=str(size))
            m.put(cf='file|content', cq='content', val=data)
            m.put(cf='twosix', cq='tags')  # false
            self.hashes.add(md5)
        #m.put(cf='file|count', cq='count', value=TODO)
        #m.put(cf='twosix|tag', cq='recursion', val='0.443')
        self.wr.add_mutation(m)
Beispiel #2
0
def parse(filename, uuid):
    util.check_uuid(uuid)
    with open(filename) as f:
        header = f.next()
        if header != '%%%% size,md5,filename\n':
            raise ValueError('header {} is not recognized'.format(header))
        lines = [line.rstrip() for line in f if line]  # ignore empty lines
        lines = [line for line in lines if line]
        # parse paths
        if not len(lines):
            raise ValueError('no lines in file {}'.format(filename))

    tokens = []
    for i, line in enumerate(lines):
        token = line.split(',', 2)  # any commas after the 2nd are in the path
        try:
            if len(token) < 3:
                raise ValueError('{} < 3 tokens in line'.format(len(token)))
            size, md5, path = token
            size = int(size)
            util.check_md5(md5)
        except ValueError as e:
            raise ValueError('line {}, value {}: {}'.format(i + 1, line, e))
        tokens.append((size, md5, path))
    sizes, md5s, paths = zip(*tokens)

    path.find(uuid) + len(uuid)
    ind = path.find(uuid)
    if ind == -1:
        raise ValueError('cannot find uuid {}'.find(uuid))
    start = ind + len(uuid)
    if path[start:].startswith('/tmp/'):
        suffix = '/tmp/'
    elif path[start:].startswith('/'):
        suffix = '/'
    else:
        raise ValueError('path does not have "/" after uuid')
    prefix = path[:start] + suffix

    length = len(prefix)
    for i, p in enumerate(paths):
        if not p.startswith(prefix):
            raise ValueError('line {} does not start with {}'.format(
                i, prefix))

    paths = [p[length:] for p in paths]
    for i, p in enumerate(paths):
        if not p:
            raise ValueError('line {} has an empty path'.format(i))

    return sizes, md5s, paths
Beispiel #3
0
    def __init__(self,
                 uuid,
                 max_size=MAX_SIZE,
                 target_dir=TARGET_DIR,
                 empty_size=4096):
        util.check_uuid(uuid)
        self.uuid = uuid
        self.max_size = int(max_size)
        if not os.path.isdir(target_dir):
            raise ValueError('{} is not a directory'.format(target_dir))
        self.target_dir = target_dir
        self.empty_size = int(empty_size)
        if self.empty_size < 0:
            raise ValueError('empty_size cannot be < 0')

        path = filemap.build_path(uuid)
        ending = '{}_code.tgz'.format(uuid)
        self.source_path = os.path.join(path, ending)
        if not os.path.isfile(self.source_path):
            raise ValueError('{} is not a file'.format(self.source_path))

        check_tgz_size(self.source_path, empty_size=self.empty_size)
        self.dir_path = os.path.join(self.target_dir, uuid)
Beispiel #4
0
 def add_project(self, uuid):
     util.check_uuid(uuid)
     m = pyaccumulo.Mutation(uuid)
     m.put(cf="", cq="")
     self.conn.write(self.project_table, m)
     self.projects.add(uuid)
Beispiel #5
0
# get all uuids in SAN
with open('data/uuids.json') as f:
    uuids = json.load(f)

# get all failed uuids in log
if os.path.isfile(log_name):
    with open(log_name) as f:
        failed = []
        start = 'ERROR:__main__:uuid '
        index = len(start)
        for line in f:
            if line.startswith(start):
                uuid = line[index:index + 36]
                try:
                    util.check_uuid(uuid)
                    failed.append(uuid)
                except ValueError:
                    pass
    failed = set(failed)

import log_errors

gatherer = accumulo.DataGatherer(**config)
start = time.time()
counter = 0
for i, uuid in enumerate(uuids):
    mid = time.time()
    if (np.log2(i) % 1) == 0.0:
        print '\n**** {} uuids in {} seconds ****\n'.format(i, mid - start)