Esempio n. 1
0
def build_master_index(cache=False):
    master_index = dict()
    size = IndexEntry.size()
    for node in localnode.nodes():
        # print("Processing %s_index" % node)
        node_index_file = os.path.join(GPFS_STORAGE, "%s_index" % node)
        with open(node_index_file, 'r') as f:
            while True:
                chunk = f.read(size)
                if chunk == '':
                    break
                index_entry = IndexEntry.unpack(chunk)
                strid = str(index_entry.id)
                if strid not in master_index:
                    master_index[strid] = []

                index_content = (index_entry.index,
                     index_entry.offset,
                     index_entry.chunk_size)
                master_index[strid].append(index_content)

    if cache:
        print("putting it to memcache")
        for key in master_index:
            memcached.set(key, master_index[key])
        print("done")
    return master_index
Esempio n. 2
0
def index_processed_file(index, writer):
    datafile = os.path.join(GPFS_STORAGE, "gram2_%s.processed" % str(index))
    with open(datafile, 'r') as f:
        pos = 0
        line = f.readline()
        while True:
            if is_parent_line(line):
                word, skip_lines, _ = parse_parent_line(line)
                starting_pos = pos
                md5 = hashlib.md5()
                md5.update(word)
                word_hash = md5.hexdigest()
                for i in range(0, skip_lines):
                    f.readline()

                chunk_size = f.tell() - starting_pos
                index_entry = IndexEntry(word_hash, index, starting_pos, chunk_size)
                writer.write(index_entry.pack())
                pos = f.tell()
            else:
                if line == '':
                    break # last line is empty in the data file, we are done here
                else:
                    raise ValueError('Improper data file %s' % datafile)

            line = f.readline()