def build_master_index(cache=False): master_index = dict() size = IndexEntry.size() for node in localnode.nodes(): # print("Processing %s_index" % node) node_index_file = os.path.join(GPFS_STORAGE, "%s_index" % node) with open(node_index_file, 'r') as f: while True: chunk = f.read(size) if chunk == '': break index_entry = IndexEntry.unpack(chunk) strid = str(index_entry.id) if strid not in master_index: master_index[strid] = [] index_content = (index_entry.index, index_entry.offset, index_entry.chunk_size) master_index[strid].append(index_content) if cache: print("putting it to memcache") for key in master_index: memcached.set(key, master_index[key]) print("done") return master_index
def index_processed_file(index, writer): datafile = os.path.join(GPFS_STORAGE, "gram2_%s.processed" % str(index)) with open(datafile, 'r') as f: pos = 0 line = f.readline() while True: if is_parent_line(line): word, skip_lines, _ = parse_parent_line(line) starting_pos = pos md5 = hashlib.md5() md5.update(word) word_hash = md5.hexdigest() for i in range(0, skip_lines): f.readline() chunk_size = f.tell() - starting_pos index_entry = IndexEntry(word_hash, index, starting_pos, chunk_size) writer.write(index_entry.pack()) pos = f.tell() else: if line == '': break # last line is empty in the data file, we are done here else: raise ValueError('Improper data file %s' % datafile) line = f.readline()