def compile(self): print_progress(self.stats) writeln() self.failed_articles.close() self.empty_articles.close() self.skipped_articles.close() writeln('Compiling .aar files') self.add_metadata("article_count", self.stats.articles) articles = self.article_store.sorted( key=lambda x: collation_key(x).getByteArray()) log.info('Compiling %s', self.output_file_name) metadata = compress(tojson(self.metadata).encode('utf8')) header_meta_len = spec_len(HEADER_SPEC) + len(metadata) create_volume_func = functools.partial(self.create_volume, header_meta_len) for volume in self.make_volumes(create_volume_func, articles): m = "Creating volume %d" % volume.number log.info(m) writeln(m).flush() file_name = self.make_aar(volume) self.file_names.append(file_name) m = "Wrote volume %d" % volume.number log.info(m) writeln(m).flush() self.article_store.close() self.write_volume_count() self.write_sha1sum() rename_files(self.file_names)
def compile(self): print_progress(self.stats) writeln() self.failed_articles.close() self.empty_articles.close() self.skipped_articles.close() writeln('Compiling .aar files') self.add_metadata("article_count", self.stats.articles) articles = self.article_store.sorted(key=lambda x: collation_key(x).getByteArray()) log.info('Compiling %s', self.output_file_name) metadata = compress(tojson(self.metadata).encode('utf8')) header_meta_len = spec_len(HEADER_SPEC) + len(metadata) create_volume_func = functools.partial(self.create_volume, header_meta_len) for volume in self.make_volumes(create_volume_func, articles): m = "Creating volume %d" % volume.number log.info(m) writeln(m).flush() file_name = self.make_aar(volume) self.file_names.append(file_name) m = "Wrote volume %d" % volume.number log.info(m) writeln(m).flush() self.article_store.close() self.write_volume_count() self.write_sha1sum() rename_files(self.file_names)
def _sort(self): index1_sorted = tempfile.NamedTemporaryFile(prefix='index1_sorted', dir=self.work_dir, delete=False) self.index1_sorted = index1_sorted index1_unit_len = struct.calcsize(INDEX1_ITEM_FORMAT) klen_structsize = struct.calcsize(KEY_LENGTH_FORMAT) key = lambda x: collation_key(x).getByteArray() with open(self.index1.name) as fi1, open(self.index2.name) as fi2: index1 = mmap.mmap(fi1.fileno(), 0, prot=mmap.PROT_READ) index2 = mmap.mmap(fi2.fileno(), 0, prot=mmap.PROT_READ) index_item_count = len(index1)/index1_unit_len def read_packed_index1_item(i): pos_start = i*index1_unit_len pos_end = pos_start + index1_unit_len return index1[pos_start:pos_end] def index1_item_at(i): return struct.unpack(INDEX1_ITEM_FORMAT, read_packed_index1_item(i)) def read_key(pos): start = pos+klen_structsize s = index2[pos:start] strlen = struct.unpack(KEY_LENGTH_FORMAT, s)[0] return index2[start:start+strlen] def realkey(x): index_item = index1_item_at(x) index2_ptr = index_item[0] title = read_key(index2_ptr) return key(title) def sorted_index1_items(): for i in sorted(xrange(index_item_count), key=realkey): yield read_packed_index1_item(i) for index1_item in sorted_index1_items(): index1_sorted.write(index1_item) index1_sorted.close() log.info("Index sorted, removing temp file %s", self.index1.name) os.remove(self.index1.name)
def _sort(self): index1_sorted = tempfile.NamedTemporaryFile(prefix='index1_sorted', dir=self.work_dir, delete=False) self.index1_sorted = index1_sorted index1_unit_len = struct.calcsize(INDEX1_ITEM_FORMAT) klen_structsize = struct.calcsize(KEY_LENGTH_FORMAT) key = lambda x: collation_key(x).getByteArray() with open(self.index1.name) as fi1, open(self.index2.name) as fi2: index1 = mmap.mmap(fi1.fileno(), 0, prot=mmap.PROT_READ) index2 = mmap.mmap(fi2.fileno(), 0, prot=mmap.PROT_READ) index_item_count = len(index1) / index1_unit_len def read_packed_index1_item(i): pos_start = i * index1_unit_len pos_end = pos_start + index1_unit_len return index1[pos_start:pos_end] def index1_item_at(i): return struct.unpack(INDEX1_ITEM_FORMAT, read_packed_index1_item(i)) def read_key(pos): start = pos + klen_structsize s = index2[pos:start] strlen = struct.unpack(KEY_LENGTH_FORMAT, s)[0] return index2[start:start + strlen] def realkey(x): index_item = index1_item_at(x) index2_ptr = index_item[0] title = read_key(index2_ptr) return key(title) def sorted_index1_items(): for i in sorted(xrange(index_item_count), key=realkey): yield read_packed_index1_item(i) for index1_item in sorted_index1_items(): index1_sorted.write(index1_item) index1_sorted.close() log.info("Index sorted, removing temp file %s", self.index1.name) os.remove(self.index1.name)