def main(): parser = argparse.ArgumentParser( description='Concatenate a number of vector files into single file') parser.add_argument('outpath', type=str, help='Place to save the new Vector file.') parser.add_argument( '--mode', type=str, default='a', help= 'Write mode for output. By default, appends if the file exists, can be switched to \'w\' to overwrite.' ) parser.add_argument('--build-cache', action='store_true', help='Build a prefix cache after concatenation.') parser.add_argument( '--no-concat', action='store_true', help="Skip the concatenation, if you're hoping to *just* build cache.") parser.add_argument('filepaths', type=str, nargs='*', help='List of vector files being combines.') args = parser.parse_args() if len(args.filepaths) == 0 and not args.no_concat: raise Exception("Nothing to do without input filepaths") if args.no_concat and not args.build_cache: raise Exception( "If you're not concatenating and not building a cache, you're not doing anything." ) if not args.no_concat: with SRP.Vector_file(args.filepaths[0], mode="r") as vecf: dims = vecf.dims with SRP.Vector_file(args.outpath, mode=args.mode, dims=dims) as outf: for efpath in args.filepaths: print("Concatenating:", efpath) outf.concatenate_file(efpath) if args.build_cache: with SRP.Vector_file(args.outpath, offset_cache=True) as outf: print("Building prefix lookup cache") outf._build_prefix_lookup(sep='-', dump_every=2000000)
def _initialize_embeddings(self, chunk_file): """ Read in an embedding file at unit length, and adjust the metadata to match. """ input = SRP.Vector_file(chunk_file) dataset = input.to_matrix(unit_length=True) self.matrix = dataset['matrix'] ids = dataset['names'] self.ids = ids sections = [a.split("-") for a in dataset['names']] try: htids, sections, starts, ends = zip(*sections) except: # The old format: deprecated. htids, sections = zip(*sections) sections = pd.DataFrame({ 'mtid': ids, 'htid': htids, 'section': list(map(int, sections)) }).set_index('htid') self.chunk_metadata = sections.join(self.metadata, how='left').reset_index() self.mtid_lookup = dict( zip(dataset['names'], range(len(dataset['names'])))) self.htid_lookup = defaultdict(list) for i, htid in enumerate(htids): self.htid_lookup[htid].append(ids[i])
def test_error_on_load(self): with tempfile.TemporaryDirectory() as dir: testfile = SRP.Vector_file(Path(dir, "test.bin"), dims=3, mode="w") with self.assertRaises(TypeError): testfile.add_row("this is a space", self.array1) testfile.close()