def pipeline(factories, membership, split=None, sample=None, num_parts=None, dest_type="indirect", dest_dir=".", verbosity=1, seed=1): prng = Random(seed) if num_parts and split: raise ValueError, "Can't partition and split the dataset at the same time." if num_parts: if len(factories) != 1: raise ValueError, "Won't partition more than one dataset" dest_names = partition_dataset( factories[0], membership, num_parts, dest_type=dest_type, dest_dir=dest_dir, prng=prng) if verbosity >= 1: sys.stderr.write('Partitioned %s into %d datasets\n' % ( factories[0].name, num_parts)) factories = [create_factory(dest_name) for dest_name in dest_names] if verbosity >= 2: for factory in factories: num_elements = len(factory.get_sources()) sys.stderr.write('%s: %d elements\n' % (factory.name, num_elements)) if split: if len(factories) != 1: raise ValueError, "Won't split more than one dataset" ref_name, test_name = split_dataset( factories[0], membership, fraction=split, dest_type=dest_type, dest_dir=dest_dir, prng=prng) if verbosity >= 1: sys.stderr.write('Split %s into %s (reference) and %s (test)\n' % ( factories[0].name, ref_name, test_name)) factories = [create_factory(ref_name), create_factory(test_name)] if sample: for i, factory in enumerate(factories): dest_name = sample_dataset(factory, membership, sample, dest_type=dest_type, dest_dir=dest_dir, prng=prng) factories[i] = create_factory(dest_name) if verbosity >= 1: sys.stderr.write('Sampled %s into %s\n' % (factory.name, dest_name)) return factories
def parse_args(args): if len(args.input) == 1: factory = create_factory(args.input[0]) if args.complete: factories = [factory, factory] else: factories = [factory] elif len(args.input) == 2: factories = [ create_factory(args.input[0]), create_factory(args.input[1]) ] else: raise ValueError('More than two sources provided') compressor = c.get_compressor(args.compressor, level=args.level, model_order=args.model_order, memory=args.memory, restoration_method=args.restoration) return { 'factories': factories, 'compressor': compressor, 'matrix_format': args.matrix_format, }
import ncd2 as ncd import partition as p from pprint import pformat from clustering import pipeline from compressor import get_compressor if __name__ == '__main__': dataset = '../../dataset/binaries' dataset_ref = '../../dataset/binaries-membership.csv' block_sizes = ( range(0, 128, 16) + range(128, 1024, 128) + range(1024, 32768, 1024)) + range(32768-128, 32768+7*128, 128) factory = ds.create_factory(dataset) # Calculate the distance matrix for each block size matrix_dir = 'interleave/matrix' if not os.path.isdir(matrix_dir): os.makedirs(matrix_dir) for block_size in block_sizes: fname = os.path.join(matrix_dir, 'block_%d.csv' % block_size) if os.path.isfile(fname): continue print 'Calculating distance matrix for block_size = %d' % block_size ncd_results = ncd.distance_matrix([factory, factory], get_compressor('zlib'), interleave_block_size=block_size)
def full_distance_matrix(dataset, compressor): _, _, _, _, start = os.times() factory = ds.create_factory(dataset) ncd_results = ncd.distance_matrix([factory, factory], compressor) _, _, _, _, finish = os.times() return ncd_results, finish - start
'dest_type': args.output_mode, 'dest_dir': args.dest_dir or "", } if __name__ == '__main__': parser = argparse.ArgumentParser(parents=[cli_parser()]) parser.add_argument('input', nargs='+', help='Datasets') parser.add_argument('--membership', help='Membership file') parser.add_argument('-v', '--verbosity', action='count') a = parser.parse_args() if len(a.input) > 2: raise ValueError, 'More than two datasets provided' if not (a.split or a.sample or a.num_parts): warnings.warn("Nothing to do...") exit() factories = [create_factory(fname) for fname in a.input] if a.membership: membership = p.membership_parse(a.membership) elif a.no_membership: membership = {} for factory in factories: for source in factory.get_sources(): membership[source.name] = 'all' else: raise ValueError, 'Membership was not provided (you may want to pass --no-membership to make it explicit)' pipeline(factories, membership, verbosity=a.verbosity, **parse_args(a))
range(20), is_upper=True, start=73, stop=156), ncd.pairs(range(20), range(20), is_upper=True, start=156)))) compressor = c.get_compressor('zlib') class Dialect(csv.excel): delimiter = ':' factory = ds.create_factory('testdata/planets_with_name.txt', csv_dialect=Dialect) sources = factory.get_sources() zs = { 'MERCURY': 72, 'VENUS': 76, 'EARTH': 69, 'MARS': 75, } zxys = { ('MERCURY', 'MERCURY'): 76, ('MERCURY', 'VENUS'): 121, ('MERCURY', 'EARTH'): 114, ('MERCURY', 'MARS'): 117, ('VENUS', 'MERCURY'): 121,