Beispiel #1
0
def pipeline(factories, membership,
    split=None, sample=None, num_parts=None,
    dest_type="indirect", dest_dir=".", verbosity=1, seed=1):
  prng = Random(seed)

  if num_parts and split:
    raise ValueError, "Can't partition and split the dataset at the same time."

  if num_parts:
    if len(factories) != 1:
      raise ValueError, "Won't partition more than one dataset"

    dest_names = partition_dataset(
        factories[0], membership, num_parts,
        dest_type=dest_type, dest_dir=dest_dir, prng=prng)
    if verbosity >= 1:
      sys.stderr.write('Partitioned %s into %d datasets\n' % (
        factories[0].name, num_parts))
    factories = [create_factory(dest_name) for dest_name in dest_names]
    if verbosity >= 2:
      for factory in factories:
        num_elements = len(factory.get_sources())
        sys.stderr.write('%s: %d elements\n' % (factory.name, num_elements))

  if split:
    if len(factories) != 1:
      raise ValueError, "Won't split more than one dataset"

    ref_name, test_name = split_dataset(
        factories[0], membership,
        fraction=split,
        dest_type=dest_type, dest_dir=dest_dir, prng=prng)
    if verbosity >= 1:
      sys.stderr.write('Split %s into %s (reference) and %s (test)\n' % (
          factories[0].name, ref_name, test_name))
    factories = [create_factory(ref_name), create_factory(test_name)]

  if sample:
    for i, factory in enumerate(factories):
      dest_name = sample_dataset(factory, membership, sample,
          dest_type=dest_type, dest_dir=dest_dir, prng=prng)
      factories[i] = create_factory(dest_name)
      if verbosity >= 1:
        sys.stderr.write('Sampled %s into %s\n' % (factory.name, dest_name))

  return factories
Beispiel #2
0
def parse_args(args):
    if len(args.input) == 1:
        factory = create_factory(args.input[0])
        if args.complete: factories = [factory, factory]
        else: factories = [factory]
    elif len(args.input) == 2:
        factories = [
            create_factory(args.input[0]),
            create_factory(args.input[1])
        ]
    else:
        raise ValueError('More than two sources provided')

    compressor = c.get_compressor(args.compressor,
                                  level=args.level,
                                  model_order=args.model_order,
                                  memory=args.memory,
                                  restoration_method=args.restoration)

    return {
        'factories': factories,
        'compressor': compressor,
        'matrix_format': args.matrix_format,
    }
Beispiel #3
0
import ncd2 as ncd
import partition as p

from pprint import pformat
from clustering import pipeline
from compressor import get_compressor

if __name__ == '__main__':
  dataset = '../../dataset/binaries'
  dataset_ref = '../../dataset/binaries-membership.csv'
  block_sizes = (
      range(0, 128, 16) +
      range(128, 1024, 128) +
      range(1024, 32768, 1024)) + 
      range(32768-128, 32768+7*128, 128)
  factory = ds.create_factory(dataset)

  # Calculate the distance matrix for each block size
  matrix_dir = 'interleave/matrix'
  if not os.path.isdir(matrix_dir):
    os.makedirs(matrix_dir)

  for block_size in block_sizes:
    fname = os.path.join(matrix_dir, 'block_%d.csv' % block_size)
    if os.path.isfile(fname):
      continue

    print 'Calculating distance matrix for block_size = %d' % block_size
    ncd_results = ncd.distance_matrix([factory, factory],
        get_compressor('zlib'),
        interleave_block_size=block_size)
Beispiel #4
0
def full_distance_matrix(dataset, compressor):
    _, _, _, _, start = os.times()
    factory = ds.create_factory(dataset)
    ncd_results = ncd.distance_matrix([factory, factory], compressor)
    _, _, _, _, finish = os.times()
    return ncd_results, finish - start
Beispiel #5
0
      'dest_type': args.output_mode,
      'dest_dir': args.dest_dir or "",
      }

if __name__ == '__main__':
  parser = argparse.ArgumentParser(parents=[cli_parser()])
  parser.add_argument('input', nargs='+', help='Datasets')
  parser.add_argument('--membership', help='Membership file')
  parser.add_argument('-v', '--verbosity', action='count')
  
  a = parser.parse_args()
  if len(a.input) > 2:
    raise ValueError, 'More than two datasets provided'

  if not (a.split or a.sample or a.num_parts):
    warnings.warn("Nothing to do...")
    exit()
 
  factories = [create_factory(fname) for fname in a.input] 
  if a.membership:
    membership = p.membership_parse(a.membership)
  elif a.no_membership:
    membership = {}
    for factory in factories:
      for source in factory.get_sources():
        membership[source.name] = 'all'
  else:
    raise ValueError, 'Membership was not provided (you may want to pass --no-membership to make it explicit)'

  pipeline(factories, membership, verbosity=a.verbosity, **parse_args(a))
Beispiel #6
0
                              range(20),
                              is_upper=True,
                              start=73,
                              stop=156),
                    ncd.pairs(range(20), range(20), is_upper=True,
                              start=156))))


compressor = c.get_compressor('zlib')


class Dialect(csv.excel):
    delimiter = ':'


factory = ds.create_factory('testdata/planets_with_name.txt',
                            csv_dialect=Dialect)
sources = factory.get_sources()

zs = {
    'MERCURY': 72,
    'VENUS': 76,
    'EARTH': 69,
    'MARS': 75,
}

zxys = {
    ('MERCURY', 'MERCURY'): 76,
    ('MERCURY', 'VENUS'): 121,
    ('MERCURY', 'EARTH'): 114,
    ('MERCURY', 'MARS'): 117,
    ('VENUS', 'MERCURY'): 121,