Example #1
0
def runCachingExperiment(name, data_dir):
  print("Running caching experiment for '" + name + "'.")
  memoryPerWorker = "21g"
  cores = 143
  timesToRun = 4
  tps = 1500000

  outFilePath = data_dir + "/caching/" + name + ".yaml"
  if os.path.isfile(outFilePath):
    with open(outFilePath, "r") as f: data = yaml.load(f)
  else: data = {}

  for cache in [True,False]:
    if cache not in data: data[cache] = []
    if len(data[cache]) >= timesToRun:
      print("  Already profiled for cache = " + str(cache) + ", skipping.")
      continue
    while len(data[cache]) < timesToRun:
      try:
        bench_base.restartServers()
        bench_base.restartSparkContext(memoryPerWorker, cores)
        # Load the data into cache.
        if cache: bench_base.runQuery(name,"2014-01-01","2014-01-07",cache,tps)
        while len(data[cache]) < timesToRun:
          result = bench_base.runQuery(name,"2014-01-01","2014-01-07",cache,tps)
          data[cache].append(result[2]['TimeMillis'] - result[0]['TimeMillis'])
        with open(outFilePath, "w") as f:
          f.write(yaml.dump(data, indent=2, default_flow_style=False))
      except KeyboardInterrupt: sys.exit(-1)
      except Exception:
        print("Exception occurred, retrying.")
        traceback.print_exc()
        if not cache: data[cache] = []
        pass
  return data
def runScalingExperiment(queries, data_dir):
  print("Running scaling experiment.")
  memoryPerWorker = "21g"
  timesToRun = 4
  tps = 1500000
  allNodes = [6,5,4,3]
  cache = True

  outFilePath = data_dir + "/scaling/scaling.yaml"
  if os.path.isfile(outFilePath):
    with open(outFilePath, "r") as f: data = yaml.load(f)
  else: data = {}

  for nodes in allNodes:
    cores = nodes*24-1
    if nodes not in data: data[nodes] = {'dataLoad': [], 'execution': {}}
    if len(data[nodes]['dataLoad']) < timesToRun or \
        len(data[nodes]['execution']) < len(queries):
      input("Please ensure {} Spark and HDFS nodes are running and press any key to continue.".format(nodes))

    while len(data[nodes]['dataLoad']) < timesToRun or \
        len(data[nodes]['execution']) < len(queries):
      try:
        bench_base.restartServers()
        bench_base.restartSparkContext(memoryPerWorker, cores)
        while len(data[nodes]['dataLoad']) < timesToRun:
          result = bench_base.getDataLoadTime()
          data[nodes]['dataLoad'].append(
            result[1]['TimeMillis'] - result[0]['TimeMillis']
          )
        with open(outFilePath, "w") as f:
          f.write(yaml.dump(data, indent=2, default_flow_style=False))
        for query in queries:
          exeData = data[nodes]['execution']
          if query not in exeData: exeData[query] = []
          if len(exeData[query]) >= timesToRun:
            print("  Already profiled for nodes = " + str(nodes) + ", skipping.")
            continue
          while len(exeData[query]) < timesToRun:
            # Load the data into cache.
            if cache:
              bench_base.runQuery(query,"2014-01-01","2014-01-07",cache,tps)
            while len(exeData[query]) < timesToRun:
              result = bench_base.runQuery(
                query,"2014-01-01","2014-01-07",cache,tps
              )
              exeData[query].append(
                result[2]['TimeMillis'] - result[0]['TimeMillis']
              )
            with open(outFilePath, "w") as f:
              f.write(yaml.dump(data, indent=2, default_flow_style=False))
      except KeyboardInterrupt: sys.exit(-1)
      except Exception:
        print("Exception occurred, retrying.")
        traceback.print_exc()
        if not cache: data[cache] = []
        pass
  return data
Example #3
0
def runPartitionExperiment(name, data_dir):
    print("Running partition experiment for '" + name + "'.")
    targetPartitionSizes = [
        10000, 50000, 100000, 200000, 300000, 400000, 500000, 750000, 1000000,
        1500000
    ]
    memoryPerWorker = "21g"
    cores = 143
    timesToRun = 4
    cache = True

    outFilePath = data_dir + "/partitions/" + name + ".yaml"
    if os.path.isfile(outFilePath):
        with open(outFilePath, "r") as f:
            data = yaml.load(f)
    else:
        data = {}

    for tps in targetPartitionSizes:
        if tps not in data: data[tps] = []
        if len(data[tps]) >= timesToRun:
            print("  Already profiled for " + str(tps) +
                  " targetPartitionSizes, skipping.")
            continue
        while len(data[tps]) < timesToRun:
            try:
                bench_base.restartServers()
                bench_base.restartSparkContext(memoryPerWorker, cores)
                # Load the data into cache.
                if cache:
                    bench_base.runQuery(name, "2014-01-01", "2014-01-07",
                                        cache, tps)
                while len(data[tps]) < timesToRun:
                    result = bench_base.runQuery(name, "2014-01-01",
                                                 "2014-01-07", cache, tps)
                    data[tps].append(result[2]['TimeMillis'] -
                                     result[0]['TimeMillis'])
                with open(outFilePath, "w") as f:
                    f.write(yaml.dump(data, indent=2,
                                      default_flow_style=False))
            except KeyboardInterrupt:
                sys.exit(-1)
            except Exception:
                print("Exception occurred, retrying.")
                traceback.print_exc()
                if not cache: data[tps] = []
                pass
    return data
Example #4
0
def runPartitionExperiment(name, data_dir):
  print("Running partition experiment for '" + name + "'.")
  targetPartitionSizes = [10000, 50000, 100000, 200000,
      300000, 400000, 500000, 750000, 1000000, 1500000]
  memoryPerWorker = "21g"
  cores = 143
  timesToRun = 4
  cache = True

  outFilePath = data_dir + "/partitions/" + name + ".yaml"
  if os.path.isfile(outFilePath):
    with open(outFilePath, "r") as f: data = yaml.load(f)
  else: data = {}

  for tps in targetPartitionSizes:
    if tps not in data: data[tps] = []
    if len(data[tps]) >= timesToRun:
      print("  Already profiled for " + str(tps) +
          " targetPartitionSizes, skipping.")
      continue
    while len(data[tps]) < timesToRun:
      try:
        bench_base.restartServers()
        bench_base.restartSparkContext(memoryPerWorker, cores)
        # Load the data into cache.
        if cache: bench_base.runQuery(name,"2014-01-01","2014-01-07",cache,tps)
        while len(data[tps]) < timesToRun:
          result = bench_base.runQuery(name,"2014-01-01","2014-01-07",cache,tps)
          data[tps].append(result[2]['TimeMillis'] - result[0]['TimeMillis'])
        with open(outFilePath, "w") as f:
          f.write(yaml.dump(data, indent=2, default_flow_style=False))
      except KeyboardInterrupt: sys.exit(-1)
      except Exception:
        print("Exception occurred, retrying.")
        traceback.print_exc()
        if not cache: data[tps] = []
        pass
  return data
Example #5
0
def runScalingExperiment(queries, data_dir):
    print("Running scaling experiment.")
    memoryPerWorker = "21g"
    timesToRun = 4
    tps = 1500000
    allNodes = [6, 5, 4, 3]
    cache = True

    outFilePath = data_dir + "/scaling/scaling.yaml"
    if os.path.isfile(outFilePath):
        with open(outFilePath, "r") as f:
            data = yaml.load(f)
    else:
        data = {}

    for nodes in allNodes:
        cores = nodes * 24 - 1
        if nodes not in data: data[nodes] = {'dataLoad': [], 'execution': {}}
        if len(data[nodes]['dataLoad']) < timesToRun or \
            len(data[nodes]['execution']) < len(queries):
            input(
                "Please ensure {} Spark and HDFS nodes are running and press any key to continue."
                .format(nodes))

        while len(data[nodes]['dataLoad']) < timesToRun or \
            len(data[nodes]['execution']) < len(queries):
            try:
                bench_base.restartServers()
                bench_base.restartSparkContext(memoryPerWorker, cores)
                while len(data[nodes]['dataLoad']) < timesToRun:
                    result = bench_base.getDataLoadTime()
                    data[nodes]['dataLoad'].append(result[1]['TimeMillis'] -
                                                   result[0]['TimeMillis'])
                with open(outFilePath, "w") as f:
                    f.write(yaml.dump(data, indent=2,
                                      default_flow_style=False))
                for query in queries:
                    exeData = data[nodes]['execution']
                    if query not in exeData: exeData[query] = []
                    if len(exeData[query]) >= timesToRun:
                        print("  Already profiled for nodes = " + str(nodes) +
                              ", skipping.")
                        continue
                    while len(exeData[query]) < timesToRun:
                        # Load the data into cache.
                        if cache:
                            bench_base.runQuery(query, "2014-01-01",
                                                "2014-01-07", cache, tps)
                        while len(exeData[query]) < timesToRun:
                            result = bench_base.runQuery(
                                query, "2014-01-01", "2014-01-07", cache, tps)
                            exeData[query].append(result[2]['TimeMillis'] -
                                                  result[0]['TimeMillis'])
                        with open(outFilePath, "w") as f:
                            f.write(
                                yaml.dump(data,
                                          indent=2,
                                          default_flow_style=False))
            except KeyboardInterrupt:
                sys.exit(-1)
            except Exception:
                print("Exception occurred, retrying.")
                traceback.print_exc()
                if not cache: data[cache] = []
                pass
    return data
Example #6
0
def runConcurrentExperiment(name, data_dir):
  global timesToRun
  print("Running concurrent experiment for '" + name + "'.")
  allConcurrentQueries = list(range(1,9))
  cores = 143
  memoryPerWorker = "20g"
  tps = 1500000
  def isDataFinished(concurrentQueries,d):
    if not d or not isinstance(d,list) or len(d)<concurrentQueries:
      return False
    for thread in d:
      if len(thread) < timesToRun: return False
    return True
  def run(concurrentQueries):
    g_lock = Lock()
    def threadEntry(threadNum):
      def isFinished():
        with g_lock:
          for n in results:
            if len(n) < timesToRun: return False
          return True
      try:
        while not isFinished():
          print(str(threadNum) + ": Calling query.")
          result = bench_base.runQuery(name, "2014-01-01", "2014-01-07", True)
          queryExecutionTime = result[2]['TimeMillis']-result[0]['TimeMillis']
          print(str(threadNum) + ": Query execution time: " +
              str(queryExecutionTime))
          with g_lock:
            results[threadNum].append(queryExecutionTime)
      except:
        print("Error occurred in thread.")
        traceback.print_exc()
    results = [[] for x in range(0,concurrentQueries)]
    threads = [
      Thread(target=threadEntry, args=(i,)) for i in range(0,concurrentQueries)
    ]
    [t.start() for t in threads]
    [t.join() for t in threads]
    return results

  outFilePath = data_dir + "/concurrent/" + name + ".yaml"
  if os.path.isfile(outFilePath):
    with open(outFilePath, "r") as f: data = yaml.load(f)
  else: data = {}

  for concurrentQueries in allConcurrentQueries:
    if concurrentQueries in data and \
        isDataFinished(concurrentQueries,data[concurrentQueries]):
      print("  Already profiled for " + str(concurrentQueries) +
          " concurrent queries, skipping.")
      continue
    else:
      data[concurrentQueries] = {}
    while not isDataFinished(concurrentQueries,data[concurrentQueries]):
      try:
        bench_base.restartServers()
        bench_base.restartSparkContext(memoryPerWorker, cores)

        # For cache.
        bench_base.runQuery(name, "2014-01-01", "2014-01-07", True, tps)

        data[concurrentQueries] = run(concurrentQueries)

        with open(outFilePath, "w") as f:
          f.write(yaml.dump(data, indent=2, default_flow_style=False))
      except KeyboardInterrupt: sys.exit(-1)
      except Exception:
        print("Exception occurred, retrying.")
        traceback.print_exc()
        data[concurrentQueries] = {}
        pass
  return data
Example #7
0
def runConcurrentExperiment(name, data_dir):
    global timesToRun
    print("Running concurrent experiment for '" + name + "'.")
    allConcurrentQueries = list(range(1, 9))
    cores = 143
    memoryPerWorker = "20g"
    tps = 1500000

    def isDataFinished(concurrentQueries, d):
        if not d or not isinstance(d, list) or len(d) < concurrentQueries:
            return False
        for thread in d:
            if len(thread) < timesToRun: return False
        return True

    def run(concurrentQueries):
        g_lock = Lock()

        def threadEntry(threadNum):
            def isFinished():
                with g_lock:
                    for n in results:
                        if len(n) < timesToRun: return False
                    return True

            try:
                while not isFinished():
                    print(str(threadNum) + ": Calling query.")
                    result = bench_base.runQuery(name, "2014-01-01",
                                                 "2014-01-07", True)
                    queryExecutionTime = result[2]['TimeMillis'] - result[0][
                        'TimeMillis']
                    print(
                        str(threadNum) + ": Query execution time: " +
                        str(queryExecutionTime))
                    with g_lock:
                        results[threadNum].append(queryExecutionTime)
            except:
                print("Error occurred in thread.")
                traceback.print_exc()

        results = [[] for x in range(0, concurrentQueries)]
        threads = [
            Thread(target=threadEntry, args=(i, ))
            for i in range(0, concurrentQueries)
        ]
        [t.start() for t in threads]
        [t.join() for t in threads]
        return results

    outFilePath = data_dir + "/concurrent/" + name + ".yaml"
    if os.path.isfile(outFilePath):
        with open(outFilePath, "r") as f:
            data = yaml.load(f)
    else:
        data = {}

    for concurrentQueries in allConcurrentQueries:
        if concurrentQueries in data and \
            isDataFinished(concurrentQueries,data[concurrentQueries]):
            print("  Already profiled for " + str(concurrentQueries) +
                  " concurrent queries, skipping.")
            continue
        else:
            data[concurrentQueries] = {}
        while not isDataFinished(concurrentQueries, data[concurrentQueries]):
            try:
                bench_base.restartServers()
                bench_base.restartSparkContext(memoryPerWorker, cores)

                # For cache.
                bench_base.runQuery(name, "2014-01-01", "2014-01-07", True,
                                    tps)

                data[concurrentQueries] = run(concurrentQueries)

                with open(outFilePath, "w") as f:
                    f.write(yaml.dump(data, indent=2,
                                      default_flow_style=False))
            except KeyboardInterrupt:
                sys.exit(-1)
            except Exception:
                print("Exception occurred, retrying.")
                traceback.print_exc()
                data[concurrentQueries] = {}
                pass
    return data