def generate_benchmark_commands(total_benchmark_commands, bench_fname, all_fname, indexname, docs, stop_words, use_numeric_range_searchs, ts_digest, p_writes): total_benchmark_reads = 0 total_benchmark_writes = 0 all_csvfile = open(all_fname, 'a', newline='') bench_csvfile = open(bench_fname, 'w', newline='') all_csv_writer = csv.writer(all_csvfile, delimiter=',', quoting=csv.QUOTE_ALL) bench_csv_writer = csv.writer(bench_csvfile, delimiter=',', quoting=csv.QUOTE_ALL) progress = tqdm(unit="docs", total=total_benchmark_commands) total_docs = len(docs) ## timestamp related timestamps_pdist = generate_lognormal_dist(total_benchmark_commands) min_ts = ts_digest.percentile(0.0) max_ts = ts_digest.percentile(100.0) query_range_digest = TDigest() generated_commands = 0 while generated_commands < total_benchmark_commands: query_ts_pdist = timestamps_pdist[generated_commands] percentile = (1.0 - query_ts_pdist) * 100.0 query_min_ts = ts_digest.percentile(percentile) random_doc_pos = random.randint(0, total_docs - 1) doc = docs[random_doc_pos] # decide read or write p_cmd = random.random() if p_cmd < p_writes: ## WRITE total_benchmark_writes = total_benchmark_writes + 1 generated_row, doc_size = use_case_to_cmd(use_ftadd, doc["title"], doc["text"], doc["comment"], doc["username"], doc["timestamp"], generated_commands) else: ## READ total_benchmark_reads = total_benchmark_reads + 1 words, totalW = getQueryWords(doc, stop_words, 2) choice = random.choices(["simple-1word-query", "2word-union-query", "2word-intersection-query"])[0] generated_row = None numeric_range_str = "" if use_numeric_range_searchs: numeric_range_str = "@timestamp:[{} {}] ".format(query_min_ts, max_ts) query_range_digest.update(int(max_ts - query_min_ts)) if choice == "simple-1word-query" and len(words) >= 1: generated_row = generate_ft_search_row(indexname, "simple-1word-query", "{}{}".format(numeric_range_str, words[0])) elif choice == "2word-union-query" and len(words) >= 2: generated_row = generate_ft_search_row(indexname, "2word-union-query", "{}{} {}".format(numeric_range_str, words[0], words[1])) elif choice == "2word-intersection-query" and len(words) >= 2: generated_row = generate_ft_search_row(indexname, "2word-intersection-query", "{}{}|{}".format(numeric_range_str, words[0], words[1])) if generated_row != None: # all_csv_writer.writerow(generated_row) # bench_csv_writer.writerow(generated_row) progress.update() generated_commands = generated_commands + 1 progress.close() bench_csvfile.close() all_csvfile.close() # print() xx = [] yy = [] p90 = query_range_digest.percentile(90.0) dataset_percent = ts_digest.cdf(p90) print("90% of the read queries target at max {} percent o keyspace".format(dataset_percent)) print("100% of the read queries target at max {} percent o keyspace".format(ts_digest.cdf(max_ts - min_ts))) for centroid in query_range_digest.centroids_to_list(): ts_m = centroid["m"] xx.append(ts_m) yy.append(query_range_digest.cdf(ts_m)) plt.scatter(xx, yy) plt.title('EnWiki pages Query time range') plt.xlabel('Query time range') plt.ylabel('cdf') plt.xscale('log') plt.show() return total_benchmark_reads, total_benchmark_writes
plt.hist(docs_sizes, bins=bins, alpha=0.5) plt.title('EnWiki pages document size frequency. Avg document size: {} Bytes'.format(int(np.average(docs_sizes)))) plt.xlabel('Document Size in Bytes') plt.ylabel('count') plt.xscale('log') plt.show() xx = [] yy = [] for centroid in ts_digest.centroids_to_list(): # print(centroid) ts_m = centroid["m"] xx.append(ts_m) yy.append(ts_digest.cdf(ts_m)) plt.scatter(xx, yy) plt.title('EnWiki pages timestamp range') plt.xlabel('timestamp') plt.ylabel('cdf') # plt.xscale('log') plt.show() progress.close() all_csvfile.close() setup_csvfile.close() print("-- generating {} full text search commands -- ".format(total_benchmark_commands)) print("\t saving to {} and {}".format(bench_fname, all_fname)) total_benchmark_reads, total_benchmark_writes = generate_benchmark_commands(total_benchmark_commands, bench_fname,