def set_value(partition_key, sort_key): lookup_key = partition_key + ":" + sort_key machine_index = hashes["hashes"].get_machine(partition_key) response = requests.post("http://{}/set/{}/{}".format( servers[machine_index], partition_key, sort_key), data=request.data) if lookup_key in indexed: return make_response(str(response.status_code), response.status_code) indexed[lookup_key] = True sort_index[partition_key + ":" + sort_key] = sort_key if sort_key not in sort_index: sort_index[sort_key] = pygtrie.CharTrie() sort_index[sort_key][partition_key] = partition_key + ":" + sort_key if partition_key not in between_index: between_index[partition_key] = Tree("", None, None) if partition_key not in partition_trees: partition_tree = both_between_index.insert(partition_key, Tree("", None, None)) partition_trees[partition_key] = partition_tree between_index[partition_key].insert(sort_key, partition_key, partition_key + ":" + sort_key) partition_trees[partition_key].partition_tree.insert( sort_key, partition_key, partition_key + ":" + sort_key) return make_response(str(response.status_code), response.status_code)
def set_value(partition_key, sort_key): lookup_key = partition_key + ":" + sort_key print("{} Saving {} to {}".format(self_server, request.data, lookup_key)) data[lookup_key] = request.data.decode('utf-8') if lookup_key in indexed: return make_response(str(response.status_code), response.status_code) indexed[lookup_key] = True sort_index[partition_key + ":" + sort_key] = sort_key if sort_key not in sort_index: sort_index[sort_key] = pygtrie.CharTrie() sort_index[sort_key][partition_key] = partition_key + ":" + sort_key if partition_key not in between_index: between_index[partition_key] = Tree("", None, None) if partition_key not in partition_trees: partition_tree = both_between_index.insert(partition_key, Tree("", None, None)) partition_trees[partition_key] = partition_tree between_index[partition_key].insert(sort_key, partition_key, partition_key + ":" + sort_key) partition_trees[partition_key].partition_tree.insert( sort_key, partition_key, partition_key + ":" + sort_key) sql_index[sort_key] = lookup_key return make_response('', 202)
def test_add(self): tree = Tree() root = tree.add() self.assertEqual(tree.add(), root.left) self.assertEqual(tree.size(), 2) self.assertEqual(tree.max_depth(), 2) self.assertEqual(tree.add(), root.right) self.assertEqual(tree.size(), 3) self.assertEqual(tree.max_depth(), 2)
def test_root(self): tree = Tree() root = tree.add() self.assertEqual(root, tree.root) self.assertEqual(tree.size(), 1) self.assertEqual(tree.max_depth(), 1) self.assertEqual(list((n.node for n in tree)), [root]) self.assertEqual(list((n.node for n in tree.in_order())), [root]) self.assertFalse(root.left) self.assertFalse(root.right)
def test_insert(self): tree = Tree() self.assertTrue(tree.is_bst()) tree.insert(5) self.assertEqual(tree.size(), 1) self.assertEqual(self.values(tree), '5') self.assertTrue(tree.is_bst()) tree.insert(2) self.assertEqual(tree.size(), 2) self.assertEqual(self.values(tree), '2 5') self.assertTrue(tree.is_bst()) tree.insert(3) self.assertEqual(tree.size(), 3) self.assertEqual(self.values(tree), '2 3 5') self.assertTrue(tree.is_bst()) tree.insert(8) self.assertEqual(tree.size(), 4) self.assertEqual(self.values(tree), '2 3 5 8') self.assertTrue(tree.is_bst()) tree.insert(-1) self.assertEqual(tree.size(), 5) self.assertEqual(self.values(tree), '-1 2 3 5 8') self.assertEqual(self.reversed_values(tree), '8 5 3 2 -1') self.assertTrue(tree.is_bst()) self.assertTrue(-1 <= tree.balanced_factor() <= 1)
def test_empty_tree(self): tree = Tree() self.assertEqual(tree.size(), 0) self.assertEqual(tree.max_depth(), 0) self.assertEqual(list(tree), [])
_minimal_height(slice_left, btree) def minimal_height(sorted_array): mid_index = get_mid_index(sorted_array) root = TreeNode(value=sorted_array[mid_index]) btree = BinaryTree(root) _minimal_height(sorted_array[:mid_index], btree) shift_factor = 1 if len(sorted_array) > 1 else 0 _minimal_height(sorted_array[mid_index + shift_factor:], btree) return btree if __name__ == '__main__': # tests sorted_array = [5, 10, 15, 20, 27, 30, 45, 90, 100, 110, 115, 120] btree = minimal_height(sorted_array) def visit(lista): def _visit(node): lista.append(node) return _visit result = [] Tree.visit_in_order(btree.root, visit(result)) result = list(map(lambda node: node.value, result)) assert result == sorted_array
def execute(self): if self.parser.create_join_clause: print("Creating a join") print(self.parser.create_join_clause) for clause in self.parser.create_join_clause: left_table, left_field = clause[0].split(".") right_table, right_field = clause[1].split(".") print(left_table) print(right_table) if left_table in joins: joins[left_table].append({"clause": clause}) else: joins[left_table] = [{"clause": clause}] if right_table in joins: joins[right_table].append({"clause": clause}) else: joins[right_table] = [{"clause": clause}] print(joins) # if you insert into left table, you also need to insert join targets into right table # i need to do a select right.id from right_table where left_table.left_field = right_table.right_field # i need to insert left_field onto all servers that return a right id elif self.parser.update_table: entries = [] for server in servers: subset = json.loads( requests.post("http://{}/sql".format(server), data=json.dumps( {"parser": self.parser.__dict__})).text) if subset: entries = entries + subset print("From data node") print(entries) elif self.parser.fts_clause: for server in servers: subset = json.loads( requests.post("http://{}/sql".format(server), data=json.dumps( {"parser": self.parser.__dict__})).text) yield from subset elif self.parser.insert_values: insert_table = self.parser.insert_table print("Insert statement") created = False new_insert_count = 1 for field, value in zip(self.parser.insert_fields, self.parser.insert_values): all_servers = [] table_size = self.get_table_size(insert_table) if not created: new_insert_count = table_size + 1 table_counts[insert_table] = new_insert_count items = [] # create full text search index if isinstance(value, str): tokens = value.replace(",", "").split(" ") for token in tokens: new_key = "FTS.{}.{}.{}.{}".format( insert_table, field, token, new_insert_count) items.append({ "key": new_key, "value": new_insert_count }) new_key = "R.{}.{}.{}".format(insert_table, new_insert_count, field) items.append({"key": new_key, "value": value}) new_key = "S.{}.{}.{}.{}".format(insert_table, field, value, new_insert_count) items.append({"key": new_key, "value": new_insert_count}) new_key = "C.{}.{}.{}".format(insert_table, field, new_insert_count) items.append({"key": new_key, "value": value}) if not created: new_key = "R.{}.{}.id".format(insert_table, new_insert_count) new_id = {"key": new_key, "value": new_insert_count} items.append(new_id) created = True all_servers.append(new_id) new_key = "S.{}.{}.{}.{}".format(insert_table, "id", new_insert_count, new_insert_count) items.append({"key": new_key, "value": new_insert_count}) items.sort(key=itemgetter('key')) for item in all_servers: for server in servers: partition_key = "{}.{}".format(insert_table, new_insert_count) sort_key = item["key"] lookup_key = partition_key + ":" + sort_key response = requests.post("http://{}/set/{}/{}".format( server, partition_key, sort_key), data=str(item["value"])) for item in items: partition_key = "{}.{}".format(insert_table, new_insert_count) sort_key = item["key"] lookup_key = partition_key + ":" + sort_key machine_index = hashes["hashes"].get_machine(partition_key) response = requests.post("http://{}/set/{}/{}".format( servers[machine_index], partition_key, sort_key), data=str(item["value"])) if lookup_key not in indexed: indexed[lookup_key] = True sort_index[partition_key + ":" + sort_key] = sort_key if sort_key not in sort_index: sort_index[sort_key] = pygtrie.CharTrie() sort_index[sort_key][ partition_key] = partition_key + ":" + sort_key if partition_key not in between_index: between_index[partition_key] = Tree("", None, None) if partition_key not in partition_trees: partition_tree = both_between_index.insert( partition_key, Tree("", None, None)) partition_trees[partition_key] = partition_tree between_index[partition_key].insert( sort_key, partition_key, partition_key + ":" + sort_key) partition_trees[partition_key].partition_tree.insert( sort_key, partition_key, partition_key + ":" + sort_key) # we need to check if any materialized joins if insert_table in joins: join_clauses = joins[insert_table] print(join_clauses) for join_clause in join_clauses: clauses = join_clause["clause"] left_components = clauses[0].split(".") left_table = left_components[0] left_field = left_components[1] right_components = clauses[1].split(".") right_table = right_components[0] right_field = right_components[1] if right_table == insert_table: print("We need to swap") temp_table = right_table temp_field = right_field right_table = left_table right_field = left_field left_table = temp_table left_field = temp_field print("Do we need to join this inserted data?") print(field) print(left_field) search_value = value if left_field == "id": search_value = str(new_insert_count) if field == left_field or left_field == "id": # Do the prejoin parser = Parser() statement = "select {}.id, {}.{} from {} where {}.{} = {}".format( right_table, right_table, right_field, right_table, right_table, right_field, search_value) parser.parse(statement) print(statement) data = SQLExecutor(parser).execute() for match in data: for server in servers: server_value = match[1] print( "Data from {}, we are inserting {} into server {}" .format(server, server_value, servers[machine_index])) print("{} {}".format( left_table, right_table)) response = requests.post( "http://{}/set/{}.{}/R.{}.{}.{}". format(server, right_table, server_value, right_table, server_value, right_field), data=str(search_value)) response = requests.post( "http://{}/set/{}.{}/R.{}.{}.{}". format(servers[machine_index], right_table, server_value, right_table, server_value, "id"), data=server_value) if server != servers[machine_index]: # new_key = "R.{}.{}.{}".format(insert_table, new_insert_count, field) response = requests.post( "http://{}/set/{}.{}/R.{}.{}.{}". format(server, left_table, new_insert_count, left_table, new_insert_count, left_field), data=str(search_value)) response = requests.post( "http://{}/set/{}.{}/R.{}.{}.{}". format(server, left_table, new_insert_count, left_table, new_insert_count, "id"), data=str(new_insert_count)) # have to create a key on elif self.parser.group_by: print("Group by statement") group_by_components = parser.group_by.split(".") aggregator = defaultdict(list) row_specifier = "C.{}.{}".format(group_by_components[0], group_by_components[1]) for item in filter(lambda x: x["key"].startswith(row_specifier), items): k = item["key"] v = item["value"] key_components = k.split(".") print(key_components[2]) if (key_components[1] == group_by_components[0]) and ( key_components[2] == group_by_components[1]): aggregator[v].append(v) print(statement) for k, v in aggregator.items(): output_line = "" for item in parser.select_clause: if "count" in item: output_line += str(len(aggregator[k])) else: output_line += str(k) + " " print(output_line) elif self.parser.join_clause: server = random.choice(servers) records = json.loads( requests.post("http://{}/sql".format(server), data=json.dumps({"parser": self.parser.__dict__})).text) print(records) missing_fields = set() missing_records = [] for record in records: if record["missing_fields"]: missing_fields = missing_fields.union( set(record["missing_fields"])) for dataitem in record["outputs"]: missing_records.append(dataitem) print("Missing fields:") print(missing_fields) missing_index = {} for index, missing_record in enumerate(missing_records): missing_index[str(index)] = missing_record missing_record["missing_index"] = str(index) def trim_record(join_fields, items): for item in items: data = { "missing_index": item["missing_index"], "id": item["id"] } for join_field in join_fields: data[join_field] = item[join_field] yield data join_fields = [] join_specs = [] for missing_field in missing_fields: for select_clause in self.parser.select_clause: select_table, select_field = select_clause.split(".") if select_field == missing_field: for join_clause in self.parser.join_clause: left_components = join_clause[0].split(".") left_table = left_components[0] left_field = left_components[1] right_components = join_clause[1].split(".") right_table = right_components[0] right_field = right_components[1] id_field = None if select_table == left_table: id_field = "id" join_field = "{}_{}".format(left_table, "id") print("Join field -> {}".format(join_field)) elif select_table == right_table: id_field = "id" join_field = "{}_{}".format(right_table, "id") print("Join field -> {}".format(join_field)) if not id_field: print(select_table) print(left_table) print(right_table) continue print("select {} from {} inner join {} on {} = {}". format(missing_field, "network_table", select_table, id_field, join_field)) join_fields.append(join_field) join_specs.append({ "id_field": id_field, "join_field": join_field, "missing_field": missing_field, "select_table": select_table }) valid_matches = list( trim_record(join_fields, filter(lambda x: join_field in x, missing_records))) print("Valid matches") pprint(valid_matches) def getresults(server): response = json.loads( requests.post("http://{}/networkjoin".format(server), data=json.dumps({ "parser": self.parser.__dict__, "join_specs": join_specs, "records": valid_matches })).text) yield response with ThreadPoolExecutor(max_workers=len(servers)) as executor: future = executor.map(getresults, servers) print("Doing network join...") for server in future: for rowset in server: print("Missing data records") print(len(missing_records)) pprint(missing_records) for missing_data in rowset: print("Missing data") print(missing_data) missing_field, missing_index_key, found_data = missing_data if missing_data: missing_index[missing_index_key][ missing_field] = found_data outputs = [] for item in records: for obj in item["outputs"]: outputs.append(obj) # here header = "" output_lines = [] have_printed_header = False for result in outputs: skip = False output_line = [] for field in self.parser.select_clause: if field == "*": for key, value in result.items(): if not have_printed_header: header.append(key) output_line.append(value) else: table, field_name = field.split(".") if field_name not in result: skip = True else: output_line.append(result[field_name]) if skip: continue output_lines.append(output_line) have_printed_header = True print(header) yield from output_lines elif self.parser.select_clause: for server in servers: subset = json.loads( requests.post("http://{}/sql".format(server), data=json.dumps( {"parser": self.parser.__dict__})).text) for result in subset: item = [server] + result yield item
def execute(self): if self.parser["updates"]: table_datas, field_reductions = self.get_tables( [["{}.".format(self.parser["update_table"])]]) for result in self.process_wheres(field_reductions[0][0]): for update in self.parser["updates"]: updated_to, new_value = update updated_field = updated_to.split(".")[1] partition_key = "{}.{}".format(self.parser["update_table"], result["id"]) deindex(partition_key, self.parser["update_table"], result["id"], updated_field, result[updated_field]) # now update the data items = [] insert_table = self.parser["update_table"] field = updated_field if isinstance(new_value, str): tokens = new_value.replace(",", "").split(" ") for token in tokens: new_key = "FTS.{}.{}.{}.{}".format( insert_table, field, token, result["id"]) items.append({ "key": new_key, "value": result["id"] }) new_key = "R.{}.{}.{}".format(insert_table, result["id"], field) items.append({"key": new_key, "value": new_value}) new_key = "S.{}.{}.{}.{}".format(insert_table, field, new_value, result["id"]) items.append({"key": new_key, "value": result["id"]}) new_key = "C.{}.{}.{}".format(insert_table, field, result["id"]) items.append({"key": new_key, "value": new_value}) items.sort(key=itemgetter('key')) for item in items: sort_key = item["key"] lookup_key = partition_key + ":" + sort_key data[lookup_key] = item["value"] indexed[lookup_key] = True sort_index[partition_key + ":" + sort_key] = sort_key if sort_key not in sort_index: sort_index[sort_key] = pygtrie.CharTrie() sort_index[sort_key][ partition_key] = partition_key + ":" + sort_key if partition_key not in between_index: between_index[partition_key] = Tree("", None, None) if partition_key not in partition_trees: partition_tree = both_between_index.insert( partition_key, Tree("", None, None)) partition_trees[partition_key] = partition_tree between_index[partition_key].insert( sort_key, partition_key, partition_key + ":" + sort_key) partition_trees[partition_key].partition_tree.insert( sort_key, partition_key, partition_key + ":" + sort_key) sql_index[sort_key] = lookup_key elif self.parser["fts_clause"]: # full text search table_datas, field_reductions = self.get_tables( [["{}.".format(self.parser["table_name"])]]) table_datas, field_reductions = self.mark_join_table( table_datas, field_reductions, self.parser["table_name"]) table_datas = self.rewrite_joins(table_datas) have_printed_header = False header = [] output_lines = [] outputs = [] for result in self.process_wheres(field_reductions[0][0]): output_lines = [] for field in self.parser["select_clause"]: if field == "*": for key, value in result.items(): if not have_printed_header: header.append(key) output_lines.append(value) else: output_lines.append(result[field]) have_printed_header = True outputs.append(output_lines) print(header) print(outputs) yield from output_lines elif self.parser["group_by"]: print("Group by statement") group_by_components = parser.group_by.split(".") aggregator = defaultdict(list) row_specifier = "C.{}.{}".format(group_by_components[0], group_by_components[1]) for item in filter(lambda x: x["key"].startswith(row_specifier), items): k = item["key"] v = item["value"] key_components = k.split(".") print(key_components[2]) if (key_components[1] == group_by_components[0]) and ( key_components[2] == group_by_components[1]): aggregator[v].append(v) print(statement) for k, v in aggregator.items(): output_line = "" for item in parser.select_clause: if "count" in item: output_line += str(len(aggregator[k])) else: output_line += str(k) + " " print(output_line) elif self.parser["join_clause"]: table_datas, field_reductions = self.get_tables( self.parser["join_clause"]) table_datas, field_reductions = self.mark_join_table( table_datas, field_reductions, self.parser["table_name"]) table_datas = self.rewrite_joins(table_datas) previous = list(self.hash_join(0, table_datas)) print("First join") for index, pair in enumerate(table_datas[1:]): entries = table_datas[index + 1] table_name, collection, field, size = entries[0] if collection == "previous": table_datas[index + 1][0] = (table_name, previous, field, size) previous = list(self.hash_join(index + 1, table_datas)) print("Second join") records = self.process_wheres(previous) print("records from join" + str(records)) print(len(records)) missing_fields = set() output_lines = [] for record in records: # output_line = [] output_lines.append(record) for clause in self.parser["select_clause"]: table, field = clause.split(".") if field not in record: missing_fields.add(field) # output_lines.append(output_line) print(len(output_lines)) print(len(records)) yield { "outputs": output_lines, "missing_fields": list(missing_fields) } elif self.parser["select_clause"]: table_datas, field_reductions = self.get_tables( [["{}.".format(self.parser["table_name"])]]) have_printed_header = False header = [] output_lines = [] for result in self.process_wheres(field_reductions[0][0]): skip = False output_line = [] for field in self.parser["select_clause"]: if field == "*": for key, value in result.items(): if not have_printed_header: header.append(key) output_line.append(value) else: table, field_name = field.split(".") if field_name not in result: skip = True else: output_line.append(result[field_name]) if skip: continue output_lines.append(output_line) have_printed_header = True yield from output_lines print(header) print(output_lines)
sql_index = pygtrie.CharTrie() response = requests.post("http://{}/bootstrap/{}".format( args.server, args.port)) print(response.text) bootstrapped_keys = json.loads(response.text) for lookup_key, value in bootstrapped_keys.items(): data[lookup_key] = value indexed[lookup_key] = True partition_key, sort_key = lookup_key.split(":") sort_index[partition_key + ":" + sort_key] = sort_key if sort_key not in sort_index: sort_index[sort_key] = pygtrie.CharTrie() sort_index[sort_key][partition_key] = partition_key + ":" + sort_key if partition_key not in between_index: between_index[partition_key] = Tree("", None, None) if partition_key not in partition_trees: partition_tree = both_between_index.insert(partition_key, Tree("", None, None)) partition_trees[partition_key] = partition_tree between_index[partition_key].insert(sort_key, partition_key, partition_key + ":" + sort_key) partition_trees[partition_key].partition_tree.insert( sort_key, partition_key, partition_key + ":" + sort_key) sql_index[sort_key] = lookup_key app = Flask(__name__) @app.route("/get/<lookup_key>", methods=["POST"])