def MedCalculator(fileNum, text): """ thread worker function Calculates the running median for the lines present in the text list supplied. Currently the sequential implementation is identical to the parallel implementation. :rtype : null :param fileNum: an index pointing to the file to be processing in the input files :param text: a text buffer to be loaded with the input text """ # Start Profiling # basic profiling for the speed of the algorithm # start = time.clock() # the list that is going to hold the running medians medianNumbers = [] # a sorted list to hold the word counts for input lines # the sorted list boosts performance substantially when computing the running median because this will not require # resorting the wordcount list every time we add an entry to it. linesWordCount = SortedList() lineNO = 0 for line in text: # fast conversion of uppercase to lowercase and removal of the following `'-_ cleaned_line = line.translate(table) # matching words/tokens words = re.findall(tokenPattern, cleaned_line) # counting wordcount of a line and adding it ot the respective list lineWordCount = len(words) linesWordCount.add(lineWordCount) # running median calculations # because I used a sorted list for the worcounts of lines, now it is straightforward to compute # the running median index = int(lineNO / 2) if lineNO % 2 == 0: medianNumbers.append(float(linesWordCount[index])) else: medianNumbers.append( float((linesWordCount[index] + linesWordCount[index + 1]) / 2)) lineNO += 1 # optional profiling # print("Line NO: " + str(lineNO) + " wordcount: " + str(lineWordCount)) # print("Size: " + str(len(linesWordCount)) + " linesWordCount elm: " + str(linesWordCount) ) # print("Median NOs: " + str(medianNumbers)) # end = time.clock() # optional profiling # print("(Calculator)Time elapsed: ", (end-start), "Using Multiprocessing, Generated ", len(medianNumbers) , " medians from " , lineNO, " Lines")#, len(text) , " files") return medianNumbers
def sort_file_lists(input_file_list, file_name_generator): command_list = SortedList() for input_file in input_file_list: reader = _get_reader(input_file) for command in reader: command_list.add(command) if len(command_list) > number_of_allowed_command: dump_commands_to_file(command_list, file_name_generator) command_list = SortedList()
def _match_agg_by_val(self): ''' Matches input/output aggregates by values and returns a bunch of data structs ''' self._all_match_in_agg = SortedList() self._match_in_agg_to_val = defaultdict(int) self._val_to_match_out_agg = defaultdict(set) # Gets unique values of input / output aggregates all_unique_in_agg_val, _ = np.unique(self._all_in_agg_val, return_inverse=True) all_unique_out_agg_val, _ = np.unique(self._all_out_agg_val, return_inverse=True) # Computes total fees paid/receiver by taker/maker if self._has_intrafees: fees_taker = self._fees + self._fees_taker fees_maker = -self._fees_maker # doesn't take into account tx fees paid by makers # Finds input and output aggregates with matching values for in_agg_val in np.nditer(all_unique_in_agg_val): val = int(in_agg_val) for out_agg_val in np.nditer(all_unique_out_agg_val): diff = in_agg_val - out_agg_val if (not self._has_intrafees) and (diff < 0): break else: # Computes conditions required for a matching cond_no_intrafees = ( not self._has_intrafees) and diff <= self._fees cond_intrafees = self._has_intrafees and\ ( (diff <= 0 and diff >= fees_maker) or (diff >= 0 and diff <= fees_taker) ) if cond_no_intrafees or cond_intrafees: # Registers the matching input aggregate match_in_agg = np.where( self._all_in_agg_val == in_agg_val)[0] for in_idx in match_in_agg: if not in_idx in self._all_match_in_agg: self._all_match_in_agg.add(in_idx) self._match_in_agg_to_val[in_idx] = val # Registers the matching output aggregate match_out_agg = np.where( self._all_out_agg_val == out_agg_val)[0] self._val_to_match_out_agg[val].update( match_out_agg.tolist())
def _merge_when_available(self, error_queue, kill_queue, in_queue, result_send, y, q): try: result = SortedList() _in_get = in_queue.get _kill_get = kill_queue.get _get_ids = y.get_ids _add = result.add while True: try: _kill_get(False) result_send.send(None) return except Empty: pass item = _in_get() if item == "kill": break slice_obj, nn_ids = item q_ids = _get_ids(slice_obj) _n = q_ids.shape[0] j = 0 for i in range(_n): _add((q_ids[i], tuple(nn_ids[j:j + q]))) j += q result_send.send(result) except Exception as e: result_send.send(None) error_queue.put(ExceptionWrapper(os.getpid(), e))
def get_split_point_suggestions(self): suggested_split_values = SortedList() min_value = np.inf max_value = -np.inf for k, estimator in self._att_val_dist_per_class.items(): if self._min_value_observed_per_class[k] < min_value: min_value = self._min_value_observed_per_class[k] if self._max_value_observed_per_class[k] > max_value: max_value = self._max_value_observed_per_class[k] if min_value < np.inf: bin_size = max_value - min_value bin_size /= (float(self.num_bin_options) + 1.0) for i in range(self.num_bin_options): split_value = min_value + (bin_size * (i + 1)) if split_value > min_value and split_value < max_value: suggested_split_values.add(split_value) return suggested_split_values
def _get_top_n_samples(model_predictions: List[ModelPrediction], n: int, best: bool): top_n_samples = SortedList(key=lambda sample: -sample.true_label_probability) if best else SortedList() for model_prediction in model_predictions: if best == model_prediction.is_correct(): if len(top_n_samples) < n: top_n_samples.add(model_prediction) else: if best != (model_prediction < top_n_samples[-1]): top_n_samples.pop() top_n_samples.add(model_prediction) return [sample for sample in top_n_samples] # so that it returns a normal list instead of SortedList
def reformat(data_dir='../../../data', cores=1): print('Reformats data from:', data_dir) files = os.listdir(data_dir + '/json/') # noqa snap_files = SortedList( [filename for filename in files if 'snaps' in filename], key=lambda fn: pd.to_datetime(fn[:-11], format='%d_%m_%Y_%H_%M_%S')) try: os.makedirs(data_dir + '/snap_json/') except FileExistsError: pass Parallel(n_jobs=cores)( delayed(save_snaps_from_file_path)(data_dir, snapfile) for snapfile in tqdm(snap_files)) files = os.listdir(data_dir + '/json/') # noqa mess_files = SortedList( [filename for filename in files if 'mess' in filename], key=lambda fn: pd.to_datetime(fn[:-10], format='%d_%m_%Y_%H_%M_%S')) keys = { 'order_type', 'reason', 'sequence', 'side', 'size', 'type', 'price', 'funds', 'order_id', 'time' } price_tick = 0.01 price_dec = int(np.log10(1 / price_tick)) try: os.makedirs(data_dir + '/feather/') except FileExistsError: pass Parallel(n_jobs=cores)( delayed(reformat_messages)(data_dir, k, keys, price_dec, messfile) for k, messfile in tqdm(enumerate(list((mess_files)))))
def test_nlines(self): reader, start_time = self.init_reader() tpoints = SortedList() time_points, jobs = reader.next(start_time) tpoints.update(time_points) total_jobs = len(jobs) while True: if not tpoints: break _time = tpoints.pop(0) time_points, jobs = reader.next(_time) total_jobs += sum([len(js) for js in jobs.values()]) tpoints.update(time_points) self.assertEqual(total_jobs, 25)
def _query_in_serial(self, y, q): _L = self.L _b = self.b _metric = self.metric _margs = self.margs _where = np.where _empty = np.empty _unique = np.unique _get_data = self.data.get_data _get_data_by_id = self.data.get_data_by_id _get_lists = storage.get_lists _hash = self._hash _encode = self._encode result = SortedList() _add = result.add q_data = y.get_data() num_points = q_data.shape[0] tmp = _hash(q_data) hashlist = _encode(tmp, _b).reshape(-1, _L) indptr, cand = _get_lists(hashlist) c_ids, c_indices = _unique(cand, return_inverse=True) c_data = _get_data_by_id(c_ids) dist = ssdist_wrapped(q_data, c_data, None, c_indices, indptr, _metric, _margs) nn_indices = rank(q, num_points, dist.data, c_indices, indptr) q_ids = y.get_ids() nn_ids = _where(nn_indices > 0, c_ids[nn_indices], -1) _m = q_ids.shape[0] j = 0 for i in range(_m): _add((q_ids[i], tuple(nn_ids[j:j + q]))) j += q return result
def _end_running_jobs(self, running_jobs, allocator, requests): if not running_jobs: return running_jobs if random() >= 0.5: return running_jobs sample_size = randint(1, len(running_jobs)) idxs = SortedList(sample(range(len(running_jobs)), sample_size)) cur_resources = allocator.get_resources() for idx in reversed(idxs): job = running_jobs.pop(idx) job_request = requests.pop(job[1]) for node in job[2]: for k, v in job_request.requested_resources.items(): cur_resources[node][k] += v return running_jobs
def get_arrdata_from_dataset_values(arr_distinct_values,attributes,votes_attributes,users_attributes,position_attribute): arr_data=[] arr_types=[] arr_depthmax=[] arr_refinement_indexes=[] arr_labels=[] subgroup_pipeline=[] filter_operations=[] num=r'([0-9]|\.)*' reg = re.compile(num) for i,attr in enumerate(attributes): arr_types.append(attr['type']) arr_depthmax.append(attr['bound_width']) subgroup_pipeline.append({'dimensionName':attr['name']}) if attr['type']=='numeric': arr_data.append(SortedList(arr_distinct_values[i])) arr_refinement_indexes.append(0) arr_labels.append({}) subgroup_pipeline[-1]['inInterval']=[] filter_operations.append('inInterval') elif attr['type']=='nominal': arr_data.append(SortedList(arr_distinct_values[i])) arr_refinement_indexes.append(len(arr_data[i])) arr_labels.append({}) subgroup_pipeline[-1]['inSet']=[] filter_operations.append('inSet') elif attr['type']=='simple': arr_data.append(arr_distinct_values[i]) arr_refinement_indexes.append(len(arr_data[i])) arr_labels.append({}) subgroup_pipeline[-1]['inSet']=[] filter_operations.append('inSet') elif attr['type']=='themes': data_to_tree=[] for val in arr_distinct_values[i]: data_to_tree.append({'ID':reg.search(val).group(),'LABEL':val[reg.search(val).end()+1:]}) tree,themesMAP=createTreeOutOfThemes(data_to_tree) arr_data.append([tree]) arr_refinement_indexes.append(0) arr_labels.append(themesMAP) subgroup_pipeline[-1]['contain_themes']=[] filter_operations.append('contain_themes') elif attr['type']=='themes2': data_to_tree=[] for val in arr_distinct_values[i]: data_to_tree.append({'ID':reg.search(val).group(),'LABEL':val[reg.search(val).end()+1:]}) tree,themesMAP=createTreeOutOfThemes(data_to_tree) tree_themes=tree_theme2(sorted([x['ID'] for x in data_to_tree])) #tree_themes['pattern']=[''] arr_data.append(tree_themes) #arr_refinement_indexes.append(['']) arr_refinement_indexes.append(([''],None)) arr_labels.append(themesMAP) subgroup_pipeline[-1]['contain_themes']=[] filter_operations.append('contain_themes') subgroup_pipeline_for_votes=[stage for stage in subgroup_pipeline if stage['dimensionName'] in votes_attributes] subgroup_pipeline_for_meps=[stage for stage in subgroup_pipeline if stage['dimensionName'] in users_attributes] return arr_data,arr_types,arr_depthmax,arr_refinement_indexes,arr_labels,subgroup_pipeline,filter_operations,subgroup_pipeline_for_votes,subgroup_pipeline_for_meps
def orderstream(order_paths='../../../data/feather/', snapshot_paths='../../../data/snap_json/', max_sequence_skip=1, random_start=False, **kwargs): """ Generates a stream of orders, either a snapshot of the order book is returned when a disruption in the order stream happens or the next order is yielded. Parameters ---------- order_paths: str Path to the orders snapshot_paths: str Path to the snapshots Yields ------- order: list, snapshot: dict The first yield will have a snapshot. Then orders will be yielded with the snapshot as None. """ order_paths = order_paths snapshot_paths = snapshot_paths order_files = os.listdir(order_paths) snap_files = os.listdir(snapshot_paths) order_files = SortedList(order_files, key=lambda x: int(x.split('_')[0])) snap_files = sorted(snap_files) snap_files_ = [] min_order_files_seq = int(order_files[0].split('_')[1]) for snap_file in snap_files: snap_seq_ = int(''.join(filter(str.isdigit, snap_file))) if snap_seq_ > min_order_files_seq: snap_files_.append(snap_file) snap_files = snap_files snap_sequences = np.array([int(re.search(r'\d+', snap_sequence).group()) for snap_sequence in snap_files]) random_start = random_start max_seq_skip = max_sequence_skip while True: if random_start: snap_file = random.choice(snap_files) snap_seq = ''.join(filter(str.isdigit, snap_file)) order_files_ = [] for order_file in order_files: max_order_file_seq = int(order_file.split('_')[2].split('.')[0]) if max_order_file_seq >= int(snap_seq): order_files_.append(order_file) order_files_ = order_files_ else: snap_file = snap_files[0] order_files_ = deepcopy(order_files) with open(snapshot_paths + snap_file) as f: snap = ujson.load(f) snap_sequence = snap['sequence'] prev_order_seq = snap_sequence yield None, snap break_ = False for order_file in order_files_: orders = load_orders(order_paths + order_file) for order in orders: if order.sequence < snap_sequence: pass else: if order.sequence - prev_order_seq > max_seq_skip: print('To large gap', order.sequence - prev_order_seq) if random_start: break_ = True break else: snap_seq_k = (snap_sequences >= order.sequence).argmax() snap_file = snap_files[snap_seq_k] with open(snapshot_paths + snap_file) as f: snap = ujson.load(f) snap_sequence = snap['sequence'] yield None, snap else: if order.type in MESSAGE_TYPES: yield order, None prev_order_seq = order.sequence gc.collect() if break_: break
class TxosLinker(object): ''' A class allowing to compute the entropy of Bitcoin transactions and the linkability of inputs/outputs of a transaction ''' ''' CONSTANTS ''' # Default maximum duration in seconds MAX_DURATION = 180 # Processing options LINKABILITY = 'LINKABILITY' PRECHECK = 'PRECHECK' MERGE_FEES = 'MERGE_FEES' # Markers FEES = 'FEES' PACK = 'PACK' # Max number of inputs (or outputs) which can be processed by this algorithm MAX_NB_TXOS = 12 ''' ATTRIBUTES # List of input txos expressed as tuples (id, amount) inputs = [] # List of output txos expressed as tuples (id, amount) outputs = [] # Fees associated to the transaction fees = 0 # Matrix of txos linkability # Columns = input txos # Rows = output txos # Cells = number of combinations for which an input and an output are linked links = np.array() # Number of valid transactions combinations nb_tx_cmbn = 0 # Maximum duration of the script (in seconds) _max_duration = MAX_DURATION ''' ''' INITIALIZATION ''' def __init__(self, inputs=[], outputs=[], fees=0, max_duration=MAX_DURATION, max_txos=MAX_NB_TXOS): ''' Constructor Parameters: inputs = list of inputs txos [(v1_id, v1_amount), ...] outputs = list of outputs txos [(v1_id, v1_amount), ...] fees = amount of fees associated to the transaction max_duration = max duration allocated to processing of a single tx (in seconds) max_txos = max number of txos. Txs with more than max_txos inputs or outputs are not processed. ''' self._orig_ins = inputs self._orig_outs = outputs self._orig_fees = fees self._max_duration = max_duration self.max_txos = max_txos self._packs = [] ''' PUBLIC METHODS ''' def process(self, linked_txos=[], options=[LINKABILITY, PRECHECK], intrafees=(0,0)): ''' Computes the linkability between a set of input txos and a set of output txos Returns: linkability matrix number of possible combinations for the transaction list of inputs (sorted by decreasing value) list of outputs (sorted by decreasing value) Parameters: linked_txos = list of sets storing linked input txos. Each txo is identified by its id options = list of actions to be applied LINKABILITY : computes the linkability matrix PRECHECK : prechecks existence of deterministic links between inputs and outputs MERGE_FEES : consider that all fees have been paid by a unique sender and manage fees as an additionnal output intrafees = tuple (fees_maker, fees_taker) of max "fees" paid among participants used for joinmarket transactions fees_maker are potential max "fees" received by a participant from another participant fees_taker are potential max "fees" paid by a participant to all others participants ''' self._options = options self.inputs = self._orig_ins.copy() self.outputs = self._orig_outs.copy() self._fees_maker = intrafees[0] self._fees_taker = intrafees[1] self._has_intrafees = True if (self._fees_maker or self._fees_taker) else False # Packs txos known as being controlled by a same entity # It decreases the entropy and speeds-up computations if linked_txos: self._pack_linked_txos(linked_txos) # Manages fees if (self.MERGE_FEES in options) and (self._orig_fees > 0): # Manages fees as an additional output (case of sharedsend by blockchain.info). # Allows to reduce the volume of computations to be done. self._fees = 0 txo_fees = (self.FEES, self._orig_fees) self.outputs.append(txo_fees) else: self._fees = self._orig_fees # Checks deterministic links nb_cmbn = 0 if self.PRECHECK in options and self._check_limit_ok(self.PRECHECK) and (not self._has_intrafees): # Prepares the data self._prepare_data() self._match_agg_by_val() # Checks deterministic links dtrm_lnks, dtrm_lnks_id = self._check_dtrm_links() # If deterministic links have been found, fills the linkability matrix # (returned as result if linkability is not processed) if dtrm_lnks is not None: shape = ( len(self.outputs), len(self.inputs) ) mat_lnk = np.zeros(shape, dtype=np.int64) for (r,c) in dtrm_lnks: mat_lnk[r,c] = 1 else: mat_lnk = None dtrm_lnks_id = None # Checks if all inputs and outputs have already been merged nb_ins = len(self.inputs) nb_outs = len(self.outputs) if (nb_ins == 0) or (nb_outs == 0): nb_cmbn = 1 shape = (nb_outs, nb_ins) mat_lnk = np.ones(shape, dtype=np.int64) elif self.LINKABILITY in options and self._check_limit_ok(self.LINKABILITY): # Packs deterministic links if needed if dtrm_lnks_id is not None: dtrm_lnks_id = [set(lnk) for lnk in dtrm_lnks_id] self._pack_linked_txos(dtrm_lnks_id) # Prepares data self._prepare_data() self._match_agg_by_val() # Computes a matrix storing a tree composed of valid pairs of input aggregates self._compute_in_agg_cmbn() # Builds the linkability matrix nb_cmbn, mat_lnk = self._compute_link_matrix() # Unpacks the matrix mat_lnk = self._unpack_link_matrix(mat_lnk, nb_cmbn) # Returns results return mat_lnk, nb_cmbn, self.inputs, self.outputs ''' PREPARATION ''' def _prepare_data(self): ''' Computes several data structures which will be used later Parameters: inputs = list of input txos outputs = list of output txos ''' # Prepares data related to the input txos self.inputs,\ self._all_in_agg,\ self._all_in_agg_val = self._prepare_txos(self.inputs) # Prepares data related to the output txos self.outputs,\ self._all_out_agg,\ self._all_out_agg_val = self._prepare_txos(self.outputs) def _prepare_txos(self, txos): ''' Computes several data structures related to a list of txos Returns: list of txos sorted by decreasing values array of aggregates (combinations of txos) in binary format array of values associated to the aggregates Parameters: txos = list of txos (list of tuples (id, value)) ''' # Removes txos with null value txos = filter(lambda x: x[1] > 0, txos) # Orders txos by value txos = sorted(txos, key=lambda tup: tup[1], reverse=True) # Creates a 1D array of values vals = [ e[1] for _, e in enumerate(txos) ] all_val = np.array(vals, dtype='int64') # Computes all possible combinations of txos encoded in binary format expnt = len(txos) shape = (expnt, 2**expnt) all_agg = np.zeros(shape, dtype=np.bool) base = np.array([0,1], dtype=bool) for j in range(0, expnt): two_exp_j = 2**j tmp = np.repeat(base, two_exp_j) all_agg[j, :] = np.tile(tmp, 2**(expnt-1) / two_exp_j) #all_agg = np.arange(2**expnt) >> np.arange(expnt)[::, np.newaxis] & 1 # Computes values of aggregates all_agg_val = np.dot(all_val, all_agg) # Returns computed data structures return txos, all_agg, all_agg_val ''' PROCESSING OF AGGREGATES ''' def _match_agg_by_val(self): ''' Matches input/output aggregates by values and returns a bunch of data structs ''' self._all_match_in_agg = SortedList() self._match_in_agg_to_val = defaultdict(int) self._val_to_match_out_agg = defaultdict(set) # Gets unique values of input / output aggregates all_unique_in_agg_val, _ = np.unique(self._all_in_agg_val, return_inverse=True) all_unique_out_agg_val, _ = np.unique(self._all_out_agg_val, return_inverse=True) # Computes total fees paid/receiver by taker/maker if self._has_intrafees: fees_taker = self._fees + self._fees_taker fees_maker = - self._fees_maker # doesn't take into account tx fees paid by makers # Finds input and output aggregates with matching values for in_agg_val in np.nditer(all_unique_in_agg_val): val = int(in_agg_val) for out_agg_val in np.nditer(all_unique_out_agg_val): diff = in_agg_val - out_agg_val if (not self._has_intrafees) and (diff < 0): break else: # Computes conditions required for a matching cond_no_intrafees = (not self._has_intrafees) and diff <= self._fees cond_intrafees = self._has_intrafees and\ ( (diff <= 0 and diff >= fees_maker) or (diff >= 0 and diff <= fees_taker) ) if cond_no_intrafees or cond_intrafees: # Registers the matching input aggregate match_in_agg = np.where(self._all_in_agg_val == in_agg_val)[0] for in_idx in match_in_agg: if not in_idx in self._all_match_in_agg: self._all_match_in_agg.add(in_idx) self._match_in_agg_to_val[in_idx] = val # Registers the matching output aggregate match_out_agg = np.where(self._all_out_agg_val == out_agg_val)[0] self._val_to_match_out_agg[val].update(match_out_agg.tolist()) def _compute_in_agg_cmbn(self): ''' Computes a matrix of valid combinations (pairs) of input aggregates Returns a dictionary (parent_agg => (child_agg1, child_agg2)) We have a valid combination (agg1, agg2) if: R1/ child_agg1 & child_agg2 = 0 (no bitwise overlap) R2/ child_agg1 > child_agg2 (matrix is symmetric) ''' aggs = self._all_match_in_agg[1:-1] tgt = self._all_match_in_agg[-1] mat = defaultdict(list) saggs = set(aggs) for i in range(0, tgt+1): if i in saggs: j_max = min(i, tgt - i + 1) for j in range(0, j_max): if (i & j == 0) and (j in saggs): mat[i+j].append( (i,j) ) self._mat_in_agg_cmbn = mat ''' COMPUTATION OF LINKS BETWEEN TXOS ''' def _check_dtrm_links(self): ''' Checks the existence of deterministic links between inputs and outputs Returns a list of tuples (idx_output, idx_input) and a list of tuples (id_output, id_input) ''' nb_ins = len(self.inputs) nb_outs = len(self.outputs) shape = (nb_outs, nb_ins) mat_cmbn = np.zeros(shape, dtype=np.int64) shape = (1, nb_ins) in_cmbn = np.zeros(shape, dtype=np.int64) # Computes a matrix storing numbers of raw combinations matching input/output pairs # Also computes sum of combinations along inputs axis to get the number of combinations for (in_idx, val) in self._match_in_agg_to_val.items(): for out_idx in self._val_to_match_out_agg[val]: mat_cmbn += self._get_link_cmbn(in_idx, out_idx) in_cmbn += self._all_in_agg[:,in_idx][np.newaxis,:] # Builds a list of sets storing inputs having a deterministic link with an output nb_cmbn = in_cmbn[0,0] dtrm_rows, dtrm_cols = np.where(mat_cmbn == nb_cmbn) dtrm_coords = list(zip(dtrm_rows, dtrm_cols)) dtrm_aggs = [(self.outputs[o][0], self.inputs[i][0]) for (o,i) in dtrm_coords] return dtrm_coords, dtrm_aggs def _compute_link_matrix(self): ''' Computes the linkability matrix Returns the number of possible combinations and the links matrix Implements a depth-first traversal of the inputs combinations tree (right to left) For each input combination we compute the matching output combinations. This is a basic brute-force solution. Will have to find a better method later. ''' nb_tx_cmbn = 0 itgt = 2 ** len(self.inputs) - 1 otgt = 2 ** len(self.outputs) - 1 d_links = defaultdict(int) # Initializes a stack of tasks & sets the initial task # 0: index used to resume the processing of the task (required for depth-first algorithm) # 1: il = left input aggregate # 2: ir = right input aggregate # 3: d_out = outputs combination matching with current input combination # dictionary of dictionary : { or => { ol => (nb_parents_cmbn, nb_children_cmbn) } } stack = deque() ini_d_out = defaultdict(dict) ini_d_out[otgt] = { 0: (1, 0) } stack.append( (0, 0, itgt, ini_d_out) ) # Sets start date/hour start_time = datetime.now() # Iterates over all valid inputs combinations (top->down) while len(stack) > 0: # Checks duration curr_time = datetime.now() delta_time = curr_time - start_time if delta_time.total_seconds() >= self._max_duration: return 0, None # Gets data from task t = stack[-1] idx_il = t[0] il = t[1] ir = t[2] d_out = t[3] n_idx_il = idx_il # Gets all valid decompositions of right input aggregate ircs = self._mat_in_agg_cmbn[ir] len_ircs = len(ircs) for i in range(idx_il, len_ircs): n_idx_il = i n_d_out = defaultdict(dict) # Gets left input sub-aggregate (column from ircs) n_il = ircs[i][1] # Checks if we must process this pair (columns from ircs are sorted in decreasing order) if n_il > il: # Gets the right input sub-aggregate (row from ircs) n_ir = ircs[i][0] # Iterates over outputs combinations previously found for o_r in d_out: sol = otgt - o_r # Computes the number of parent combinations nb_prt = sum([s[0] for s in d_out[o_r].values()]) # Iterates over output sub-aggregates matching with left input sub-aggregate val_il = self._match_in_agg_to_val[n_il] for n_ol in self._val_to_match_out_agg[val_il]: # Checks compatibility of output sub-aggregate with left part of output combination if (sol & n_ol == 0): # Computes: # the sum corresponding to the left part of the output combination # the complementary right output sub-aggregate n_sol = sol + n_ol n_or = otgt - n_sol # Checks if the right output sub-aggregate is valid val_ir = self._match_in_agg_to_val[n_ir] match_out_agg = self._val_to_match_out_agg[val_ir] # Adds this output combination into n_d_out if all conditions met if (n_sol & n_or == 0) and (n_or in match_out_agg): n_d_out[n_or][n_ol] = (nb_prt, 0) # Updates idx_il for the current task stack[-1] = (i + 1, il, ir, d_out) # Pushes a new task which will decompose the right input aggregate stack.append( (0, n_il, n_ir, n_d_out) ) # Executes the new task (depth-first) break else: # No more results for il, triggers a break and a pop n_idx_il = len_ircs break # Checks if task has completed if n_idx_il > len_ircs - 1: # Pops the current task t = stack.pop() il = t[1] ir = t[2] d_out = t[3] # Checks if it's the root task if len(stack) == 0: # Retrieves the number of combinations from root task nb_tx_cmbn = d_out[otgt][0][1] else: # Gets parent task p_t = stack[-1] p_d_out = p_t[3] # Iterates over all entries from d_out for (o_r, l_ol) in d_out.items(): r_key = (ir, o_r) # Iterates over all left aggregates for (ol, (nb_prnt, nb_chld)) in l_ol.items(): l_key = (il, ol) # Updates the dictionary of links for the pair of aggregates nb_occur = nb_chld + 1 d_links[r_key] += nb_prnt d_links[l_key] += nb_prnt * nb_occur # Updates parent d_out by back-propagating number of child combinations p_or = ol + o_r p_l_ol = p_d_out[p_or] for (p_ol, (p_nb_prt, p_nb_chld)) in p_l_ol.items(): p_d_out[p_or][p_ol] = (p_nb_prt, p_nb_chld + nb_occur) # Fills the matrix links = self._get_link_cmbn(itgt, otgt) nb_tx_cmbn += 1 for (lnk, mult) in d_links.items(): links = links + self._get_link_cmbn(lnk[0], lnk[1]) * mult return nb_tx_cmbn, links def _get_link_cmbn(self, in_agg, out_agg): ''' Computes a linkability matrix encoding the matching of given input/output aggregates Returns a numpy array Parameters: in_agg = input aggregate out_agg = output aggregate ''' vouts = self._all_out_agg[:,out_agg][:,np.newaxis] vins = self._all_in_agg[:,in_agg][np.newaxis,:] return np.dot(vouts, vins) ''' PACKING/UNPACKING OF LINKED TXOS ''' def _pack_linked_txos(self, linked_txos): ''' Packs input txos which are known as being controlled by a same entity Parameters: linked_txos = list of sets storing linked input txos. Each txo is identified by its "id" ''' idx = len(self._packs) # Merges packs sharing common elements packs = merge_sets(linked_txos) for pack in packs: ins = [] val_ins = 0 for i in self.inputs: if i[0] in pack: ins.append(i) val_ins += i[1] idx += 1 if len(ins) > 0: lbl = '%s_I%i' % (self.PACK, idx) inp = (lbl, val_ins) self.inputs.append(inp) in_pack = (lbl, val_ins, 'INPUTS', ins, []) self._packs.append(in_pack) [self.inputs.remove(v) for v in ins] def _unpack_link_matrix(self, mat_lnk, nb_cmbn): ''' Unpacks linked txos in the linkability matrix Returns the unpacked matrix Parameters: mat_lnk = linkability matrix to be unpacked nb_cmbn = number of combinations associated to the linkability matrix ''' mat_res = mat_lnk nb_cmbn = max(1, nb_cmbn) for (pack, val, lctn, ins, outs) in reversed(self._packs): if lctn == 'INPUTS': key = (pack, val) idx = self.inputs.index(key) if mat_lnk is not None: nb_ins = len(ins) nb_outs = len(self.outputs) # Inserts columns into the matrix for packed inputs shape = (nb_outs, nb_ins) vals = np.zeros(shape , dtype=np.int64) vals += mat_res[:,idx][:, np.newaxis] mat_res = np.hstack( (mat_res[:,0:idx], vals, mat_res[:,idx+1:]) ) # Inserts unpacked inputs into the list of inputs self.inputs[idx:idx+1] = ins elif lctn == 'OUTPUTS': key = (pack, val) idx = self.outputs.index(key) if mat_lnk is not None: nb_ins = len(self.inputs) nb_outs = len(outs) # Inserts rows into the matrix for packed outputs shape = (nb_outs, nb_ins) vals = np.zeros(shape, dtype=np.int64) vals += mat_res[idx,:][np.newaxis,:] mat_res = np.vstack( (mat_res[0:idx,:], vals, mat_res[idx+1:,:]) ) # Inserts unpacked outputs into the list of outputs self.outputs[idx:idx+1] = outs return mat_res ''' LIMITS ''' def _check_limit_ok(self, mode): len_in = len(self.inputs) len_out = len(self.outputs) max_card = max(len_in, len_out) return True if (max_card <= self.max_txos) else False
def scheduling_method(self, cur_time, es, es_dict): """ This function must map the queued events to available nodes at the current time. :param cur_time: current time :param es_dict: dictionary with full data of the events :param es: events to be scheduled :param debug: Flag to debug :return a tuple of (time to schedule, event id, list of assigned nodes) """ resource_types = self.resource_manager.resource_types avl_resources = self.resource_manager.current_availability #======================================================================= # Considered queued jobs: Jobs can be fitted in the current system state and less or equal than q_length # If a job_obj cannot be fitted or exceed the q_length is directly loaded in the dispatching decision using the no-solution dispatching tuple #======================================================================= priorized_jobs = SortedListWithKey(key=lambda job_tuple: job_tuple[1]) current_qjobs = SortedList() #=================================================================== # Here, if there is a non dispatching previous state set, the current system capacity # is verified if it is different (more reasource available than before) the dispatcher is called. # Otherwise, a non dispatching decision is returned. #=================================================================== # Dispatching Skip dispatch = True prev_qjobs = None # Dispatching skip if self.non_dispatched_state: dispatch = False (prev_qjobs, prev_total_resource_usage,) = self.non_dispatched_state new_jobs = False for e in es: if not(e.id in prev_qjobs): new_jobs = True self.non_dispatched_state = None break if not new_jobs: cur_total_resource_usage = self.resource_manager._resources.usage('dict') zero_usage = [] same_usage = [] for res in resource_types: zero_usage.append(cur_total_resource_usage[res] == 0) same_usage.append(cur_total_resource_usage[res] >= prev_total_resource_usage[res]) if all(zero_usage): # The system is empty self.non_dispatched_state = None elif all(same_usage): # The system has the same or less capacity wrt the stuck state return [self.dispatching_tuple(e.id) for e in es], [] else: # The system is not empty but has more capacity wrt the stuck state self.non_dispatched_state = None cons_qjobs = {} max_ewt = max([self.get_ewt(job.queue) for job in es] + [self.get_ewt(es_dict[job_id]) for job_id in self.resource_manager.current_allocations]) for node in self.resource_manager.node_names: avl_res = avl_resources[node] for idx, job_obj in enumerate(es): job_id = job_obj.id if not(job_id in cons_qjobs): current_qjobs.add(job_id) cons_qjobs[job_id] = [False, 0, {}, None] priorized_jobs.add((job_id, self._job_priority_ewt(job_obj, cur_time, max_ewt))) possibilities = self._joint_nodes(job_obj, avl_res) if possibilities > 0: cons_qjobs[job_id][2][node] = min(possibilities, job_obj.requested_nodes) cons_qjobs[job_id][1] += possibilities if cons_qjobs[job_id][1] >= job_obj.requested_nodes: cons_qjobs[job_id][0] = True if not cons_qjobs[job_id][3]: cons_qjobs[job_id][3] = job_obj qjobs = 0 wc_makespan = 0 makespans = [] remaining_priorized_jobs = [] # Job of the dispatching decision decision_jobs = {} for job_id, _ in priorized_jobs: t = cons_qjobs[job_id] if not t[0] or qjobs > self.cur_q_length - 1: decision_jobs[job_id] = self.dispatching_tuple(job_id) cons_qjobs.pop(job_id) else: exp_duration = max(1, t[-1].expected_duration) wc_makespan += exp_duration makespans.append(exp_duration) qjobs += 1 remaining_priorized_jobs.append(job_id) #======================================================================= # There are no jobs to dispatch at the current system state. # Then a no solution list is returned. #======================================================================= if not cons_qjobs: # Job Dispatching skip cur_total_resource_usage = self.resource_manager._resources.usage('dict') self.non_dispatched_state = (current_qjobs, cur_total_resource_usage,) return decision_jobs.values(), [] #======================================================================= # After an unsuccessful dispatching #======================================================================= if self.use_max_timelimit: timelimit = self.timelimit else: timelimit = self.initial_timelimit a_jobs_list = [] best_z_list = [] solved = False self.priorized_jobs = None prev_sched = [] while timelimit <= self.timelimit: schedalloc_plan = {} args = (schedalloc_plan, cur_time, cons_qjobs, remaining_priorized_jobs, es_dict, resource_types, avl_resources) kwargs = {'timelimit':timelimit, 'prev_sched':prev_sched} function = getattr(self, 'cp_model') function(*args, **kwargs) solver_state = schedalloc_plan.pop('solver_state') best_z = schedalloc_plan.pop('best_z') best_z_list.append(best_z) if solver_state == self.SolverState.PROBLEM_INFEASIBLE: break limit_reached = schedalloc_plan.pop('limit_reached') disp_jobs = 0 prev_sched = [] for stime, job_id, _ in schedalloc_plan.values(): if stime == cur_time: prev_sched.append(job_id) disp_jobs += 1 if disp_jobs == len(cons_qjobs) and solver_state == self.SolverState.NO_MORE_SOLUTIONS.value and not limit_reached: solved = True break elif disp_jobs < len(cons_qjobs) and solver_state == self.SolverState.NO_MORE_SOLUTIONS.value and not limit_reached: solved = True break elif len(best_z_list) >= self.max_k and all([best_z_list[-1] == b for b in best_z_list[-self.max_k:]]): solved = True break else: a_jobs_list.append(disp_jobs) timelimit *= 2 self.priorized_jobs = None # This is useful for print and also to create the unsuccessful data dispatched_jobs = 0 queued_job_ids = [] for a in schedalloc_plan: if a[2]: dispatched_jobs += 1 if dispatched_jobs == 0: queued_job_ids.append(a[1]) if self.reduce_job_length: #=================================================================== # The considered number of jobs in the next scheduling decision are reduced to the half # if the current problem instance was not solved, if the current usage is # leq of the previous time point. After a successful dispatching this value is reset. # The minimum is 1, otherwise there will be nothing to dispatch #=================================================================== if not solved: self.cur_q_length = max(1, self.cur_q_length // 2) else: self.cur_q_length = self.q_length if dispatched_jobs == 0: self.non_dispatched_state = (current_qjobs, self.resource_manager._resources.usage('dict'),) else: self.non_dispatched_state = None return list(schedalloc_plan.values()) + list(decision_jobs.values()), []
def scheduling_method(self, cur_time, es, es_dict): """ This function must map the queued events to available nodes at the current time. :param cur_time: current time :param es_dict: dictionary with full data of the events :param es: events to be scheduled :param debug: Flag to debug :return a tuple of (time to schedule, event id, list of assigned nodes) """ dispatching_plan = [] resource_types = self.resource_manager.resource_types avl_resources = self.resource_manager.current_availability system_capacity = self.resource_manager.system_capacity('nodes') # ======================================================================= # Considered queued jobs: Jobs can be fitted in the current system state and less or equal than q_length # If a job_obj cannot be fitted or exceed the q_length is directly loaded in the dispatching decision using the no-solution dispatching tuple # ======================================================================= priorized_jobs = SortedListWithKey(key=lambda job_tuple: job_tuple[1]) current_qjobs = SortedList() cons_qjobs = {} for node in self.resource_manager.node_names: avl_res = avl_resources[node] # avl_res = system_capacity[node] for idx, job_obj in enumerate(es): job_id = job_obj.id if not (job_id in cons_qjobs): current_qjobs.add(job_id) cons_qjobs[job_id] = [False, 0, {}, None] priorized_jobs.add((job_id, self._job_priority_slowdown(job_obj, cur_time))) if self._reduced_model: possibilities = self._joint_nodes(job_obj, avl_res) if possibilities > 0: cons_qjobs[job_id][2][node] = min(possibilities, job_obj.requested_nodes) cons_qjobs[job_id][1] += possibilities if cons_qjobs[job_id][1] >= job_obj.requested_nodes: cons_qjobs[job_id][0] = True if not cons_qjobs[job_id][3]: cons_qjobs[job_id][3] = job_obj else: cons_qjobs[job_id][0] = True cons_qjobs[job_id][1] = None cons_qjobs[job_id][2] = None cons_qjobs[job_id][3] = job_obj qjobs = 0 wc_makespan = 0 makespans = [] selected_priorized_jobs = [] # Job of the dispatching decision decision_jobs = {} if self._reduced_model: for job_id, _ in priorized_jobs: t = cons_qjobs[job_id] if not t[0] or qjobs > self._cur_q_length - 1: decision_jobs[job_id] = self.dispatching_tuple(job_id) cons_qjobs.pop(job_id) else: exp_duration = max(1, t[-1].expected_duration) wc_makespan += exp_duration makespans.append(exp_duration) qjobs += 1 selected_priorized_jobs.append(job_id) else: cannot_start_selected = 0 for job_id, _ in priorized_jobs: t = cons_qjobs[job_id] if (not t[0] and cannot_start_selected >= self._considered_cannot_start) or ( qjobs > self._cur_q_length - 1): decision_jobs[job_id] = self.dispatching_tuple(job_id) cons_qjobs.pop(job_id) else: if not t[0]: cons_qjobs[job_id][3] = es_dict[job_id] cannot_start_selected += 1 exp_duration = max(1, t[-1].expected_duration) wc_makespan += exp_duration # , self.get_queue(t[-1].queue)) # exp_duration makespans.append(exp_duration) qjobs += 1 selected_priorized_jobs.append(job_id) # ======================================================================= # There are no jobs to dispatch at the current system state. # Then a no solution list is returned. # ======================================================================= if not cons_qjobs: # Job Dispatching skip return decision_jobs.values(), [] solved = False self.priorized_jobs = None if self._safe: manager = mp_dill.Manager() schedule_plan = manager.dict() process_class = mp_dill.Process p = process_class(target=getattr(self, 'cp_model'), args=( schedule_plan, cur_time, cons_qjobs, selected_priorized_jobs, es_dict, resource_types, avl_resources), kwargs={'timelimit': timelimit} ) p.start() p.join() if p.exitcode != 0: schedule_plan.pop('solver_state', None) schedule_plan.pop('limit_reached', None) return list(decision_jobs.values()) \ + [self.dispatching_tuple(job_id, start_time, nodes) for (start_time, job_id, nodes) in schedule_plan.values()] \ + [self.dispatching_tuple(job_id, None, []) for job_id in cons_qjobs if not (job_id in schedule_plan)], [] else: schedule_plan = {} args = ( schedule_plan, cur_time, cons_qjobs, selected_priorized_jobs, es_dict, resource_types, avl_resources) kwargs = {'max_timelimit': self._max_timelimit} function = getattr(self, 'cp_model') function(*args, **kwargs) solved = schedule_plan.pop('solved') of_value = schedule_plan.pop('of_value') walltime = schedule_plan.pop('walltime') proc_time = schedule_plan.pop('proc_time') incurred_time = walltime + proc_time failures = schedule_plan.pop('failures') branches = schedule_plan.pop('branches') p = None self.priorized_jobs = None dispatching_plan = list(schedule_plan.values()) self.__instance_data = ( solved, of_value, walltime, incurred_time, failures, branches, dispatching_plan + list(decision_jobs.values()),) # This is useful for print and also to create the unsuccessful data dispatched_jobs = 0 queued_job_ids = [] for a in dispatching_plan: if a[2]: dispatched_jobs += 1 if dispatched_jobs == 0: queued_job_ids.append(a[1]) if self._reduce_job_length: # =================================================================== # The considered number of jobs in the next scheduling decision are reduced to the half # if the current problem instance was not solved, if the current usage is # leq of the previous time point. After a successful dispatching this value is reset. # The minimum is 1, otherwise there will be nothing to dispatch # =================================================================== if not solved: self._cur_q_length = max(1, min(self._cur_q_length, len(schedule_plan)) // 2) # max(1, self._cur_q_length // 2) else: self._cur_q_length = self._q_length print('{} - {}: Queued {}, Dispatched {}, Running {}. {}'.format(self._counter, cur_time, len(es) - dispatched_jobs, dispatched_jobs, len(self.resource_manager.current_allocations), self.resource_manager.current_usage)) return dispatching_plan + list(decision_jobs.values()), []
import math from sortedcontainers.sortedlist import SortedList import automata as atma from automata.AnmalZoo.anml_zoo import anml_path, AnmalZoo fcb_size = 256 # size of the local switches. we assume GS has also the same size fcb_to_gs = 16 # number of wires from local switches to GS bigest_component_size = fcb_size / fcb_to_gs * fcb_size ds = [a for a in AnmalZoo] for uat in ds: r = SortedList() big128, big256 = 0, 0 automatas = atma.parse_anml_file(anml_path[uat]) automatas.remove_ors() automatas = automatas.get_connected_components_as_automatas() for atm in automatas: nc = atm.nodes_count if nc >= 128: big128 += 1 if nc >= 256: big256 += 1 if nc > bigest_component_size: print "this NFA can not be fit:", uat break
def MedCombiner2(intermediates): """ The Running Medians Reducer merges the intermediate lists that are packed inside the outer list, intermediates, into one master flat list. :rtype : object master list of the final results :param intermediates: list of lists of the running medians of each input text file :return: the final results """ # master list of the final results linesWordCount = [] # iterating over the sub lists for each input file to concatenate them into a master list # for v in intermediates: # linesWordCount+=v # print(intermediates) resultDict = defaultdict(list) # the following loop iterates over the first dictionary key and value pairs and then iterates over the next dictionary's # pairs. It continues until it iterates over all dictionaries that are members of the intermediates. While iterating, # a new dictionary is created, result, to hold all the pairs of the intermediate dictionaries, thus effectively # merging all of them. # i = 0 for d in intermediates: # print(d) for k, v in dict(d).items(): resultDict[k] = v # for k,l in chain(*intermediates): # resultDict[k] = l # print("resultedDict ", resultDict) # the following loop iterates over the first dictionary key and value pairs and then iterates over the next dictionary's # pairs. It continues until it iterates over all dictionaries that are members of the intermediates. While iterating, # a new dictionary is created, result, to hold all the pairs of the intermediate dictionaries, thus effectively # merging all of them. sortedKeys=sorted(resultDict, key= lambda k:k,reverse=False) for k in sortedKeys: linesWordCount.extend(resultDict[k]) # print("linesWordCount ", linesWordCount) medianNumbers= [] # a sorted list to hold the word counts for input lines # the sorted list boosts performance substantially when computing the running median because this will not require # resorting the wordcount list every time we add an entry to it. sortedLinesWordCount = SortedList() lineNO = 0 # running median calculations # because I used a sorted list for the wordcounts of lines, now it is straightforward to compute # the running median for wordcount in linesWordCount: sortedLinesWordCount.add(wordcount) # print(sortedLinesWordCount) index = int(lineNO/2) if lineNO%2 == 0: medianNumbers.append(float(sortedLinesWordCount[index])) else: medianNumbers.append(float((sortedLinesWordCount[index] + sortedLinesWordCount[index+1])/2)) lineNO += 1 # print(medianNumbers) # print(medianNumbers) return medianNumbers
import gc import os import feather import numpy as np import pandas as pd import ujson from sortedcontainers.sortedlist import SortedList from tqdm import tqdm data_dir = '../data' print('Reformats data from:', data_dir) files = os.listdir(data_dir + '/json/') # noqa snap_files = SortedList([filename for filename in files if 'snaps' in filename], key=lambda fn: pd.to_datetime(fn[:-11], format='%d_%m_%Y_%H_%M_%S')) try: os.makedirs(data_dir + '/snap_json/') except FileExistsError: pass for snapfile in tqdm(snap_files): with open(data_dir + '/json/' + snapfile, 'r') as f: snaps = f.readlines() for snap in snaps: try: snap = ujson.loads(snap) try: seq = snap['sequence'] with open(data_dir + '/snap_json/snap_' + str(seq) + '.json', 'w') as snapf: ujson.dump(snap, snapf)