def eval_recall5(imgs_enc, caps_enc): imgs_enc = np.vstack(flatten(imgs_enc)) caps_enc = np.vstack(flatten(caps_enc)) res = avg_recall5(imgs_enc, caps_enc) return res
def test_flatten(self): ref = ['foo', 'bar', '123'] result = utils.flatten(ref) self.assertEquals(ref, result) result = utils.flatten(['foo', ['bar', '123']]) self.assertEquals(ref, result) result = utils.flatten([['foo'], ['bar', '123']]) self.assertEquals(ref, result)
def computeEvolutionRoc(temporalListLabels, predictions, classes = None, percentage = 0.001): """ Plots the evolution of the auc Arguments: temporalListLabels {List of (time, labels)*} -- Ground truth labels predictions {Dict / List of labels} -- Predicitons (same format than labels in temporalListLabels) classes {Dict} -- Classes to consider to plot (key: Name to display, Value: label) percentage {float} -- Evaluate the TPR and TNR at this given value of FNR and FPR """ aucs = {} for time, labels in temporalListLabels: pred_time, labels_time = selection(predictions, labels, classes) pred_time, labels_time = flatten(pred_time, labels_time) fpr, tpr, _ = roc_curve(labels_time, pred_time) fnr, tnr = (1 - tpr)[::-1], (1 - fpr)[::-1] auc_time = auc(fpr, tpr) wilson_tpr = 1.96 * np.sqrt(tpr * (1 - tpr)/len(predictions)) wilson_tnr = 1.96 * np.sqrt(tnr * (1 - tnr)/len(predictions)) aucs[time] = { "auc": auc_time, "lower": auc(fpr, tpr - wilson_tpr), "upper": auc(fpr, tpr + wilson_tpr), "tpr": np.interp(percentage, fpr, tpr), "tpr_wilson" : np.interp(percentage, fpr, wilson_tpr), "tnr": np.interp(percentage, fnr, tnr), "tnr_wilson" : np.interp(percentage, fnr, wilson_tnr), } return pd.DataFrame.from_dict(aucs, orient = "index")
def fcn_G(input_dim, nn, imgsz, channels, requires_grad, depth=2): def gen_block_params(ni, no): return {'fc': utils.linear_params(ni, no),} def gen_group_params(ni, no, count): return {'block%d' % i: gen_block_params(ni if i == 0 else no, no) for i in range(count)} flat_params = utils.cast(utils.flatten({ 'group0': gen_group_params(input_dim, nn, depth), 'last_proj': utils.linear_params(nn, imgsz*imgsz*channels), })) if requires_grad: utils.set_requires_grad_except_bn_(flat_params) def block(x, params, base, mode): return F.relu(F.linear(x, params[base+'.fc.weight'], params[base+'.fc.bias']), inplace=True) def group(o, params, base, mode): for i in range(depth): o = block(o, params, '%s.block%d' % (base,i), mode) return o def f(input, params, mode): o = group(input, params, 'group0', mode) o = F.linear(o, params['last_proj.weight'], params['last_proj.bias']) o = torch.tanh(o) # o = o.view(o.size(0), channels, imgsz, imgsz) o = o.reshape(o.size(0), channels, imgsz, imgsz) return o return f, flat_params
def graph_loss_tols(graph, i, o, filename=None): """ Gets all configurations graph pathfinding is loss tolerant to. Exports output to file, if provided. """ print('\n graph_loss_tols \n') nodes = set(graph.nodes()) - set([i, o]) all_tols = [[]] loss_tol_nodes = set(nodes) for r in range(1, len(nodes) + 1): loss_configs = list(it.combinations(loss_tol_nodes, r)) loss_tol_nodes = set() while loss_configs: loss_config = loss_configs.pop() lost_nodes = \ set(loss_config) | \ set(flatten(map(graph.neighbors, loss_config))) loss_graph = deepcopy(graph) loss_graph.remove_nodes_from(lost_nodes) if i in loss_graph.nodes() and o in loss_graph.nodes() and \ nx.has_path(loss_graph, i, o): loss_tol_nodes |= set(loss_config) all_tols.append(list(loss_config)) if filename: with open(filename, 'w') as fp: json.dump(all_tols, fp) return all_tols
def histPlot(predictions, truth, classes = None, label = "Model", newFigure = None, splitPosNeg = False, kde = False): """ Computes the histograms of a binary predictions Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}} Keyword Arguments: label {str} -- Legend to plot (default: {"Model"}) newFigure {str} -- Display on a given figure (default: {None} - Create new figure) splitPosNeg {bool} -- Split between positive and negative (default: {False}) kde {bool} -- Computes the kde of the histogram (default: {False}) """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) bins = np.linspace(0, 1, 20) if newFigure is not None: plt.figure(newFigure) else: plt.xlabel('Predicted Probability') plt.ylabel('Frequency') plt.title('Histogram Probabilities') if splitPosNeg: sns.distplot(predictions[truth == 1], label=label + " Positive", kde = kde, bins = bins) sns.distplot(predictions[truth == 0], label=label + " Negative", kde = kde, bins = bins) else: sns.distplot(predictions, label=label, kde = kde, bins = bins)
def dcganx_D(nn0, imgsz, channels, # 1: gray-scale, 3: color norm_type, # 'bn', 'none' requires_grad, depth=3, leaky_slope=0.2, nodemul=2, do_bias=True): ker=5; padding=2 def gen_block_params(ni, no, k): return { 'conv0': conv2d_params(ni, no, k, do_bias), 'conv1': conv2d_params(no, no, 1, do_bias), 'bn0': utils.bnparams(no) if norm_type == 'bn' else None, 'bn1': utils.bnparams(no) if norm_type == 'bn' else None } def gen_group_params(ni, no, count): return {'block%d' % i: gen_block_params(ni if i == 0 else no, no, ker) for i in range(count)} count = 1 sz = imgsz // (2**depth) nn = nn0 p = { 'conv0': conv2d_params(channels, nn0, ker, do_bias) } for d in range(depth-1): p['group%d'%d] = gen_group_params(nn, nn*nodemul, count) nn = nn*nodemul p['fc'] = utils.linear_params(sz*sz*nn, 1) flat_params = utils.cast(utils.flatten(p)) if requires_grad: utils.set_requires_grad_except_bn_(flat_params) def block(x, params, base, mode, stride): o = F.conv2d(x, params[base+'.conv0.w'], params.get(base+'conv0.b'), stride=stride, padding=padding) if norm_type == 'bn': o = utils.batch_norm(o, params, base + '.bn0', mode) o = F.leaky_relu(o, negative_slope=leaky_slope, inplace=True) o = F.conv2d(o, params[base+'.conv1.w'], params.get(base+'conv1.b'), stride=1, padding=0) if norm_type == 'bn': o = utils.batch_norm(o, params, base + '.bn1', mode) o = F.leaky_relu(o, negative_slope=leaky_slope, inplace=True) return o def group(o, params, base, mode, stride=2): n = 1 for i in range(n): o = block(o, params, '%s.block%d' % (base,i), mode, stride if i == 0 else 1) return o def f(input, params, mode): o = F.conv2d(input, params['conv0.w'], params.get('conv0.b'), stride=2, padding=padding) o = F.leaky_relu(o, negative_slope=leaky_slope, inplace=True) for d in range(depth-1): o = group(o, params, 'group%d'%d, mode) o = o.view(o.size(0), -1) o = F.linear(o, params['fc.weight'], params['fc.bias']) return o return f, flat_params
def import_loss_tols(in_file, filename=None): """ Imports and formats loss tols for use. Exports to file, if provided """ with open(in_file, 'r') as fp: data = json.load(fp) max_tols = flatten(value for value in data.values()) all_tols = get_all_loss_tols(max_tols) if filename: with open(filename, 'w') as fp: json.dump(all_tols, fp) return all_tols
def most_common_mnt(avail_pats, qubit_key, measured): """ Picks measurement that occurs most in the available patterns """ all_mnts = dict(Counter((q, mnt) for mnt_pat in flatten(avail_pats.values()) for q, mnt in zip(qubit_key, mnt_pat) if mnt and q not in measured)) max_c = max(c for c in all_mnts.values()) best_mnts = [mnt for mnt, c in all_mnts.items() if c == max_c] shuffle(best_mnts) return best_mnts.pop()
def rocPlot(predictions, truth, classes = None, label = "Model", newFigure = None, reverse = False, percentage = None): """ Computes the roc with confidence bounds for the given model Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}} Keyword Arguments: label {str} -- Legend to plot (default: {"Model"}) newFigure {str} -- Display on a given figure (default: {None} - Create new figure) reverse {bool} -- Plot the reverse ROC useful for analyzing TNR (default: {False}) """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) global_fpr, global_tpr, _ = roc_curve(truth, predictions) if reverse: x, y = 1 - global_tpr, 1 - global_fpr # FNR, TNR x, y = x[::-1], y[::-1] minx = 1. / np.sum(truth == 1) if percentage is None: percentage = minx str_print = "TNR @{:.2f}% FNR : {:.2f}".format(percentage*100, np.interp(percentage, x, y)) else: x, y = global_fpr, global_tpr minx = 1. / np.sum(truth == 0) if percentage is None: percentage = minx str_print = "TPR @{:.2f}% FPR : {:.2f}".format(percentage*100, np.interp(percentage, x, y)) if newFigure is not None: plt.figure(newFigure) else: plt.plot(np.linspace(0, 1, 100), np.linspace(0, 1, 100), 'k--', label="Random") if reverse: plt.xlabel('False negative rate') plt.ylabel('True negative rate') plt.title('Reverse ROC curve') else: plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') newx = np.linspace(minx, 1, 1000) y = np.interp(newx, x, y) wilson = 1.96 * np.sqrt(y * (1 - y)/len(predictions)) print(str_print + " +/- {:.2f}".format(np.interp(0.01, newx, wilson))) upper = np.minimum(y + wilson, 1) lower = np.maximum(y - wilson, 0) plRoc = plt.plot(newx, y, label=label + " ({:.2f} +/- {:.2f})".format(aucCompute(predictions, truth, classes), (auc(newx, upper) - auc(newx, lower))/2.), ls = '--' if "train" in label.lower() else '-') plt.fill_between(newx, lower, upper, color=plRoc[0].get_color(), alpha=.2)
def describe_instances(self, parameters): """ Execute the ec2-describe-instances command and returns a summary of the already running EC2 instances. (Also see documentation for the BaseAgent class) Args: parameters A dictionary containing the 'keyname' parameter Returns: A tuple of the form (public_ips, private_ips, instances) where each member is a list. """ keyname = parameters[self.PARAM_KEYNAME] describe_instances = utils.shell(self.prefix + '-describe-instances 2>&1') utils.log('describe-instances says {0}'.format(describe_instances)) fqdn_regex = re.compile('\s+({0})\s+({0})\s+running\s+{1}\s'.format(self.FQDN_REGEX, keyname)) instance_regex = re.compile('INSTANCE\s+(i-\w+)') all_ip_addresses = utils.flatten(fqdn_regex.findall(describe_instances)) instances = utils.flatten(instance_regex.findall(describe_instances)) public_ips, private_ips = self.get_ip_addresses(all_ip_addresses) return public_ips, private_ips, instances
def get_per_node_loss_tol(all_tols, filename=None): """ For each qubit in the state, calculates the number of measurement patterns that can tolerate it's loss. """ tol_counts = Counter(flatten(all_tols)) tol_counts = [[n, count] for n, count in tol_counts.items()] if filename: with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) writer.writerow(['node', 'tol_count']) writer.writerows(tol_counts) return tol_counts
def format_dataframe(self, data_dictionary): """ Method to format the response dictionary from a requested module. :param data_dictionary: The dictionary data from the requested module. :type data_dictionary: dict :return: A formatted dataframe containing the data. :rtype pd.Dataframe """ # Checks to see if there are any dictionaries or lists within the data that need to be flattened. if isinstance(data_dictionary, list): module = [flatten(data) for data in data_dictionary] module = pd.DataFrame(module) else: module = flatten(data_dictionary) module = pd.DataFrame([module]) # Due to the way Yahoo Finance API returns numeric types, the raw integer value is preferred. Therefore, any # values with the suffix longfmt (long format) or fmt (format) are removed. module_columns = [ column for column in module.columns if not ('.fmt' in column or '.longFmt' in column) ] # Get a new dataframe. module = module[module_columns] # Format the headers of the column to match PEP8 standards. new_columns_dict = { col: self.reader.pep_pattern.sub('_', col.split('.')[0]).lower() for col in module.columns } module.rename(columns=new_columns_dict, inplace=True) return module
def calibrationPlot(predictions, truth, classes=None, label="Model", newFigure=None, n_bins=5): """ Computes the roc with confidence bounds for the given model Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}} Keyword Arguments: label {str} -- Legend to plot (default: {"Model"}) newFigure {str} -- Display on a given figure (default: {None} - Create new figure) n_bins {int} -- Numbre of bins for the calibration (default: {5}) """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) predictions = ((predictions - predictions.min()) / (predictions.max() - predictions.min())).flatten() fraction_of_positives, mean_predicted_value = calibration_curve( truth, predictions, n_bins=n_bins) bins = np.linspace(0., 1. + 1e-8, n_bins + 1) binids = np.digitize(predictions, bins) - 1 bin_sums = np.bincount(binids, minlength=len(bins)) bin_sums = bin_sums[bin_sums != 0] * 500 / np.sum(bin_sums) if newFigure is not None: plt.figure(newFigure) else: plt.xlabel('Mean Predicted Value') plt.ylabel('Fraction Positive') plt.title('Calibration') p = plt.plot(mean_predicted_value, fraction_of_positives, alpha=0.5, ls=':') plt.scatter(mean_predicted_value, fraction_of_positives, s=bin_sums, label=label + " ({:.2f})".format(brier_score_loss(truth, predictions)), color=p[0].get_color(), alpha=0.5)
def cutnpaste(oracle): # find the breakpoint for padding pad_len = 0 for pad in range(32, 100): oracle_chunks = list(chunks(oracle("A" * pad), 16)) if oracle_chunks[1] == oracle_chunks[2]: pad_len = pad % 16 break payload = "A" * (pad_len) + "admin" + "\v" * 11 payload_chunk = list(chunks(oracle(payload)))[1] cut_payload = "A" * (3 + pad_len) cut_chunks = list(chunks(oracle(cut_payload)))[:-1] cut_chunks.append(payload_chunk) print(decrypt(flatten(cut_chunks)))
def averagePrecisionRecallCompute(predictions, truth, classes=None): """ Computes AUC of the given predictions Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth Keyword Arguments: classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}} Returns: float -- Estimation by pooling of auc """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) return average_precision_score(truth, predictions)
def test_measurement_patterns_teleport(self): """ Finds measurement patterns and tests they teleport the state """ nodes = 8 output = nodes - 1 for i in tqdm(range(100)): prob_edge = random.uniform(0.2, 0.5) / 2 disablePrint() psi = RandomGNPGraphChannel(nodes, prob_edge, output=output, use_gpu=True) psi.update_inputs_and_outputs() enablePrint() mnt_patterns, qubit_key = psi.get_mnt_patterns() mnt_pattern = random.choice(flatten(mnt_patterns.values())) mnt_pattern = [(qubit, basis) for qubit, basis in zip(qubit_key, mnt_pattern) if basis] random.shuffle(mnt_pattern) for qubit, basis in mnt_pattern: try: psi.pauli_measurement(qubit, basis, forget=False) except Exception: print "Measurement failed" print psi.edges() print mnt_pattern sys.exit() try: all_nt = psi._test_all_non_trivial_combos_found( print_stabs=True, join=False, verbose=False) except Exception: print "Measurement failed" print psi.edges() print mnt_pattern pprint(psi.gen_combos) psi._print_stabs() sys.exit() stab_combos_correct = psi._test_combo_stabs_correct() self.assertTrue(all_nt) self.assertTrue(stab_combos_correct) self.assertEqual(psi._support(psi.X_op), [output]) self.assertEqual(psi._support(psi.Z_op), [output]) self.assertTrue(anticommute(psi.X_op, psi.Z_op))
def handle_underscores(suffix, text_encoder, prefix=False): encoder = text_encoder.encoder if prefix: tok = "___" else: tok = find_underscore_length(suffix) suffix_parts = [i.strip() for i in suffix.split("{}".format(tok))] to_flatten = [] for i, part in enumerate(suffix_parts): if part: to_flatten.append(text_encoder.encode([part], verbose=False)[0]) if i != len(suffix_parts) - 1 and suffix_parts[i + 1]: to_flatten.append([encoder["<blank>"]]) else: to_flatten.append([encoder["<blank>"]]) final_suffix = utils.flatten(to_flatten) return final_suffix
def get_max_weight_efficiencies(psi, max_weight, filename=None, verbose=False): """ Gets loss tolerance of measurement patterns produced with different absolute maximum weights. """ data = [] for w in tqdm(range(1, max_weight + 1)): mnt_pats, qubit_key = psi.get_mnt_patterns( max_weight=w, rel_weight=True) loss_tols = get_loss_tolerance(mnt_pats, qubit_key) max_tols = flatten(value for value in loss_tols.values()) all_tols = get_all_loss_tols(max_tols) datum = [w, len(all_tols)] if verbose: tqdm.write(str(datum)) data.append(datum) if filename: with open(filename, 'wb') as csvfile: writer = csv.writer(csvfile) header = ['max_weight', 'loss_tol_configs'] writer.writerow(header) writer.writerows(data)
def confusionPlot(predictions, truth, classes, percentage = True): """ Computes the confusion matrix of the given model Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth classes {Dict "+":int, "-":int} -- Classes to consider to plot """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) classes_list = np.array(list(classes.keys())) confusion = confusion_matrix(truth, predictions, labels=[classes[c] for c in classes_list]) notNull = confusion.sum(axis = 0) != 0 if percentage: confusion = confusion / confusion.sum(axis = 1, keepdims = True) sns.heatmap(confusion[:, notNull], xticklabels = classes_list[notNull], yticklabels = classes_list, annot = True, vmin = 0, vmax = 1 if percentage else None) plt.xlabel("Predicted") plt.ylabel("Ground truth")
def precisionRecallPlot(predictions, truth, classes=None, label="Model", newFigure=None, reverse=False, percentage=None): """ Computes the roc with confidence bounds for the given model Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}} Keyword Arguments: label {str} -- Legend to plot (default: {"Model"}) newFigure {str} -- Display on a given figure (default: {None} - Create new figure) reverse {bool} -- Plot the reverse ROC useful for analyzing TNR (default: {False}) """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) precision, recall, _ = precision_recall_curve(truth, predictions) if newFigure is not None: plt.figure(newFigure) else: plt.xlabel('Precision') plt.ylabel('Recall') plt.title('Precision Recall curve') plt.plot(precision, recall, label=label + " ({:.2f})".format( averagePrecisionRecallCompute(predictions, truth, classes)), ls='--' if "train" in label.lower() else '-')
def run(self): if self.params.rng == -1: seed = random.randrange(2**32 - 1) else: seed = int(self.params.rng) rng = np.random.RandomState(seed) np.random.seed(seed) conf_env_dir = "cfgs/env/" + self.params.env_module + "/" + self.params.env_conf_file env_params = parse_conf(conf_env_dir) env_params["rng"] = rng env = get_mod_object("envs",self.params.env_module,"env",(rng,), env_params,mode=1) pol_train = get_mod_class("pols",self.params.pol_train_module, "pol") self.params.pol_train_args = flatten(self.params.pol_train_args) if self.params.pol_train_args is not None else [] pol_train_args = parse_conf("cfgs/pol/" + self.params.pol_train_module + "/" + self.params.pol_train_args[0]) if len(self.params.pol_train_args) > 0 and isfile("cfgs/pol/" + self.params.pol_train_module + "/" + self.params.pol_train_args[0]) else parse_conf("cfgs/pol/" + self.params.pol_train_module + "/default") pol_train_args_2 = erase_dict_from_keyword_list(pol_train_args, self.params.pol_train_args) pol_train_args = revalidate_dict_from_conf_module(pol_train_args_2, "pol", self.params.pol_train_module) pol_test = get_mod_class("pols",self.params.pol_test_module, "pol") self.params.pol_test_args = flatten(self.params.pol_test_args) if self.params.pol_test_args is not None else [] pol_test_args = parse_conf("cfgs/pol/" + self.params.pol_test_module + "/" + self.params.pol_test_args[0]) if len(self.params.pol_test_args) > 0 and isfile("cfgs/pol/" + self.params.pol_test_module + "/" + self.params.pol_test_args[0]) else parse_conf("cfgs/pol/" + self.params.pol_test_module + "/default") pol_test_args_2 = erase_dict_from_keyword_list(pol_test_args, self.params.pol_test_args) pol_test_args = revalidate_dict_from_conf_module(pol_test_args_2, "pol", self.params.pol_test_module) self.params.backend_nnet_conf_file= flatten(self.params.backend_nnet_conf_file) if self.params.backend_nnet_conf_file is not None else [] backend_nnet_params = parse_conf("cfgs/backend_nnet/" + self.params.backend_nnet + "/" + self.params.backend_nnet_conf_file[0]) if len(self.params.backend_nnet_conf_file) > 0 and isfile("cfgs/backend_nnet/" + self.params.backend_nnet + "/" + self.params.backend_nnet_conf_file[0]) else parse_conf("cfgs/backend_nnet/" + self.params.backend_nnet + "/default") backend_nnet_params_2 = erase_dict_from_keyword_list(backend_nnet_params,self.params.backend_nnet_conf_file) backend_nnet_params = revalidate_dict_from_conf_module(backend_nnet_params_2, "backend_nnet", self.params.backend_nnet) neural_net = get_mod_class("neural_nets", self.params.backend_nnet,"neural_net") self.params.ctrl_neural_nets_conf_file = flatten(self.params.ctrl_neural_nets_conf_file) if self.params.ctrl_neural_nets_conf_file is not None else [] ctrl_neural_nets_params = parse_conf("cfgs/ctrl_nnet/" + self.params.qnetw_module + "/" + self.params.ctrl_neural_nets_conf_file[0]) if len(self.params.ctrl_neural_nets_conf_file) > 0 and isfile("cfgs/ctrl_nnet/" + self.params.qnetw_module + "/" + self.params.ctrl_neural_nets_conf_file[0]) else parse_conf("cfgs/ctrl_nnet/" + self.params.qnetw_module + "/DEFAULT") ctrl_neural_nets_params_2 = erase_dict_from_keyword_list(ctrl_neural_nets_params,self.params.ctrl_neural_nets_conf_file) ctrl_neural_nets_params = revalidate_dict_from_conf_module(ctrl_neural_nets_params_2, "ctrl_neural_net", self.params.qnetw_module) ctrl_neural_nets_params["neural_network"] = neural_net ctrl_neural_nets_params["neural_network_kwargs"] = backend_nnet_params ctrl_neural_nets_params["batch_size"] = self.params.batch_size ctrl_neural_net = get_mod_object("ctrl_neural_nets", self.params.qnetw_module, "ctrl_neural_net", (env,),ctrl_neural_nets_params, mode=0) agent = NeuralAgent([env], [ctrl_neural_net], replay_memory_size=self.params.replay_memory_size, replay_start_size=None, batch_size=self.params.batch_size, random_state=rng, exp_priority=self.params.exp_priority, train_policy=pol_train,train_policy_kwargs=pol_train_args, test_policy=pol_test, test_policy_kwargs=pol_test_args, only_full_history=self.params.only_full_history) for tc in self.params.controllers: len_tc = len(tc) s = tc[0] redo_conf = False if len_tc >= 2: #Test if sc is a config file or an argument to override if '=' not in tc[1]: #This is a config file conf_ctrl = parse_conf("cfgs/ctrl/" + s + "/" + tc[1]) else: conf_ctrl = parse_conf("cfgs/ctrl/" + s + "/default") sc = tc[1].split("=") if sc[0] in conf_ctrl.keys(): conf_ctrl[sc[0]] = sc[1] redo_conf = True else: print ("Warning : parameter " + str(sc[0]) + " is not included in config specs for the controller " + s) if len_tc > 2: remainder = tc[2:] for a in remainder: sc = a.split("=") if len(sc) != 2: print ("Warning : arg " + a + " for controller parametrization is ill formed. It needs to be in the form key=value.") else: redo_conf = True if sc[0] in conf_ctrl.keys(): conf_ctrl[sc[0]] = sc[1] else: print ("Warning : parameter " + str(sc[0]) + " is not included in config specs for the controller " + s) #Create a temporary config file with the erased parameter and go through parse_conf again if redo_conf: write_conf(conf_ctrl, "cfgs/ctrl/" + s + "/temp") conf_ctrl = parse_conf("cfgs/ctrl/" + s + "/temp") os.remove("cfgs/ctrl/" + s + "/temp") else: conf_ctrl = parse_conf("cfgs/ctrl/" + s + "/default") controller = get_mod_object("ctrls",s,"ctrl",tuple(),conf_ctrl,mode=0) agent.attach(controller) agent.run(self.params.epochs, self.params.max_size_episode)
def run_instances(self, count, parameters, security_configured): """ Spawn the specified number of EC2 instances using the parameters provided. This method relies on the ec2-run-instances command to spawn the actual VMs in the cloud. This method is blocking in that it waits until the requested VMs are properly booted up. However if the requested VMs cannot be procured within 1800 seconds, this method will treat it as an error and return. (Also see documentation for the BaseAgent class) Args: count No. of VMs to spawned parameters A dictionary of parameters. This must contain 'keyname', 'group', 'image_id' and 'instance_type' parameters. security_configured Uses this boolean value as an heuristic to detect brand new AppScale deployments. Returns: A tuple of the form (instances, public_ips, private_ips) """ image_id = parameters[self.PARAM_IMAGE_ID] instance_type = parameters[self.PARAM_INSTANCE_TYPE] keyname = parameters[self.PARAM_KEYNAME] group = parameters[self.PARAM_GROUP] spot = False utils.log('[{0}] [{1}] [{2}] [{3}] [ec2] [{4}] [{5}]'.format(count, image_id, instance_type, keyname, group, spot)) start_time = datetime.datetime.now() active_public_ips = [] active_private_ips = [] active_instances = [] if os.environ.has_key('EC2_URL'): utils.log('EC2_URL = [{0}]'.format(os.environ['EC2_URL'])) else: utils.log('Warning: EC2_URL environment not found in the process runtime!') while True: active_public_ips, active_private_ips, active_instances =\ self.describe_instances(parameters) # If security has been configured on this agent just now, # that's an indication that this is a fresh cloud deployment. # As such it's not expected to have any running VMs. if len(active_instances) > 0 or security_configured: break args = '-k {0} -n {1} --instance-type {2} --group {3} {4}'.format(keyname, count, instance_type, group, image_id) if spot: price = self.get_optimal_spot_price(instance_type) command_to_run = '{0}-request-spot-instances -p {1} {2}'.format(self.prefix, price, args) else: command_to_run = '{0}-run-instances {1}'.format(self.prefix, args) while True: run_instances = utils.shell(command_to_run) utils.log('Run instances says {0}'.format(run_instances)) status, command_to_run = self.run_instances_response(command_to_run, run_instances) if status: break utils.log('sleepy time') utils.sleep(5) instances = [] public_ips = [] private_ips = [] utils.sleep(10) end_time = datetime.datetime.now() + datetime.timedelta(0, self.MAX_VM_CREATION_TIME) now = datetime.datetime.now() while now < end_time: describe_instances = utils.shell(self.prefix + '-describe-instances 2>&1') utils.log('[{0}] {1} seconds left...'.format(now, (end_time - now).seconds)) utils.log(describe_instances) fqdn_regex = re.compile('\s+({0})\s+({0})\s+running\s+{1}\s'.format(self.FQDN_REGEX, keyname)) instance_regex = re.compile('INSTANCE\s+(i-\w+)') all_ip_addresses = utils.flatten(fqdn_regex.findall(describe_instances)) instances = utils.flatten(instance_regex.findall(describe_instances)) public_ips, private_ips = self.get_ip_addresses(all_ip_addresses) public_ips = utils.diff(public_ips, active_public_ips) private_ips = utils.diff(private_ips, active_private_ips) instances = utils.diff(instances, active_instances) if count == len(public_ips): break time.sleep(self.SLEEP_TIME) now = datetime.datetime.now() if not public_ips: sys.exit('No public IPs were able to be procured within the time limit') if len(public_ips) != count: for index in range(0, len(public_ips)): if public_ips[index] == '0.0.0.0': instance_to_term = instances[index] utils.log('Instance {0} failed to get a public IP address and is being terminated'.\ format(instance_to_term)) utils.shell(self.prefix + '-terminate-instances ' + instance_to_term) pass end_time = datetime.datetime.now() total_time = end_time - start_time if spot: utils.log('TIMING: It took {0} seconds to spawn {1} spot instances'.format( total_time.seconds, count)) else: utils.log('TIMING: It took {0} seconds to spawn {1} regular instances'.format( total_time.seconds, count)) return instances, public_ips, private_ips
def main(): rates_for_algo = {} index_comparison = pd.DataFrame(index=config_manager.INDEX_TO_COMPARE) # Used only for initialization for func in config_manager.FUNCTION_NAMES: rates_for_algo[func] = {} # For each strategy type, for each minute and for each function read data exported # by the simulation and use them to calculate rates and indexes for comparison for algo in config_manager.STRATEGIES: x_func_success_rate = {} x_func_reject_rate = {} x_func_reject_num = {} # Initialize dictionary of rates for all functions for func in config_manager.FUNCTION_NAMES: x_func_success_rate[func] = [] x_func_reject_rate[func] = [] x_func_reject_num[func] = [] print("-------------------------- ALGO {} --------------------------". format(algo)) # Create path for recover tables base_path = config_manager.SIMULATION_TABLES_OUTPUT_PATH.joinpath(algo) for minute in range(0, config_manager.SIMULATION_MINUTES): print("MINUTE {}".format(minute)) print( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" ) # Complete path for load tables path = base_path.joinpath("minute_" + str(minute)) # For each minute load invocaion_rate and max_rate table df_invoc_rate = pd.read_csv(path.joinpath("invoc_rates.csv"), delimiter='\t', header=0, index_col=0) print("================ INVOCATION RATES ==================") print(df_invoc_rate) print("====================================================") df_max_rate = pd.read_csv(path.joinpath("max_rates.csv"), delimiter='\t', header=0, index_col=0) print("================ MAX RATES =========================") print(df_max_rate) print("====================================================") # For each minute and foreach function load dataframe for func in config_manager.FUNCTION_NAMES: df = pd.read_csv(path.joinpath(func + ".csv"), delimiter='\t', header=0, index_col=0) print( "================ FORWARDED REQUESTS for {} ================" .format(func)) print(df) print( "==========================================================" ) sr, rr, rn = calculate_rates(df, func, df_max_rate[func], df_invoc_rate[func]) x_func_success_rate[func].append(sr) x_func_reject_rate[func].append(rr) x_func_reject_num[func].append(rn) rates_for_algo[func][algo] = x_func_success_rate[func] print( "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" ) print("STATS FOR ALGO {}".format(algo)) # Utility print for success/reject rate and reject nume for func # TODO: fix it to work with new dictionaties # # print(" > Mean success rate for funca: {}".format(np.mean(funca_sr))) # print(" > Mean reject rate for funca: {}".format(np.mean(funca_rr))) # print(" > Rejected requests for funca: {}".format(np.sum(funca_reject_num))) # print(" > Mean success rate for qrcode: {}".format(np.mean(qrcode_sr))) # print(" > Mean reject rate for qrcode: {}".format(np.mean(qrcode_rr))) # print(" > Rejected requests for qrcode: {}".format(np.sum(qrcode_reject_num))) # print(" > Mean success rate for ocr: {}".format(np.mean(ocr_sr))) # print(" > Mean reject rate for ocr: {}".format(np.mean(ocr_rr))) # print(" > Rejected requests for ocr: {}".format(np.sum(ocr_reject_num))) # TEST #print(x_func_success_rate) #print(x_func_reject_rate) #print(x_func_reject_num) # Metrics prints ##### SUCCESS RATES METRICS ##### # Mean success rate mean_success_rate = np.mean( [np.mean(srates) for k, srates in x_func_success_rate.items()]) * 100 print(" > Mean success rate: {:0.2f}%".format(mean_success_rate)) # Success rate variance flat_list = [ i * 100 for i in flatten(list(x_func_success_rate.values())) ] success_rate_variance = np.var(flat_list) print(" > Success rate variance: {:0.2f}".format( success_rate_variance)) # Success rate median flat_list = flatten(list(x_func_success_rate.values())) success_rate_median = np.median(flat_list) * 100 print( " > Success rate median: {:0.2f}%".format(success_rate_median)) # Success rate percentile flat_list = flatten(list(x_func_success_rate.values())) success_rate_percentile = np.percentile( flat_list, config_manager.ANALYSIS_PERCENTILE) * 100 print(" > Success rate {}% percentile: {:0.2f}%".format( config_manager.ANALYSIS_PERCENTILE, success_rate_percentile)) ##### SUCCESS RATES (STRESS PERIOD) METRICS ##### # Mean success rate calculated during high traffic period (minutes from 1 to 5) mean_success_rate_stress_period = np.mean([ np.mean(srates[1:6]) for k, srates in x_func_success_rate.items() ]) * 100 print( " > Mean success rate during stress period (from minute 1 to 5): {:0.2f}%" .format(mean_success_rate_stress_period)) # Success rate variance (stress period) flat_list = [ i * 100 for i in flatten( [item[1:6] for item in list(x_func_success_rate.values())]) ] success_rate_stress_period_variance = np.var(flat_list) print( " > Success rate variance during stress period (from minute 1 to 5): {:0.2f}" .format(success_rate_stress_period_variance)) # Success rate median (stress period) flat_list = flatten( [item[1:6] for item in list(x_func_success_rate.values())]) success_rate_stress_period_median = np.median(flat_list) * 100 print( " > Success rate median during stress period (from minute 1 to 5): {:0.2f}%" .format(success_rate_stress_period_median)) # Success rate percentile (stress period) flat_list = flatten( [item[1:6] for item in list(x_func_success_rate.values())]) success_rate_stress_period_percentile = np.percentile( flat_list, config_manager.ANALYSIS_PERCENTILE) * 100 print( " > Success rate {}% percentile during stress period (from minute 1 to 5): {:0.2f}%" .format(config_manager.ANALYSIS_PERCENTILE, success_rate_stress_period_percentile)) ##### REJECT RATES METRICS ##### # Total rejected requests num calculated for each algorithm across minutes total_reject_requests = np.sum( [np.sum(rejnums) for k, rejnums in x_func_reject_num.items()]) print(" > Total rejected requests: {} req".format( total_reject_requests)) # Reject number variance flat_list = flatten(list(x_func_reject_num.values())) reject_number_variance = np.var(flat_list) print(" > Reject num variance: {:0.2f}".format( reject_number_variance)) # Reject number median flat_list = flatten(list(x_func_reject_num.values())) reject_number_median = np.median(flat_list) print(" > Reject num median: {:0.2f}".format(reject_number_median)) # Reject number percentile flat_list = flatten(list(x_func_reject_num.values())) reject_number_percentile = np.percentile( flat_list, config_manager.ANALYSIS_PERCENTILE) print(" > Reject num {}% percentile: {:0.2f}".format( config_manager.ANALYSIS_PERCENTILE, reject_number_percentile)) print( "----------------------------------------------------------------------------" ) index_comparison[algo] = [ mean_success_rate, success_rate_variance, success_rate_median, success_rate_percentile, mean_success_rate_stress_period, success_rate_stress_period_variance, success_rate_stress_period_median, success_rate_stress_period_percentile, total_reject_requests, reject_number_variance, reject_number_median, reject_number_percentile, ] # Export print for comparison for func in config_manager.FUNCTION_NAMES: export_for_minute_rates(func, rates_for_algo[func]) # Export index comparison table print("> INDEX COMPARISON TABLE") print(index_comparison.T) export_index_comparison_table(index_comparison.T)
def dcganx_G(input_dim, n0g, imgsz, channels, norm_type, # 'bn', 'none' requires_grad, depth=3, nodemul=2, do_bias=True): ker=5; padding=2; output_padding=1 def gen_block_T_params(ni, no, k): return { 'convT0': conv2dT_params(ni, no, k, do_bias), 'conv1': conv2d_params(no, no, 1, do_bias), 'bn0': utils.bnparams(no) if norm_type == 'bn' else None, 'bn1': utils.bnparams(no) if norm_type == 'bn' else None } def gen_group_T_params(ni, no, count): return {'block%d' % i: gen_block_T_params(ni if i == 0 else no, no, ker) for i in range(count)} count = 1 nn0 = n0g * (nodemul**(depth-1)) sz = imgsz // (2**depth) p = { 'proj': utils.linear_params(input_dim, nn0*sz*sz) } nn = nn0 for d in range(depth-1): p['group%d'%d] = gen_group_T_params(nn, nn//nodemul, count) nn = nn//nodemul p['last_convT'] = conv2dT_params(nn, channels, ker, do_bias) flat_params = utils.cast(utils.flatten(p)) if requires_grad: utils.set_requires_grad_except_bn_(flat_params) def block(x, params, base, mode, stride): o = F.relu(x, inplace=True) o = F.conv_transpose2d(o, params[base+'.convT0.w'], params.get(base+'.convT0.b'), stride=stride, padding=padding, output_padding=output_padding) if norm_type == 'bn': o = utils.batch_norm(o, params, base + '.bn0', mode) o = F.relu(o, inplace=True) o = F.conv2d(o, params[base+'.conv1.w'], params.get(base+'.conv1.b'), stride=1, padding=0) if norm_type == 'bn': o = utils.batch_norm(o, params, base + '.bn1', mode) return o def group(o, params, base, mode, stride=2): for i in range(count): o = block(o, params, '%s.block%d' % (base,i), mode, stride if i == 0 else 1) return o def f(input, params, mode): o = F.linear(input, params['proj.weight'], params['proj.bias']) o = o.view(input.size(0), nn0, sz, sz) for d in range(depth-1): o = group(o, params, 'group%d'%d, mode) o = F.relu(o, inplace=True) o = F.conv_transpose2d(o, params['last_convT.w'], params.get('last_convT.b'), stride=2, padding=padding, output_padding=output_padding) o = torch.tanh(o) return o return f, flat_params
def handle(self, *args, **options): t00 = time() qid = options['qid'] K = options['K'] alpha = options['alpha'] n_features = options['n_features'] limit = options['limit'] ng = options['ng'] n_samples = options['n_samples'] # Get the docs from the query docs = Doc.objects.filter(query=qid,content__iregex='\w') # if we are limiting, probably for testing, then do that if limit > 0: docs = docs[:limit] print('\n###############################\ \n## Doing NMF on query {} with {} documents \ and {} topics\n'.format(qid, docs.count(),K)) # Get the docs into lists abstracts, docsizes, ids = proc_docs(docs, stoplist) ############################################# # Use tf-idf features for NMF. print("Extracting tf-idf features for NMF...") tfidf_vectorizer = TfidfVectorizer(max_df=0.97, min_df=2, max_features=n_features, ngram_range=(ng,ng), tokenizer=snowball_stemmer(), stop_words=stoplist) t0 = time() tfidf = tfidf_vectorizer.fit_transform(abstracts) print("done in %0.3fs." % (time() - t0)) del abstracts gc.collect() run_id = db.init(n_features) stat = RunStats.objects.get(run_id=run_id) stat.query = Query.objects.get(pk=qid) stat.method = "NM" stat.alpha = alpha stat.process_id = os.getpid() stat.save() # Get the vocab, add it to db vocab = tfidf_vectorizer.get_feature_names() vocab_ids = [] pool = Pool(processes=8) vocab_ids.append(pool.map(partial(add_features,run_id=run_id),vocab)) pool.terminate() del vocab vocab_ids = vocab_ids[0] ## Make some topics django.db.connections.close_all() topic_ids = db.add_topics(K, run_id) gc.collect() # Fit the NMF model print("Fitting the NMF model with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=K, random_state=1, alpha=alpha, l1_ratio=.5, verbose=True, init='nndsvd', max_iter=500).fit(tfidf) print("done in %0.3fs." % (time() - t0)) ## Add topics terms print("Adding topicterms to db") t0 = time() ldalambda = find(csr_matrix(nmf.components_)) topics = range(len(ldalambda[0])) tts = [] pool = Pool(processes=8) tts.append(pool.map(partial(db.f_lambda, m=ldalambda, v_ids=vocab_ids,t_ids=topic_ids,run_id=run_id),topics)) pool.terminate() tts = flatten(tts) gc.collect() sys.stdout.flush() django.db.connections.close_all() TopicTerm.objects.bulk_create(tts) print("done in %0.3fs." % (time() - t0)) ## Add topic-docs gamma = find(csr_matrix(nmf.transform(tfidf))) glength = len(gamma[0]) chunk_size = 100000 ps = 16 parallel_add = True all_dts = [] make_t = 0 add_t = 0 ### Go through in chunks for i in range(glength//chunk_size+1): dts = [] values_list = [] f = i*chunk_size l = (i+1)*chunk_size if l > glength: l = glength docs = range(f,l) doc_batches = [] for p in range(ps): doc_batches.append([x for x in docs if x % ps == p]) pool = Pool(processes=ps) make_t0 = time() values_list.append(pool.map(partial( db.f_gamma_batch, gamma=gamma, docsizes=docsizes,docUTset=ids,topic_ids=topic_ids, run_id=run_id ),doc_batches)) #dts.append(pool.map(partial(f_gamma, gamma=gamma, # docsizes=docsizes,docUTset=ids,topic_ids=topic_ids),doc_batches)) pool.terminate() make_t += time() - make_t0 django.db.connections.close_all() add_t0 = time() values_list = [item for sublist in values_list for item in sublist] pool = Pool(processes=ps) pool.map(insert_many,values_list) pool.terminate() add_t += time() - add_t0 gc.collect() sys.stdout.flush() stat.error = nmf.reconstruction_err_ stat.errortype = "Frobenius" stat.iterations = nmf.n_iter_ stat.last_update=timezone.now() stat.save() management.call_command('update_run',run_id) totalTime = time() - t00 tm = int(totalTime//60) ts = int(totalTime-(tm*60)) print("done! total time: " + str(tm) + " minutes and " + str(ts) + " seconds") print("a maximum of " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000) + " MB was used")
def run_tm(s_id, K, language="german", verbosity=1, method='NM', max_features=0, max_df=0.95, min_df=5, alpha=0.01, extra_stopwords=set(), top_chain_var=None, rng_seed=None, max_iter=200, **kwargs): if method in ['BD', 'BleiDTM'] and top_chain_var is None: top_chain_var = 0.005 s = Search.objects.get(pk=s_id) stat = RunStats(psearch=s, K=K, min_freq=min_df, max_df=max_df, method=method.upper()[0:2], max_features=max_features, max_iter=max_iter, alpha=alpha, extra_stopwords=list(extra_stopwords), top_chain_var=top_chain_var, status=1, language=language) stat.save() django.db.connections.close_all() if method in ['DT', 'dnmf']: print("Running dynamic NMF algorithm") run_dynamic_nmf(stat, **kwargs) return 0 elif method in ['BD', 'BleiDTM']: print("Running Blei DTM algorithm") if rng_seed: stat.rng_seed = rng_seed else: stat.rng_seed = 1 stat.save() run_blei_dtm(stat, **kwargs) return 0 print("starting topic model for runstat with settings:") for field in stat._meta.fields: field_value = getattr(stat, field.name) if field_value: print("{}: {}".format(field.name, field_value)) start_time = time.time() start_datetime = timezone.now() stat.status = 1 # 3 = finished stat.save() run_id = stat.run_id if s.search_object_type == 1: ps = Paragraph.objects.filter(search_matches=s) docs = ps.filter(text__iregex='\w') texts, docsizes, ids = process_texts(docs) elif s.search_object_type == 2: uts = Utterance.objects.filter(search_matches=s) texts, docsizes, ids = merge_utterance_paragraphs(uts) else: print("search object type invalid") return 1 if stat.max_features == 0: n_features = 10000000 else: n_features = stat.max_features if stat.language is "german": stemmer = SnowballStemmer("german") tokenizer = german_stemmer() stopword_list = [stemmer.stem(t) for t in stopwords.words("german")] elif stat.language is "english": stemmer = SnowballStemmer("english") stopword_list = [stemmer.stem(t) for t in stopwords.words("english")] tokenizer = snowball_stemmer() else: print("Language not recognized.") return 1 if stat.extra_stopwords: stopword_list = list(set(stopword_list) | set(stat.extra_stopwords)) if method in ["NM", "nmf"]: if verbosity > 0: print( "creating term frequency-inverse document frequency matrix ({})" .format(time.time() - start_time)) # get term frequency-inverse document frequency matrix (using log weighting) # and min/max document frequency (min_df, max_df) tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(1, stat.ngram), tokenizer=tokenizer, stop_words=stopword_list) tfidf = tfidf_vectorizer.fit_transform(texts) vectorizer = tfidf_vectorizer vocab = vectorizer.get_feature_names() elif method in ["LD", "lda"]: if verbosity > 0: print("creating term frequency matrix ({})".format(time.time() - start_time)) # Use tf (raw term count) features for LDA. tf_vectorizer = CountVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(1, stat.ngram), tokenizer=tokenizer, stop_words=stopword_list) tf = tf_vectorizer.fit_transform(texts) vectorizer = tf_vectorizer vocab = vectorizer.get_feature_names() else: print("method not implemented") return 1 if verbosity > 0: print("save terms to db ({})".format(time.time() - start_time)) paralellized = True if paralellized: vocab_ids = [] # multiprocessing: add vocabulary as Term pool = Pool(processes=8) vocab_ids.append( pool.map(partial(db.add_features, run_id=run_id), vocab)) pool.terminate() del vocab vocab_ids = vocab_ids[0] else: print("without multiprocessing for storing terms") # without multiprocessing objects = [Term(title=term_title) for term_title in vocab] # TODO: if some of the objects already exist, duplicates are created: use uniqueness of field 'title' Term.objects.bulk_create(objects) runstats = RunStats.objects.get(run_id=run_id) runstats.term_set.add(*objects) runstats.save() ## Make some topics django.db.connections.close_all() topic_ids = db.add_topics(K, run_id) gc.collect() if verbosity > 1: v = True else: v = False if method in ["NM", "nmf"]: if verbosity > 0: print("running matrix factorization with NMF ({})".format( time.time() - start_time)) # NMF = non-negative matrix factorization model = NMF(n_components=K, random_state=1, alpha=stat.alpha, l1_ratio=.1, verbose=v, init='nndsvd', max_iter=stat.max_iter).fit(tfidf) # initialization with Nonnegative Double Singular Value Decomposition (nndsvd) print("Reconstruction error of nmf: {}".format( model.reconstruction_err_)) stat.error = model.reconstruction_err_ stat.errortype = "Frobenius" # document topic matrix dtm = csr_matrix(model.transform(tfidf)) elif method in ["LD", "lda"]: if verbosity > 0: print( "running Latent Dirichlet Allocation ({})".format(time.time() - start_time)) model = LDA( n_components=K, doc_topic_prior=stat. alpha, # this is the concentration parameter of the Dirichlet distribution of topics in documents topic_word_prior=stat. beta, # this is the concentration parameter of the Dirichlet distribution of words in topics # if None, this defaults to 1/n max_iter=stat.max_iter, learning_method= 'online', # using 'batch' instead could lead to memory problems learning_offset=50. #n_jobs=6 ).partial_fit(tf) stat.error = model.perplexity(tf) stat.errortype = "Perplexity" dtm = csr_matrix(model.transform(tf)) else: print("Method {} not available.".format(method)) return 1 # term topic matrix ldalambda = find(csr_matrix(model.components_)) # find returns the indices and values of the nonzero elements of a matrix topics = range(len(ldalambda[0])) tts = [] # multiprocessing: add TopicTerms and scores pool = Pool(processes=8) tts.append( pool.map( partial(db.f_lambda, m=ldalambda, v_ids=vocab_ids, t_ids=topic_ids, run_id=run_id), topics)) pool.terminate() tts = flatten(tts) gc.collect() sys.stdout.flush() django.db.connections.close_all() TopicTerm.objects.bulk_create(tts) if verbosity > 0: print("saving document topic matrix to db ({})".format(time.time() - start_time)) #document topic matrix gamma = find(dtm) glength = len(gamma[0]) chunk_size = 100000 no_cores = 16 parallel_add = True all_dts = [] make_t = 0 add_t = 0 ### Go through in chunks for i in range(glength // chunk_size + 1): values_list = [] f = i * chunk_size l = (i + 1) * chunk_size if l > glength: l = glength docs = range(f, l) doc_batches = [] for p in range(no_cores): doc_batches.append([x for x in docs if x % no_cores == p]) pool = Pool(processes=no_cores) values_list.append( pool.map( partial(db.f_gamma_batch, gamma=gamma, docsizes=docsizes, docUTset=ids, topic_ids=topic_ids, run_id=run_id), doc_batches)) pool.terminate() django.db.connections.close_all() print( "... created document topic matrix for saving iteration {}".format( i)) values_list = [item for sublist in values_list for item in sublist] pool = Pool(processes=no_cores) if s.search_object_type == 1: pool.map(db.insert_many_pars, values_list) elif s.search_object_type == 2: pool.map(db.insert_many_utterances, values_list) pool.terminate() gc.collect() sys.stdout.flush() print("... saved document topic matrix iteration {}".format(i)) stat.iterations = model.n_iter_ stat.status = 3 # 3 = finished stat.last_update = timezone.now() stat.runtime = timezone.now() - start_datetime stat.save() update_topic_titles(run_id) update_topic_scores(run_id) if verbosity > 0: print("topic model run done ({})".format(time.time() - start_time)) return 0
def run_dynamic_nmf(stat): """ Run dynamic NMF model on utterances (speeches) or paragraphs from the parliament data :param stat: RunStats object with the parameters to run the model with :return: 0 if successful, 1 otherwise """ print("starting topic model for runstat with settings:") for field in stat._meta.fields: field_value = getattr(stat, field.name) if field_value: print("{}: {}".format(field.name, field_value)) t0 = time() start_datetime = timezone.now() s = Search.objects.get(pk=stat.psearch.id) n_samples = 1000 run_id = stat.run_id # load time range if s.search_object_type == 1: ps = Paragraph.objects.filter(search_matches=s) wps = ParlPeriod.objects.filter( document__utterance__paragraph__in=ps).distinct().values('n') elif s.search_object_type == 2: uts = Utterance.objects.filter( search_matches=s).order_by('document__parlperiod__n') wps = ParlPeriod.objects.filter( document__utterance__in=uts).distinct().values('n') else: print("search object type invalid") return 1 # language specific settings if stat.language is "german": stemmer = SnowballStemmer("german") tokenizer = german_stemmer() stopword_list = [stemmer.stem(t) for t in stopwords.words("german")] elif stat.language is "english": stemmer = SnowballStemmer("english") stopword_list = [stemmer.stem(t) for t in stopwords.words("english")] tokenizer = snowball_stemmer() else: print("Language not recognized.") return 1 if stat.extra_stopwords: stopword_list = list(set(stopword_list) | set(stat.extra_stopwords)) time_range = sorted([wp['n'] for wp in wps]) for timestep in time_range: # load text from database if s.search_object_type == 1: ps = Paragraph.objects.filter( search_matches=s, utterance__document__parlperiod__n=timestep) docs = ps.filter(text__iregex='\w') texts, docsizes, ids = process_texts(docs) elif s.search_object_type == 2: uts = Utterance.objects.filter(search_matches=s, document__parlperiod__n=timestep) texts, docsizes, ids = merge_utterance_paragraphs(uts) else: print("search object type not known") return 1 print("\n#######################") print("in period {}: {} docs".format(timestep, len(texts))) k = stat.K # k = predict(text_count) # print("esimating {} topics...".format(k)) print("Extracting tf-idf features for NMF...") if stat.max_features == 0: n_features = 100000000 else: n_features = stat.max_features tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(1, stat.ngram), tokenizer=tokenizer, stop_words=stopword_list) t1 = time() tfidf = tfidf_vectorizer.fit_transform(texts) del texts gc.collect() print("done in %0.3fs." % (time() - t1)) print("Save terms to DB") # Get the vocab, add it to db vocab = tfidf_vectorizer.get_feature_names() vocab_ids = [] pool = Pool(processes=8) vocab_ids.append( pool.map(partial(db.add_features, run_id=run_id), vocab)) pool.terminate() del vocab vocab_ids = vocab_ids[0] django.db.connections.close_all() topic_ids = db.add_topics(k, run_id) for t in topic_ids: top = Topic.objects.get(pk=t) top.year = timestep top.save() gc.collect() # Fit the NMF model print("Fitting the NMF model with tf-idf features, " "n_samples=%d and max_features=%d..." % (n_samples, stat.max_features)) t1 = time() nmf = NMF(n_components=k, random_state=1, alpha=.0001, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t1)) print("Adding topicterms to db") ldalambda = find(csr_matrix(nmf.components_)) topics = range(len(ldalambda[0])) tts = [] pool = Pool(processes=8) tts.append( pool.map( partial(db.f_lambda, m=ldalambda, v_ids=vocab_ids, t_ids=topic_ids, run_id=run_id), topics)) pool.terminate() tts = flatten(tts) gc.collect() sys.stdout.flush() django.db.connections.close_all() TopicTerm.objects.bulk_create(tts) print("done in %0.3fs." % (time() - t1)) gamma = find(csr_matrix(nmf.transform(tfidf))) glength = len(gamma[0]) chunk_size = 100000 no_cores = 16 make_t = 0 add_t = 0 ### Go through in chunks for i in range(glength // chunk_size + 1): values_list = [] f = i * chunk_size l = (i + 1) * chunk_size if l > glength: l = glength docs = range(f, l) doc_batches = [] for p in range(no_cores): doc_batches.append([x for x in docs if x % no_cores == p]) pool = Pool(processes=no_cores) make_t0 = time() values_list.append( pool.map( partial(db.f_gamma_batch, gamma=gamma, docsizes=docsizes, docUTset=ids, topic_ids=topic_ids, run_id=run_id), doc_batches)) pool.terminate() make_t += time() - make_t0 django.db.connections.close_all() add_t0 = time() values_list = [item for sublist in values_list for item in sublist] pool = Pool(processes=no_cores) if s.search_object_type == 1: pool.map(db.insert_many_pars, values_list) elif s.search_object_type == 2: pool.map(db.insert_many_utterances, values_list) pool.terminate() add_t += time() - add_t0 gc.collect() sys.stdout.flush() stat.error = stat.error + nmf.reconstruction_err_ stat.errortype = "Frobenius" ## After all the years have been run, update the dtops tops = Topic.objects.filter(run_id=run_id) highest_id = Term.objects.all().order_by('-id').first().id B = np.zeros((tops.count(), highest_id)) #print(tops) wt = 0 for topic in tops: tts = TopicTerm.objects.filter(topic=topic).order_by('-score')[:50] for tt in tts: B[wt, tt.term.id] = tt.score wt += 1 col_sum = np.sum(B, axis=0) vocab_ids = np.flatnonzero(col_sum) # we only want the columns where there are at least some # topic-term values B = B[:, vocab_ids] nmf = NMF(n_components=stat.K, random_state=1, alpha=.1, l1_ratio=.5).fit(B) ## Add dynamic topics dtopics = [] for k in range(stat.K): dtopic = DynamicTopic(run_id=RunStats.objects.get(pk=run_id)) dtopic.save() dtopics.append(dtopic) dtopic_ids = list( DynamicTopic.objects.filter(run_id=run_id).values_list('id', flat=True)) print(dtopic_ids) ################## ## Add the dtopic*term matrix to the db print("Adding topicterms to db") t1 = time() ldalambda = find(csr_matrix(nmf.components_)) topics = range(len(ldalambda[0])) tts = [] pool = Pool(processes=8) tts.append( pool.map( partial(db.f_dlambda, m=ldalambda, v_ids=vocab_ids, t_ids=dtopic_ids, run_id=run_id), topics)) pool.terminate() tts = flatten(tts) gc.collect() sys.stdout.flush() django.db.connections.close_all() DynamicTopicTerm.objects.bulk_create(tts) print("done in %0.3fs." % (time() - t1)) ## Add the wtopic*dtopic matrix to the database gamma = nmf.transform(B) for topic in range(len(gamma)): for dtopic in range(len(gamma[topic])): if gamma[topic][dtopic] > 0: tdt = TopicDTopic(topic=tops[topic], dynamictopic_id=dtopic_ids[dtopic], score=gamma[topic][dtopic]) tdt.save() ## Calculate the primary dtopic for each topic for t in tops: try: t.primary_dtopic.add( TopicDTopic.objects.filter( topic=t).order_by('-score').first().dynamictopic) t.save() except: print("saving primary topic not working") pass management.call_command('update_run', run_id) stat.error = stat.error + nmf.reconstruction_err_ stat.errortype = "Frobenius" stat.last_update = timezone.now() stat.runtime = timezone.now() - start_datetime stat.status = 3 # 3 = finished stat.save() totalTime = time() - t0 tm = int(totalTime // 60) ts = int(totalTime - (tm * 60)) print("done! total time: " + str(tm) + " minutes and " + str(ts) + " seconds") print("a maximum of " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000) + " MB was used") return 0
def plot_tsne(r_ind, tsne_results, cats, nocatids, ax=None, verbose=False, hdoc=False, legend=True, sc=None, heat_var=None, cmapname=None, topics=None, min_cluster=100, psize=1, t_thresh=0.8, eps=1, n_clusters=1, doc_sets=None, clabel_size=8, words_only=False, fsize=5, adjust=False, draw_highlight_points=False, dot_legend=True, nocat_colour='#F0F0F026', nocat_alpha=0.4, raster=False, extension="png", slinewidth=0.1): cs = [] sizes = [] xs = [] ys = [] if ax == None: fig, ax = plt.subplots(dpi=188) t0 = time() nocatids = np.argwhere(np.isin(r_ind, nocatids)) if hdoc is not False: hdocs = nocatids[np.isin(nocatids, hdoc)] ids = nocatids[np.isin(nocatids, hdoc, invert=True)] ax.scatter(tsne_results[nocatids, 0], tsne_results[nocatids, 1], c=nocat_colour, s=psize, alpha=nocat_alpha, linewidth=slinewidth, edgecolor='#a39c9c66', rasterized=raster) # Draw docs to be highlighted separately if hdoc is not False: ax.scatter(tsne_results[hdocs, 0], tsne_results[hdocs, 1], c='#F0F0F026', s=psize, alpha=1, linewidth=0.5, edgecolor='black', rasterized=raster) # split the data and add layer by layer to prevent top layer overwriting all splits = 10 for i in range(splits): for c in cats: ids = np.array_split(c["dis"], splits)[i] if hdoc is not False: hdocs = ids[np.isin(ids, hdoc)] ids = ids[np.isin(ids, hdoc, invert=True)] if len(nocatids) > len(r_ind) / 2: a = 1 else: a = 0.7 ax.scatter( tsne_results[ids, 0], tsne_results[ids, 1], #zorder = [math.ceil(random.random()*1) for i in range(len(ids))], c=c['color'], s=psize, alpha=a, linewidth=slinewidth, edgecolor='#a39c9c66', rasterized=raster) if hdoc is not False: ax.scatter(tsne_results[hdocs, 0], tsne_results[hdocs, 1], c=c["color"], s=psize, alpha=1, linewidth=0.5, edgecolor='black', rasterized=raster) ax.grid(linestyle='-') if verbose: print("calculating points took %0.3fs." % (time() - t0)) l = ax.get_xlim()[0] t = ax.get_ylim()[1] yextent = ax.get_ylim()[1] - ax.get_ylim()[0] ysp = yextent * 0.04 draw_leg = False if legend: for i, c in enumerate(cats): prop = len(c['docs']) / len(r_ind) label = "{} {:.1%}".format(c['name'], prop) if extension == "pdf": label = label.replace("%", "\%") if dot_legend: if prop > 0.001: draw_leg = True ax.scatter( [], [], c=c['color'], label=label, linewidth=slinewidth, edgecolor='#a39c9c66', ) else: if c['color'] == "#000000": tcolor = "white" else: tcolor = "black" ax.text(l * 0.95, t - ysp - i * ysp, label, fontsize=fsize, color=tcolor, bbox={ 'facecolor': c['color'], 'pad': 3 }) if dot_legend and draw_leg: ax.legend() if heat_var: cmap = cm.get_cmap(cmapname) ys = [ getattr(cs, heat_var) for cs in sc.objects if getattr(cs, heat_var) is not None ] X = np.interp(ys, (np.min(ys), np.max(ys)), (0, +1)) f = interpolate.interp1d(ys, X) for cs in sc.objects: if getattr(cs, heat_var): col = cmap(f(getattr(cs, heat_var)).max()) rect = patches.Rectangle((cs.x1, cs.y1), cs.x2 - cs.x1, cs.y2 - cs.y1, linewidth=1, edgecolor='r', facecolor=col, alpha=0.3) ax.add_patch(rect) if topics: texts = [] for t in topics: if t.run_id.method == "DT": atdocscores = Doc.objects.filter( docdynamictopic__topic=t, ).values_list( 'docdynamictopic__score', flat=True) thresh = np.quantile(atdocscores, t_thresh) tdocs = Doc.objects.filter( docdynamictopic__topic=t, docdynamictopic__score__gt=thresh).order_by( '-docdynamictopic__score').values_list('id', flat=True) else: atdocscores = Doc.objects.filter( doctopic__topic=t, ).values_list('doctopic__score', flat=True) thresh = np.quantile(atdocscores, t_thresh) tdocs = Doc.objects.filter( doctopic__topic=t, doctopic__score__gt=thresh).order_by( '-doctopic__score').values_list('id', flat=True) highlight_docs = np.argwhere(np.isin(r_ind, tdocs))[:, 0] if len(highlight_docs) == 0: continue points = tsne_results[highlight_docs] texts.append( cluster_label_points(t.title, points, ax, eps, min_cluster, n_clusters, clabel_size, words_only)) if draw_highlight_points: ax.scatter(points[:, 0], points[:, 1], c=c["color"], s=psize, alpha=1, linewidth=0.5, edgecolor='black', rasterized=raster) if adjust: texts = list(flatten(texts)) adjust_text(texts, ax=ax, arrowprops=dict(arrowstyle="->", color='None', lw=0.5)) if doc_sets: texts = [] for d in doc_sets: highlight_docs = np.argwhere(np.isin(r_ind, d['docs']))[:, 0] points = tsne_results[highlight_docs] texts.append( cluster_label_points(d['title'], points, ax, eps, min_cluster, n_clusters, clabel_size, words_only)) if draw_highlight_points: ax.scatter(points[:, 0], points[:, 1], c=c["color"], s=psize, alpha=1, linewidth=0.5, edgecolor='black', rasterized=raster) if adjust: texts = list(flatten(texts)) adjust_text(texts, ax=ax, arrowprops=dict(arrowstyle="->", color='None', lw=0.5)) if topics: return texts
def resnet4_D(nn, imgsz, channels, # 1: gray-scale, 3: color norm_type, # 'bn', 'none' requires_grad, do_bias=True): depth =4 ker = 3 padding = (ker-1)//2 count = 1 def gen_group0_params(no): ni = channels return { 'block0' : { 'conv0': conv2d_params(ni, no, ker, do_bias), 'conv1': conv2d_params(no, no, ker, do_bias), 'convdim': utils.conv_params(ni, no, 1), 'bn': utils.bnparams(no) if norm_type == 'bn' else None }} def gen_resnet_D_block_params(ni, no, k, norm_type, do_bias): return { 'conv0': conv2d_params(ni, ni, k, do_bias), 'conv1': conv2d_params(ni, no, k, do_bias), 'convdim': utils.conv_params(ni, no, 1), 'bn': utils.bnparams(no) if norm_type == 'bn' else None } def gen_group_params(ni, no): return {'block%d' % i: gen_resnet_D_block_params(ni if i == 0 else no, no, ker, norm_type, do_bias) for i in range(count)} sz = imgsz // (2**depth) flat_params = utils.cast(utils.flatten({ 'group0': gen_group0_params(nn), 'group1': gen_group_params(nn, nn*2), 'group2': gen_group_params(nn*2, nn*4), 'group3': gen_group_params(nn*4, nn*8), 'fc': utils.linear_params(sz*sz*nn*8, 1), })) if requires_grad: utils.set_requires_grad_except_bn_(flat_params) def block(x, params, base, mode, do_downsample, is_first): o = x if not is_first: o = F.relu(o, inplace=True) o = F.conv2d(x, params[base+'.conv0.w'], params.get(base+'conv0.b'), padding=padding) o = F.relu(o, inplace=True) o = F.conv2d(o, params[base+'.conv1.w'], params.get(base+'conv1.b'), padding=padding) if norm_type == 'bn': o = utils.batch_norm(o, params, base + '.bn', mode) if do_downsample: o = F.avg_pool2d(o,2) x = F.avg_pool2d(x,2) if base + '.convdim' in params: return o + F.conv2d(x, params[base + '.convdim']) else: return o + x def group(o, params, base, mode, do_downsample, is_first=False): for i in range(count): o = block(o, params, '%s.block%d' % (base,i), mode, do_downsample=(do_downsample and i == count-1), is_first=(is_first and i == 0)) return o def f(input, params, mode): o = group(input, params, 'group0', mode, do_downsample=True, is_first=True) o = group(o, params, 'group1', mode, do_downsample=True) o = group(o, params, 'group2', mode, do_downsample=True) o = group(o, params, 'group3', mode, do_downsample=True) o = F.relu(o, inplace=True) o = o.view(o.size(0), -1) o = F.linear(o, params['fc.weight'], params['fc.bias']) return o return f, flat_params
def resnet4_G(input_dim, n0g, imgsz, channels, norm_type, # 'bn', 'none' requires_grad, do_bias=True): depth = 4 ker = 3 padding = (ker-1)//2 count = 1 def gen_resnet_G_block_params(ni, no, k, norm_type, do_bias): return { 'conv0': conv2d_params(ni, no, k, do_bias), 'conv1': conv2d_params(no, no, k, do_bias), 'convdim': utils.conv_params(ni, no, 1), 'bn': utils.bnparams(no) if norm_type == 'bn' else None } def gen_group_params(ni, no): return {'block%d' % i: gen_resnet_G_block_params(ni if i == 0 else no, no, ker, norm_type, do_bias) for i in range(count)} nn = n0g * (2**(depth-1)); sz = imgsz // (2**depth) flat_params = utils.cast(utils.flatten({ 'proj': utils.linear_params(input_dim, nn*sz*sz), 'group0': gen_group_params(nn, nn//2), 'group1': gen_group_params(nn//2, nn//4), 'group2': gen_group_params(nn//4, nn//8), 'group3': gen_group_params(nn//8, nn//8), 'last_conv': conv2d_params(nn//8, channels, ker, do_bias), })) if requires_grad: utils.set_requires_grad_except_bn_(flat_params) def block(x, params, base, mode, do_upsample): o = F.relu(x, inplace=True) if do_upsample: o = F.interpolate(o, scale_factor=2, mode='nearest') o = F.conv2d(o, params[base+'.conv0.w'], params.get(base+'.conv0.b'), padding=padding) o = F.relu(o, inplace=True) o = F.conv2d(o, params[base+'.conv1.w'], params.get(base+'.conv1.b'), padding=padding) if norm_type == 'bn': o = utils.batch_norm(o, params, base + '.bn', mode) xo = F.conv2d(x, params[base + '.convdim']) if do_upsample: return o + F.interpolate(xo, scale_factor=2, mode='nearest') else: return o + xo def group(o, params, base, mode, do_upsample): for i in range(count): o = block(o, params, '%s.block%d' % (base,i), mode, do_upsample if i == 0 else False) return o def show_shape(o, msg=''): print(o.size(), msg) def f(input, params, mode): o = F.linear(input, params['proj.weight'], params['proj.bias']) o = o.view(input.size(0), nn, sz, sz) o = group(o, params, 'group0', mode, do_upsample=True) o = group(o, params, 'group1', mode, do_upsample=True) o = group(o, params, 'group2', mode, do_upsample=True) o = group(o, params, 'group3', mode, do_upsample=True) o = F.relu(o, inplace=True) o = F.conv2d(o, params['last_conv.w'], params.get('last_conv.b'), padding=padding) o = torch.tanh(o) return o return f, flat_params
def do_nmf(run_id, no_processes=16): stat = RunStats.objects.get(run_id=run_id) qid = stat.query.id K = stat.K TopicTerm.objects.filter(run_id=run_id).delete() DocTopic.objects.filter(run_id=run_id).delete() Topic.objects.filter(run_id=run_id).delete() stat.term_set.clear() alpha = stat.alpha n_features = stat.max_features if n_features == 0: n_features = 100000000000 limit = stat.limit ng = stat.ngram # if stat.method=="LD" and stat.lda_library!=RunStats.WARP: # if stat.max_iter == 200: # stat.max_iter = 10 # if stat.max_iter > 100: # stat.max_iter = 90 n_samples = stat.max_iter stat.process_id = os.getpid() stat.status = 1 stat.save() if stat.fulltext: docs = Doc.objects.filter(query=qid, fulltext__iregex='\w') else: docs = Doc.objects.filter(query=qid, content__iregex='\w') # if we are limiting, probably for testing, then do that if limit > 0: docs = docs[:limit] print('\n###############################\ \n## Topic modeling (method: {}, library: {}) on query {} with {} documents \ and {} topics (run_id: {})\n'.format(stat.method, stat.lda_library, qid, docs.count(), K, run_id)) # Get the docs into lists abstracts, docsizes, ids, citations = proc_docs(docs, stoplist, stat.fulltext, stat.citations) scaled_citations = 1 + RobustScaler(with_centering=False).fit_transform( np.array(citations).reshape(-1, 1)) sentences = [get_sentence(x) for x in abstracts] w2v = gensim.models.Word2Vec(sentences) validation_measure = WithinTopicMeasure(ModelSimilarity(w2v)) if stat.fancy_tokenization: ###################################### ## A fancy tokenizer from nltk import wordpunct_tokenize from nltk import WordNetLemmatizer from nltk import sent_tokenize from nltk import pos_tag from nltk.corpus import stopwords as sw punct = set(string.punctuation) from nltk.corpus import wordnet as wn stopwords = set(sw.words('english')) if stat.extra_stopwords: stopwords = stopwords | set(stat.extra_stopwords) def lemmatize(token, tag): tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(tag[0], wn.NOUN) return WordNetLemmatizer().lemmatize(token, tag) kws = Doc.objects.filter( query=stat.query, kw__text__iregex='\w+[\-\ ]').values('kw__text').annotate( n=Count('pk')).filter(n__gt=len(abstracts) // 200).order_by('-n') kw_text = set([x['kw__text'].replace('-', ' ') for x in kws]) kw_ws = set([x['kw__text'].replace('-', ' ').split()[0] for x in kws]) - stopwords def fancy_tokenize(X): common_words = set([x.lower() for x in X.split()]) & kw_ws for w in list(common_words): w = w.replace('(', '').replace(')', '') wpat = "({}\W*\w*)".format(w) wn = [ x.lower().replace('-', ' ') for x in re.findall(wpat, X, re.IGNORECASE) ] kw_matches = set(wn) & kw_text if len(kw_matches) > 0: for m in kw_matches: insensitive_m = re.compile(m, re.IGNORECASE) X = insensitive_m.sub(' ', X) yield m.replace(" ", "-") for sent in sent_tokenize(X): for token, tag in pos_tag(wordpunct_tokenize(sent)): token = token.lower().strip() if token in stopwords: continue if all(char in punct for char in token): continue if len(token) < 3: continue if all(char in string.digits for char in token): continue lemma = lemmatize(token, tag) yield lemma tokenizer = fancy_tokenize else: tokenizer = snowball_stemmer() ####################################### ############################################# # Use tf-idf features for NMF. print("Extracting tf-idf features ...") tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(ng, ng), tokenizer=tokenizer, stop_words=stoplist) count_vectorizer = CountVectorizer(max_df=stat.max_df, min_df=stat.min_freq, max_features=n_features, ngram_range=(ng, ng), tokenizer=tokenizer, stop_words=stoplist) t0 = time() if stat.method == "NM": tfidf = tfidf_vectorizer.fit_transform(abstracts) vectorizer = tfidf_vectorizer else: tfidf = count_vectorizer.fit_transform(abstracts) vectorizer = count_vectorizer print("done in %0.3fs." % (time() - t0)) stat.tfidf_time = time() - t0 stat.save() if citations is not False: tfidf = tfidf.multiply(scaled_citations) del abstracts gc.collect() if stat.db: vocab = vectorizer.get_feature_names() vocab_ids = [] pool = Pool(processes=no_processes) vocab_ids.append(pool.map(partial(add_features, run_id=run_id), vocab)) pool.terminate() #del vocab vocab_ids = vocab_ids[0] ## Make some topics django.db.connections.close_all() topic_ids = db.add_topics(K, run_id) gc.collect() # Fit the NMF model print("Fitting the model with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() if stat.method == "NM": model = NMF(n_components=K, random_state=1, alpha=alpha, l1_ratio=.1, verbose=True, init='nndsvd', max_iter=n_samples).fit(tfidf) dtm = csr_matrix(model.transform(tfidf)) components = csr_matrix(model.components_) else: if stat.lda_library == RunStats.LDA_LIB: model = lda.LDA( n_topics=K, alpha=stat.alpha, eta=stat.alpha, n_iter=stat.max_iter * 10, ).fit(tfidf) dtm = model.doc_topic_ components = csr_matrix(model.components_) elif stat.lda_library == RunStats.WARP: # Export warp lda try: warp_path = settings.WARP_LDA_PATH os.chdir(warp_path) except: print( "warplda is not installed, or its path is not defined in settings, exiting...." ) return fname = wpu.export_warp_lda(ids, tfidf, vocab, run_id) # preformat os.system(f'./format -input {fname} -prefix {run_id} train') # Run warp lda runcmd = f'./warplda --prefix {run_id} --k {stat.K}' if stat.alpha: runcmd += f' -alpha {stat.alpha}' if stat.beta: runcmd += f' -beta {stat.beta}' else: stat.beta = 0.01 # default beta value stat.save() if stat.max_iter: runcmd += f' --niter {stat.max_iter}' runcmd += ' train.model' print("Running warplda.") os.system(runcmd) print("Finished running warplda, importing results.") warp_vocab = np.loadtxt(f'{run_id}.vocab', dtype=str) warp_translate = np.argsort(warp_vocab).argsort() # Import warp lda as matrices with open(f'{run_id}.model', 'r') as f: for i, l in enumerate(f): if i == 0: M = int(l.split()[0]) N = int(l.split()[1]) components = lil_matrix((N, M)) else: largs = l.split('\t')[1].strip().split() for la in largs: wid = warp_translate[i - 1] t, n = la.split(':') components[int(t), wid] = int(n) components = components.todense() for k in range(components.shape[0]): components[k, :] = (components[k, :] + stat.beta) / ( components[k, :].sum() + stat.K * stat.beta) components = csr_matrix(components) dtm = lil_matrix((len(ids), N)) with open(f'{run_id}.z.estimate', 'r') as f: for i, l in enumerate(f): largs = l.split(' ', maxsplit=1)[1].strip().split() for la in largs: w, t = la.split(':') dtm[i, int(t)] += 1 theta = dtm.todense() for i in range(dtm.shape[0]): theta[i, :] = (theta[i, :] + stat.alpha) / ( theta[i, :].sum() + stat.K * stat.alpha) dtm = csr_matrix(theta) else: model = LDA( n_components=K, doc_topic_prior=stat.alpha, topic_word_prior=stat.beta, learning_method=stat.get_lda_learning_method_display().lower(), max_iter=stat.max_iter, n_jobs=2).fit(tfidf) dtm = csr_matrix(model.transform(tfidf)) components = csr_matrix(model.components_) print("done in %0.3fs." % (time() - t0)) stat.nmf_time = time() - t0 if stat.db: ## Add topics terms print("Adding topicterms to db") t0 = time() ldalambda = find(components) topics = range(len(ldalambda[0])) tts = [] pool = Pool(processes=no_processes) tts.append( pool.map( partial(db.f_lambda, m=ldalambda, v_ids=vocab_ids, t_ids=topic_ids, run_id=run_id), topics)) pool.terminate() tts = flatten(tts) gc.collect() sys.stdout.flush() django.db.connections.close_all() TopicTerm.objects.bulk_create(tts) print("done in %0.3fs." % (time() - t0)) stat.db_time = stat.db_time + time() - t0 ## Add topic-docs print("Adding DocTopics") gamma = find(dtm) glength = len(gamma[0]) chunk_size = 100000 parallel_add = True all_dts = [] make_t = 0 add_t = 0 t0 = time() ### Go through in chunks for i in range(glength // chunk_size + 1): dts = [] values_list = [] f = i * chunk_size l = (i + 1) * chunk_size if l > glength: l = glength docs = range(f, l) doc_batches = [] for p in range(no_processes): doc_batches.append([x for x in docs if x % no_processes == p]) pool = Pool(processes=no_processes) make_t0 = time() values_list.append( pool.map( partial(db.f_gamma_batch, gamma=gamma, docsizes=docsizes, docUTset=ids, topic_ids=topic_ids, run_id=run_id), doc_batches)) #dts.append(pool.map(partial(f_gamma, gamma=gamma, # docsizes=docsizes,docUTset=ids,topic_ids=topic_ids),doc_batches)) pool.terminate() make_t += time() - make_t0 print(make_t) django.db.connections.close_all() add_t0 = time() values_list = [item for sublist in values_list for item in sublist] pool = Pool(processes=no_processes) pool.map(insert_many, values_list) pool.terminate() add_t += time() - add_t0 print(add_t) gc.collect() sys.stdout.flush() stat.db_time = stat.db_time + time() - t0 print("done in %0.3fs." % (time() - t0)) em = 0 for i in range(K): if dtm[:, i].nnz == 0: em += 1 stat.empty_topics = em if stat.method == "NM": stat.error = model.reconstruction_err_ stat.errortype = "Frobenius" elif stat.method == "LD": if stat.lda_library == RunStats.LDA_LIB: stat.error = model.loglikelihood() stat.errortype = "Log likelihood" stat.iterations = model.n_iter elif stat.lda_library == RunStats.WARP: pass else: stat.error = model.perplexity(tfidf) stat.errortype = "Perplexity" stat.iterations = model.n_iter_ stat.last_update = timezone.now() stat.status = 3 stat.save() if stat.db: term_rankings = [] topics = Topic.objects.filter(run_id=run_id) for topic in topics: term_ranking = list( Term.objects.filter(topicterm__topic=topic).order_by( '-topicterm__score').values_list('title', flat=True)[:50]) term_rankings.append(term_ranking) stat.coherence = validation_measure.evaluate_rankings(term_rankings) stat.save() if stat.db: management.call_command('update_run', run_id)