Esempio n. 1
0
def eval_recall5(imgs_enc, caps_enc):
    imgs_enc = np.vstack(flatten(imgs_enc))
    caps_enc = np.vstack(flatten(caps_enc))

    res = avg_recall5(imgs_enc, caps_enc)

    return res
Esempio n. 2
0
    def test_flatten(self):
        ref = ['foo', 'bar', '123']
        result = utils.flatten(ref)
        self.assertEquals(ref, result)

        result = utils.flatten(['foo', ['bar', '123']])
        self.assertEquals(ref, result)

        result = utils.flatten([['foo'], ['bar', '123']])
        self.assertEquals(ref, result)
Esempio n. 3
0
  def test_flatten(self):
    ref = ['foo', 'bar', '123']
    result = utils.flatten(ref)
    self.assertEquals(ref, result)

    result = utils.flatten(['foo', ['bar', '123']])
    self.assertEquals(ref, result)

    result = utils.flatten([['foo'], ['bar', '123']])
    self.assertEquals(ref, result)
Esempio n. 4
0
def computeEvolutionRoc(temporalListLabels, predictions, classes = None, percentage = 0.001):
    """
        Plots the evolution of the auc 
        
        Arguments:
            temporalListLabels {List of (time, labels)*} -- Ground truth labels
            predictions {Dict / List of labels} -- Predicitons (same format than labels in temporalListLabels)
            classes {Dict} -- Classes to consider to plot (key: Name to display, Value: label)
            percentage {float} -- Evaluate the TPR and TNR at this given value of FNR and FPR
    """
    aucs = {}
    for time, labels in temporalListLabels:
        pred_time, labels_time = selection(predictions, labels, classes)
        pred_time, labels_time = flatten(pred_time, labels_time)
        fpr, tpr, _ = roc_curve(labels_time, pred_time)
        fnr, tnr = (1 - tpr)[::-1], (1 - fpr)[::-1]
        auc_time = auc(fpr, tpr)
        wilson_tpr = 1.96 * np.sqrt(tpr * (1 - tpr)/len(predictions))
        wilson_tnr = 1.96 * np.sqrt(tnr * (1 - tnr)/len(predictions))

        aucs[time] = {
                        "auc": auc_time, 
                        "lower": auc(fpr, tpr - wilson_tpr), 
                        "upper": auc(fpr, tpr + wilson_tpr), 

                        "tpr": np.interp(percentage, fpr, tpr),
                        "tpr_wilson" : np.interp(percentage, fpr, wilson_tpr),

                        "tnr": np.interp(percentage, fnr, tnr),
                        "tnr_wilson" : np.interp(percentage, fnr, wilson_tnr),
                     }
                     
    return pd.DataFrame.from_dict(aucs, orient = "index")
Esempio n. 5
0
def fcn_G(input_dim, nn, imgsz, channels, requires_grad, depth=2):
   def gen_block_params(ni, no):
      return {'fc': utils.linear_params(ni, no),}

   def gen_group_params(ni, no, count):
      return {'block%d' % i: gen_block_params(ni if i == 0 else no, no) for i in range(count)}

   flat_params = utils.cast(utils.flatten({
        'group0': gen_group_params(input_dim, nn, depth),
        'last_proj': utils.linear_params(nn, imgsz*imgsz*channels),
   }))

   if requires_grad:
      utils.set_requires_grad_except_bn_(flat_params)

   def block(x, params, base, mode):
      return F.relu(F.linear(x, params[base+'.fc.weight'], params[base+'.fc.bias']), inplace=True)

   def group(o, params, base, mode):
      for i in range(depth):
         o = block(o, params, '%s.block%d' % (base,i), mode)
      return o

   def f(input, params, mode):
      o = group(input, params, 'group0', mode)
      o = F.linear(o, params['last_proj.weight'], params['last_proj.bias'])
      o = torch.tanh(o)
#      o = o.view(o.size(0), channels, imgsz, imgsz)
      o = o.reshape(o.size(0), channels, imgsz, imgsz)      
      return o

   return f, flat_params
Esempio n. 6
0
def graph_loss_tols(graph, i, o, filename=None):
    """
        Gets all configurations graph pathfinding is loss tolerant to.
        Exports output to file, if provided.
    """
    print('\n           graph_loss_tols         \n')
    nodes = set(graph.nodes()) - set([i, o])
    all_tols = [[]]
    loss_tol_nodes = set(nodes)
    for r in range(1, len(nodes) + 1):
        loss_configs = list(it.combinations(loss_tol_nodes, r))
        loss_tol_nodes = set()
        while loss_configs:
            loss_config = loss_configs.pop()
            lost_nodes = \
                set(loss_config) | \
                set(flatten(map(graph.neighbors, loss_config)))
            loss_graph = deepcopy(graph)
            loss_graph.remove_nodes_from(lost_nodes)
            if i in loss_graph.nodes() and o in loss_graph.nodes() and \
                    nx.has_path(loss_graph, i, o):
                loss_tol_nodes |= set(loss_config)
                all_tols.append(list(loss_config))
    if filename:
        with open(filename, 'w') as fp:
            json.dump(all_tols, fp)
    return all_tols
Esempio n. 7
0
def histPlot(predictions, truth, classes = None, label = "Model", newFigure = None, splitPosNeg = False, kde = False):
    """
        Computes the histograms of a binary predictions
        
        Arguments:
            predictions {Dict / List} -- Label predictions
            truth {Dict / List} -- Ground truth
            classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}}
        
        Keyword Arguments:
            label {str} -- Legend to plot (default: {"Model"})
            newFigure {str} -- Display on a given figure (default: {None} - Create new figure)
            splitPosNeg {bool} -- Split between positive and negative (default: {False})
            kde {bool} -- Computes the kde of the histogram (default: {False})
    """
    predictions, truth = selection(predictions, truth, classes)
    predictions, truth = flatten(predictions, truth)
    bins = np.linspace(0, 1, 20)

    if newFigure is not None:
        plt.figure(newFigure)
    else:
        plt.xlabel('Predicted Probability')
        plt.ylabel('Frequency')
        plt.title('Histogram Probabilities')

    if splitPosNeg:
        sns.distplot(predictions[truth == 1], label=label + " Positive", kde = kde, bins = bins)
        sns.distplot(predictions[truth == 0], label=label + " Negative", kde = kde, bins = bins)
    else:
        sns.distplot(predictions, label=label, kde = kde, bins = bins)
Esempio n. 8
0
def dcganx_D(nn0, imgsz,
             channels,    # 1: gray-scale, 3: color
             norm_type,   # 'bn', 'none'
             requires_grad, 
             depth=3, leaky_slope=0.2, nodemul=2, do_bias=True):
              
   ker=5; padding=2
   
   def gen_block_params(ni, no, k):
      return {
         'conv0': conv2d_params(ni, no, k, do_bias), 
         'conv1': conv2d_params(no, no, 1, do_bias), 
         'bn0': utils.bnparams(no) if norm_type == 'bn' else None, 
         'bn1': utils.bnparams(no) if norm_type == 'bn' else None
      }

   def gen_group_params(ni, no, count):
       return {'block%d' % i: gen_block_params(ni if i == 0 else no, no, ker) for i in range(count)}

   count = 1
   sz = imgsz // (2**depth)
   nn = nn0
   p = { 'conv0': conv2d_params(channels, nn0, ker, do_bias) }
   for d in range(depth-1):
      p['group%d'%d] = gen_group_params(nn, nn*nodemul, count)
      nn = nn*nodemul
   p['fc'] = utils.linear_params(sz*sz*nn, 1)
   flat_params = utils.cast(utils.flatten(p))

   if requires_grad:
      utils.set_requires_grad_except_bn_(flat_params)

   def block(x, params, base, mode, stride):
      o = F.conv2d(x, params[base+'.conv0.w'], params.get(base+'conv0.b'), stride=stride, padding=padding)
      if norm_type == 'bn':
         o = utils.batch_norm(o, params, base + '.bn0', mode)
      o = F.leaky_relu(o, negative_slope=leaky_slope, inplace=True)
      o = F.conv2d(o, params[base+'.conv1.w'], params.get(base+'conv1.b'), stride=1, padding=0)
      if norm_type == 'bn':
         o = utils.batch_norm(o, params, base + '.bn1', mode)
      o = F.leaky_relu(o, negative_slope=leaky_slope, inplace=True)
      return o

   def group(o, params, base, mode, stride=2):
      n = 1
      for i in range(n):
         o = block(o, params, '%s.block%d' % (base,i), mode, stride if i == 0 else 1)
      return o

   def f(input, params, mode):
      o = F.conv2d(input, params['conv0.w'], params.get('conv0.b'), stride=2, padding=padding)
      o = F.leaky_relu(o, negative_slope=leaky_slope, inplace=True)
      for d in range(depth-1):
         o = group(o, params, 'group%d'%d, mode)
      o = o.view(o.size(0), -1)
      o = F.linear(o, params['fc.weight'], params['fc.bias'])
      return o

   return f, flat_params
Esempio n. 9
0
def import_loss_tols(in_file, filename=None):
    """ Imports and formats loss tols for use. Exports to file, if provided """
    with open(in_file, 'r') as fp:
        data = json.load(fp)
    max_tols = flatten(value for value in data.values())
    all_tols = get_all_loss_tols(max_tols)
    if filename:
        with open(filename, 'w') as fp:
            json.dump(all_tols, fp)
    return all_tols
Esempio n. 10
0
def most_common_mnt(avail_pats, qubit_key, measured):
    """ Picks measurement that occurs most in the available patterns """
    all_mnts = dict(Counter((q, mnt)
                            for mnt_pat in flatten(avail_pats.values())
                            for q, mnt in zip(qubit_key, mnt_pat)
                            if mnt and q not in measured))
    max_c = max(c for c in all_mnts.values())
    best_mnts = [mnt for mnt, c in all_mnts.items() if c == max_c]
    shuffle(best_mnts)
    return best_mnts.pop()
Esempio n. 11
0
def rocPlot(predictions, truth, classes = None, label = "Model", newFigure = None, reverse = False, percentage = None):
    """
        Computes the roc with confidence bounds for the given model
        
        Arguments:
            predictions {Dict / List} -- Label predictions
            truth {Dict / List} -- Ground truth
            classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}}
        
        Keyword Arguments:
            label {str} -- Legend to plot (default: {"Model"})
            newFigure {str} -- Display on a given figure (default: {None} - Create new figure)
            reverse {bool} -- Plot the reverse ROC useful for analyzing TNR (default: {False})
    """
    predictions, truth = selection(predictions, truth, classes)
    predictions, truth = flatten(predictions, truth)
    global_fpr, global_tpr, _ = roc_curve(truth, predictions)
    if reverse:
        x, y = 1 - global_tpr, 1 - global_fpr # FNR, TNR
        x, y = x[::-1], y[::-1]
        minx = 1. / np.sum(truth == 1)
        if percentage is None:
            percentage = minx
        str_print = "TNR @{:.2f}% FNR : {:.2f}".format(percentage*100, np.interp(percentage, x, y))
    else:
        x, y = global_fpr, global_tpr
        minx = 1. / np.sum(truth == 0)
        if percentage is None:
            percentage = minx
        str_print = "TPR @{:.2f}% FPR : {:.2f}".format(percentage*100, np.interp(percentage, x, y))

            
    if newFigure is not None:
        plt.figure(newFigure)
    else:
        plt.plot(np.linspace(0, 1, 100), np.linspace(0, 1, 100), 'k--', label="Random")
        if reverse:
            plt.xlabel('False negative rate')
            plt.ylabel('True negative rate')
            plt.title('Reverse ROC curve')
        else:
            plt.xlabel('False positive rate')
            plt.ylabel('True positive rate')
            plt.title('ROC curve')
            
    newx = np.linspace(minx, 1, 1000)
    y = np.interp(newx, x, y)
    wilson = 1.96 * np.sqrt(y * (1 - y)/len(predictions))
    print(str_print + " +/- {:.2f}".format(np.interp(0.01, newx, wilson)))
    upper = np.minimum(y + wilson, 1)
    lower = np.maximum(y - wilson, 0)
    plRoc = plt.plot(newx, y, label=label + " ({:.2f} +/- {:.2f})".format(aucCompute(predictions, truth, classes), (auc(newx, upper) - auc(newx, lower))/2.), ls = '--' if "train" in label.lower() else '-')
    plt.fill_between(newx, lower, upper, color=plRoc[0].get_color(), alpha=.2)
Esempio n. 12
0
  def describe_instances(self, parameters):
    """
    Execute the ec2-describe-instances command and returns a summary of the
    already running EC2 instances. (Also see documentation for the BaseAgent
    class)

    Args:
      parameters  A dictionary containing the 'keyname' parameter

    Returns:
      A tuple of the form (public_ips, private_ips, instances) where each
      member is a list.
    """
    keyname = parameters[self.PARAM_KEYNAME]
    describe_instances = utils.shell(self.prefix + '-describe-instances 2>&1')
    utils.log('describe-instances says {0}'.format(describe_instances))
    fqdn_regex = re.compile('\s+({0})\s+({0})\s+running\s+{1}\s'.format(self.FQDN_REGEX, keyname))
    instance_regex = re.compile('INSTANCE\s+(i-\w+)')
    all_ip_addresses = utils.flatten(fqdn_regex.findall(describe_instances))
    instances = utils.flatten(instance_regex.findall(describe_instances))
    public_ips, private_ips = self.get_ip_addresses(all_ip_addresses)
    return public_ips, private_ips, instances
Esempio n. 13
0
def get_per_node_loss_tol(all_tols, filename=None):
    """
        For each qubit in the state, calculates the number of measurement
        patterns that can tolerate it's loss.
    """
    tol_counts = Counter(flatten(all_tols))
    tol_counts = [[n, count] for n, count in tol_counts.items()]
    if filename:
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['node', 'tol_count'])
            writer.writerows(tol_counts)
    return tol_counts
Esempio n. 14
0
    def format_dataframe(self, data_dictionary):
        """
        Method to format the response dictionary from a requested module.
        :param data_dictionary: The dictionary data from the requested module.
        :type data_dictionary: dict
        :return: A formatted dataframe containing the data.
        :rtype pd.Dataframe
        """

        # Checks to see if there are any dictionaries or lists within the data that need to be flattened.
        if isinstance(data_dictionary, list):
            module = [flatten(data) for data in data_dictionary]
            module = pd.DataFrame(module)
        else:
            module = flatten(data_dictionary)
            module = pd.DataFrame([module])

        # Due to the way Yahoo Finance API returns numeric types, the raw integer value is preferred. Therefore, any
        # values with the suffix longfmt (long format) or fmt (format) are removed.
        module_columns = [
            column for column in module.columns
            if not ('.fmt' in column or '.longFmt' in column)
        ]

        # Get a new dataframe.
        module = module[module_columns]

        # Format the headers of the column to match PEP8 standards.
        new_columns_dict = {
            col: self.reader.pep_pattern.sub('_',
                                             col.split('.')[0]).lower()
            for col in module.columns
        }
        module.rename(columns=new_columns_dict, inplace=True)

        return module
Esempio n. 15
0
def calibrationPlot(predictions,
                    truth,
                    classes=None,
                    label="Model",
                    newFigure=None,
                    n_bins=5):
    """
        Computes the roc with confidence bounds for the given model
        
        Arguments:
            predictions {Dict / List} -- Label predictions
            truth {Dict / List} -- Ground truth
            classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}}
        
        Keyword Arguments:
            label {str} -- Legend to plot (default: {"Model"})
            newFigure {str} -- Display on a given figure (default: {None} - Create new figure)
            n_bins {int} -- Numbre of bins for the calibration (default: {5})
    """
    predictions, truth = selection(predictions, truth, classes)
    predictions, truth = flatten(predictions, truth)
    predictions = ((predictions - predictions.min()) /
                   (predictions.max() - predictions.min())).flatten()
    fraction_of_positives, mean_predicted_value = calibration_curve(
        truth, predictions, n_bins=n_bins)
    bins = np.linspace(0., 1. + 1e-8, n_bins + 1)
    binids = np.digitize(predictions, bins) - 1
    bin_sums = np.bincount(binids, minlength=len(bins))
    bin_sums = bin_sums[bin_sums != 0] * 500 / np.sum(bin_sums)

    if newFigure is not None:
        plt.figure(newFigure)
    else:
        plt.xlabel('Mean Predicted Value')
        plt.ylabel('Fraction Positive')
        plt.title('Calibration')

    p = plt.plot(mean_predicted_value,
                 fraction_of_positives,
                 alpha=0.5,
                 ls=':')
    plt.scatter(mean_predicted_value,
                fraction_of_positives,
                s=bin_sums,
                label=label +
                " ({:.2f})".format(brier_score_loss(truth, predictions)),
                color=p[0].get_color(),
                alpha=0.5)
Esempio n. 16
0
def cutnpaste(oracle):
    # find the breakpoint for padding
    pad_len = 0
    for pad in range(32, 100):
        oracle_chunks = list(chunks(oracle("A" * pad), 16))
        if oracle_chunks[1] == oracle_chunks[2]:
            pad_len = pad % 16
            break
    payload = "A" * (pad_len) + "admin" + "\v" * 11
    payload_chunk = list(chunks(oracle(payload)))[1]

    cut_payload = "A" * (3 + pad_len)
    cut_chunks = list(chunks(oracle(cut_payload)))[:-1]
    cut_chunks.append(payload_chunk)

    print(decrypt(flatten(cut_chunks)))
Esempio n. 17
0
def averagePrecisionRecallCompute(predictions, truth, classes=None):
    """
        Computes AUC of the given predictions
        
        Arguments:
            predictions {Dict / List} -- Label predictions
            truth {Dict / List} -- Ground truth
        
        Keyword Arguments:
            classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}}
    
        Returns:
            float -- Estimation by pooling of auc
    """
    predictions, truth = selection(predictions, truth, classes)
    predictions, truth = flatten(predictions, truth)
    return average_precision_score(truth, predictions)
Esempio n. 18
0
 def test_measurement_patterns_teleport(self):
     """ Finds measurement patterns and tests they teleport the state """
     nodes = 8
     output = nodes - 1
     for i in tqdm(range(100)):
         prob_edge = random.uniform(0.2, 0.5) / 2
         disablePrint()
         psi = RandomGNPGraphChannel(nodes,
                                     prob_edge,
                                     output=output,
                                     use_gpu=True)
         psi.update_inputs_and_outputs()
         enablePrint()
         mnt_patterns, qubit_key = psi.get_mnt_patterns()
         mnt_pattern = random.choice(flatten(mnt_patterns.values()))
         mnt_pattern = [(qubit, basis)
                        for qubit, basis in zip(qubit_key, mnt_pattern)
                        if basis]
         random.shuffle(mnt_pattern)
         for qubit, basis in mnt_pattern:
             try:
                 psi.pauli_measurement(qubit, basis, forget=False)
             except Exception:
                 print "Measurement failed"
                 print psi.edges()
                 print mnt_pattern
                 sys.exit()
             try:
                 all_nt = psi._test_all_non_trivial_combos_found(
                     print_stabs=True, join=False, verbose=False)
             except Exception:
                 print "Measurement failed"
                 print psi.edges()
                 print mnt_pattern
                 pprint(psi.gen_combos)
                 psi._print_stabs()
                 sys.exit()
             stab_combos_correct = psi._test_combo_stabs_correct()
             self.assertTrue(all_nt)
             self.assertTrue(stab_combos_correct)
         self.assertEqual(psi._support(psi.X_op), [output])
         self.assertEqual(psi._support(psi.Z_op), [output])
         self.assertTrue(anticommute(psi.X_op, psi.Z_op))
Esempio n. 19
0
def handle_underscores(suffix, text_encoder, prefix=False):
    encoder = text_encoder.encoder
    if prefix:
        tok = "___"
    else:
        tok = find_underscore_length(suffix)

    suffix_parts = [i.strip() for i in suffix.split("{}".format(tok))]
    to_flatten = []
    for i, part in enumerate(suffix_parts):
        if part:
            to_flatten.append(text_encoder.encode([part], verbose=False)[0])

            if i != len(suffix_parts) - 1 and suffix_parts[i + 1]:
                to_flatten.append([encoder["<blank>"]])
        else:
            to_flatten.append([encoder["<blank>"]])

    final_suffix = utils.flatten(to_flatten)

    return final_suffix
Esempio n. 20
0
def get_max_weight_efficiencies(psi, max_weight, filename=None, verbose=False):
    """
        Gets loss tolerance of measurement patterns produced with different
        absolute maximum weights.
    """
    data = []
    for w in tqdm(range(1, max_weight + 1)):
        mnt_pats, qubit_key = psi.get_mnt_patterns(
            max_weight=w, rel_weight=True)
        loss_tols = get_loss_tolerance(mnt_pats, qubit_key)
        max_tols = flatten(value for value in loss_tols.values())
        all_tols = get_all_loss_tols(max_tols)
        datum = [w, len(all_tols)]
        if verbose:
            tqdm.write(str(datum))
        data.append(datum)
    if filename:
        with open(filename, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            header = ['max_weight', 'loss_tol_configs']
            writer.writerow(header)
            writer.writerows(data)
Esempio n. 21
0
def confusionPlot(predictions, truth, classes, percentage = True):
    """
        Computes the confusion matrix of the given model
        
        Arguments:
            predictions {Dict / List} -- Label predictions
            truth {Dict / List} -- Ground truth
            classes {Dict "+":int, "-":int} -- Classes to consider to plot
    """
    predictions, truth = selection(predictions, truth, classes)
    predictions, truth = flatten(predictions, truth)

    classes_list = np.array(list(classes.keys()))
    confusion = confusion_matrix(truth, predictions, labels=[classes[c] for c in classes_list])
    notNull = confusion.sum(axis = 0) != 0

    if percentage:
        confusion = confusion / confusion.sum(axis = 1, keepdims = True)

    sns.heatmap(confusion[:, notNull], xticklabels = classes_list[notNull], yticklabels = classes_list, annot = True, vmin = 0, vmax = 1 if percentage else None)
    plt.xlabel("Predicted")
    plt.ylabel("Ground truth")
Esempio n. 22
0
def precisionRecallPlot(predictions,
                        truth,
                        classes=None,
                        label="Model",
                        newFigure=None,
                        reverse=False,
                        percentage=None):
    """
        Computes the roc with confidence bounds for the given model
        
        Arguments:
            predictions {Dict / List} -- Label predictions
            truth {Dict / List} -- Ground truth
            classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}}
        
        Keyword Arguments:
            label {str} -- Legend to plot (default: {"Model"})
            newFigure {str} -- Display on a given figure (default: {None} - Create new figure)
            reverse {bool} -- Plot the reverse ROC useful for analyzing TNR (default: {False})
    """
    predictions, truth = selection(predictions, truth, classes)
    predictions, truth = flatten(predictions, truth)
    precision, recall, _ = precision_recall_curve(truth, predictions)

    if newFigure is not None:
        plt.figure(newFigure)
    else:
        plt.xlabel('Precision')
        plt.ylabel('Recall')
        plt.title('Precision Recall curve')

    plt.plot(precision,
             recall,
             label=label + " ({:.2f})".format(
                 averagePrecisionRecallCompute(predictions, truth, classes)),
             ls='--' if "train" in label.lower() else '-')
Esempio n. 23
0
    def run(self):
        if self.params.rng == -1:
                seed = random.randrange(2**32 - 1)
        else:
                seed = int(self.params.rng)
        rng = np.random.RandomState(seed)
        np.random.seed(seed)
    
        
        

        

        conf_env_dir = "cfgs/env/" + self.params.env_module + "/" + self.params.env_conf_file
        env_params = parse_conf(conf_env_dir)
        env_params["rng"] = rng
        env = get_mod_object("envs",self.params.env_module,"env",(rng,), env_params,mode=1)

        

        pol_train = get_mod_class("pols",self.params.pol_train_module, "pol")
        self.params.pol_train_args = flatten(self.params.pol_train_args) if self.params.pol_train_args is not None else [] 
        pol_train_args = parse_conf("cfgs/pol/" + self.params.pol_train_module + "/" + self.params.pol_train_args[0]) if len(self.params.pol_train_args) > 0 and isfile("cfgs/pol/" + self.params.pol_train_module + "/" + self.params.pol_train_args[0]) else parse_conf("cfgs/pol/" + self.params.pol_train_module + "/default")
        pol_train_args_2 = erase_dict_from_keyword_list(pol_train_args, self.params.pol_train_args)
        pol_train_args = revalidate_dict_from_conf_module(pol_train_args_2, "pol", self.params.pol_train_module)

        pol_test = get_mod_class("pols",self.params.pol_test_module, "pol")
        self.params.pol_test_args = flatten(self.params.pol_test_args) if self.params.pol_test_args is not None else [] 
        pol_test_args = parse_conf("cfgs/pol/" + self.params.pol_test_module + "/" + self.params.pol_test_args[0]) if len(self.params.pol_test_args) > 0 and isfile("cfgs/pol/" + self.params.pol_test_module + "/" + self.params.pol_test_args[0]) else parse_conf("cfgs/pol/" + self.params.pol_test_module + "/default")
        pol_test_args_2 = erase_dict_from_keyword_list(pol_test_args, self.params.pol_test_args)
        pol_test_args = revalidate_dict_from_conf_module(pol_test_args_2, "pol", self.params.pol_test_module)

        self.params.backend_nnet_conf_file= flatten(self.params.backend_nnet_conf_file) if self.params.backend_nnet_conf_file is not None else [] 
        backend_nnet_params = parse_conf("cfgs/backend_nnet/" + self.params.backend_nnet + "/" + self.params.backend_nnet_conf_file[0]) if len(self.params.backend_nnet_conf_file) > 0 and isfile("cfgs/backend_nnet/" + self.params.backend_nnet + "/" + self.params.backend_nnet_conf_file[0]) else parse_conf("cfgs/backend_nnet/" + self.params.backend_nnet + "/default")
        backend_nnet_params_2 = erase_dict_from_keyword_list(backend_nnet_params,self.params.backend_nnet_conf_file)
        backend_nnet_params = revalidate_dict_from_conf_module(backend_nnet_params_2, "backend_nnet", self.params.backend_nnet)
        
        neural_net = get_mod_class("neural_nets", self.params.backend_nnet,"neural_net")
        
        self.params.ctrl_neural_nets_conf_file = flatten(self.params.ctrl_neural_nets_conf_file) if self.params.ctrl_neural_nets_conf_file is not None else [] 
        ctrl_neural_nets_params = parse_conf("cfgs/ctrl_nnet/" + self.params.qnetw_module + "/" + self.params.ctrl_neural_nets_conf_file[0]) if len(self.params.ctrl_neural_nets_conf_file) > 0 and isfile("cfgs/ctrl_nnet/" + self.params.qnetw_module + "/" + self.params.ctrl_neural_nets_conf_file[0]) else parse_conf("cfgs/ctrl_nnet/" + self.params.qnetw_module + "/DEFAULT")
        ctrl_neural_nets_params_2 = erase_dict_from_keyword_list(ctrl_neural_nets_params,self.params.ctrl_neural_nets_conf_file)
        ctrl_neural_nets_params = revalidate_dict_from_conf_module(ctrl_neural_nets_params_2, "ctrl_neural_net", self.params.qnetw_module)

        ctrl_neural_nets_params["neural_network"] = neural_net
        ctrl_neural_nets_params["neural_network_kwargs"] = backend_nnet_params
        ctrl_neural_nets_params["batch_size"] = self.params.batch_size
        ctrl_neural_net = get_mod_object("ctrl_neural_nets", self.params.qnetw_module, "ctrl_neural_net", (env,),ctrl_neural_nets_params, mode=0)
        
        agent = NeuralAgent([env], [ctrl_neural_net], replay_memory_size=self.params.replay_memory_size, replay_start_size=None, batch_size=self.params.batch_size, random_state=rng, exp_priority=self.params.exp_priority, train_policy=pol_train,train_policy_kwargs=pol_train_args, test_policy=pol_test, test_policy_kwargs=pol_test_args, only_full_history=self.params.only_full_history)
       

        for tc in self.params.controllers:
                len_tc = len(tc)                
                s = tc[0]
                redo_conf = False
                if len_tc >= 2:
                    
                    #Test if sc is a config file or an argument to override
                    if '=' not in tc[1]:
                        #This is a config file
                        conf_ctrl = parse_conf("cfgs/ctrl/" + s + "/" + tc[1])
                    else:
                        conf_ctrl = parse_conf("cfgs/ctrl/" + s + "/default")
                        sc = tc[1].split("=")
                        if sc[0] in conf_ctrl.keys():
                            conf_ctrl[sc[0]] = sc[1]
                            redo_conf = True
                        else:
                            print ("Warning : parameter " + str(sc[0]) + " is not included in config specs for the controller " + s)

                    if len_tc > 2:
                        remainder = tc[2:]
                        for a in remainder:
                             sc = a.split("=")
                             if len(sc) != 2:
                                 print ("Warning : arg " + a + " for controller parametrization is ill formed. It needs to be in the form key=value.") 
                             else:
                                 redo_conf = True
                                 if sc[0] in conf_ctrl.keys():
                                    conf_ctrl[sc[0]] = sc[1]
                                 else:
                                    print ("Warning : parameter " + str(sc[0]) + " is not included in config specs for the controller " + s)
                    #Create a temporary config file with the erased parameter and go through parse_conf again
                    if redo_conf:
                        write_conf(conf_ctrl, "cfgs/ctrl/" + s + "/temp")
                        conf_ctrl = parse_conf("cfgs/ctrl/" + s + "/temp")
                        os.remove("cfgs/ctrl/" + s + "/temp") 
                    
                else:
                    conf_ctrl = parse_conf("cfgs/ctrl/" + s + "/default")
                controller = get_mod_object("ctrls",s,"ctrl",tuple(),conf_ctrl,mode=0)
                agent.attach(controller)
        agent.run(self.params.epochs, self.params.max_size_episode)
Esempio n. 24
0
  def run_instances(self, count, parameters, security_configured):
    """
    Spawn the specified number of EC2 instances using the parameters
    provided. This method relies on the ec2-run-instances command to
    spawn the actual VMs in the cloud. This method is blocking in that
    it waits until the requested VMs are properly booted up. However
    if the requested VMs cannot be procured within 1800 seconds, this
    method will treat it as an error and return. (Also see documentation
    for the BaseAgent class)

    Args:
      count               No. of VMs to spawned
      parameters          A dictionary of parameters. This must contain 'keyname',
                          'group', 'image_id' and 'instance_type' parameters.
      security_configured Uses this boolean value as an heuristic to
                          detect brand new AppScale deployments.

    Returns:
      A tuple of the form (instances, public_ips, private_ips)
    """
    image_id = parameters[self.PARAM_IMAGE_ID]
    instance_type = parameters[self.PARAM_INSTANCE_TYPE]
    keyname = parameters[self.PARAM_KEYNAME]
    group = parameters[self.PARAM_GROUP]
    spot = False

    utils.log('[{0}] [{1}] [{2}] [{3}] [ec2] [{4}] [{5}]'.format(count,
      image_id, instance_type, keyname, group, spot))

    start_time = datetime.datetime.now()
    active_public_ips = []
    active_private_ips = []
    active_instances = []
    if os.environ.has_key('EC2_URL'):
      utils.log('EC2_URL = [{0}]'.format(os.environ['EC2_URL']))
    else:
      utils.log('Warning: EC2_URL environment not found in the process runtime!')
    while True:
      active_public_ips, active_private_ips, active_instances =\
      self.describe_instances(parameters)
      # If security has been configured on this agent just now,
      # that's an indication that this is a fresh cloud deployment.
      # As such it's not expected to have any running VMs.
      if len(active_instances) > 0 or security_configured:
        break

    args = '-k {0} -n {1} --instance-type {2} --group {3} {4}'.format(keyname,
      count, instance_type, group, image_id)
    if spot:
      price = self.get_optimal_spot_price(instance_type)
      command_to_run = '{0}-request-spot-instances -p {1} {2}'.format(self.prefix, price, args)
    else:
      command_to_run = '{0}-run-instances {1}'.format(self.prefix, args)

    while True:
      run_instances = utils.shell(command_to_run)
      utils.log('Run instances says {0}'.format(run_instances))
      status, command_to_run = self.run_instances_response(command_to_run, run_instances)
      if status:
        break
      utils.log('sleepy time')
      utils.sleep(5)

    instances = []
    public_ips = []
    private_ips = []
    utils.sleep(10)

    end_time = datetime.datetime.now() + datetime.timedelta(0, self.MAX_VM_CREATION_TIME)
    now = datetime.datetime.now()
    while now < end_time:
      describe_instances = utils.shell(self.prefix + '-describe-instances 2>&1')
      utils.log('[{0}] {1} seconds left...'.format(now, (end_time - now).seconds))
      utils.log(describe_instances)
      fqdn_regex = re.compile('\s+({0})\s+({0})\s+running\s+{1}\s'.format(self.FQDN_REGEX, keyname))
      instance_regex = re.compile('INSTANCE\s+(i-\w+)')
      all_ip_addresses = utils.flatten(fqdn_regex.findall(describe_instances))
      instances = utils.flatten(instance_regex.findall(describe_instances))
      public_ips, private_ips = self.get_ip_addresses(all_ip_addresses)
      public_ips = utils.diff(public_ips, active_public_ips)
      private_ips = utils.diff(private_ips, active_private_ips)
      instances = utils.diff(instances, active_instances)
      if count == len(public_ips):
        break
      time.sleep(self.SLEEP_TIME)
      now = datetime.datetime.now()

    if not public_ips:
      sys.exit('No public IPs were able to be procured within the time limit')

    if len(public_ips) != count:
      for index in range(0, len(public_ips)):
        if public_ips[index] == '0.0.0.0':
          instance_to_term = instances[index]
          utils.log('Instance {0} failed to get a public IP address and is being terminated'.\
          format(instance_to_term))
          utils.shell(self.prefix + '-terminate-instances ' + instance_to_term)
      pass

    end_time = datetime.datetime.now()
    total_time = end_time - start_time
    if spot:
      utils.log('TIMING: It took {0} seconds to spawn {1} spot instances'.format(
        total_time.seconds, count))
    else:
      utils.log('TIMING: It took {0} seconds to spawn {1} regular instances'.format(
        total_time.seconds, count))
    return instances, public_ips, private_ips
Esempio n. 25
0
def main():
    rates_for_algo = {}
    index_comparison = pd.DataFrame(index=config_manager.INDEX_TO_COMPARE)

    # Used only for initialization
    for func in config_manager.FUNCTION_NAMES:
        rates_for_algo[func] = {}

    # For each strategy type, for each minute and for each function read data exported
    # by the simulation and use them to calculate rates and indexes for comparison
    for algo in config_manager.STRATEGIES:
        x_func_success_rate = {}
        x_func_reject_rate = {}
        x_func_reject_num = {}

        # Initialize dictionary of rates for all functions
        for func in config_manager.FUNCTION_NAMES:
            x_func_success_rate[func] = []
            x_func_reject_rate[func] = []
            x_func_reject_num[func] = []

        print("-------------------------- ALGO {} --------------------------".
              format(algo))

        # Create path for recover tables
        base_path = config_manager.SIMULATION_TABLES_OUTPUT_PATH.joinpath(algo)

        for minute in range(0, config_manager.SIMULATION_MINUTES):
            print("MINUTE {}".format(minute))
            print(
                ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
            )

            # Complete path for load tables
            path = base_path.joinpath("minute_" + str(minute))

            # For each minute load invocaion_rate and max_rate table
            df_invoc_rate = pd.read_csv(path.joinpath("invoc_rates.csv"),
                                        delimiter='\t',
                                        header=0,
                                        index_col=0)
            print("================ INVOCATION RATES ==================")
            print(df_invoc_rate)
            print("====================================================")

            df_max_rate = pd.read_csv(path.joinpath("max_rates.csv"),
                                      delimiter='\t',
                                      header=0,
                                      index_col=0)
            print("================ MAX RATES =========================")
            print(df_max_rate)
            print("====================================================")

            # For each minute and foreach function load dataframe
            for func in config_manager.FUNCTION_NAMES:
                df = pd.read_csv(path.joinpath(func + ".csv"),
                                 delimiter='\t',
                                 header=0,
                                 index_col=0)

                print(
                    "================ FORWARDED REQUESTS for {} ================"
                    .format(func))
                print(df)
                print(
                    "=========================================================="
                )

                sr, rr, rn = calculate_rates(df, func, df_max_rate[func],
                                             df_invoc_rate[func])
                x_func_success_rate[func].append(sr)
                x_func_reject_rate[func].append(rr)
                x_func_reject_num[func].append(rn)
                rates_for_algo[func][algo] = x_func_success_rate[func]

            print(
                "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
            )

        print("STATS FOR ALGO {}".format(algo))

        # Utility print for success/reject rate and reject nume for func
        # TODO: fix it to work with new dictionaties
        #
        # print(" > Mean success rate for funca: {}".format(np.mean(funca_sr)))
        # print(" > Mean reject rate for funca: {}".format(np.mean(funca_rr)))
        # print(" > Rejected requests for funca: {}".format(np.sum(funca_reject_num)))

        # print(" > Mean success rate for qrcode: {}".format(np.mean(qrcode_sr)))
        # print(" > Mean reject rate for qrcode: {}".format(np.mean(qrcode_rr)))
        # print(" > Rejected requests for qrcode: {}".format(np.sum(qrcode_reject_num)))

        # print(" > Mean success rate for ocr: {}".format(np.mean(ocr_sr)))
        # print(" > Mean reject rate for ocr: {}".format(np.mean(ocr_rr)))
        # print(" > Rejected requests for ocr: {}".format(np.sum(ocr_reject_num)))

        # TEST
        #print(x_func_success_rate)
        #print(x_func_reject_rate)
        #print(x_func_reject_num)

        # Metrics prints

        ##### SUCCESS RATES METRICS #####
        # Mean success rate
        mean_success_rate = np.mean(
            [np.mean(srates)
             for k, srates in x_func_success_rate.items()]) * 100
        print("     > Mean success rate: {:0.2f}%".format(mean_success_rate))

        # Success rate variance
        flat_list = [
            i * 100 for i in flatten(list(x_func_success_rate.values()))
        ]
        success_rate_variance = np.var(flat_list)
        print("     > Success rate variance: {:0.2f}".format(
            success_rate_variance))

        # Success rate median
        flat_list = flatten(list(x_func_success_rate.values()))
        success_rate_median = np.median(flat_list) * 100
        print(
            "     > Success rate median: {:0.2f}%".format(success_rate_median))

        # Success rate percentile
        flat_list = flatten(list(x_func_success_rate.values()))
        success_rate_percentile = np.percentile(
            flat_list, config_manager.ANALYSIS_PERCENTILE) * 100
        print("     > Success rate {}% percentile: {:0.2f}%".format(
            config_manager.ANALYSIS_PERCENTILE, success_rate_percentile))

        ##### SUCCESS RATES (STRESS PERIOD) METRICS #####
        # Mean success rate calculated during high traffic period (minutes from 1 to 5)
        mean_success_rate_stress_period = np.mean([
            np.mean(srates[1:6]) for k, srates in x_func_success_rate.items()
        ]) * 100
        print(
            "         > Mean success rate during stress period (from minute 1 to 5): {:0.2f}%"
            .format(mean_success_rate_stress_period))

        # Success rate variance (stress period)
        flat_list = [
            i * 100 for i in flatten(
                [item[1:6] for item in list(x_func_success_rate.values())])
        ]
        success_rate_stress_period_variance = np.var(flat_list)
        print(
            "         > Success rate variance during stress period (from minute 1 to 5): {:0.2f}"
            .format(success_rate_stress_period_variance))

        # Success rate median (stress period)
        flat_list = flatten(
            [item[1:6] for item in list(x_func_success_rate.values())])
        success_rate_stress_period_median = np.median(flat_list) * 100
        print(
            "         > Success rate median during stress period (from minute 1 to 5): {:0.2f}%"
            .format(success_rate_stress_period_median))

        # Success rate percentile (stress period)
        flat_list = flatten(
            [item[1:6] for item in list(x_func_success_rate.values())])
        success_rate_stress_period_percentile = np.percentile(
            flat_list, config_manager.ANALYSIS_PERCENTILE) * 100
        print(
            "         > Success rate {}% percentile during stress period (from minute 1 to 5): {:0.2f}%"
            .format(config_manager.ANALYSIS_PERCENTILE,
                    success_rate_stress_period_percentile))

        ##### REJECT RATES METRICS #####
        # Total rejected requests num calculated for each algorithm across minutes
        total_reject_requests = np.sum(
            [np.sum(rejnums) for k, rejnums in x_func_reject_num.items()])
        print("     > Total rejected requests: {} req".format(
            total_reject_requests))

        # Reject number variance
        flat_list = flatten(list(x_func_reject_num.values()))
        reject_number_variance = np.var(flat_list)
        print("     > Reject num variance: {:0.2f}".format(
            reject_number_variance))

        # Reject number median
        flat_list = flatten(list(x_func_reject_num.values()))
        reject_number_median = np.median(flat_list)
        print("     > Reject num median: {:0.2f}".format(reject_number_median))

        # Reject number percentile
        flat_list = flatten(list(x_func_reject_num.values()))
        reject_number_percentile = np.percentile(
            flat_list, config_manager.ANALYSIS_PERCENTILE)
        print("     > Reject num {}% percentile: {:0.2f}".format(
            config_manager.ANALYSIS_PERCENTILE, reject_number_percentile))

        print(
            "----------------------------------------------------------------------------"
        )

        index_comparison[algo] = [
            mean_success_rate,
            success_rate_variance,
            success_rate_median,
            success_rate_percentile,
            mean_success_rate_stress_period,
            success_rate_stress_period_variance,
            success_rate_stress_period_median,
            success_rate_stress_period_percentile,
            total_reject_requests,
            reject_number_variance,
            reject_number_median,
            reject_number_percentile,
        ]

    # Export print for comparison
    for func in config_manager.FUNCTION_NAMES:
        export_for_minute_rates(func, rates_for_algo[func])

    # Export index comparison table
    print("> INDEX COMPARISON TABLE")
    print(index_comparison.T)
    export_index_comparison_table(index_comparison.T)
Esempio n. 26
0
def dcganx_G(input_dim, n0g, imgsz, channels,
             norm_type,  # 'bn', 'none'
             requires_grad, depth=3, 
             nodemul=2, do_bias=True):
              
   ker=5; padding=2; output_padding=1

   def gen_block_T_params(ni, no, k):
      return {
         'convT0': conv2dT_params(ni, no, k, do_bias), 
         'conv1': conv2d_params(no, no, 1, do_bias), 
         'bn0': utils.bnparams(no) if norm_type == 'bn' else None, 
         'bn1': utils.bnparams(no) if norm_type == 'bn' else None
      }

   def gen_group_T_params(ni, no, count):
       return {'block%d' % i: gen_block_T_params(ni if i == 0 else no, no, ker) for i in range(count)}

   count = 1
   nn0 = n0g * (nodemul**(depth-1))
   sz = imgsz // (2**depth)  
   p = { 'proj': utils.linear_params(input_dim, nn0*sz*sz) }
   nn = nn0
   for d in range(depth-1):
      p['group%d'%d] = gen_group_T_params(nn, nn//nodemul, count)
      nn = nn//nodemul
   p['last_convT'] = conv2dT_params(nn, channels, ker, do_bias)
   flat_params = utils.cast(utils.flatten(p))

   if requires_grad:
      utils.set_requires_grad_except_bn_(flat_params)

   def block(x, params, base, mode, stride):
      o = F.relu(x, inplace=True)
      o = F.conv_transpose2d(o, params[base+'.convT0.w'], params.get(base+'.convT0.b'),
                             stride=stride, padding=padding, output_padding=output_padding)
      if norm_type == 'bn':
         o = utils.batch_norm(o, params, base + '.bn0', mode)

      o = F.relu(o, inplace=True)
      o = F.conv2d(o, params[base+'.conv1.w'], params.get(base+'.conv1.b'),
                   stride=1, padding=0)
      if norm_type == 'bn':
         o = utils.batch_norm(o, params, base + '.bn1', mode)
      return o

   def group(o, params, base, mode, stride=2):
      for i in range(count):
         o = block(o, params, '%s.block%d' % (base,i), mode, stride if i == 0 else 1)
      return o

   def f(input, params, mode):
      o = F.linear(input, params['proj.weight'], params['proj.bias'])
      o = o.view(input.size(0), nn0, sz, sz)
      for d in range(depth-1):
        o = group(o, params, 'group%d'%d, mode)
      o = F.relu(o, inplace=True)
      o = F.conv_transpose2d(o, params['last_convT.w'], params.get('last_convT.b'), stride=2,
                             padding=padding, output_padding=output_padding)
      o = torch.tanh(o)
      return o

   return f, flat_params
Esempio n. 27
0
    def handle(self, *args, **options):

        t00 = time()
        qid = options['qid']
        K = options['K']

        alpha = options['alpha']
        n_features = options['n_features']
        limit = options['limit']
        ng = options['ng']
        n_samples = options['n_samples']

        # Get the docs from the query
        docs = Doc.objects.filter(query=qid,content__iregex='\w')

        # if we are limiting, probably for testing, then do that
        if limit > 0:
            docs = docs[:limit]

        print('\n###############################\
        \n## Doing NMF on query {} with {} documents \
and {} topics\n'.format(qid, docs.count(),K))

        # Get the docs into lists
        abstracts, docsizes, ids = proc_docs(docs, stoplist)

        #############################################
        # Use tf-idf features for NMF.
        print("Extracting tf-idf features for NMF...")
        tfidf_vectorizer = TfidfVectorizer(max_df=0.97, min_df=2,
                                           max_features=n_features,
                                           ngram_range=(ng,ng),
                                           tokenizer=snowball_stemmer(),
                                           stop_words=stoplist)
        t0 = time()
        tfidf = tfidf_vectorizer.fit_transform(abstracts)
        print("done in %0.3fs." % (time() - t0))

        del abstracts
        gc.collect()

        run_id = db.init(n_features)
        stat = RunStats.objects.get(run_id=run_id)
        stat.query = Query.objects.get(pk=qid)
        stat.method = "NM"
        stat.alpha = alpha
        stat.process_id = os.getpid()
        stat.save()

        # Get the vocab, add it to db
        vocab = tfidf_vectorizer.get_feature_names()
        vocab_ids = []
        pool = Pool(processes=8)
        vocab_ids.append(pool.map(partial(add_features,run_id=run_id),vocab))
        pool.terminate()
        del vocab
        vocab_ids = vocab_ids[0]


        ## Make some topics
        django.db.connections.close_all()
        topic_ids = db.add_topics(K, run_id)


        gc.collect()

        # Fit the NMF model
        print("Fitting the NMF model with tf-idf features, "
              "n_samples=%d and n_features=%d..."
              % (n_samples, n_features))
        t0 = time()
        nmf = NMF(n_components=K, random_state=1,
                  alpha=alpha, l1_ratio=.5, verbose=True,
                  init='nndsvd', max_iter=500).fit(tfidf)

        print("done in %0.3fs." % (time() - t0))


        ## Add topics terms
        print("Adding topicterms to db")
        t0 = time()
        ldalambda = find(csr_matrix(nmf.components_))
        topics = range(len(ldalambda[0]))
        tts = []
        pool = Pool(processes=8)

        tts.append(pool.map(partial(db.f_lambda, m=ldalambda,
                        v_ids=vocab_ids,t_ids=topic_ids,run_id=run_id),topics))
        pool.terminate()
        tts = flatten(tts)
        gc.collect()
        sys.stdout.flush()
        django.db.connections.close_all()
        TopicTerm.objects.bulk_create(tts)
        print("done in %0.3fs." % (time() - t0))


        ## Add topic-docs
        gamma =  find(csr_matrix(nmf.transform(tfidf)))
        glength = len(gamma[0])

        chunk_size = 100000

        ps = 16
        parallel_add = True

        all_dts = []

        make_t = 0
        add_t = 0

        ### Go through in chunks
        for i in range(glength//chunk_size+1):
            dts = []
            values_list = []
            f = i*chunk_size
            l = (i+1)*chunk_size
            if l > glength:
                l = glength
            docs = range(f,l)
            doc_batches = []
            for p in range(ps):
                doc_batches.append([x for x in docs if x % ps == p])
            pool = Pool(processes=ps)
            make_t0 = time()
            values_list.append(pool.map(partial(
                db.f_gamma_batch, gamma=gamma,
                docsizes=docsizes,docUTset=ids,topic_ids=topic_ids,
                run_id=run_id
            ),doc_batches))
            #dts.append(pool.map(partial(f_gamma, gamma=gamma,
            #                docsizes=docsizes,docUTset=ids,topic_ids=topic_ids),doc_batches))
            pool.terminate()
            make_t += time() - make_t0
            django.db.connections.close_all()

            add_t0 = time()
            values_list = [item for sublist in values_list for item in sublist]
            pool = Pool(processes=ps)
            pool.map(insert_many,values_list)
            pool.terminate()
            add_t += time() - add_t0
            gc.collect()
            sys.stdout.flush()

        stat.error = nmf.reconstruction_err_
        stat.errortype = "Frobenius"
        stat.iterations = nmf.n_iter_
        stat.last_update=timezone.now()
        stat.save()
        management.call_command('update_run',run_id)



        totalTime = time() - t00

        tm = int(totalTime//60)
        ts = int(totalTime-(tm*60))

        print("done! total time: " + str(tm) + " minutes and " + str(ts) + " seconds")
        print("a maximum of " + str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000) + " MB was used")
Esempio n. 28
0
def run_tm(s_id,
           K,
           language="german",
           verbosity=1,
           method='NM',
           max_features=0,
           max_df=0.95,
           min_df=5,
           alpha=0.01,
           extra_stopwords=set(),
           top_chain_var=None,
           rng_seed=None,
           max_iter=200,
           **kwargs):

    if method in ['BD', 'BleiDTM'] and top_chain_var is None:
        top_chain_var = 0.005

    s = Search.objects.get(pk=s_id)
    stat = RunStats(psearch=s,
                    K=K,
                    min_freq=min_df,
                    max_df=max_df,
                    method=method.upper()[0:2],
                    max_features=max_features,
                    max_iter=max_iter,
                    alpha=alpha,
                    extra_stopwords=list(extra_stopwords),
                    top_chain_var=top_chain_var,
                    status=1,
                    language=language)
    stat.save()
    django.db.connections.close_all()

    if method in ['DT', 'dnmf']:
        print("Running dynamic NMF algorithm")
        run_dynamic_nmf(stat, **kwargs)
        return 0
    elif method in ['BD', 'BleiDTM']:
        print("Running Blei DTM algorithm")
        if rng_seed:
            stat.rng_seed = rng_seed
        else:
            stat.rng_seed = 1
        stat.save()
        run_blei_dtm(stat, **kwargs)
        return 0

    print("starting topic model for runstat with settings:")
    for field in stat._meta.fields:
        field_value = getattr(stat, field.name)
        if field_value:
            print("{}: {}".format(field.name, field_value))

    start_time = time.time()
    start_datetime = timezone.now()

    stat.status = 1  # 3 = finished

    stat.save()
    run_id = stat.run_id

    if s.search_object_type == 1:
        ps = Paragraph.objects.filter(search_matches=s)
        docs = ps.filter(text__iregex='\w')
        texts, docsizes, ids = process_texts(docs)

    elif s.search_object_type == 2:
        uts = Utterance.objects.filter(search_matches=s)
        texts, docsizes, ids = merge_utterance_paragraphs(uts)
    else:
        print("search object type invalid")
        return 1

    if stat.max_features == 0:
        n_features = 10000000
    else:
        n_features = stat.max_features

    if stat.language is "german":
        stemmer = SnowballStemmer("german")
        tokenizer = german_stemmer()
        stopword_list = [stemmer.stem(t) for t in stopwords.words("german")]

    elif stat.language is "english":
        stemmer = SnowballStemmer("english")
        stopword_list = [stemmer.stem(t) for t in stopwords.words("english")]
        tokenizer = snowball_stemmer()
    else:
        print("Language not recognized.")
        return 1

    if stat.extra_stopwords:
        stopword_list = list(set(stopword_list) | set(stat.extra_stopwords))

    if method in ["NM", "nmf"]:
        if verbosity > 0:
            print(
                "creating term frequency-inverse document frequency matrix ({})"
                .format(time.time() - start_time))
        # get term frequency-inverse document frequency matrix (using log weighting)
        # and min/max document frequency (min_df, max_df)
        tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df,
                                           min_df=stat.min_freq,
                                           max_features=n_features,
                                           ngram_range=(1, stat.ngram),
                                           tokenizer=tokenizer,
                                           stop_words=stopword_list)

        tfidf = tfidf_vectorizer.fit_transform(texts)
        vectorizer = tfidf_vectorizer
        vocab = vectorizer.get_feature_names()

    elif method in ["LD", "lda"]:
        if verbosity > 0:
            print("creating term frequency matrix ({})".format(time.time() -
                                                               start_time))
        #  Use tf (raw term count) features for LDA.
        tf_vectorizer = CountVectorizer(max_df=stat.max_df,
                                        min_df=stat.min_freq,
                                        max_features=n_features,
                                        ngram_range=(1, stat.ngram),
                                        tokenizer=tokenizer,
                                        stop_words=stopword_list)
        tf = tf_vectorizer.fit_transform(texts)
        vectorizer = tf_vectorizer
        vocab = vectorizer.get_feature_names()
    else:
        print("method not implemented")
        return 1

    if verbosity > 0:
        print("save terms to db ({})".format(time.time() - start_time))

    paralellized = True
    if paralellized:
        vocab_ids = []
        # multiprocessing: add vocabulary as Term
        pool = Pool(processes=8)
        vocab_ids.append(
            pool.map(partial(db.add_features, run_id=run_id), vocab))
        pool.terminate()
        del vocab
        vocab_ids = vocab_ids[0]

    else:
        print("without multiprocessing for storing terms")
        # without multiprocessing
        objects = [Term(title=term_title) for term_title in vocab]

        # TODO: if some of the objects already exist, duplicates are created: use uniqueness of field 'title'
        Term.objects.bulk_create(objects)
        runstats = RunStats.objects.get(run_id=run_id)
        runstats.term_set.add(*objects)
        runstats.save()

    ## Make some topics
    django.db.connections.close_all()
    topic_ids = db.add_topics(K, run_id)
    gc.collect()

    if verbosity > 1:
        v = True
    else:
        v = False

    if method in ["NM", "nmf"]:
        if verbosity > 0:
            print("running matrix factorization with NMF ({})".format(
                time.time() - start_time))
        # NMF = non-negative matrix factorization
        model = NMF(n_components=K,
                    random_state=1,
                    alpha=stat.alpha,
                    l1_ratio=.1,
                    verbose=v,
                    init='nndsvd',
                    max_iter=stat.max_iter).fit(tfidf)
        # initialization with Nonnegative Double Singular Value Decomposition (nndsvd)
        print("Reconstruction error of nmf: {}".format(
            model.reconstruction_err_))

        stat.error = model.reconstruction_err_
        stat.errortype = "Frobenius"

        # document topic matrix
        dtm = csr_matrix(model.transform(tfidf))

    elif method in ["LD", "lda"]:
        if verbosity > 0:
            print(
                "running Latent Dirichlet Allocation ({})".format(time.time() -
                                                                  start_time))
        model = LDA(
            n_components=K,
            doc_topic_prior=stat.
            alpha,  # this is the concentration parameter of the Dirichlet distribution of topics in documents
            topic_word_prior=stat.
            beta,  # this is the concentration parameter of the Dirichlet distribution of words in topics
            # if None, this defaults to 1/n
            max_iter=stat.max_iter,
            learning_method=
            'online',  # using 'batch' instead could lead to memory problems
            learning_offset=50.
            #n_jobs=6
        ).partial_fit(tf)

        stat.error = model.perplexity(tf)
        stat.errortype = "Perplexity"

        dtm = csr_matrix(model.transform(tf))

    else:
        print("Method {} not available.".format(method))
        return 1

    # term topic matrix
    ldalambda = find(csr_matrix(model.components_))
    # find returns the indices and values of the nonzero elements of a matrix
    topics = range(len(ldalambda[0]))
    tts = []
    # multiprocessing: add TopicTerms and scores
    pool = Pool(processes=8)
    tts.append(
        pool.map(
            partial(db.f_lambda,
                    m=ldalambda,
                    v_ids=vocab_ids,
                    t_ids=topic_ids,
                    run_id=run_id), topics))
    pool.terminate()

    tts = flatten(tts)
    gc.collect()
    sys.stdout.flush()
    django.db.connections.close_all()
    TopicTerm.objects.bulk_create(tts)

    if verbosity > 0:
        print("saving document topic matrix to db ({})".format(time.time() -
                                                               start_time))

    #document topic matrix
    gamma = find(dtm)
    glength = len(gamma[0])

    chunk_size = 100000

    no_cores = 16
    parallel_add = True

    all_dts = []

    make_t = 0
    add_t = 0

    ### Go through in chunks
    for i in range(glength // chunk_size + 1):
        values_list = []
        f = i * chunk_size
        l = (i + 1) * chunk_size
        if l > glength:
            l = glength
        docs = range(f, l)
        doc_batches = []
        for p in range(no_cores):
            doc_batches.append([x for x in docs if x % no_cores == p])
        pool = Pool(processes=no_cores)
        values_list.append(
            pool.map(
                partial(db.f_gamma_batch,
                        gamma=gamma,
                        docsizes=docsizes,
                        docUTset=ids,
                        topic_ids=topic_ids,
                        run_id=run_id), doc_batches))
        pool.terminate()
        django.db.connections.close_all()
        print(
            "... created document topic matrix for saving iteration {}".format(
                i))

        values_list = [item for sublist in values_list for item in sublist]
        pool = Pool(processes=no_cores)
        if s.search_object_type == 1:
            pool.map(db.insert_many_pars, values_list)
        elif s.search_object_type == 2:
            pool.map(db.insert_many_utterances, values_list)
        pool.terminate()
        gc.collect()
        sys.stdout.flush()
        print("... saved document topic matrix iteration {}".format(i))

    stat.iterations = model.n_iter_
    stat.status = 3  # 3 = finished
    stat.last_update = timezone.now()
    stat.runtime = timezone.now() - start_datetime
    stat.save()
    update_topic_titles(run_id)
    update_topic_scores(run_id)

    if verbosity > 0:
        print("topic model run done ({})".format(time.time() - start_time))

    return 0
Esempio n. 29
0
def run_dynamic_nmf(stat):
    """
    Run dynamic NMF model on utterances (speeches) or paragraphs from the parliament data

    :param stat: RunStats object with the parameters to run the model with
    :return: 0 if successful, 1 otherwise
    """

    print("starting topic model for runstat with settings:")
    for field in stat._meta.fields:
        field_value = getattr(stat, field.name)
        if field_value:
            print("{}: {}".format(field.name, field_value))

    t0 = time()
    start_datetime = timezone.now()

    s = Search.objects.get(pk=stat.psearch.id)

    n_samples = 1000

    run_id = stat.run_id

    # load time range
    if s.search_object_type == 1:
        ps = Paragraph.objects.filter(search_matches=s)
        wps = ParlPeriod.objects.filter(
            document__utterance__paragraph__in=ps).distinct().values('n')

    elif s.search_object_type == 2:
        uts = Utterance.objects.filter(
            search_matches=s).order_by('document__parlperiod__n')
        wps = ParlPeriod.objects.filter(
            document__utterance__in=uts).distinct().values('n')
    else:
        print("search object type invalid")
        return 1

    # language specific settings
    if stat.language is "german":
        stemmer = SnowballStemmer("german")
        tokenizer = german_stemmer()
        stopword_list = [stemmer.stem(t) for t in stopwords.words("german")]

    elif stat.language is "english":
        stemmer = SnowballStemmer("english")
        stopword_list = [stemmer.stem(t) for t in stopwords.words("english")]
        tokenizer = snowball_stemmer()
    else:
        print("Language not recognized.")
        return 1

    if stat.extra_stopwords:
        stopword_list = list(set(stopword_list) | set(stat.extra_stopwords))

    time_range = sorted([wp['n'] for wp in wps])

    for timestep in time_range:

        # load text from database
        if s.search_object_type == 1:
            ps = Paragraph.objects.filter(
                search_matches=s, utterance__document__parlperiod__n=timestep)
            docs = ps.filter(text__iregex='\w')
            texts, docsizes, ids = process_texts(docs)

        elif s.search_object_type == 2:
            uts = Utterance.objects.filter(search_matches=s,
                                           document__parlperiod__n=timestep)
            texts, docsizes, ids = merge_utterance_paragraphs(uts)
        else:
            print("search object type not known")
            return 1

        print("\n#######################")
        print("in period {}: {} docs".format(timestep, len(texts)))
        k = stat.K
        # k = predict(text_count)
        # print("esimating {} topics...".format(k))

        print("Extracting tf-idf features for NMF...")

        if stat.max_features == 0:
            n_features = 100000000
        else:
            n_features = stat.max_features

        tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df,
                                           min_df=stat.min_freq,
                                           max_features=n_features,
                                           ngram_range=(1, stat.ngram),
                                           tokenizer=tokenizer,
                                           stop_words=stopword_list)

        t1 = time()
        tfidf = tfidf_vectorizer.fit_transform(texts)
        del texts
        gc.collect()

        print("done in %0.3fs." % (time() - t1))

        print("Save terms to DB")

        # Get the vocab, add it to db
        vocab = tfidf_vectorizer.get_feature_names()
        vocab_ids = []
        pool = Pool(processes=8)
        vocab_ids.append(
            pool.map(partial(db.add_features, run_id=run_id), vocab))
        pool.terminate()
        del vocab
        vocab_ids = vocab_ids[0]

        django.db.connections.close_all()
        topic_ids = db.add_topics(k, run_id)
        for t in topic_ids:
            top = Topic.objects.get(pk=t)
            top.year = timestep
            top.save()

        gc.collect()

        # Fit the NMF model
        print("Fitting the NMF model with tf-idf features, "
              "n_samples=%d and max_features=%d..." %
              (n_samples, stat.max_features))
        t1 = time()
        nmf = NMF(n_components=k, random_state=1, alpha=.0001,
                  l1_ratio=.5).fit(tfidf)
        print("done in %0.3fs." % (time() - t1))

        print("Adding topicterms to db")
        ldalambda = find(csr_matrix(nmf.components_))
        topics = range(len(ldalambda[0]))
        tts = []
        pool = Pool(processes=8)

        tts.append(
            pool.map(
                partial(db.f_lambda,
                        m=ldalambda,
                        v_ids=vocab_ids,
                        t_ids=topic_ids,
                        run_id=run_id), topics))
        pool.terminate()
        tts = flatten(tts)
        gc.collect()
        sys.stdout.flush()
        django.db.connections.close_all()
        TopicTerm.objects.bulk_create(tts)
        print("done in %0.3fs." % (time() - t1))

        gamma = find(csr_matrix(nmf.transform(tfidf)))
        glength = len(gamma[0])

        chunk_size = 100000

        no_cores = 16

        make_t = 0
        add_t = 0

        ### Go through in chunks
        for i in range(glength // chunk_size + 1):
            values_list = []
            f = i * chunk_size
            l = (i + 1) * chunk_size
            if l > glength:
                l = glength
            docs = range(f, l)
            doc_batches = []
            for p in range(no_cores):
                doc_batches.append([x for x in docs if x % no_cores == p])
            pool = Pool(processes=no_cores)
            make_t0 = time()
            values_list.append(
                pool.map(
                    partial(db.f_gamma_batch,
                            gamma=gamma,
                            docsizes=docsizes,
                            docUTset=ids,
                            topic_ids=topic_ids,
                            run_id=run_id), doc_batches))
            pool.terminate()
            make_t += time() - make_t0
            django.db.connections.close_all()

            add_t0 = time()
            values_list = [item for sublist in values_list for item in sublist]
            pool = Pool(processes=no_cores)

            if s.search_object_type == 1:
                pool.map(db.insert_many_pars, values_list)
            elif s.search_object_type == 2:
                pool.map(db.insert_many_utterances, values_list)

            pool.terminate()
            add_t += time() - add_t0
            gc.collect()
            sys.stdout.flush()

        stat.error = stat.error + nmf.reconstruction_err_
        stat.errortype = "Frobenius"

    ## After all the years have been run, update the dtops

    tops = Topic.objects.filter(run_id=run_id)

    highest_id = Term.objects.all().order_by('-id').first().id
    B = np.zeros((tops.count(), highest_id))

    #print(tops)

    wt = 0
    for topic in tops:
        tts = TopicTerm.objects.filter(topic=topic).order_by('-score')[:50]
        for tt in tts:
            B[wt, tt.term.id] = tt.score
        wt += 1

    col_sum = np.sum(B, axis=0)
    vocab_ids = np.flatnonzero(col_sum)

    # we only want the columns where there are at least some
    # topic-term values
    B = B[:, vocab_ids]

    nmf = NMF(n_components=stat.K, random_state=1, alpha=.1,
              l1_ratio=.5).fit(B)

    ## Add dynamic topics
    dtopics = []
    for k in range(stat.K):
        dtopic = DynamicTopic(run_id=RunStats.objects.get(pk=run_id))
        dtopic.save()
        dtopics.append(dtopic)

    dtopic_ids = list(
        DynamicTopic.objects.filter(run_id=run_id).values_list('id',
                                                               flat=True))

    print(dtopic_ids)

    ##################
    ## Add the dtopic*term matrix to the db
    print("Adding topicterms to db")
    t1 = time()
    ldalambda = find(csr_matrix(nmf.components_))
    topics = range(len(ldalambda[0]))
    tts = []
    pool = Pool(processes=8)
    tts.append(
        pool.map(
            partial(db.f_dlambda,
                    m=ldalambda,
                    v_ids=vocab_ids,
                    t_ids=dtopic_ids,
                    run_id=run_id), topics))
    pool.terminate()
    tts = flatten(tts)
    gc.collect()
    sys.stdout.flush()
    django.db.connections.close_all()
    DynamicTopicTerm.objects.bulk_create(tts)
    print("done in %0.3fs." % (time() - t1))

    ## Add the wtopic*dtopic matrix to the database
    gamma = nmf.transform(B)

    for topic in range(len(gamma)):
        for dtopic in range(len(gamma[topic])):
            if gamma[topic][dtopic] > 0:
                tdt = TopicDTopic(topic=tops[topic],
                                  dynamictopic_id=dtopic_ids[dtopic],
                                  score=gamma[topic][dtopic])
                tdt.save()

    ## Calculate the primary dtopic for each topic
    for t in tops:
        try:
            t.primary_dtopic.add(
                TopicDTopic.objects.filter(
                    topic=t).order_by('-score').first().dynamictopic)
            t.save()
        except:
            print("saving primary topic not working")
            pass

    management.call_command('update_run', run_id)

    stat.error = stat.error + nmf.reconstruction_err_
    stat.errortype = "Frobenius"
    stat.last_update = timezone.now()
    stat.runtime = timezone.now() - start_datetime
    stat.status = 3  # 3 = finished
    stat.save()

    totalTime = time() - t0

    tm = int(totalTime // 60)
    ts = int(totalTime - (tm * 60))

    print("done! total time: " + str(tm) + " minutes and " + str(ts) +
          " seconds")
    print("a maximum of " +
          str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000) +
          " MB was used")

    return 0
Esempio n. 30
0
def plot_tsne(r_ind,
              tsne_results,
              cats,
              nocatids,
              ax=None,
              verbose=False,
              hdoc=False,
              legend=True,
              sc=None,
              heat_var=None,
              cmapname=None,
              topics=None,
              min_cluster=100,
              psize=1,
              t_thresh=0.8,
              eps=1,
              n_clusters=1,
              doc_sets=None,
              clabel_size=8,
              words_only=False,
              fsize=5,
              adjust=False,
              draw_highlight_points=False,
              dot_legend=True,
              nocat_colour='#F0F0F026',
              nocat_alpha=0.4,
              raster=False,
              extension="png",
              slinewidth=0.1):
    cs = []
    sizes = []
    xs = []
    ys = []

    if ax == None:
        fig, ax = plt.subplots(dpi=188)
    t0 = time()

    nocatids = np.argwhere(np.isin(r_ind, nocatids))

    if hdoc is not False:
        hdocs = nocatids[np.isin(nocatids, hdoc)]
        ids = nocatids[np.isin(nocatids, hdoc, invert=True)]
    ax.scatter(tsne_results[nocatids, 0],
               tsne_results[nocatids, 1],
               c=nocat_colour,
               s=psize,
               alpha=nocat_alpha,
               linewidth=slinewidth,
               edgecolor='#a39c9c66',
               rasterized=raster)

    # Draw docs to be highlighted separately
    if hdoc is not False:
        ax.scatter(tsne_results[hdocs, 0],
                   tsne_results[hdocs, 1],
                   c='#F0F0F026',
                   s=psize,
                   alpha=1,
                   linewidth=0.5,
                   edgecolor='black',
                   rasterized=raster)

    # split the data and add layer by layer to prevent top layer overwriting all
    splits = 10
    for i in range(splits):
        for c in cats:
            ids = np.array_split(c["dis"], splits)[i]
            if hdoc is not False:
                hdocs = ids[np.isin(ids, hdoc)]
                ids = ids[np.isin(ids, hdoc, invert=True)]

            if len(nocatids) > len(r_ind) / 2:
                a = 1
            else:
                a = 0.7
            ax.scatter(
                tsne_results[ids, 0],
                tsne_results[ids, 1],
                #zorder = [math.ceil(random.random()*1) for i in range(len(ids))],
                c=c['color'],
                s=psize,
                alpha=a,
                linewidth=slinewidth,
                edgecolor='#a39c9c66',
                rasterized=raster)
            if hdoc is not False:
                ax.scatter(tsne_results[hdocs, 0],
                           tsne_results[hdocs, 1],
                           c=c["color"],
                           s=psize,
                           alpha=1,
                           linewidth=0.5,
                           edgecolor='black',
                           rasterized=raster)

    ax.grid(linestyle='-')

    if verbose:
        print("calculating points took %0.3fs." % (time() - t0))

    l = ax.get_xlim()[0]
    t = ax.get_ylim()[1]

    yextent = ax.get_ylim()[1] - ax.get_ylim()[0]
    ysp = yextent * 0.04

    draw_leg = False
    if legend:
        for i, c in enumerate(cats):
            prop = len(c['docs']) / len(r_ind)
            label = "{} {:.1%}".format(c['name'], prop)
            if extension == "pdf":
                label = label.replace("%", "\%")
            if dot_legend:
                if prop > 0.001:
                    draw_leg = True
                    ax.scatter(
                        [],
                        [],
                        c=c['color'],
                        label=label,
                        linewidth=slinewidth,
                        edgecolor='#a39c9c66',
                    )
            else:
                if c['color'] == "#000000":
                    tcolor = "white"
                else:
                    tcolor = "black"
                ax.text(l * 0.95,
                        t - ysp - i * ysp,
                        label,
                        fontsize=fsize,
                        color=tcolor,
                        bbox={
                            'facecolor': c['color'],
                            'pad': 3
                        })

    if dot_legend and draw_leg:
        ax.legend()

    if heat_var:
        cmap = cm.get_cmap(cmapname)
        ys = [
            getattr(cs, heat_var) for cs in sc.objects
            if getattr(cs, heat_var) is not None
        ]
        X = np.interp(ys, (np.min(ys), np.max(ys)), (0, +1))
        f = interpolate.interp1d(ys, X)
        for cs in sc.objects:
            if getattr(cs, heat_var):
                col = cmap(f(getattr(cs, heat_var)).max())
                rect = patches.Rectangle((cs.x1, cs.y1),
                                         cs.x2 - cs.x1,
                                         cs.y2 - cs.y1,
                                         linewidth=1,
                                         edgecolor='r',
                                         facecolor=col,
                                         alpha=0.3)

                ax.add_patch(rect)

    if topics:
        texts = []
        for t in topics:
            if t.run_id.method == "DT":
                atdocscores = Doc.objects.filter(
                    docdynamictopic__topic=t, ).values_list(
                        'docdynamictopic__score', flat=True)

                thresh = np.quantile(atdocscores, t_thresh)

                tdocs = Doc.objects.filter(
                    docdynamictopic__topic=t,
                    docdynamictopic__score__gt=thresh).order_by(
                        '-docdynamictopic__score').values_list('id', flat=True)
            else:
                atdocscores = Doc.objects.filter(
                    doctopic__topic=t, ).values_list('doctopic__score',
                                                     flat=True)

                thresh = np.quantile(atdocscores, t_thresh)

                tdocs = Doc.objects.filter(
                    doctopic__topic=t, doctopic__score__gt=thresh).order_by(
                        '-doctopic__score').values_list('id', flat=True)
            highlight_docs = np.argwhere(np.isin(r_ind, tdocs))[:, 0]

            if len(highlight_docs) == 0:
                continue

            points = tsne_results[highlight_docs]

            texts.append(
                cluster_label_points(t.title, points, ax, eps, min_cluster,
                                     n_clusters, clabel_size, words_only))

            if draw_highlight_points:
                ax.scatter(points[:, 0],
                           points[:, 1],
                           c=c["color"],
                           s=psize,
                           alpha=1,
                           linewidth=0.5,
                           edgecolor='black',
                           rasterized=raster)

        if adjust:
            texts = list(flatten(texts))
            adjust_text(texts,
                        ax=ax,
                        arrowprops=dict(arrowstyle="->", color='None', lw=0.5))

    if doc_sets:
        texts = []
        for d in doc_sets:
            highlight_docs = np.argwhere(np.isin(r_ind, d['docs']))[:, 0]
            points = tsne_results[highlight_docs]

            texts.append(
                cluster_label_points(d['title'], points, ax, eps, min_cluster,
                                     n_clusters, clabel_size, words_only))
            if draw_highlight_points:
                ax.scatter(points[:, 0],
                           points[:, 1],
                           c=c["color"],
                           s=psize,
                           alpha=1,
                           linewidth=0.5,
                           edgecolor='black',
                           rasterized=raster)

        if adjust:
            texts = list(flatten(texts))
            adjust_text(texts,
                        ax=ax,
                        arrowprops=dict(arrowstyle="->", color='None', lw=0.5))

    if topics:
        return texts
Esempio n. 31
0
def resnet4_D(nn, imgsz,
              channels,    # 1: gray-scale, 3: color
              norm_type,  # 'bn', 'none'
              requires_grad,
              do_bias=True):             
   depth =4
   ker = 3
   padding = (ker-1)//2
   count = 1

   def gen_group0_params(no):
      ni = channels
      return { 'block0' : {
         'conv0': conv2d_params(ni, no, ker, do_bias), 
         'conv1': conv2d_params(no, no, ker, do_bias), 
         'convdim': utils.conv_params(ni, no, 1), 
         'bn': utils.bnparams(no) if norm_type == 'bn' else None
      }}

   def gen_resnet_D_block_params(ni, no, k, norm_type, do_bias):
      return {
         'conv0': conv2d_params(ni, ni, k, do_bias), 
         'conv1': conv2d_params(ni, no, k, do_bias), 
         'convdim': utils.conv_params(ni, no, 1), 
         'bn': utils.bnparams(no) if norm_type == 'bn' else None
      }

   def gen_group_params(ni, no):
       return {'block%d' % i: gen_resnet_D_block_params(ni if i == 0 else no, no, ker, norm_type, do_bias) for i in range(count)}

   sz = imgsz // (2**depth)
   flat_params = utils.cast(utils.flatten({
        'group0': gen_group0_params(nn),
        'group1': gen_group_params(nn,   nn*2),
        'group2': gen_group_params(nn*2, nn*4),
        'group3': gen_group_params(nn*4, nn*8),        
        'fc': utils.linear_params(sz*sz*nn*8, 1),
   }))

   if requires_grad:
      utils.set_requires_grad_except_bn_(flat_params)

   def block(x, params, base, mode, do_downsample, is_first):
      o = x
      if not is_first:
         o = F.relu(o, inplace=True)   
      o = F.conv2d(x, params[base+'.conv0.w'], params.get(base+'conv0.b'), padding=padding)
      o = F.relu(o, inplace=True)      
      o = F.conv2d(o, params[base+'.conv1.w'], params.get(base+'conv1.b'), padding=padding)
      if norm_type == 'bn':
         o = utils.batch_norm(o, params, base + '.bn', mode)
 
      if do_downsample:
         o = F.avg_pool2d(o,2)
         x = F.avg_pool2d(x,2)
      
      if base + '.convdim' in params:
         return o + F.conv2d(x, params[base + '.convdim'])
      else:
         return o + x


   def group(o, params, base, mode, do_downsample, is_first=False):
      for i in range(count):
         o = block(o, params, '%s.block%d' % (base,i), mode, 
                   do_downsample=(do_downsample and i == count-1), 
                   is_first=(is_first and i == 0))                   
      return o

   def f(input, params, mode):
      o = group(input, params, 'group0', mode, do_downsample=True, is_first=True)
      o = group(o, params, 'group1', mode, do_downsample=True)
      o = group(o, params, 'group2', mode, do_downsample=True)
      o = group(o, params, 'group3', mode, do_downsample=True)      
      o = F.relu(o, inplace=True)
      o = o.view(o.size(0), -1)
      o = F.linear(o, params['fc.weight'], params['fc.bias'])
      return o

   return f, flat_params   
Esempio n. 32
0
def resnet4_G(input_dim, n0g, imgsz, channels,
             norm_type,  # 'bn', 'none'
             requires_grad,
             do_bias=True):         
   depth = 4
   ker = 3
   padding = (ker-1)//2
   count = 1

   def gen_resnet_G_block_params(ni, no, k, norm_type, do_bias):
      return {
         'conv0': conv2d_params(ni, no, k, do_bias), 
         'conv1': conv2d_params(no, no, k, do_bias), 
         'convdim': utils.conv_params(ni, no, 1), 
         'bn': utils.bnparams(no) if norm_type == 'bn' else None
      }

   def gen_group_params(ni, no):
       return {'block%d' % i: gen_resnet_G_block_params(ni if i == 0 else no, no, ker, norm_type, do_bias) for i in range(count)}

   nn = n0g * (2**(depth-1)); sz = imgsz // (2**depth)
   flat_params = utils.cast(utils.flatten({
        'proj': utils.linear_params(input_dim, nn*sz*sz),
        'group0': gen_group_params(nn,    nn//2),
        'group1': gen_group_params(nn//2, nn//4),
        'group2': gen_group_params(nn//4, nn//8),
        'group3': gen_group_params(nn//8, nn//8),        
        'last_conv': conv2d_params(nn//8, channels, ker, do_bias),
   }))

   if requires_grad:
      utils.set_requires_grad_except_bn_(flat_params)

   def block(x, params, base, mode, do_upsample):
      o = F.relu(x, inplace=True)
      if do_upsample:
        o = F.interpolate(o, scale_factor=2, mode='nearest')
            
      o = F.conv2d(o, params[base+'.conv0.w'], params.get(base+'.conv0.b'), padding=padding)
      o = F.relu(o, inplace=True)
      o = F.conv2d(o, params[base+'.conv1.w'], params.get(base+'.conv1.b'), padding=padding)
      if norm_type == 'bn':
         o = utils.batch_norm(o, params, base + '.bn', mode)
         
      xo = F.conv2d(x, params[base + '.convdim']) 
      if do_upsample:
         return o + F.interpolate(xo, scale_factor=2, mode='nearest')
      else:
         return o + xo
 
   def group(o, params, base, mode, do_upsample):
      for i in range(count):
         o = block(o, params, '%s.block%d' % (base,i), mode, do_upsample if i == 0 else False)
      return o

   def show_shape(o, msg=''):
      print(o.size(), msg)

   def f(input, params, mode):
      o = F.linear(input, params['proj.weight'], params['proj.bias'])
      o = o.view(input.size(0), nn, sz, sz)
      o = group(o, params, 'group0', mode, do_upsample=True)
      o = group(o, params, 'group1', mode, do_upsample=True)
      o = group(o, params, 'group2', mode, do_upsample=True)
      o = group(o, params, 'group3', mode, do_upsample=True)
      o = F.relu(o, inplace=True)
      o = F.conv2d(o, params['last_conv.w'], params.get('last_conv.b'), padding=padding)
      o = torch.tanh(o)
      return o

   return f, flat_params   
   
Esempio n. 33
0
def do_nmf(run_id, no_processes=16):
    stat = RunStats.objects.get(run_id=run_id)
    qid = stat.query.id
    K = stat.K

    TopicTerm.objects.filter(run_id=run_id).delete()
    DocTopic.objects.filter(run_id=run_id).delete()
    Topic.objects.filter(run_id=run_id).delete()

    stat.term_set.clear()

    alpha = stat.alpha
    n_features = stat.max_features
    if n_features == 0:
        n_features = 100000000000
    limit = stat.limit
    ng = stat.ngram

    # if stat.method=="LD" and stat.lda_library!=RunStats.WARP:
    #     if stat.max_iter == 200:
    #         stat.max_iter = 10
    #     if stat.max_iter > 100:
    #         stat.max_iter = 90

    n_samples = stat.max_iter

    stat.process_id = os.getpid()
    stat.status = 1
    stat.save()

    if stat.fulltext:
        docs = Doc.objects.filter(query=qid, fulltext__iregex='\w')
    else:
        docs = Doc.objects.filter(query=qid, content__iregex='\w')

    # if we are limiting, probably for testing, then do that
    if limit > 0:
        docs = docs[:limit]

    print('\n###############################\
    \n## Topic modeling (method: {}, library: {}) on query {} with {} documents \
and {} topics (run_id: {})\n'.format(stat.method, stat.lda_library, qid,
                                     docs.count(), K, run_id))

    # Get the docs into lists
    abstracts, docsizes, ids, citations = proc_docs(docs, stoplist,
                                                    stat.fulltext,
                                                    stat.citations)

    scaled_citations = 1 + RobustScaler(with_centering=False).fit_transform(
        np.array(citations).reshape(-1, 1))

    sentences = [get_sentence(x) for x in abstracts]
    w2v = gensim.models.Word2Vec(sentences)
    validation_measure = WithinTopicMeasure(ModelSimilarity(w2v))

    if stat.fancy_tokenization:
        ######################################
        ## A fancy tokenizer

        from nltk import wordpunct_tokenize
        from nltk import WordNetLemmatizer
        from nltk import sent_tokenize
        from nltk import pos_tag
        from nltk.corpus import stopwords as sw
        punct = set(string.punctuation)
        from nltk.corpus import wordnet as wn
        stopwords = set(sw.words('english'))

        if stat.extra_stopwords:
            stopwords = stopwords | set(stat.extra_stopwords)

        def lemmatize(token, tag):
            tag = {
                'N': wn.NOUN,
                'V': wn.VERB,
                'R': wn.ADV,
                'J': wn.ADJ
            }.get(tag[0], wn.NOUN)
            return WordNetLemmatizer().lemmatize(token, tag)

        kws = Doc.objects.filter(
            query=stat.query,
            kw__text__iregex='\w+[\-\ ]').values('kw__text').annotate(
                n=Count('pk')).filter(n__gt=len(abstracts) //
                                      200).order_by('-n')

        kw_text = set([x['kw__text'].replace('-', ' ') for x in kws])
        kw_ws = set([x['kw__text'].replace('-', ' ').split()[0]
                     for x in kws]) - stopwords

        def fancy_tokenize(X):

            common_words = set([x.lower() for x in X.split()]) & kw_ws
            for w in list(common_words):
                w = w.replace('(', '').replace(')', '')
                wpat = "({}\W*\w*)".format(w)
                wn = [
                    x.lower().replace('-', ' ')
                    for x in re.findall(wpat, X, re.IGNORECASE)
                ]
                kw_matches = set(wn) & kw_text
                if len(kw_matches) > 0:
                    for m in kw_matches:
                        insensitive_m = re.compile(m, re.IGNORECASE)
                        X = insensitive_m.sub(' ', X)
                        yield m.replace(" ", "-")

            for sent in sent_tokenize(X):
                for token, tag in pos_tag(wordpunct_tokenize(sent)):
                    token = token.lower().strip()
                    if token in stopwords:
                        continue
                    if all(char in punct for char in token):
                        continue
                    if len(token) < 3:
                        continue
                    if all(char in string.digits for char in token):
                        continue
                    lemma = lemmatize(token, tag)
                    yield lemma

        tokenizer = fancy_tokenize
    else:
        tokenizer = snowball_stemmer()

    #######################################

    #############################################
    # Use tf-idf features for NMF.
    print("Extracting tf-idf features ...")
    tfidf_vectorizer = TfidfVectorizer(max_df=stat.max_df,
                                       min_df=stat.min_freq,
                                       max_features=n_features,
                                       ngram_range=(ng, ng),
                                       tokenizer=tokenizer,
                                       stop_words=stoplist)

    count_vectorizer = CountVectorizer(max_df=stat.max_df,
                                       min_df=stat.min_freq,
                                       max_features=n_features,
                                       ngram_range=(ng, ng),
                                       tokenizer=tokenizer,
                                       stop_words=stoplist)

    t0 = time()
    if stat.method == "NM":
        tfidf = tfidf_vectorizer.fit_transform(abstracts)
        vectorizer = tfidf_vectorizer
    else:
        tfidf = count_vectorizer.fit_transform(abstracts)
        vectorizer = count_vectorizer
    print("done in %0.3fs." % (time() - t0))
    stat.tfidf_time = time() - t0
    stat.save()

    if citations is not False:
        tfidf = tfidf.multiply(scaled_citations)

    del abstracts
    gc.collect()

    if stat.db:
        vocab = vectorizer.get_feature_names()
        vocab_ids = []
        pool = Pool(processes=no_processes)
        vocab_ids.append(pool.map(partial(add_features, run_id=run_id), vocab))
        pool.terminate()
        #del vocab
        vocab_ids = vocab_ids[0]

        ## Make some topics
        django.db.connections.close_all()
        topic_ids = db.add_topics(K, run_id)
        gc.collect()

    # Fit the NMF model
    print("Fitting the model with tf-idf features, "
          "n_samples=%d and n_features=%d..." % (n_samples, n_features))
    t0 = time()
    if stat.method == "NM":
        model = NMF(n_components=K,
                    random_state=1,
                    alpha=alpha,
                    l1_ratio=.1,
                    verbose=True,
                    init='nndsvd',
                    max_iter=n_samples).fit(tfidf)
        dtm = csr_matrix(model.transform(tfidf))
        components = csr_matrix(model.components_)

    else:
        if stat.lda_library == RunStats.LDA_LIB:
            model = lda.LDA(
                n_topics=K,
                alpha=stat.alpha,
                eta=stat.alpha,
                n_iter=stat.max_iter * 10,
            ).fit(tfidf)
            dtm = model.doc_topic_
            components = csr_matrix(model.components_)
        elif stat.lda_library == RunStats.WARP:
            # Export warp lda
            try:
                warp_path = settings.WARP_LDA_PATH
                os.chdir(warp_path)
            except:
                print(
                    "warplda is not installed, or its path is not defined in settings, exiting...."
                )
                return
            fname = wpu.export_warp_lda(ids, tfidf, vocab, run_id)
            # preformat
            os.system(f'./format -input {fname} -prefix {run_id} train')
            # Run warp lda
            runcmd = f'./warplda --prefix {run_id} --k {stat.K}'
            if stat.alpha:
                runcmd += f' -alpha {stat.alpha}'
            if stat.beta:
                runcmd += f' -beta {stat.beta}'
            else:
                stat.beta = 0.01  # default beta value
                stat.save()
            if stat.max_iter:
                runcmd += f' --niter {stat.max_iter}'
            runcmd += ' train.model'
            print("Running warplda.")
            os.system(runcmd)
            print("Finished running warplda, importing results.")

            warp_vocab = np.loadtxt(f'{run_id}.vocab', dtype=str)
            warp_translate = np.argsort(warp_vocab).argsort()
            # Import warp lda as matrices
            with open(f'{run_id}.model', 'r') as f:
                for i, l in enumerate(f):
                    if i == 0:
                        M = int(l.split()[0])
                        N = int(l.split()[1])
                        components = lil_matrix((N, M))
                    else:
                        largs = l.split('\t')[1].strip().split()
                        for la in largs:
                            wid = warp_translate[i - 1]
                            t, n = la.split(':')
                            components[int(t), wid] = int(n)

            components = components.todense()
            for k in range(components.shape[0]):
                components[k, :] = (components[k, :] + stat.beta) / (
                    components[k, :].sum() + stat.K * stat.beta)
            components = csr_matrix(components)

            dtm = lil_matrix((len(ids), N))
            with open(f'{run_id}.z.estimate', 'r') as f:
                for i, l in enumerate(f):
                    largs = l.split(' ', maxsplit=1)[1].strip().split()
                    for la in largs:
                        w, t = la.split(':')
                        dtm[i, int(t)] += 1

            theta = dtm.todense()
            for i in range(dtm.shape[0]):
                theta[i, :] = (theta[i, :] + stat.alpha) / (
                    theta[i, :].sum() + stat.K * stat.alpha)

            dtm = csr_matrix(theta)

        else:
            model = LDA(
                n_components=K,
                doc_topic_prior=stat.alpha,
                topic_word_prior=stat.beta,
                learning_method=stat.get_lda_learning_method_display().lower(),
                max_iter=stat.max_iter,
                n_jobs=2).fit(tfidf)

            dtm = csr_matrix(model.transform(tfidf))
            components = csr_matrix(model.components_)

    print("done in %0.3fs." % (time() - t0))
    stat.nmf_time = time() - t0

    if stat.db:
        ## Add topics terms
        print("Adding topicterms to db")
        t0 = time()
        ldalambda = find(components)
        topics = range(len(ldalambda[0]))
        tts = []
        pool = Pool(processes=no_processes)

        tts.append(
            pool.map(
                partial(db.f_lambda,
                        m=ldalambda,
                        v_ids=vocab_ids,
                        t_ids=topic_ids,
                        run_id=run_id), topics))
        pool.terminate()
        tts = flatten(tts)
        gc.collect()
        sys.stdout.flush()
        django.db.connections.close_all()
        TopicTerm.objects.bulk_create(tts)
        print("done in %0.3fs." % (time() - t0))
        stat.db_time = stat.db_time + time() - t0

        ## Add topic-docs
        print("Adding DocTopics")
        gamma = find(dtm)
        glength = len(gamma[0])

        chunk_size = 100000

        parallel_add = True

        all_dts = []

        make_t = 0
        add_t = 0

        t0 = time()
        ### Go through in chunks
        for i in range(glength // chunk_size + 1):
            dts = []
            values_list = []
            f = i * chunk_size
            l = (i + 1) * chunk_size
            if l > glength:
                l = glength
            docs = range(f, l)
            doc_batches = []
            for p in range(no_processes):
                doc_batches.append([x for x in docs if x % no_processes == p])
            pool = Pool(processes=no_processes)
            make_t0 = time()
            values_list.append(
                pool.map(
                    partial(db.f_gamma_batch,
                            gamma=gamma,
                            docsizes=docsizes,
                            docUTset=ids,
                            topic_ids=topic_ids,
                            run_id=run_id), doc_batches))
            #dts.append(pool.map(partial(f_gamma, gamma=gamma,
            #                docsizes=docsizes,docUTset=ids,topic_ids=topic_ids),doc_batches))
            pool.terminate()
            make_t += time() - make_t0
            print(make_t)
            django.db.connections.close_all()

            add_t0 = time()
            values_list = [item for sublist in values_list for item in sublist]
            pool = Pool(processes=no_processes)
            pool.map(insert_many, values_list)
            pool.terminate()
            add_t += time() - add_t0
            print(add_t)
            gc.collect()
            sys.stdout.flush()

        stat.db_time = stat.db_time + time() - t0
        print("done in %0.3fs." % (time() - t0))

    em = 0
    for i in range(K):
        if dtm[:, i].nnz == 0:
            em += 1

    stat.empty_topics = em
    if stat.method == "NM":
        stat.error = model.reconstruction_err_
        stat.errortype = "Frobenius"
    elif stat.method == "LD":
        if stat.lda_library == RunStats.LDA_LIB:
            stat.error = model.loglikelihood()
            stat.errortype = "Log likelihood"
            stat.iterations = model.n_iter
        elif stat.lda_library == RunStats.WARP:
            pass
        else:
            stat.error = model.perplexity(tfidf)
            stat.errortype = "Perplexity"
            stat.iterations = model.n_iter_
    stat.last_update = timezone.now()
    stat.status = 3

    stat.save()

    if stat.db:
        term_rankings = []

        topics = Topic.objects.filter(run_id=run_id)

        for topic in topics:
            term_ranking = list(
                Term.objects.filter(topicterm__topic=topic).order_by(
                    '-topicterm__score').values_list('title', flat=True)[:50])
            term_rankings.append(term_ranking)

        stat.coherence = validation_measure.evaluate_rankings(term_rankings)
        stat.save()
        if stat.db:
            management.call_command('update_run', run_id)