def read_data(self, query, index, max_size, query_type): if self.es.indices.exists(index=index): res = self.es.search(index=index, size = max_size, scroll = '5m', # Keep the scroll window open for 5 minutes body=query, doc_type=query_type) logger.info({"hits":res['hits']['total']}) return res['hits']['hits'] else: return([])
def read_data(q): logger.debug(q) es_query = {"query": q["query"]} logger.info({"es_query": es_query}) hits = esd.read_all_data(es_query, cfg['elk']['index'], cfg['elk']['type'], 5000) logger.info({"first hit": hits[0]}) return (hits)
def read_all_data(self, query, index, query_type, scroll_size=500, reduce = [], to=30000): page = self.es.search(index=index, size = scroll_size, scroll = '5m', # Keep the scroll window open for 5 minutes body=query, doc_type=query_type, request_timeout=to) sid = page['_scroll_id'] scroll_size = page['hits']['total'] data = self.reduce_data(page['hits']['hits'], reduce) # Start scrolling while (scroll_size > 0): logger.info("Scrolling...") page = self.es.scroll(scroll_id = sid, scroll = '2m') # Update the scroll ID sid = page['_scroll_id'] # Get the number of results that we returned in the last scroll data += self.reduce_data(page['hits']['hits'], reduce) scroll_size = len(page['hits']['hits']) logger.info({"scroll size:": str(scroll_size)}) return(data)
def evaluate(args, test_dataset, model, tokenizer, mode='test'): if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) eval_sampler = SequentialSampler(test_dataset) eval_dataloader = DataLoader(test_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info('******* Running Evaluation ********') logger.info(' Num examples = %d', len(test_dataset)) logger.info(' Batch size = %d', args.eval_batch_size) y_true = [] y_pred = [] for batch in tqdm(eval_dataloader, desc='Iter'): model.eval() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'segment_ids': batch[2] } yt = batch[3].view(-1, 1).to('cpu').numpy() with torch.no_grad(): logits = model(**inputs) logits.detach().cpu().numpy() for i, label in enumerate(yt): y_true.append(label[0]) y_pred.append(1 if logits[i] >= 0.5 else 0) print(y_true[:15]) print(y_pred[:15]) logger.info(' Accuracy score: %f', metrics.accuracy_score(y_true, y_pred)) f1 = metrics.f1_score(y_true, y_pred, average='binary') return f1
def load_and_cache_examples(args, processor, tokenizer, evaluate=False, dev=False, output_examples=False): cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}'.format( 'eval' if evaluate else 'dev' if dev else 'train', list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length))) # TODO: refactor the Example structure, save both Example and labels to cache if os.path.exists(cached_features_file ) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) # label_list = processor.get_labels() examples, labels = processor.get_test_examples(args.data_dir) if evaluate else processor.get_dev_examples(args.data_dir) \ if dev else processor.get_train_examples(args.data_dir) features = convert_examples_to_features(args, examples, tokenizer) logger.info('saving features into cache file %s', cached_features_file) torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_masks = torch.tensor([f.attention_masks for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) # TODO: check shape dataset = (all_input_ids, all_attention_masks, all_segment_ids, all_labels) return dataset
def information_leakage(self, clusters, sample_size=5000, joint_leakage=True): """ Evaluate the information leakage for feature(s). Computes marginal KDEs for features given a sites using AKDEs. Conditional entropy is then estimated from the distributions via monte-carlo integration. The conditional entropy is then used to compute the leakage for the feature(s) Parameters ---------- clusters : list A list of lists. Features is a list of clusters. Each cluster is a list containing the features in the cluster. A singular feature or cluster may be given as the parameter. In those instances, the data will be wrapped in additional lists to match the expected form. sample_size : int Count of total random feature samples to use for monte-carlo estimation. joint_leakage : bool Determines if the leakage of clusters should be measured jointly or individually. If True, the probability of samples for each cluster will be multiplied together before estimating entropy. Otherwise, the leakage for each cluster is measured. Returns ------- list Estimated information leakage for the features/clusters. If ``joint_leakage`` is True, the list contains the leakage for the combined analysis. Otherwise, the list contains the leakages for each cluster, appearing in the same order as seen in ``clusters``. """ # convert one feature to singular list for comparability if not isinstance(clusters, Iterable): clusters = [clusters] if not isinstance(clusters[0], Iterable): clusters = [clusters] self.sample_size = sample_size logger.debug("Measuring leakage for {}".format(clusters)) # Shannon Entropy func: -p(x)*log2(p(x)) h = lambda x: -x * math.log(x, 2) # H(C) -- compute website entropy, this represents the maximum number of bits which can be leaked H_C = sum([h(prior) for prior in self.website_priors if prior > 0]) # map clusters to probability predictions for random samples # allows for KDE construction, sampling, and prediction to be done in parallel (if enabled) if self._pool is None: results = map(self._do_predictions, clusters) else: results = self._pool.imap(self._do_predictions, clusters) self._pool.close() # load the results as they are produced and log progress cluster_probs = [] for probs in results: cluster_probs.append(probs) # print progress updates count = len(cluster_probs) if count-1 % (len(clusters)*0.05) == 0: logger.info("Progress: {}/{}".format(count, len(clusters))) # restart pool if multiprocessing if self._pool is not None: self._pool.join() self._pool.restart() if joint_leakage: # multiply cluster probs to get joint probs for each sample # clusters are assumed to be independent from one another # in this way, the joint probability of all the variables is their products cluster_probs = np.array(cluster_probs) prob_sets = [np.prod(cluster_probs, axis=0)] # shape (1, n_sites, n_samples) else: # measure leakages for each cluster independently prob_sets = cluster_probs # shape (n_clusters, n_sites, n_samples) # compute information leakage for each cluster (or combined cluster if joint) leakages = [] for i, prob_set in enumerate(prob_sets): # weight the probability predictions by the website priors # in the closed-world scenario, all are equally weighted probs_weighted = [] for site, probs in enumerate(prob_set): probs_weighted.append(probs * self.website_priors[site]) probs_weighted = np.array(probs_weighted) # transpose array so that first index represents samples, second index represent site probs_weighted = np.transpose(probs_weighted) # normalize probabilities such that the per-site probs for each sample sums to one # (as should be expected for conditional probabilities) probs_norm = [] for probs in probs_weighted: norm = probs / sum(probs) if sum(probs) > 0 else probs probs_norm.append(norm) # compute entropy for each sample entropies = [] for probs in probs_norm: entropies.append(sum([h(prob) for prob in probs if prob > 0])) # H(C|f) -- estimate real entropy as average of all samples H_CF = sum(entropies)/len(entropies) # I(C;f) = H(C) - H(C|f) -- compute information leakage leakage = H_C - H_CF leakages.append(leakage) # debug output logger.debug("{cluster} {l} = {c} - {cf}" .format(cluster=clusters[i], l=leakage, c=H_C, cf=H_CF)) return leakages
def update(self, doc, index, doc_id, doc_type='list'): logger.debug({"about to update":str(doc)}) res = self.es.update(index, id=doc_id, doc_type=doc_type, body=doc ) logger.info({"updated. results":str(res)}) return(res)
def write(self, doc, index, doc_id, doc_type='list'): logger.debug({"About to write":str(doc)}) res = self.es.index(index, id=doc_id, doc_type=doc_type, body=doc ) logger.info({"wrote, results":str(res)}) return(res)
def train(args, train_dataset, dev_dataset, model, tokenizer): train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.warmup_ratio: ws = args.warmup_ratio * t_total else: ws = args.warmup_steps # prepare the optimizer and scheduler (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=ws, t_total=t_total) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1 = 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc='Epoch', disable=args.local_rank not in [-1, 0]) for _ in train_iterator: epoch_loss = 0.0 epoch_iteration = tqdm(train_dataloader, desc='Iter') for step, batch in enumerate(epoch_iteration): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'segment_ids': batch[2] } y_true = batch[3].view(-1, 1).float() logits = model(**inputs) # print(logits.type()) # print(y_true.type()) loss_func = BCEWithLogitsLoss() loss = loss_func(logits, y_true) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() epoch_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if global_step % args.dev_step == 0: curr_f1 = evaluate(args, dev_dataset, model, tokenizer, mode='dev') logger.info(' current f1: %f', curr_f1) if curr_f1 > best_f1: best_f1 = curr_f1 logger.info(' best f1: %f', best_f1) output_file = os.path.join( args.output_dir, str(args.max_seq_length) + '_' + str(args.train_batch_size) + '_' + args.task + 'best_model.bin') model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save, output_file) logger.info('Training loss current epoch: %f', epoch_loss) return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() # required parameters parser.add_argument( '--data_dir', default=None, type=str, required=True, help="Dir of input data. DON'T include exact file name") parser.add_argument( '--bert_model', default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( '--task', default=None, type=str, required=True, help='Will use as name of saved models and result file') parser.add_argument('--output_dir', default=None, type=str, required=True) # Other optional parameters parser.add_argument("--model_type", default='bert') parser.add_argument("--split_ratio", default=0.25, type=float) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--warmup_ratio", default=0.0, type=float, help="Linear warmup over warmup_ratio.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--dev_step', type=float, default=500) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup CUDA device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.device = device # Set seed set_seed(args) # args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES['bert'] processor = ClfProcessor() # label_list = processor.get_labels() # num_labels = len(processor.get_labels()) # initialize tokenizer and model from the downloaded tf checkpoint if args.bert_model == 'bert-base-cased': vocab_file = weightpath.BASE_VOCAB_FILE config_file = weightpath.BASE_CONFIG_FILE weight_file = weightpath.BASE_WEIGHTS elif args.bert_model == 'wwm': vocab_file = weightpath.WWM_VOCAB_FILE config_file = weightpath.WWM_CONFIG_FILE weight_file = weightpath.WWM_WEIGHTS else: raise ValueError( 'Currently only support Bert Base Cased(bert-base-cased) and Whole Word Masking Cased(wwm)' ) # prepare the pretrained model and tokenizer tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False) config = BertConfig.from_pretrained(config_file) model = model_class.from_pretrained('bert-base-cased') model.to(args.device) logger.info("Training/evaluation parameters %s", args) if args.do_train: train_dataset = TensorDataset( *load_and_cache_examples(args, processor, tokenizer)) dev_dataset = TensorDataset( *load_and_cache_examples(args, processor, tokenizer, dev=True)) global_step, tr_loss = train(args, train_dataset, dev_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) if args.do_eval: model = torch.load( os.path.join( args.output_dir, str(args.max_seq_length) + '_' + str(args.train_batch_size) + '_' + args.task + 'best_model.bin')) model.to(args.device) test_dataset = TensorDataset(*load_and_cache_examples( args, processor, tokenizer, evaluate=True)) f1 = evaluate(args, test_dataset, model, tokenizer) logger.info(" f1: %f", f1)
def cluster(self, features, checkpoint=None, min_samples=1, min_cluster_size=3): """ Find clusters in provided features. Use DBSCAN algorithm to cluster topN features based upon their pairwise mutual information. First fill an NxN matrix with NMI feature pair values. NMI values may be retrieved from the MIAnalyzer's internal cache or by doing computations anew. The DBSCAN model is then fit to this distances grid, and the identified clusters are returned. Parameters ---------- features : list A list of features to cluster checkpoint : str Path to plaintext file to store feature redundancy checkpoint information. Do not perform checkpointing if None is used. min_samples : int The min_samples parameter to use for the HDBSCAN algorithm. The number of samples in a neighbourhood for a point to be considered a core point. min_cluster_size : int The min_cluster_size parameter to use for the HDBSCAN algorithm. The minimum size of clusters; single linkage splits that contain fewer points than this will be considered points “falling out” of a cluster rather than a cluster splitting into two new clusters. Returns ------- list Nested lists where each list contains the cluster's features. Features that do not fall into a cluster are given their own cluster (ie. singular list). """ # compute pairwise MI for all topN features X = np.empty(shape=(len(features), len(features)), dtype=float) # distance matrix pairs = list(combinations_with_replacement( features, 2)) # all possible combinations # if checkpointing, read NMI calculations and save to cache if checkpoint is not None: if os.path.exists(checkpoint): chk_fi = open(checkpoint, 'r+') for line in chk_fi: try: if line[0] == '=': a, b, c = line[1:].split(',') self._nmi_cache.append( ((int(a), int(b)), float(c))) except: pass chk_fi.close() # re-open checkpoint for appending chk_fi = open(checkpoint, 'a+') if self._nmi_cache: # ignore unselected features in cache cache = [(pair, nmi) for pair, nmi in self._nmi_cache if pair[0] in features and pair[1] in features] # add each cached nmi to the distance matrix for cached_pair, nmi in cache: # remove cached_pair from pairs pairs = list( filter( lambda pair: (pair[0] != cached_pair[0] and pair[ 1] != cached_pair[1]) and (pair[0] != cached_pair[ 1] and pair[1] != cached_pair[0]), pairs)) # add cached nmi to matrix i, j = features.index(cached_pair[0]), features.index( cached_pair[1]) X[i][j] = 1 - nmi X[j][i] = 1 - nmi if len(pairs) > 0: # map pairs to nmi if self._pool is None: results = map(self._estimate_nmi, pairs) else: results = self._pool.imap(self._estimate_nmi, pairs) self._pool.close() # fill matrix with pair nmi values count = 0 for pair, nmi in zip(pairs, results): # print progress updates count += 1 if count - 1 % (len(pairs) * 0.05) == 0: logger.info("Progress: {}/{}".format(count, len(pairs))) fidx1, fidx2 = pair i, j = features.index(fidx1), features.index(fidx2) X[i][j] = 1 - nmi X[j][i] = 1 - nmi if checkpoint is not None: chk_fi.write('={},{},{}\n'.format(fidx1, fidx2, nmi)) chk_fi.flush() # restart pool if multiprocessing if self._pool is not None: self._pool.join() self._pool.restart() # verify that all values are filled assert not np.any(X[X == np.nan]) # use DBSCAN to cluster our data labels = HDBSCAN(metric='precomputed', min_samples=min_samples, min_cluster_size=min_cluster_size).fit_predict(X) logger.debug("Found {} clusters.".format(set(labels))) # organize the topN features into sub-lists where # each sub-list contains all features in a cluster clusters = [] for label in range(min(labels), max(labels) + 1): if label >= 0: cluster = [ features[i] for i, la in enumerate(labels) if la == label ] clusters.append(cluster) else: # treat features that do not cluster (ie. noise) each as their own independent cluster noise = [[features[i]] for i, la in enumerate(labels) if la == label] clusters.extend(noise) logger.debug("Clusters: {}".format(labels)) return clusters, X
def prune(self, features, checkpoint=None, nmi_threshold=0.9, topn=100): """ Reduce the feature-set to a list of top features which are non-redundant. Redundancy is identified by estimating the pair-wise mutual information of features. The algorithm will find up to a maximum of ``topn`` non-redundant features before ending. If the MIAnalyzer was instantiated with a ``pool``, NMI calculations will be performed in parallel. Parameters ---------- features : list Array of features from which to prune redundant features. Features should be pre-sorted by importance with the most important feature being at index 0. checkpoint : str Path to plaintext file to store feature redundancy checkpoint information. Do not perform checkpointing if None is used. nmi_threshold : float Threshold value used to identify redundant features. Features with NMI values greater than the threshold value are pruned. topn : int Number of features to save when pruning is performed. Returns ------- list Features list having variable length up to ``topn``. """ # results of NMI calculations are saved in list internal to the analyzer # reduces the amount of computation required in any subsequent cluster calls self._nmi_cache, self._mi_cache = [], dict() self.nmi_threshold = nmi_threshold # feature lists cleaned_features = set() # non-redundant pruned_features = set() # redundant # if checkpointing, open file and read any previously processed features if checkpoint is not None: if os.path.exists(checkpoint): checkpoint_fi = open(checkpoint, 'r+') for line in checkpoint_fi: try: if line[0] == '+': feature = int(line[1:].strip()) cleaned_features.add(feature) elif line[0] == '-': feature = int(line[1:].strip()) pruned_features.add(feature) if line[0] == '=': a, b, c = line[1:].split(',') self._nmi_cache.append( ((int(a), int(b)), float(c))) except: pass features = list( filter( lambda f: f not in cleaned_features and f not in pruned_features, features)) checkpoint_fi.close() # re-open checkpoint for appending checkpoint = open(checkpoint, 'a+') # continue to process features until either there are no features left to process # or the topN features have been selected while features and len(cleaned_features) < topn: # the next most important feature current_feature = features.pop(0) logger.debug("MI analysis on feature #{}".format(current_feature)) # for all top features, measure pair-wise mutual information to check for redundancy feature_pairs = zip(repeat(current_feature), cleaned_features) if self._pool is None or len(cleaned_features) < 2: results = map(self._check_redundancy, feature_pairs) else: # parallel, unordered results = self._pool.uimap(self._check_redundancy, feature_pairs) # break upon first occurrence of redundancy is_redundant = False for res in results: # unzip results is_redundant, feature_pair, nmi = res # save feature pair with nmi in cache self._nmi_cache.append((feature_pair, nmi)) if checkpoint is not None: checkpoint.write('={},{},{}\n'.format( feature_pair[0], feature_pair[1], nmi)) checkpoint.flush() # break loop if is_redundant: # if the analyzer is using a process pool # terminate processes and restart the pool if self._pool is not None: self._pool.terminate() self._pool.join() self._pool.restart() break # if the current feature does not appear to be redundant with any # other top features, add current feature to top features list if not is_redundant: cleaned_features.add(current_feature) logger.info("Progress: {}/{}".format(len(cleaned_features), min(topn, len(features)))) if checkpoint is not None: checkpoint.write('+{}\n'.format(current_feature)) checkpoint.flush() else: pruned_features.add(current_feature) if checkpoint is not None: checkpoint.write('-{}\n'.format(current_feature)) checkpoint.flush() if checkpoint is not None: checkpoint.close() # return both non-redundant and redundant features # which feature was redundant with which is however not saved return list(cleaned_features), list(pruned_features)
def _individual_measure(modeler, pool, checkpoint): """ Perform information leakage analysis for each feature one-by-one. The resulting leakages are saved in a plain-text ascii checkpoint file, which can be loaded in subsequent runs to avoid re-processing features. Parameters ---------- modeler : WebsiteFingerprintModeler initialized fingerprinting engine pool : ProcessPool Pool to use for multiprocessing. checkpoint : str Path to ascii file to save individual leakage checkpoint information. Returns ------- list list of leakages where the index of each leakage maps to the feature number """ leakage_indiv = [] # open a checkpoint file if checkpoint: lines = None if os.path.exists(checkpoint): with open(checkpoint, 'r') as tmp_file: past_leaks = [float(line) for line in tmp_file] lines = len(past_leaks) leakage_indiv = past_leaks tmp_file = open(checkpoint, 'a+') # if a pool has been provided, perform computation in parallel # otherwise do serial computation if checkpoint and lines: features = modeler.data.features[lines:] else: features = modeler.data.features if pool is None: proc_results = map(modeler, features) else: proc_results = pool.imap(modeler, features) pool.close() size = len(modeler.data.features) # number of features logger.info("Begin individual leakage measurements.") # measure information leakage # log current progress at twenty intervals for leakage in proc_results: leakage_indiv.append(leakage[0]) if len(leakage_indiv)-1 % int(size*0.05) == 0: logger.info("Progress: {}/{}".format(len(leakage_indiv), size)) if checkpoint: tmp_file.write('{}\n'.format(str(leakage[0]))) tmp_file.flush() logger.info("Progress: Done.") if pool is not None: pool.join() pool.restart() if checkpoint: tmp_file.close() return leakage_indiv
def main(features_path, output_path, n_procs=0, n_samples=5000, topn=100, nmi_threshold=0.9, discrete_threshold=100000): """ Run the full information leakage analysis on a processed dataset. Parameters ---------- features_path : str Operating system file path to the directory containing processed feature files. output_path : str Operating system file path to the directory where analysis results should be saved. n_procs : int Number of processes to use for parallelism. If 0 is used, auto-detect based on number of system CPUs. n_samples : int Number of samples to use when performing monte-carlo estimation when running the fingerprint modeler. topn : int Top number of features to analyze during joint analysis. nmi_threshold : float Cut-off value for determining redundant features. Should be a percentage value. Returns ------- float Combined feature leakage (in bits) """ # prepare feature dataset logger.info("Loading dataset.") feature_data = WebsiteData(features_path) logger.info("Loaded {} sites.".format(len(feature_data.sites))) logger.info("Loaded {} instances.".format(len(feature_data))) # create process pool if n_procs > 1: pool = Pool(n_procs) elif n_procs == 0: pool = Pool(cpu_count()) else: pool = None # directory to save results outdir = output_path if not os.path.isdir(outdir): os.makedirs(outdir) # initialize fingerprint modeler modeler = WebsiteFingerprintModeler(feature_data, discrete_threshold=discrete_threshold) # load previous leakage measurements if possible indiv_path = os.path.join(outdir, 'indiv.pkl') if os.path.exists(indiv_path): with open(indiv_path, "rb") as fi: logger.info("Loading individual leakage measures from file.") leakage_indiv = dill.load(fi) # otherwise do individual measure else: logger.info("Begin individual feature analysis.") # perform individual measure with checkpointing chk_path = os.path.join(outdir, 'indiv_checkpoint.txt') leakage_indiv = _individual_measure(modeler, pool, chk_path) # save individual leakage to file logger.info("Saving individual leakage to {}.".format(indiv_path)) with open(indiv_path, "wb") as fi: dill.dump(leakage_indiv, fi) # perform combined information leakage measurements # initialize MI analyzer analyzer = MutualInformationAnalyzer(feature_data, pool=pool) # sort the list of features by their individual leakage # we will process these features in the order of their importance during MI analysis logger.info("Sorting features by individual leakage.") tuples = list(zip(feature_data.features, leakage_indiv)) tuples = sorted(tuples, key=lambda x: (-x[1], x[0])) logger.debug("Top 20:\t {}".format(tuples[:20])) sorted_features = list(list(zip(*tuples))[0]) # process into list of non-redundant features cln_path = os.path.join(outdir, 'cleaned.pkl') rdn_path = os.path.join(outdir, 'redundant.pkl') chk_path = os.path.join(outdir, 'prune_checkpoint.txt') if os.path.exists(cln_path): logger.info("Loading top non-redundant features from file.") with open(cln_path, 'rb') as fi: cleaned = dill.load(fi) else: logger.info("Begin feature pruning.") cleaned, pruned = analyzer.prune(features=sorted_features, nmi_threshold=nmi_threshold, topn=topn, checkpoint=chk_path) with open(cln_path, 'wb') as fi: dill.dump(cleaned, fi) with open(rdn_path, 'wb') as fi: dill.dump(pruned, fi) # cluster non-redundant features dst_path = os.path.join(outdir, 'distance_matrix.pkl') cst_path = os.path.join(outdir, 'clusters.pkl') if os.path.exists(cst_path): logger.info("Loading clusters from file.") with open(cst_path, 'rb') as fi: clusters = dill.load(fi) else: logger.info("Begin feature clustering.") clusters, distance_matrix = analyzer.cluster(cleaned, checkpoint=chk_path) with open(dst_path, 'wb') as fi: dill.dump(distance_matrix, fi) with open(cst_path, 'wb') as fi: dill.dump(clusters, fi) # perform joint information leakage measurement logger.info('Identified {} clusters.'.format(len(clusters))) logger.info("Begin cluster leakage measurements.") modeler._pool = pool # configure modeler to use the proc pool leakage_joint = modeler.information_leakage(clusters=clusters, sample_size=n_samples, joint_leakage=True)[0] logger.info("Final leakage results: {} bits".format(leakage_joint)) logger.info("Finished execution.") return leakage_joint
from ElasticSearchDrive import ElasticSearchDriver from data_utils import cfg, logger, email_notify import time, boto, sys, json, logging, csv, io, subprocess, glob, re, os from logging.handlers import RotatingFileHandler handler = RotatingFileHandler(filename=str(cfg['log']['get_csv_fn']), maxBytes=int(cfg['log']['maxBytes']), backupCount=int(cfg['log']['backupCount'])) logger.addHandler(handler) formatter = logging.Formatter( '{"location":"%(module)s:%(lineno)d:%(funcName)s","server_time":"%(asctime)s","level":"%(levelname)s","msg":%(message)s}' ) handler.setFormatter(formatter) # TODO: add process number logger.info("CSV Starting...") logger.info(cfg) #http://docs.ceph.com/docs/master/radosgw/s3/python/ def upload(bucket_name, key_name, dreams_str): conn = boto.connect_s3() bucket = conn.get_bucket(bucket_name) key = bucket.new_key(key_name) key.set_contents_from_string(dreams_str) def get_names(hits): names = [] for h in hits: names.extend(list(h["_source"].keys()))