def micro_average_scores( counters: collections.defaultdict) -> collections.Counter: results: collections.Counter = collections.Counter() for counter in counters.values(): # collections.Counter results.update(counter) return results
def count_color(grid: defaultdict, color: str) -> int: c = 0 for tile in grid.values(): if tile == 'black': c += 1 return c
def get_embeddings_per_log(data: defaultdict, model: fasttext.FastText) -> np.ndarray: # create embeddings per log but at first remove '\n' (newline character) from the end embeddings = [ model.get_sentence_vector(log.rstrip()) for logs in data.values() for log in logs ] return np.asarray(embeddings)
def scanning_error_rate(ticket_rules: defaultdict, tickets: List[List[int]]) -> List[List[int]]: """Calculates the scanning error rate = the product of all values invalid for any field""" error_rate = sum( number for ticket in tickets for number in ticket if all(number not in field_range for field_range in ticket_rules.values()) ) return error_rate
def create_csv(data: defaultdict, filename: str): """dump the station information to a CSV file""" header = data[next(iter(data.keys()))].keys() with open(filename, 'wt', newline='') as f: writer = csv.DictWriter(f, fieldnames=header, lineterminator='\n') writer.writeheader() for v in sorted(data.values(), key=itemgetter('id')): writer.writerow(v)
def save_measurement_results(m_results: collections.defaultdict, db_sess): measurement_results = [] for probe_measurement_dct in m_results.values(): measurement_results.extend(probe_measurement_dct.values()) db_sess.bulk_save_objects(measurement_results) logger.info('parsed and saved %s measurements', len(measurement_results)) db_sess.commit() m_results.clear()
def get_embeddings_per_block(data: defaultdict, model: fasttext.FastText, with_timedelta: bool) -> List: # create embeddings per block but at first remove '\n' (newline character) from the end if with_timedelta: embeddings = get_embeddings_with_timedeltas_per_block(data, model) else: embeddings = [ np.asarray( [model.get_sentence_vector(log.rstrip()) for log in logs]) for logs in data.values() ] return embeddings
def get_embeddings_with_timedeltas_per_block(data: defaultdict, model: fasttext.FastText) -> List: embeddings = [] for logs in data.values(): numpy_block = np.zeros(shape=(len(logs), model.get_dimension() + 1), dtype=np.float32) for i, log in enumerate(logs): numpy_block[i, 1:] = model.get_sentence_vector(log.rstrip()) numpy_block[:, 0] = get_timedeltas(logs) embeddings.append(numpy_block) return embeddings
def find_most_have(nums: defaultdict): max_num = 0 most_have_list = [] # 가장 많이 가진 정수의 갯수를 구함 for v in nums.values(): if v > max_num: max_num = v # max_num개를 가진 정수 리스트 구함 for k, v in nums.items(): if nums[k] == max_num: most_have_list.append(k) return min(most_have_list)
def get_contextual_embeddings_per_block(data: defaultdict, embedding_mapping: Dict) -> List: # create embeddings per block but at first remove '\n' (newline character) from the end, a timestamp and a PID from # the beginning log_without_timestamp_and_pid = re.compile(r'^\d{6} \d{6} \d+ (.*)') embeddings = [ np.asarray([ embedding_mapping[search(log_without_timestamp_and_pid, log.rstrip())] for log in logs ], dtype=np.float32) for logs in data.values() ] return embeddings
def get_labels_from_keys_per_log(data: defaultdict, labels: pd.DataFrame) -> np.array: size = sum(len(logs) for logs in data.values()) ground_truth = np.zeros(shape=size, dtype=np.int8) idx = 0 for row in labels.itertuples(index=False): block_id, is_anomalous = row block_len = len(data[block_id]) if is_anomalous: ground_truth[ idx:idx + block_len] = 1 # mark all affected logs belonging to the trace as anomaly idx += block_len return ground_truth
def permute_rec(freqDict: defaultdict, permutation: List[int]): remSum = 0 for count in freqDict.values(): remSum += count if remSum <= 0: result.append(list(permutation)) return for num, count in freqDict.items(): if count > 0: freqDict[num] -= 1 permutation.append(num) permute_rec(freqDict, permutation) permutation.pop() freqDict[num] += 1
def calculate(fish: defaultdict, days: int) -> defaultdict: for _ in range(days): new_fish = defaultdict(int) for timer, num in fish.items(): timer -= 1 if timer < 0: new_fish[6] += num new_fish[8] += num else: new_fish[timer] += num fish = new_fish return sum(fish.values())
def ToCsv( path: pathlib.Path, vocab_counts: defaultdict, node_count: int, ): vocab_entries = sorted(vocab_counts.items(), key=lambda x: -x[1]) total_count = sum(vocab_counts.values()) cumfreq = 0 node_cumfreq = 0 with open(str(path), "w") as f: writer = csv.writer(f, delimiter="\t") writer.writerow( ("cumulative_frequency", "cumulative_node_frequency", "count", "text",) ) for text, count in vocab_entries: cumfreq += count / total_count node_cumfreq += count / node_count writer.writerow((cumfreq, node_cumfreq, count, text))
def flatten_datasets(datasets: defaultdict) -> list: """Concatenates together 'sub datasets' into one dataset A 'sub dataset' could be one 'square' of data Args: datasets: A default dictionary of datasets Returns: A list of datasets """ flattened_datasets = [] for dataset in datasets.values(): flattened_dataset = list(chain.from_iterable(dataset)) flattened_datasets.append(flattened_dataset) return flattened_datasets
def generate_table_rows(request_count: Counter, request_times:defaultdict) -> List[Dict]: table_rows = [] time_total = sum(chain(*request_times.values())) count_total = sum(request_count.values()) for url, times in request_times.items(): count = request_count[url] times = request_times[url] time_sum = sum(times) table_rows.append({'count': count, 'url': url, 'count_perc': round(100 * count / count_total, 2), 'time_perc': round(100 * time_sum / time_total, 2), 'time_sum': round(time_sum, 2), 'time_avg': round(sum(times) / len(times), 2), 'time_max': round(max(times), 2), 'time_med': round(median(times), 2)}) table_rows = sorted(table_rows, key=lambda x: x['time_sum'], reverse=True) return table_rows
def total_event_num(corpus: defaultdict) -> int: return sum(list(corpus.values()))
def histogram(word_dict: defaultdict, interactive=False): fig = plt.figure() ax = plt.subplot() words = list(word_dict.keys()) counts = list(word_dict.values()) wc = zip(words, counts) wc = sorted(wc, key=lambda elem: elem[1], reverse=True) words, counts = zip(*wc) bars = plt.bar(words, counts, color='g', tick_label=None) curr_word = ax.annotate("", xy=(0, 0), xytext=(40, 40), textcoords="offset points", arrowprops=dict(arrowstyle="->")) curr_word.set_visible(False) def update_label(label, bar): x = bar.get_x() + bar.get_width() / 2. y = bar.get_y() + bar.get_height() curr_word.xy = (x, y) curr_word.set_text(label) def draw_labels(): for i, bar in enumerate(bars): offset_x = 10 * (i % 20) offset_y = 15 * (i % 20) x = bar.get_x() + bar.get_width() / 2. y = bar.get_y() + bar.get_height() word = ax.annotate(words[i], xy=(x, y), xytext=(40 + offset_x, 40 + offset_y), textcoords="offset points", bbox=dict(boxstyle="round", fc="0.8"), arrowprops=dict(arrowstyle="-", alpha=0.2)) fig.canvas.draw_idle() def show_label_on_plot_hover(event): vis = curr_word.get_visible() hover_on_bar = False for i, bar in enumerate(bars): if bar.contains(event)[0]: hover_on_bar = True update_label(words[i], bar) curr_word.set_visible(True) break if vis and not hover_on_bar: curr_word.set_visible(False) fig.canvas.draw_idle() plt.xlabel('Words') plt.ylabel('Occurences') plt.title('Histogram of words in Ed Stafford: First Man Out') plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) if interactive: fig.canvas.mpl_connect('motion_notify_event', show_label_on_plot_hover) else: draw_labels() plt.show()
n_class = 32/2 c = KMeans(init='k-means++', n_clusters=n_class, n_init=10) c.fit(test_fn) dist = np.sort(c.transform(test_fn)) ex = DD(list) #example id, distance to centroid ex_id = DD(list) #example id for each C ex_N = [] #number of examples for each C for i,j,k in zip(c.labels_, xrange(len(test_fn)), dist): ex[i].append([j,k[0]]) ex_id[i].append(int(j)) for i,j in ex.items(): ex[i] = sorted(j, key=lambda x: x[-1]) ex_N.append([i,len(ex[i])]) ex_N = sorted(ex_N, key=lambda x: x[-1],reverse=True) #sort cluster by density nb_c = DD() for exx in ex_id.values(): exx = np.asarray(exx) for e in exx: nb_c[e] = exx[exx!=e] nb_f = [DD(), DD(), DD()] for b,n in zip(bl, nb_f): preds = b.predict(test_fd) ex_ = DD(list) for i,j in zip(preds, xrange(len(test_fd))): ex_[i].append(int(j)) for exx in ex_.values(): exx = np.asarray(exx) for e in exx: n[e] = exx[exx!=e] #find k NN for each ex without considering clustering
def normalise_dict(num_value_dict: defaultdict) -> defaultdict: norm = sum(num_value_dict.values()) normed_res = num_value_dict.copy() for k, v in normed_res.items(): normed_res[k] = v / norm return normed_res
for itr in range(iteration): lr_.fit(test_fn[fd_], label_) label_pr = np.sort(lr_.predict_proba(test_fn[train])) #sort in ascending order rank = [] for i,pr in zip(train, label_pr): rank.append([i,pr[-1]]) rank = sorted(rank, key=lambda x: x[-1]) idx = rank[0][0] #compute CI for each oracle for b in bl: r = R[b] n = len(r) cv = t.ppf(0.975, n-1) CI[b] = np.mean(r) + cv*np.std(r)/np.sqrt(n) epsilon = 0.7*max(CI.values()) preds = [] for b in bl: if CI[b] >= epsilon: preds.append(b.predict(test_fd[idx])) #print 'predicted label from NO', preds y_ = mode(preds, axis=None)[0][0] #print 'major', y_ for b in bl: if CI[b] >= epsilon: if b.predict(test_fd[idx]) == y_: R[b].append(1) else: R[b].append(0) fd_.append(idx) label_.append(y_)
def average_frequency(hist: defaultdict): sum = 0 for freq in hist.values(): sum += int(freq) return sum/len(hist.values())
def get_intersection_points(diagram: defaultdict) -> int: return (np.array(list(diagram.values())) > 1).sum()
def get_possible_options(steps_dict: collections.defaultdict, path: str): return (sorted( set(steps_dict.keys()).union( set(ft.reduce(lambda a, b: a + b, steps_dict.values()))).difference(set(path))))
def get_first_step(steps_dict: collections.defaultdict) -> str: return sorted( set(steps_dict.keys()).difference( set(ft.reduce(lambda a, b: a + b, steps_dict.values()))))[0]
if len(distList) > 1: stdVal = np.std(distList, ddof = 1.0) if ancestorList: listAnc[currType.depth()] = UtilObject(mean=meanVal, std=stdVal, distList=distList) else: d[currType] = UtilObject(mean=meanVal, std=stdVal, distList=distList) print # Build lists of distances for each level print("Build globDistList...") globDistList = [] for i in range(TaxaType.hierarchySize() + 1): globDistList.append([]) for listAnc in taxaTypeAncDistDict.values(): for ind, obj in enumerate(listAnc): if obj is None: continue globDistList[ind].extend(obj.distList) # Build list of UtilObject(mean, std, count) globStdList = [] for l in globDistList: UtilDrawHistogram(l, show = False) if len(l) >= 2: std = std=np.std(l, ddof=1.0) else: std = None globStdList.append(std) UtilDrawHistogram(show = True)
def count_lanternfish(lanternfish: defaultdict) -> int: return np.array(list(lanternfish.values())).sum()
def voteRefine(sequences, motifs): #get probabilities lets = "ACGT" probability = DD(int) for seq in sequences: for let in sequences[seq]: probability[let] += 1 s = sum(probability.values()) for let in lets: probability[let] = probability[let] / float(s) #conductPoll poll = {} maxV = 0 maxL = 0 for seq in sequences: poll[seq] = [0.0] * len(sequences[seq]) if len(sequences[seq]) > maxL: maxL = len(sequences[seq]) for tool in motifs: for motif in motifs[tool]: for seq in motifs[tool][motif]: sequence = best(sequences, seq) for pos in motifs[tool][motif][seq]: for i in xrange(pos, pos + len(motif)): try: # instead of weighting all results the same (1), we # could bias based on tool or number of results or something like that #poll[sequence][i - 1] += 1 if tool == "CMF": poll[sequence][i - 1] += 1 if tool == "Weeder": poll[sequence][i - 1] += 1 if tool == "MEME": poll[sequence][i - 1] += 1 if tool == "DECOD": poll[sequence][i - 1] += 1 if tool == "BioProspector": poll[sequence][i - 1] += 1 if tool == "XXmotif": poll[sequence][i - 1] += 1 except Exception as e: print e print 'It appears a tool has reported finding a motif',\ 'outside the bounds of a sequence' print 'such as finding a motif of length 10 at position',\ '195 in a sequence with length 200' pdb.set_trace() if poll[sequence][i - 1] > maxV: maxV = poll[sequence][i - 1] #inspectPoll ress = [] THRESH = 3.7 maxInsts = 0 MLEN = MOTIF_LEN for seq in poll: for i in xrange(len(poll[seq]) - MLEN): if sum(poll[seq][i:i + MLEN]) >= MLEN * THRESH: curr = sequences[seq][i:i + MLEN] bestPWM = None bestMatching = 0 for PWM in ress: matching = compMotifPWM(curr, PWM) if matching > bestMatching and matching > MLEN / 2: bestMatching = matching bestPWM = PWM if bestPWM == None: bestPWM = [[0, 0, 0, 0] for x in xrange(MLEN)] ress.append(bestPWM) for c, col in zip(curr, bestPWM): col[ALPH[c]] += 1 insts = sum(bestPWM[0]) if insts > maxInsts: maxInsts = insts votedRess = DD(int) for PWM in ress: l = len(PWM) cons = PWMconsensus(PWM) for seq in sequences: for spos in xrange(0, len(sequences[seq]) - l): # .75% thresh if compMotifPWM(sequences[seq][spos:spos + l], PWM) >= .75 * l: for pos in xrange(spos, spos + l): votedRess[cons] += poll[seq][pos] return sorted(votedRess.iteritems(), key=lambda a: a[::-1])
def run_auto(self): ''' test direct data feature based transfer accuracy on the new building ''' rf = RFC(n_estimators=100, criterion='entropy') rf.fit(self.train_fd, self.train_label) pred = rf.predict(self.test_fd) print('direct data feature-based transfer acc on tgt_bldg:', ACC(pred, self.test_label)) #plot_confusion_matrix(self.test_label, pred) ''' step1: train base models from bldg1 ''' self.get_base_learners() ''' step2: TL with name feature on bldg2 ''' label = self.test_label class_ = np.unique(self.train_label) for b in self.bl: print(b.score(self.test_fd, label)) n_class = 32 c = KMeans(init='k-means++', n_clusters=n_class, n_init=10) c.fit(self.test_fn) dist = np.sort(c.transform(self.test_fn)) ex_id = DD(list) #example id for each C for i, j, k in zip(c.labels_, range(len(self.test_fn)), dist): ex_id[i].append(int(j)) #getting neighors for each ex nb_c = DD() #nb from clustering results for exx in ex_id.values(): exx = np.asarray(exx) for e in exx: nb_c[e] = exx[exx != e] nb_f = [DD(), DD(), DD()] #nb from classification results for b, n in zip(self.bl, nb_f): preds = b.predict(self.test_fd) ex_ = DD(list) for i, j in zip(preds, range(len(self.test_fd))): ex_[i].append(int(j)) for exx in ex_.values(): exx = np.asarray(exx) for e in exx: n[e] = exx[exx != e] #use base learners' predicitons acc_ = [] cov_ = [] #for delta in np.linspace(0.1, 0.5, 5): for delta in np.linspace(self.agreement_threshold, self.agreement_threshold, 1): print('running TL with agreement threshold =', delta) labeled_id = [] confidence = [] output = DD() preds = np.array([999 for i in range(len(self.test_fd))]) for i in range(len(self.test_fn)): #get the weight for each bl: by computing sim btw cluster and clf w = [] v_c = set(nb_c[i]) for n in nb_f: v_f = set(n[i]) cns = len(v_c & v_f) / float( len(v_c | v_f)) #original count based weight #print (len(v_c & v_f) , len(v_c | v_f)) inter = v_c & v_f union = v_c | v_f d_i = 0 d_u = 0 for it in inter: d_i += np.linalg.norm(self.test_fn[i] - self.test_fn[it]) #print (np.linalg.norm(self.test_fn[i]-self.test_fn[it])) #input('...') for u in union: d_u += np.linalg.norm(self.test_fn[i] - self.test_fn[u]) if len(inter) != 0: sim = 1 - (d_i / d_u) / cns #sim = (d_i/d_u)/cns if i in output: output[i].extend( ['%s/%s' % (len(inter), len(union)), 1 - sim]) else: output[i] = [ '%s/%s' % (len(inter), len(union)), 1 - sim ] w.append(sim) output[i].append(np.mean(w)) if np.mean(w) >= delta: confidence.append(np.mean(w)) w[:] = [float(j) / sum(w) for j in w] pred_pr = np.zeros(len(class_)) for wi, b in zip(w, self.bl): pr = b.predict_proba(self.test_fd[i].reshape(1, -1)) pred_pr = pred_pr + wi * pr preds[i] = class_[np.argmax(pred_pr)] labeled_id.append(i) acc_.append(ACC(preds[preds != 999], label[preds != 999])) cov_.append(1.0 * len(preds[preds != 999]) / len(label)) print('acc =', acc_, ';') print('cov =', cov_, ';') return preds[preds != 999], labeled_id, confidence
def buildCogTaxaDict(noWeights = False, showCogFreqHist = False, interpolationRange = None): print("reading taxa dictionary...") taxaDict = UtilLoad(PROK_TAXA_DICT()) print("Read %d organisms" % len(taxaDict)) print("Reading cogDict...") cogDict = UtilLoad(COG_DICT()) print("Building COG frequncies...") cogFreq = DefDict(int) for dir, cogs in cogDict.iteritems(): for cname in cogs: cogFreq[cname] += 1 if showCogFreqHist: print("Sowing cogFreq histogram...") UtilDrawHistogram(cogFreq.values(), show = True) temp = taxaDict.keys() for dir in temp: if dir not in cogDict: del taxaDict[dir] temp = cogDict.keys() for dir in temp: if dir not in taxaDict: del cogDict[dir] print("Valid set contains %d organisms" % len(cogDict)) print("\nBuilding Taxonomy distances...") taxDist = DefDict(dict) for dir1, taxa1 in taxaDict.items(): for dir2, taxa2 in taxaDict.items(): d = taxa1.distance(taxa2) taxDist[dir1][dir2] = d # Optimization if noWeights: return (cogDict, None, taxaDict, taxDist) fname = COG_WEIGHTS_DICT_LIST() if os.path.isfile(fname): print("Loading cogWeightDictList...") cogWeightDictList = UtilLoad(fname, progrIndPeriod=100) else: print("Building cogWeightsDict...") cogWeightDictList = [DefDict(dict) for i \ in range(0, COG_REG_STEP_COUNT+1)] if not interpolationRange: interpolationRange = range(0, COG_REG_STEP_COUNT+1) for i in interpolationRange: expCogReg = math.exp(COG_REG_LOWER + float(i) * COG_REG_STEP) print("\nexpCogReg %f" % expCogReg) cogWeightDict = cogWeightDictList[i] for ind, (dir1, cogs1) in enumerate(cogDict.iteritems(), start=1): print("\r%d.%d. %s" % (i, ind, dir1)), for dir2, cogs2 in cogDict.iteritems(): cogWeightDict[dir1][dir2] = \ cogSetWeight(cogs1 & cogs2, cogFreq, expCogReg) print UtilStore(cogWeightDictList, fname) return (cogDict, cogWeightDictList, taxaDict, taxDist)