Exemple #1
0
def micro_average_scores(
        counters: collections.defaultdict) -> collections.Counter:
    results: collections.Counter = collections.Counter()
    for counter in counters.values():  # collections.Counter
        results.update(counter)

    return results
Exemple #2
0
def count_color(grid: defaultdict, color: str) -> int:
    c = 0
    for tile in grid.values():
        if tile == 'black':
            c += 1

    return c
Exemple #3
0
def get_embeddings_per_log(data: defaultdict,
                           model: fasttext.FastText) -> np.ndarray:
    # create embeddings per log but at first remove '\n' (newline character) from the end
    embeddings = [
        model.get_sentence_vector(log.rstrip()) for logs in data.values()
        for log in logs
    ]
    return np.asarray(embeddings)
Exemple #4
0
def scanning_error_rate(ticket_rules: defaultdict, tickets: List[List[int]]) -> List[List[int]]:
    """Calculates the scanning error rate = the product of all values invalid for any field"""
    error_rate = sum(
        number
        for ticket in tickets
        for number in ticket
        if all(number not in field_range for field_range in ticket_rules.values())
    )
    return error_rate
Exemple #5
0
def create_csv(data: defaultdict, filename: str):
    """dump the station information to a CSV file"""
    header = data[next(iter(data.keys()))].keys()

    with open(filename, 'wt', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=header, lineterminator='\n')
        writer.writeheader()
        for v in sorted(data.values(), key=itemgetter('id')):
            writer.writerow(v)
Exemple #6
0
    def save_measurement_results(m_results: collections.defaultdict, db_sess):
        measurement_results = []
        for probe_measurement_dct in m_results.values():
            measurement_results.extend(probe_measurement_dct.values())

        db_sess.bulk_save_objects(measurement_results)
        logger.info('parsed and saved %s measurements',
                    len(measurement_results))
        db_sess.commit()

        m_results.clear()
Exemple #7
0
def get_embeddings_per_block(data: defaultdict, model: fasttext.FastText,
                             with_timedelta: bool) -> List:
    # create embeddings per block but at first remove '\n' (newline character) from the end
    if with_timedelta:
        embeddings = get_embeddings_with_timedeltas_per_block(data, model)
    else:
        embeddings = [
            np.asarray(
                [model.get_sentence_vector(log.rstrip()) for log in logs])
            for logs in data.values()
        ]
    return embeddings
Exemple #8
0
def get_embeddings_with_timedeltas_per_block(data: defaultdict,
                                             model: fasttext.FastText) -> List:
    embeddings = []
    for logs in data.values():
        numpy_block = np.zeros(shape=(len(logs), model.get_dimension() + 1),
                               dtype=np.float32)

        for i, log in enumerate(logs):
            numpy_block[i, 1:] = model.get_sentence_vector(log.rstrip())
        numpy_block[:, 0] = get_timedeltas(logs)

        embeddings.append(numpy_block)
    return embeddings
Exemple #9
0
def find_most_have(nums: defaultdict):
    max_num = 0
    most_have_list = []
    # 가장 많이 가진 정수의 갯수를 구함
    for v in nums.values():
        if v > max_num:
            max_num = v

    # max_num개를 가진 정수 리스트 구함
    for k, v in nums.items():
        if nums[k] == max_num:
            most_have_list.append(k)

    return min(most_have_list)
Exemple #10
0
def get_contextual_embeddings_per_block(data: defaultdict,
                                        embedding_mapping: Dict) -> List:
    # create embeddings per block but at first remove '\n' (newline character) from the end, a timestamp and a PID from
    # the beginning
    log_without_timestamp_and_pid = re.compile(r'^\d{6} \d{6} \d+ (.*)')

    embeddings = [
        np.asarray([
            embedding_mapping[search(log_without_timestamp_and_pid,
                                     log.rstrip())] for log in logs
        ],
                   dtype=np.float32) for logs in data.values()
    ]
    return embeddings
Exemple #11
0
def get_labels_from_keys_per_log(data: defaultdict,
                                 labels: pd.DataFrame) -> np.array:
    size = sum(len(logs) for logs in data.values())
    ground_truth = np.zeros(shape=size, dtype=np.int8)
    idx = 0
    for row in labels.itertuples(index=False):
        block_id, is_anomalous = row
        block_len = len(data[block_id])

        if is_anomalous:
            ground_truth[
                idx:idx +
                block_len] = 1  # mark all affected logs belonging to the trace as anomaly
        idx += block_len
    return ground_truth
Exemple #12
0
        def permute_rec(freqDict: defaultdict, permutation: List[int]):
            remSum = 0
            for count in freqDict.values():
                remSum += count
            if remSum <= 0:
                result.append(list(permutation))
                return

            for num, count in freqDict.items():
                if count > 0:
                    freqDict[num] -= 1
                    permutation.append(num)
                    permute_rec(freqDict, permutation)
                    permutation.pop()
                    freqDict[num] += 1
Exemple #13
0
def calculate(fish: defaultdict, days: int) -> defaultdict:
    for _ in range(days):
        new_fish = defaultdict(int)

        for timer, num in fish.items():
            timer -= 1
            if timer < 0:
                new_fish[6] += num
                new_fish[8] += num
            else:
                new_fish[timer] += num

        fish = new_fish

    return sum(fish.values())
Exemple #14
0
def ToCsv(
  path: pathlib.Path, vocab_counts: defaultdict, node_count: int,
):
  vocab_entries = sorted(vocab_counts.items(), key=lambda x: -x[1])
  total_count = sum(vocab_counts.values())

  cumfreq = 0
  node_cumfreq = 0
  with open(str(path), "w") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(
      ("cumulative_frequency", "cumulative_node_frequency", "count", "text",)
    )
    for text, count in vocab_entries:
      cumfreq += count / total_count
      node_cumfreq += count / node_count
      writer.writerow((cumfreq, node_cumfreq, count, text))
Exemple #15
0
def flatten_datasets(datasets: defaultdict) -> list:
    """Concatenates together 'sub datasets' into one dataset

    A 'sub dataset' could be one 'square' of data

    Args:
        datasets: A default dictionary of datasets

    Returns:
        A list of datasets

    """
    flattened_datasets = []
    for dataset in datasets.values():
        flattened_dataset = list(chain.from_iterable(dataset))
        flattened_datasets.append(flattened_dataset)
    return flattened_datasets
Exemple #16
0
def generate_table_rows(request_count: Counter, request_times:defaultdict) -> List[Dict]:
    table_rows = []
    time_total = sum(chain(*request_times.values()))
    count_total = sum(request_count.values())

    for url, times in request_times.items():
        count = request_count[url]
        times = request_times[url]
        time_sum = sum(times)

        table_rows.append({'count': count,
                           'url': url,
                           'count_perc': round(100 * count / count_total, 2),
                           'time_perc': round(100 * time_sum / time_total, 2),
                           'time_sum': round(time_sum, 2),
                           'time_avg': round(sum(times) / len(times), 2),
                           'time_max': round(max(times), 2),
                           'time_med': round(median(times), 2)})

    table_rows = sorted(table_rows, key=lambda x: x['time_sum'], reverse=True)
    return table_rows
Exemple #17
0
def total_event_num(corpus: defaultdict) -> int:
    return sum(list(corpus.values()))
Exemple #18
0
def histogram(word_dict: defaultdict, interactive=False):
    fig = plt.figure()
    ax = plt.subplot()
    words = list(word_dict.keys())
    counts = list(word_dict.values())
    wc = zip(words, counts)
    wc = sorted(wc, key=lambda elem: elem[1], reverse=True)
    words, counts = zip(*wc)
    bars = plt.bar(words, counts, color='g', tick_label=None)

    curr_word = ax.annotate("",
                            xy=(0, 0),
                            xytext=(40, 40),
                            textcoords="offset points",
                            arrowprops=dict(arrowstyle="->"))
    curr_word.set_visible(False)

    def update_label(label, bar):
        x = bar.get_x() + bar.get_width() / 2.
        y = bar.get_y() + bar.get_height()
        curr_word.xy = (x, y)
        curr_word.set_text(label)

    def draw_labels():
        for i, bar in enumerate(bars):
            offset_x = 10 * (i % 20)
            offset_y = 15 * (i % 20)
            x = bar.get_x() + bar.get_width() / 2.
            y = bar.get_y() + bar.get_height()
            word = ax.annotate(words[i],
                               xy=(x, y),
                               xytext=(40 + offset_x, 40 + offset_y),
                               textcoords="offset points",
                               bbox=dict(boxstyle="round", fc="0.8"),
                               arrowprops=dict(arrowstyle="-", alpha=0.2))
        fig.canvas.draw_idle()

    def show_label_on_plot_hover(event):
        vis = curr_word.get_visible()
        hover_on_bar = False
        for i, bar in enumerate(bars):
            if bar.contains(event)[0]:
                hover_on_bar = True
                update_label(words[i], bar)
                curr_word.set_visible(True)
                break
        if vis and not hover_on_bar:
            curr_word.set_visible(False)
        fig.canvas.draw_idle()

    plt.xlabel('Words')
    plt.ylabel('Occurences')
    plt.title('Histogram of words in Ed Stafford: First Man Out')
    plt.tick_params(axis='x',
                    which='both',
                    bottom=False,
                    top=False,
                    labelbottom=False)
    if interactive:
        fig.canvas.mpl_connect('motion_notify_event', show_label_on_plot_hover)
    else:
        draw_labels()
    plt.show()
Exemple #19
0
n_class = 32/2
c = KMeans(init='k-means++', n_clusters=n_class, n_init=10)
c.fit(test_fn)
dist = np.sort(c.transform(test_fn))
ex = DD(list) #example id, distance to centroid
ex_id = DD(list) #example id for each C
ex_N = [] #number of examples for each C
for i,j,k in zip(c.labels_, xrange(len(test_fn)), dist):
    ex[i].append([j,k[0]])
    ex_id[i].append(int(j))
for i,j in ex.items():
    ex[i] = sorted(j, key=lambda x: x[-1])
    ex_N.append([i,len(ex[i])])
ex_N = sorted(ex_N, key=lambda x: x[-1],reverse=True) #sort cluster by density
nb_c = DD()
for exx in ex_id.values():
    exx = np.asarray(exx)
    for e in exx:
        nb_c[e] = exx[exx!=e]
nb_f = [DD(), DD(), DD()]
for b,n in zip(bl, nb_f):
    preds = b.predict(test_fd)
    ex_ = DD(list)
    for i,j in zip(preds, xrange(len(test_fd))):
        ex_[i].append(int(j))
    for exx in ex_.values():
        exx = np.asarray(exx)
        for e in exx:
            n[e] = exx[exx!=e]

#find k NN for each ex without considering clustering
def normalise_dict(num_value_dict: defaultdict) -> defaultdict:
    norm = sum(num_value_dict.values())
    normed_res = num_value_dict.copy()
    for k, v in normed_res.items():
        normed_res[k] = v / norm
    return normed_res
Exemple #21
0
    for itr in range(iteration):
        lr_.fit(test_fn[fd_], label_)
        label_pr = np.sort(lr_.predict_proba(test_fn[train])) #sort in ascending order
        rank = []
        for i,pr in zip(train, label_pr):
            rank.append([i,pr[-1]])
        rank = sorted(rank, key=lambda x: x[-1])
        idx = rank[0][0]
        #compute CI for each oracle
        for b in bl:
            r = R[b]
            n = len(r)
            cv = t.ppf(0.975, n-1)
            CI[b] = np.mean(r) + cv*np.std(r)/np.sqrt(n)

        epsilon = 0.7*max(CI.values())
        preds = []
        for b in bl:
            if CI[b] >= epsilon:
                preds.append(b.predict(test_fd[idx]))
        #print 'predicted label from NO', preds
        y_ = mode(preds, axis=None)[0][0]
        #print 'major', y_
        for b in bl:
            if CI[b] >= epsilon:
                if b.predict(test_fd[idx]) == y_:
                    R[b].append(1)
                else:
                    R[b].append(0)
        fd_.append(idx)
        label_.append(y_)
def average_frequency(hist: defaultdict):
    sum = 0
    for freq in hist.values():
        sum += int(freq)
    return sum/len(hist.values())
Exemple #23
0
def get_intersection_points(diagram: defaultdict) -> int:
    return (np.array(list(diagram.values())) > 1).sum()
def get_possible_options(steps_dict: collections.defaultdict, path: str):
    return (sorted(
        set(steps_dict.keys()).union(
            set(ft.reduce(lambda a, b: a + b,
                          steps_dict.values()))).difference(set(path))))
def get_first_step(steps_dict: collections.defaultdict) -> str:
    return sorted(
        set(steps_dict.keys()).difference(
            set(ft.reduce(lambda a, b: a + b, steps_dict.values()))))[0]
        if len(distList) > 1:
            stdVal = np.std(distList, ddof = 1.0)
        if ancestorList:
            listAnc[currType.depth()] = UtilObject(mean=meanVal, std=stdVal,
                distList=distList)
        else:
            d[currType] = UtilObject(mean=meanVal, std=stdVal,
                distList=distList)
print

# Build lists of distances for each level
print("Build globDistList...")
globDistList = []
for i in range(TaxaType.hierarchySize() + 1):
    globDistList.append([])
for listAnc in taxaTypeAncDistDict.values():
    for ind, obj in enumerate(listAnc):
        if obj is None:
            continue
        globDistList[ind].extend(obj.distList)

# Build list of UtilObject(mean, std, count)
globStdList = []
for l in globDistList:
    UtilDrawHistogram(l, show = False)
    if len(l) >= 2:
        std = std=np.std(l, ddof=1.0)
    else:
        std = None
    globStdList.append(std)
UtilDrawHistogram(show = True)
Exemple #27
0
def count_lanternfish(lanternfish: defaultdict) -> int:
    return np.array(list(lanternfish.values())).sum()
Exemple #28
0
def voteRefine(sequences, motifs):
    #get probabilities
    lets = "ACGT"
    probability = DD(int)
    for seq in sequences:
        for let in sequences[seq]:
            probability[let] += 1
    s = sum(probability.values())
    for let in lets:
        probability[let] = probability[let] / float(s)

    #conductPoll
    poll = {}
    maxV = 0
    maxL = 0
    for seq in sequences:
        poll[seq] = [0.0] * len(sequences[seq])
        if len(sequences[seq]) > maxL:
            maxL = len(sequences[seq])

    for tool in motifs:
        for motif in motifs[tool]:
            for seq in motifs[tool][motif]:
                sequence = best(sequences, seq)
                for pos in motifs[tool][motif][seq]:
                    for i in xrange(pos, pos + len(motif)):
                        try:
                            # instead of weighting all results the same (1), we
                            # could bias based on tool or number of results or something like that
                            #poll[sequence][i - 1] += 1
                            if tool == "CMF":
                                poll[sequence][i - 1] += 1
                            if tool == "Weeder":
                                poll[sequence][i - 1] += 1
                            if tool == "MEME":
                                poll[sequence][i - 1] += 1
                            if tool == "DECOD":
                                poll[sequence][i - 1] += 1
                            if tool == "BioProspector":
                                poll[sequence][i - 1] += 1
                            if tool == "XXmotif":
                                poll[sequence][i - 1] += 1

                        except Exception as e:
                            print e
                            print 'It appears a tool has reported finding a motif',\
                                'outside the bounds of a sequence'
                            print 'such as finding a motif of length 10 at position',\
                                '195 in a sequence with length 200'
                            pdb.set_trace()

                        if poll[sequence][i - 1] > maxV:
                            maxV = poll[sequence][i - 1]
    #inspectPoll

    ress = []
    THRESH = 3.7
    maxInsts = 0

    MLEN = MOTIF_LEN
    for seq in poll:
        for i in xrange(len(poll[seq]) - MLEN):
            if sum(poll[seq][i:i + MLEN]) >= MLEN * THRESH:
                curr = sequences[seq][i:i + MLEN]
                bestPWM = None
                bestMatching = 0
                for PWM in ress:
                    matching = compMotifPWM(curr, PWM)
                    if matching > bestMatching and matching > MLEN / 2:
                        bestMatching = matching
                        bestPWM = PWM
                if bestPWM == None:
                    bestPWM = [[0, 0, 0, 0] for x in xrange(MLEN)]
                    ress.append(bestPWM)
                for c, col in zip(curr, bestPWM):
                    col[ALPH[c]] += 1
                insts = sum(bestPWM[0])
                if insts > maxInsts:
                    maxInsts = insts

    votedRess = DD(int)
    for PWM in ress:
        l = len(PWM)
        cons = PWMconsensus(PWM)
        for seq in sequences:
            for spos in xrange(0, len(sequences[seq]) - l):
                # .75% thresh
                if compMotifPWM(sequences[seq][spos:spos + l], PWM) >= .75 * l:
                    for pos in xrange(spos, spos + l):
                        votedRess[cons] += poll[seq][pos]

    return sorted(votedRess.iteritems(), key=lambda a: a[::-1])
Exemple #29
0
    def run_auto(self):
        '''
        test direct data feature based transfer accuracy on the new building
        '''
        rf = RFC(n_estimators=100, criterion='entropy')
        rf.fit(self.train_fd, self.train_label)
        pred = rf.predict(self.test_fd)
        print('direct data feature-based transfer acc on tgt_bldg:',
              ACC(pred, self.test_label))
        #plot_confusion_matrix(self.test_label, pred)
        '''
        step1: train base models from bldg1
        '''
        self.get_base_learners()
        '''
        step2: TL with name feature on bldg2
        '''
        label = self.test_label
        class_ = np.unique(self.train_label)

        for b in self.bl:
            print(b.score(self.test_fd, label))

        n_class = 32
        c = KMeans(init='k-means++', n_clusters=n_class, n_init=10)
        c.fit(self.test_fn)
        dist = np.sort(c.transform(self.test_fn))
        ex_id = DD(list)  #example id for each C
        for i, j, k in zip(c.labels_, range(len(self.test_fn)), dist):
            ex_id[i].append(int(j))

        #getting neighors for each ex
        nb_c = DD()  #nb from clustering results
        for exx in ex_id.values():
            exx = np.asarray(exx)
            for e in exx:
                nb_c[e] = exx[exx != e]

        nb_f = [DD(), DD(), DD()]  #nb from classification results
        for b, n in zip(self.bl, nb_f):
            preds = b.predict(self.test_fd)
            ex_ = DD(list)
            for i, j in zip(preds, range(len(self.test_fd))):
                ex_[i].append(int(j))
            for exx in ex_.values():
                exx = np.asarray(exx)
                for e in exx:
                    n[e] = exx[exx != e]

        #use base learners' predicitons
        acc_ = []
        cov_ = []
        #for delta in np.linspace(0.1, 0.5, 5):
        for delta in np.linspace(self.agreement_threshold,
                                 self.agreement_threshold, 1):
            print('running TL with agreement threshold =', delta)

            labeled_id = []
            confidence = []
            output = DD()
            preds = np.array([999 for i in range(len(self.test_fd))])
            for i in range(len(self.test_fn)):
                #get the weight for each bl: by computing sim btw cluster and clf
                w = []
                v_c = set(nb_c[i])
                for n in nb_f:
                    v_f = set(n[i])
                    cns = len(v_c & v_f) / float(
                        len(v_c | v_f))  #original count based weight
                    #print (len(v_c & v_f) , len(v_c | v_f))
                    inter = v_c & v_f
                    union = v_c | v_f
                    d_i = 0
                    d_u = 0
                    for it in inter:
                        d_i += np.linalg.norm(self.test_fn[i] -
                                              self.test_fn[it])
                        #print (np.linalg.norm(self.test_fn[i]-self.test_fn[it]))
                    #input('...')
                    for u in union:
                        d_u += np.linalg.norm(self.test_fn[i] -
                                              self.test_fn[u])
                    if len(inter) != 0:
                        sim = 1 - (d_i / d_u) / cns
                        #sim = (d_i/d_u)/cns

                    if i in output:
                        output[i].extend(
                            ['%s/%s' % (len(inter), len(union)), 1 - sim])
                    else:
                        output[i] = [
                            '%s/%s' % (len(inter), len(union)), 1 - sim
                        ]
                    w.append(sim)
                output[i].append(np.mean(w))

                if np.mean(w) >= delta:
                    confidence.append(np.mean(w))
                    w[:] = [float(j) / sum(w) for j in w]
                    pred_pr = np.zeros(len(class_))
                    for wi, b in zip(w, self.bl):
                        pr = b.predict_proba(self.test_fd[i].reshape(1, -1))
                        pred_pr = pred_pr + wi * pr
                    preds[i] = class_[np.argmax(pred_pr)]
                    labeled_id.append(i)

            acc_.append(ACC(preds[preds != 999], label[preds != 999]))
            cov_.append(1.0 * len(preds[preds != 999]) / len(label))

        print('acc =', acc_, ';')
        print('cov =', cov_, ';')

        return preds[preds != 999], labeled_id, confidence
def buildCogTaxaDict(noWeights = False, showCogFreqHist = False,
    interpolationRange = None):

    print("reading taxa dictionary...")
    taxaDict = UtilLoad(PROK_TAXA_DICT())
    print("Read %d organisms" % len(taxaDict))

    print("Reading cogDict...")
    cogDict = UtilLoad(COG_DICT())

    print("Building COG frequncies...")
    cogFreq = DefDict(int)
    for dir, cogs in cogDict.iteritems():
        for cname in cogs:
            cogFreq[cname] += 1

    if showCogFreqHist:
        print("Sowing cogFreq histogram...")
        UtilDrawHistogram(cogFreq.values(), show = True)

    temp = taxaDict.keys()
    for dir in temp:
        if dir not in cogDict:
            del taxaDict[dir]
    temp = cogDict.keys()
    for dir in temp:
        if dir not in taxaDict:
            del cogDict[dir]
    print("Valid set contains %d organisms" % len(cogDict))

    print("\nBuilding Taxonomy distances...")
    taxDist = DefDict(dict)
    for dir1, taxa1 in taxaDict.items():
        for dir2, taxa2 in taxaDict.items():
            d = taxa1.distance(taxa2)
            taxDist[dir1][dir2] = d

    # Optimization
    if noWeights:
        return (cogDict, None, taxaDict, taxDist)

    fname = COG_WEIGHTS_DICT_LIST()
    if os.path.isfile(fname):
        print("Loading cogWeightDictList...")
        cogWeightDictList = UtilLoad(fname, progrIndPeriod=100)
    else:
        print("Building cogWeightsDict...")
        cogWeightDictList = [DefDict(dict) for i \
            in range(0, COG_REG_STEP_COUNT+1)]
        if not interpolationRange:
            interpolationRange = range(0, COG_REG_STEP_COUNT+1)
        for i in interpolationRange:
            expCogReg = math.exp(COG_REG_LOWER + float(i) * COG_REG_STEP)
            print("\nexpCogReg %f" % expCogReg)
            cogWeightDict = cogWeightDictList[i]
            for ind, (dir1, cogs1) in enumerate(cogDict.iteritems(), start=1):
                print("\r%d.%d. %s" % (i, ind, dir1)),
                for dir2, cogs2 in cogDict.iteritems():
                    cogWeightDict[dir1][dir2] = \
                        cogSetWeight(cogs1 & cogs2, cogFreq, expCogReg)
            print
        UtilStore(cogWeightDictList, fname)

    return (cogDict, cogWeightDictList, taxaDict, taxDist)