Ejemplo n.º 1
0
def mutual_information(x,y,nbins=20):
    Hx = entropy(x,nbins)
    Hy = entropy(y,nbins)
    Hxy = joint_entropy(x,y,nbins)
    MI = Hx+Hy-Hxy
    NMI = 2*MI/(Hx + Hy)
    return MI, NMI
def information_gain(array_source, array_children_list, criterion='gini'):
    """Computes the information gain between the first and second array using the criterion 'gini' or 'entropy' 333"""
    if isinstance(array_source, np.ndarray) == 1 and isinstance(
            array_children_list, np.ndarray) == 1:
        if criterion == "gini" or criterion == "entropy":
            if criterion == "gini":
                So = gini(array_source)
                q = len(array_children_list)
                N = len(array_source)
                somme = 0.0
                for i in range(q):
                    somme += (len(array_children_list / N) *
                              gini(array_children_list))
                IG = So - somme
                return (IG)
            else:
                So = entropy(array_source)
                q = len(array_children_list)
                N = len(array_source)
                somme = 0.0
                for i in range(q):
                    somme += (len(array_children_list / N) *
                              entropy(array_children_list))
                IG = So - somme
                return (IG)
        else:
            print("info_gain: error in children list or criterion type")
    else:
        print("info_gain: error in type of array")
def information_gain(array_source, array_children_list, criterion='gini'):
    # try:
    # 	if isinstance(array_,np.ndarray) and array_.size == 0:
    # 		return None
    # except:
    # 	return None
    # if (isinstance(array_, list)):
    # 	array_ = np.array(array_)
    # N = float(array_.size)
    # unique , counts= np.unique(array_,return_counts=True)
    # val = dict(zip(unique,counts))
    # dct = {}
    # acc = 0
    # for key in val.items():
    # 	pi = key[1] / N
    # 	acc += np.power(pi,2)
    # return 1 - acc
    G0 = entropy(array_source)
    N = array_source.size
    # print(N)
    # print(G0)

    q = len(array_children_list)
    iG = np.zeros((q, 1))
    acc = 0
    iG[0] = entropy(array_children_list[0])
    # print(array_children_list[1].size / N)
    iG[1] = entropy(array_children_list[1])
    S = (array_children_list[0].size /
         N) * iG[0] + (array_children_list[1].size / N) * iG[1]
    # for i in range(q):
    # 	n = array_children_list[i].size
    # 	acc += (n/N) * entropy(array_children_list[i])
    # print()
    return float(G0 - S)
def plot_waverec(curve, entonema, wav, wavelet, directory):
    levels = 4
    coeff = get_cD(curve, wavelet, level=levels)

    fig = plt.figure()
    ax0 = fig.add_subplot(levels + 2, 1, 1)
    ax0.plot(curve)
    plt.yticks([min(curve), (min(curve) + max(curve)) / 2, max(curve)], size=6)
    plt.xticks(np.arange(0, len(curve), step=len(curve) / 10), size=6)
    cv = convolve(curve, curve, mode='valid')[0]
    ax0.set_xlabel('longitud = {}  entropía = {}  convolución = 10**{}'.format(
        len(curve), entropy(curve), round(np.log10(cv), 2)),
                   size=8)
    ax0.set_ylabel("Curva original", size=8)
    ax0.grid()

    for i, (cA, cD) in enumerate(coeff):
        rec = pywt.waverec((cA, cD), 'db5')
        ax0 = fig.add_subplot(levels + 2, 1, i + 2)
        ax0.plot(rec, alpha=0.6)
        ax0.grid()

        plt.yticks([min(rec), (min(rec) + max(rec)) / 2, max(rec)], size=6)
        plt.xticks(np.arange(0, len(rec), step=len(rec) / 10), size=6)
        cv = convolve(curve, rec, mode='valid')[0]
        ax0.set_xlabel(
            'longitud = {}  entropía = {}  convolución = 10**{}'.format(
                len(rec), entropy(rec), round(np.log10(cv), 2)),
            size=8)
        ax0.set_ylabel("N{}".format(i), size=8)

    fig.subplots_adjust(hspace=1.2, wspace=0.2)
    plt.savefig('{}/{}_{}_{}.png'.format(directory, entonema,
                                         wav.replace('.wav', ''), wavelet))
def plot_spline_interpolation(dataset, entonema, wav):
    print('Hola!')
    if not wav.endswith('.wav'):
        return

    curve_path = '{}/{}/{}'.format(dataset, entonema, wav)
    curve = get_pitch_decompy_values(curve_path,
                                     remove_silencess=True,
                                     interpolate=False)

    ynew = spline_interpolation(curve)
    x = np.arange(0, len(curve))

    plt.figure()
    plt.scatter(x,
                curve,
                s=np.pi,
                label='Original_pitch  entropy = {}'.format(entropy(curve)))
    plt.plot(x,
             ynew,
             label='Spline_interpolation   entropy = {}'.format(entropy(ynew)),
             alpha=0.4)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.legend(loc='upper center',
               fancybox=True,
               shadow=True,
               bbox_to_anchor=(0.5, 1.13))
    plt.grid()

    #plt.title('Cubic-spline interpolation\n\n    entonema = {}    wav = {}'.format(entonema, wav))
    #plt.show()
    dir = './temp/{}'.format(entonema)
    if not os.path.exists(dir):
        os.mkdir(dir)
    plt.savefig('{}/{}_{}.png'.format(dir, entonema, wav.replace('.wav', '')))
def plot_before_after_transformation(dataset, entonema, wav):
    if not wav.endswith('.wav'):
        return

    path = '{}/{}/{}'.format(dataset, entonema, wav)

    ybefore = get_pitch_decompy_values(path,
                                       remove_silencess=False,
                                       interpolate=False)
    ynew = get_pitch_decompy_values(path,
                                    remove_silencess=True,
                                    interpolate=True)

    fig, axarr = plt.subplots(2)
    axarr[0].grid()
    axarr[0].plot(ybefore)
    axarr[0].set_ylabel('No trans')
    axarr[0].set_xlabel('len = {}       entropy = {}'.format(
        len(ybefore), entropy(ybefore)))
    ax = plt.axis

    axarr[1].grid()
    axarr[1].plot(ynew)
    axarr[1].set_ylabel('With trans')
    axarr[1].set_xlabel('len = {}       entropy = {}'.format(
        len(ynew), entropy(ynew)))
    fig.subplots_adjust(hspace=0.5)
    plt.axis = ax

    dir = './temp/{}'.format(entonema)
    if not os.path.exists(dir):
        os.mkdir(dir)
    plt.savefig('{}/{}_{}.png'.format(dir, entonema, wav.replace('.wav', '')))
Ejemplo n.º 7
0
def main():

    infilename = 'test.txt'
    outfilename = 'decoded_text.txt'

    encode(infilename)
    decode('encoded_text.txt', outfilename)

    if filecmp.cmp(infilename, outfilename):
        print("Good job! Go have a rest.")
    else:
        print("Go work!")
        return

    x = open('encoded_text.txt', 'rb')
    encoded_file_length = len(list(x.read()))
    x.close()

    x = open(infilename, 'rb')
    file_length = len(list(x.read()))
    x.close()

    print("Dlugosc kodowanego pliku: ", file_length)
    print("Dlugosc uzyskanego kodu: ", encoded_file_length)
    compression_deg = os.stat(infilename).st_size / os.stat(
        'encoded_text.txt').st_size
    print("Stopien kompresji: ", compression_deg)
    print("\nEntropia pliku kodowanego:")
    entropy.entropy(infilename)
    print("\nEntropia uzyskanego kodu:")
    entropy.entropy('encoded_text.txt')
Ejemplo n.º 8
0
    def MDLPC_criterion(self, data, feature, cut_point):
        '''
        Determines whether a partition is accepted according to the MDLPC criterion
        :param feature: feature of interest
        :param cut_point: proposed cut_point
        :param partition_index: index of the sample (dataframe partition) in the interval of interest
        :return: True/False, whether to accept the partition
        '''
        #get dataframe only with desired attribute and class columns, and split by cut_point
        data_partition = data.copy(deep=True)
        data_left = data_partition[data_partition[feature] <= cut_point]
        data_right = data_partition[data_partition[feature] > cut_point]

        #compute information gain obtained when splitting data at cut_point
        cut_point_gain = cut_point_information_gain(dataset=data_partition, cut_point=cut_point,
                                                    feature_label=feature, class_label=self._class_name)
        #compute delta term in MDLPC criterion
        N = len(data_partition) # number of examples in current partition
        partition_entropy = entropy(data_partition[self._class_name])
        k = len(data_partition[self._class_name].unique())
        k_left = len(data_left[self._class_name].unique())
        k_right = len(data_right[self._class_name].unique())
        entropy_left = entropy(data_left[self._class_name])  # entropy of partition
        entropy_right = entropy(data_right[self._class_name])
        delta = log(3 ** k, 2) - (k * partition_entropy) + (k_left * entropy_left) + (k_right * entropy_right)

        #to split or not to split
        gain_threshold = (log(N - 1, 2) + delta) / N

        if cut_point_gain > gain_threshold:
            return True
        else:
            return False
Ejemplo n.º 9
0
 def test_args_out_of_range(self):
     """
     Edge tst to make sure the function throws a ValueError
     when the input probabilities are < 0 or > 1.
     """
     with self.assertRaises(ValueError):
         entropy([-1, 2])
     return
Ejemplo n.º 10
0
 def test_args_dont_sum_to_1(self):
     """
     Edge test to make sure the function throws a ValueError
     when the input probabilities do not sum to one.
     """
     with self.assertRaises(ValueError):
         entropy([.9, .9])
     return
Ejemplo n.º 11
0
def main():
    args = list(sys.argv)
    if len(args) == 1:
        args.append('omega')

    infilename = 'test.txt'
    outfilename = 'decoded_text.txt'
    code = encode(infilename)
    encoding_error = choose_encoding(args[1], code)
    if encoding_error:
        return

    # create dictionary for decode function
    message_file = open(infilename, 'rb')
    message = list(message_file.read())
    file_length = len(message)
    message_file.close()

    dictio = {}
    next_index = 1
    for c in message:
        if not [str(c)] in dictio.values():
            dictio[next_index] = [str(c)]
            next_index += 1

    # read encoded text and parse it to 0-1 values
    encoding_file = open('encoded_text.txt', 'rb')
    content = list(encoding_file.read())
    encoded_file_length = len(content)
    bits = ''

    for byte in content[:-1]:
        bits += format(byte, '08b')

    # delete redundant zeros from the end of encoded 0-1 string
    redundant_zeros_num = content[-1]
    if redundant_zeros_num > 0:
        bits = bits[:-content[-1]]

    # decode message
    choose_decoding(args[1], bits, dictio, outfilename)

    if filecmp.cmp(infilename, outfilename):
        print("Good job! Go have a rest.")
    else:
        print("Go work!")
        return

    print("Dlugosc kodowanego pliku: ", file_length)
    print("Dlugosc uzyskanego kodu: ", encoded_file_length)
    compression_deg = os.stat(infilename).st_size / os.stat(
        'encoded_text.txt').st_size
    print("Stopien kompresji: ", compression_deg)
    print("\nEntropia pliku kodowanego:")
    entropy.entropy(infilename)
    print("\nEntropia uzyskanego kodu:")
    entropy.entropy('encoded_text.txt')
Ejemplo n.º 12
0
def mi(lines, vocab):
    d1 = Counter()
    d2 = Counter()
    djoint = Counter()
    for w1, w2, c in mi_contributions(lines):
        if (not vocab) or (w1 in vocab and w2 in vocab):
            d1[w1] += c
            d2[w2] += c
            djoint[w1, w2] += c
    return (entropy.entropy(d1.values()) + entropy.entropy(d2.values()) -
            entropy.entropy(djoint.values()))
Ejemplo n.º 13
0
def test():
    """ brute force an answer
    """
    from random import randrange
    for a in sorted(range(123), key=lambda u: random()):
        for b in sorted(range(123), key=lambda u: random()):
            for c in sorted(range(1, 123), key=lambda u: random()):
                if entropy(mapper(a, b, c)) > DIFFICULTY:
                    print(a, b, c)
                    print(mapper(a, b, c))
                    print(entropy(mapper(a, b, c)))
                    break
        print(a)
Ejemplo n.º 14
0
def main():

    args = process_args()

    data = args.file.read()
    args.file.close()


    results = list()
    if args.md5 or args.all:
        results.append(('md5',hashes.hash('md5', data)))
    if args.sha1 or args.all:
        results.append(('sha1', hashes.hash('sha1', data)))
    if args.sha256 or args.all:
        results.append(('sha256', hashes.hash('sha256', data)))
    if args.sha512 or args.all:
        results.append(('sha512', hashes.hash('sha512', data)))

    if args.entropy or args.all:
        results.append(('entropy', entropy.entropy(data)))

    if args.magic or args.all:
        results.append(('magic', filemagic.filemagic(data)))

    output(args.output, results)

    return 0
Ejemplo n.º 15
0
def runcontingent(path):
    from entropy import entropy
    import toolshed as ts
    it = ts.reader(path)
    iterable = (Interval(**iv) for iv in it)
    values = defaultdict(list)
    genes = set()
    by_transcript = defaultdict(list)
    by_domain = defaultdict(list)
    for iv in iterable:
        by_domain[iv.domain].append(iv)
        by_transcript[iv.transcript].append(iv)

    for domain, ivs in by_domain.items():
        if len(ivs) < 2: continue
        if sum(iv.mafs.count(',') for iv in ivs) < 3: continue
        if domain == ".": continue
        intervals = ivs[:]
        for iv in ivs:
            intervals.extend(by_transcript[iv.transcript])
        intervals = set(intervals)
        if len(intervals) < 3: continue
        pval, ratio, tbl, gene = contingent(intervals, domain, nodoms_only=False)
        ent = entropy(intervals)
        values['domain'].append(domain)
        values['pval'].append(pval)
        values['ent'].append(ent)
        values['tbl'].append(tbl)
        values['ratio'].append(ratio)
        values['num_intervals'].append(len(intervals))
        values['num_domains'].append(len(ivs))
        [genes.add(x) for x in gene]
        values['genes'].append(",".join(genes))
        genes=set()
    return values['domain'],values['pval'],values['ent'],values['tbl'],values['ratio'],values['num_intervals'],values['num_domains'],values['genes']
Ejemplo n.º 16
0
def main():

    args = process_args()

    data = args.file.read()
    args.file.close()

    results = list()
    if args.md5 or args.all:
        results.append(('md5', hashes.hash('md5', data)))
    if args.sha1 or args.all:
        results.append(('sha1', hashes.hash('sha1', data)))
    if args.sha256 or args.all:
        results.append(('sha256', hashes.hash('sha256', data)))
    if args.sha512 or args.all:
        results.append(('sha512', hashes.hash('sha512', data)))

    if args.entropy or args.all:
        results.append(('entropy', entropy.entropy(data)))

    if args.magic or args.all:
        results.append(('magic', filemagic.filemagic(data)))

    output(args.output, results)

    return 0
Ejemplo n.º 17
0
 def test_four_equal_likelihood_states(self):
     """
     One shot test using the known case of four states with
     equal likelihood of occurrence. Should return 2 bits.
     """
     assert np.isclose(entropy([0.25, 0.25, 0.25, 0.25]), 2.)
     return
Ejemplo n.º 18
0
    def sess_001(self):
        cvalues = defaultdict(list)
        reports = []
        cookies = self.__get_cookies(30)
        for c in cookies:
            for cname, cval in c.items():
                assert (cname == cval['name'])
                cvalues[cname].append(cval['value'])
        for cname, cvals in cvalues.items():
            ent = entropy(cvals)
            self.printer.aprint(str(ent))
            if (ent < 20):
                reports.append(
                    create_report(
                        "sess_001",
                        basic_description="Low entropy session cookie: {}".
                        format(cname),
                        confidence=1.0,
                        severity="medium",
                        owasp_association="2",
                        cwe=565,
                        misc=[
                            "Entropy heuristic only detects {} bits of entropy"
                            .format(ent)
                        ]))

        return reports
def plot_descomposition(curve, entonema, wav, wavelet, directory):
    wavelet_name = wavelet if type(wavelet) == type('hola') else wavelet.name
    size = len(curve.data)
    cA, cD = pywt.dwt(curve, wavelet, mode='symmetric', axis=-1)

    fig = plt.figure()

    for i in np.arange(0, max_level):
        ax0 = fig.add_subplot(max_level, 2, 2 * i + 1)
        ax0.grid()
        #ax0.plot(xa, cA, '-o')
        ax0.plot(cA, alpha=0.5, label='cA')
        plt.yticks([min(cA), (min(cA) + max(cA)) / 2,
                    max(cA)],
                   size=6,
                   rotation=20)
        plt.xticks(np.arange(0, len(cA), step=len(cA) / 4), size=6)
        ax0.set_xlabel('len = {}  entropy = {}'.format(len(cA), entropy(cA)),
                       size=8)
        ax0.set_ylabel("L{}".format(i), size=8)
        #plt.setp(ax0.get_xticklabels(), fontsize=6)

        ax1 = fig.add_subplot(max_level, 2, 2 * i + 2)
        ax1.grid()
        #ax1.plot(xd, cD, '-o')
        ax1.plot(cD, label='cD')
        #plt.yticks(np.arange(min(cD),max(cD), step=(max(cD) - min(cD))/2), size = 6)
        plt.yticks([min(cD), (min(cD) + max(cD)) / 2,
                    max(cD)],
                   size=6,
                   rotation=20)
        plt.xticks(np.arange(0, len(cD), step=len(cD) / 4), size=6)
        ax1.set_xlabel('len = {}  entropy = {}'.format(len(cD), entropy(cD)),
                       size=8)

        if i == 0:
            ax0.set_title('cA', size=12)
            ax1.set_title('cD', size=12)

        cA, cD = pywt.dwt(cA, wavelet, mode='symmetric', axis=-1)
        #xa = [t for t in np.arange(0, len(cA)*np.pi/50, step=np.pi/50)]
        #xd = [t for t in np.arange(0, len(cD)*np.pi/50, step=np.pi/50)]

    fig.subplots_adjust(hspace=1.2, wspace=0.2)
    plt.savefig('{}/{}_{}_{}.png'.format(directory, entonema,
                                         wav.replace('.wav', ''),
                                         wavelet_name))
def getmalwaresignature(input_malware):
    if os.path.isdir(input_malware):
        malwares_files = os.listdir(input_malware)
        for malware in malwares_files:
            malware_file = os.path.join(input_malware, malware)
            _malwaresignature(malware_file, malware)
            entropy(malware_file)
            continue

        return malware_file
    else:
        malware_file = input_malware
        malware = os.path.basename(malware_file)
        ISO8601, hashmethod, arch, importeddlls, imphash, fuzzyhash = _malwaresignature(malware_file,malware)
        #formattedpdf(input_malware, malware_file,ISO8601, hashmethod, arch, importeddlls, imphash, fuzzyhash)
        #formattedtext(input_malware, malware_file, ISO8601, hashmethod, arch, importeddlls, imphash, fuzzyhash)
        entropy(input_malware)
Ejemplo n.º 21
0
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1
    baseEntropy = entropy(dataSet)
    bestInfoGain = 0.0; bestFeature = -1
    for i in xrange(numFeatures):
        InfoGain=baseEntropy
        #x=getColumn(dataSet,i)
        x= [sample[i] for sample in dataSet]
        for k in set(x):
            subDataSet = splitDataSet(dataSet, i, k)
            xProb=float(len(subDataSet)/len(dataSet))
            xEntropy=entropy(subDataSet)
            InfoGain-=xProb*xEntropy
            print i,k,xProb,xEntropy
        if InfoGain>bestInfoGain:
            bestInfoGain=InfoGain;bestFeature=i
    return bestInfoGain,bestFeature
Ejemplo n.º 22
0
def entropy2(p):
    n = len(p)
    h = np.zeros(n)

    for i in range(n):
        p1 = np.hstack((p[i], 1 - p[i]))
        h[i] = entropy(p1)
    return h
    def info(self, dataset, entonema, wav, curve = []):
        wav_path = '{}/{}/{}'.format(dataset,entonema,wav)
        if len(curve) == 0:
            curve = get_pitch_decompy_values(wav_path, \
            remove_silencess = self.silences, interpolate = self.interpolate)
        entr_pitch = [round(entropy(curve), ndigits=2)]
        cA_entr = []
        cD_entr = []
        conv = []

        descp = get_cD(curve, 'db5', level = 4)
        for (cA, cD) in descp:
            cA_entr.append(round(entropy(cA), ndigits=2))
            cD_entr.append(round(entropy(cD), ndigits=2))
            rec = pywt.waverec((cA,cD), 'db5')
            cv = convolve(curve,rec, mode='valid')[0]
            conv.append(round(cv,ndigits=2))

        return entr_pitch + cA_entr + cD_entr + conv
Ejemplo n.º 24
0
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1
    baseEntropy = entropy(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in xrange(numFeatures):
        InfoGain = baseEntropy
        #x=getColumn(dataSet,i)
        x = [sample[i] for sample in dataSet]
        for k in set(x):
            subDataSet = splitDataSet(dataSet, i, k)
            xProb = float(len(subDataSet) / len(dataSet))
            xEntropy = entropy(subDataSet)
            InfoGain -= xProb * xEntropy
            print i, k, xProb, xEntropy
        if InfoGain > bestInfoGain:
            bestInfoGain = InfoGain
            bestFeature = i
    return bestInfoGain, bestFeature
Ejemplo n.º 25
0
def getent(K,rho):
  rundata=setrun.setrun()
  rundata.probdata.K_B=K
  rundata.probdata.rho_B=rho
  print K,rho
  rundata.write()
  runclaw(outdir='./_output')

  ent=entropy(rundata.clawdata.nout)
  return ent[-1]
Ejemplo n.º 26
0
def get_results(Path):
    print("calculating entropy of samples...")
    # Calculate Entropy:
    list_of_entropies = entropy(Path)
    print("calculating sizes of samples...")
    # Calculate file sizes:
    list_of_sizes = file_size(Path)[0]
    list_of_files = file_size(Path)[1]

    return list_of_sizes, list_of_entropies, list_of_files
Ejemplo n.º 27
0
def getent(K, rho):
    rundata = setrun.setrun()
    rundata.probdata.K_B = K
    rundata.probdata.rho_B = rho
    print K, rho
    rundata.write()
    runclaw(outdir='./_output')

    ent = entropy(rundata.clawdata.nout)
    return ent[-1]
Ejemplo n.º 28
0
def reporter(image_file=IMAGE_FILE,
             neighborhood=NEIGHBORHOOD,
             scale=SCALE,
             dpi=DPI,
             channel=CHANNEL,
             plane=PLANE,
             scope=SCOPE):
    """
    Test usage:
    reporter('image.tiff')
    reporter('image.tiff', 8, 5, 1000)
    reporter('image.tiff', 8, 5, 500, 'R', 0, 5)
    """

    bp(image_file, dpi, channel, plane)
    hist(image_file, scale, dpi)
    atc(image_file, dpi)
    ngbd(image_file, neighborhood, dpi)
    entropy(image_file)
Ejemplo n.º 29
0
def main():
    parser = get_argparser()
    args = parser.parse_args()

    if not args.usetarget:
        trainingdata.STOPWORDS = trainingdata.load_stopwords(args.bitextfn)

    triple_sentences = trainingdata.load_bitext(args.bitextfn, args.alignfn)

    if args.usetarget:
        ## Flip directionality -- we want the top words out of the target text.
        new_triple_sentences = [(t, s, a) for (s, t, a) in triple_sentences]
        triple_sentences = new_triple_sentences

    sl_sentences = [s for (s,t,a) in triple_sentences]
    top_words = trainingdata.get_top_words(sl_sentences)

    with open("topwords.txt", "w") as topwordsout:
        for (i, (word, count)) in enumerate(top_words):
            print("{0} & {1} & {2} \\\\".format(1+i, word, count),
                  file=topwordsout)

    if args.usetarget:
        ## Bail out -- just getting target text top words.
        return

    tl_sentences = trainingdata.get_target_language_sentences(triple_sentences)
    tagged_sentences = [list(zip(ss, ts))
                        for ss,ts in zip(sl_sentences, tl_sentences)]
    trainingdata.set_examples(sl_sentences, tagged_sentences)
    source_annotated = annotated_corpus.load_corpus(args.annotatedfn)
    trainingdata.set_sl_annotated(source_annotated)

    stamp = util.timestamp()
    langs = args.bitextfn.split(".")[1]
    translations_fn = "results/{0}-{1}-translations".format(stamp, langs)
    entropy_fn = "results/{0}-{1}-entropy".format(stamp, langs)

    with open(translations_fn, "w") as topwordsout, \
         open(entropy_fn, "w") as entropyout:
         for (i, (word, count)) in enumerate(top_words):
            training = trainingdata.trainingdata_for(word, nonnull=False)
            labels = [label for (feat,label) in training]
            counts = Counter(labels)
            translations_l = []
            for label, count in counts.most_common(5):
                if label == UNTRANSLATED:
                    label = "NULL"
                translations_l.append("{0}".format(label))
            translations = ", ".join(translations_l)
            print("{0} & {1}".format(word, translations), file=topwordsout)

            bits = entropy(labels)
            print("%30s%30.2f" % (word, bits), file=entropyout)
Ejemplo n.º 30
0
def test_02():

    x = np.array([0.5, 0.5])

    norm = np.sum(x)

    assert np.isclose(norm, 1.0)

    eVal = entropy(x)

    assert np.isclose(eVal, np.log(len(x)))

    return None
Ejemplo n.º 31
0
 def dataReceived(self, data):
     try:
         a, b, c = data.decode('utf-8').split(",")
         a = int(a)
         b = int(b)
         c = int(c)
     except:
         self.transport.write(b"Bad format, try again.\n")
         return
     if entropy(mapper(a, b, c)) > DIFFICULTY:
         self.transport.write(b"Congrats! " + FLAG + b"\n")
     else:
         self.transport.write(b"Nope.\n")
Ejemplo n.º 32
0
def cond_mutual_information(x,y,z,nbins=20):
    Hxz = joint_entropy(x,z,nbins)
    Hyz = joint_entropy(y,z,nbins)
    Hz = entropy(z,nbins)

    # Mutual Information for 3 Variables
    count_xyz,edges = np.histogramdd(np.array([x,y,z]).T, bins=20)
    p_xyz = count_xyz/len(x)
    Hxyz = -np.sum(np.sum(np.sum(np.where(p_xyz>0, np.log2(p_xyz)*p_xyz,0))))
    
    # Conditional Mutual Information
    MIxy_z =  Hxz + Hyz - Hxyz - Hz
    NMIxy_z = 2*MIxy_z/(Hxz + Hyz)
    return MIxy_z, NMIxy_z
Ejemplo n.º 33
0
def set_Y_entropy(data):
    for r in ['trn','val','tst']:
        Y = data[r + '_Y']
        if data['err'] == 'ce':
            v = entropy.entropy(Y.mean(axis=0))
        elif data['err'] == 'mse':
            #hist = np.histogram(Y, bins=100)
            #v = scipy.stats.rv_histogram(hist).entropy()
            if Y.shape[1] != 1: # only 1-d continuous output supported right now
                raise Exception()
            v = entropy.gaussian_entropy_np(1, np.var(Y))
        else:
            raise Exception('Unknown error func')
        data[r+'_entropyY'] = v
    return data        
Ejemplo n.º 34
0
def run(filename):
    data = get_data(filename).lower()
    data = re.sub(r'[!?.:;,-]', '', data)

    lines = data.split("\n")
    string = " ".join(lines)
    words_ = string.split(" ")
    words = list(filter(lambda x: x != '', words_))

    frequencies = FT.frequency_table(words)
    frequency_items = frequencies.items()

    entropy_of_text = entropy(frequencies)

    print 'lines:   ' + str(len(lines))
    print 'words:   ' + str(len(words))
    print 'chars:   ' + str(len(data))
    print "entropy: " + str(round_off(entropy_of_text, 2))
Ejemplo n.º 35
0
def computeIGCI(F, debug):

    ###    %% Discretize the fluorescence signal
    D = discretizeFluorescenceSignal(F)

    ###     %% Compute the entropy

    H = entropy(D)

    #####    %% Compute the scores as entropy differences (vectorized :-))

    n = len(H)

    scores = numpy.zeros(shape=(n, n))

    #scores = numpy.vstack([scores, H.T[0]])
    for i in range(n):
        scores[i] = H.T[0]
    scores = scores - scores.T

    #print scores
    return scores
Ejemplo n.º 36
0
def computeIGCI(F, debug):    
    
 
###    %% Discretize the fluorescence signal
    D = discretizeFluorescenceSignal(F)
    
###     %% Compute the entropy

    H = entropy(D)

#####    %% Compute the scores as entropy differences (vectorized :-))

 
    n = len(H)

    scores = numpy.zeros(shape=(n,n))

    #scores = numpy.vstack([scores, H.T[0]])
    for i in range(n):
        scores[i] = H.T[0]
    scores = scores - scores.T

    #print scores    
    return scores
Ejemplo n.º 37
0
import entropy
import time
import numpy as np


method_1 = []
stat = entropy.EntropyStringMatching()
for i in xrange(0, 10000):
    start = time.time()
    #stat = entropy.EntropyStringMatching()
    stat.run("Data Analyst", 'Fancy Data Analyst')
    end = time.time()

    diff = (end - start) * 1000
    method_1.append(diff)

print "Method 1 returns an average of %s " % np.average(method_1)

method_2 = []

for i in xrange(0, 10000):
    start = time.time()
    calc = entropy.joint_entropy("Data Analyst", 'fancy data analyst') / entropy.entropy("Data Analyst")
    end = time.time()

    diff = (end - start) * 1000
    method_2.append(diff)

print "Method 2 returns an average of %s " % np.average(method_2)
Ejemplo n.º 38
0
 def test_change_entropy( self ):
     F = numpy.array([[1, 2], [4, 8]])
     R = numpy.array([[1], [1]])
     assert (entropy.entropy(F) == R).all()
Ejemplo n.º 39
0
 def test_negative_entropy( self ):
     F = numpy.array([[-10, -20], [-20, -40]])
     R = numpy.array([[1], [1]])
     assert (entropy.entropy(F) == R).all()
Ejemplo n.º 40
0
 def test_no_entropy( self ):
     F = numpy.array([[2, 2], [2, 2]])
     R = numpy.array([[0], [0]])
     assert (entropy.entropy(F) == R).all()