def tf_idf_normalized(words, use_pos_tagging=True):
    normalized_words = []
    result = []
    included_tf_idf_values = []

    for i, word in enumerate(words):
        normalized_words.append(morph.parse(word)[0].normal_form)

    # Calculate if-idf values
    create_tf_idf_info(result, words, normalized_words, included_tf_idf_values)

    # Calculate highlight threshold (get first 30% of list)
    included_tf_idf_values_sorted = sorted(included_tf_idf_values,
                                           reverse=True)
    highlight_threshold = percentile(included_tf_idf_values_sorted,
                                     percent=0.7)

    for word in result:
        if word["tf_idf"] >= highlight_threshold:
            word["highlight"] = True
        else:
            word["highlight"] = False

    if use_pos_tagging:
        pos_tagging(
            list(
                filter(lambda word: word.get("normal_form") is not None,
                       result)))

    return result
Ejemplo n.º 2
0
	def Prob(self, numList, num):
		return percentile.percentile(numList, num)/100.0
birthWeightFirsties = []
birthWeightOthers = []
myBirthWeight = 8.5

firsties = []
others = []

for record in table.records:
    if record.outcome == 1:
        if record.birthord == 1:
            firsties.append(record)
        else:
            others.append(record)

for record in firsties:
	if type(record.birthwgt_lb) and type(record.birthwgt_oz) == int:
		birthWeightFirsties.append(record.birthwgt_lb + record.birthwgt_oz/16.0)

weightFirstiesPercentile = percentile.percentile(birthWeightFirsties, myBirthWeight)

for record in others:
	if type(record.birthwgt_lb) and type(record.birthwgt_oz) == int:
		birthWeightOthers.append(record.birthwgt_lb + record.birthwgt_oz/16.0)

weightOthersPercentile = percentile.percentile(birthWeightOthers, myBirthWeight)

print "percentile birthweight among firsties:", weightFirstiesPercentile
print "percentile birthweight among others:", weightOthersPercentile

Ejemplo n.º 4
0
def calculate(cycles, attacks, at, pow, defense, arm, dice_hit, dice_dmg, foc, boostflag):
    results = [0] * max_result  # Create an array to handle up to 'max_result' dmg
    ordered_results = []  # Create an array for results to land in before sorting
    input_foc = foc  # Grab the foc now before we manipulate it
    out = 0 
    counter_cycle = 0
    
    while counter_cycle < cycles:
        result = 0
        counter_cycle += 1
#         if full == 1:
#             process = counter_cycle / cycles * 10
#             if process == int(process):
#                 print(int(process * 10), "% Complete",round(time.time() - start_time,2), "seconds")           
        if input_foc == 0:  # If we are not in focus mode...
            cycle_attacks = attacks
            while cycle_attacks > 0:          
                out = rolls.roll_full(at, pow, defense, arm, dice_hit, dice_dmg)  # Do initial attacks
                if out == -1:
                    out = 0
                result = result + out
                cycle_attacks -= 1
        elif boostflag == 0:
            cycle_attacks = attacks + foc
            while cycle_attacks > 0:          
                out = rolls.roll_full(at, pow, defense, arm, dice_hit, dice_dmg)  # Do initial attacks
                if out == -1:
                    out = 0
                result = result + out
                cycle_attacks -= 1
        elif boostflag == 1:
            cycle_attacks = attacks + foc
            while cycle_attacks > 0:
                if cycle_attacks >= 2:    
                    out = rolls.roll_full(at, pow, defense, arm, dice_hit + 1, dice_dmg)  # Do initial attacks
                    if out == -1:
                        out = 0
                    cycle_attacks -= 2
                else:
                    out = rolls.roll_full(at, pow, defense, arm, dice_hit, dice_dmg)
                    if out == -1:
                        out = 0
                    cycle_attacks -= 1
                result = result + out
        elif boostflag == 2:
            cycle_attacks = attacks + foc
            while cycle_attacks > 0:
                if cycle_attacks >= 2:    
                    out = rolls.roll_full(at, pow, defense, arm, dice_hit, dice_dmg + 1)  # Do initial attacks
                    if out == -1:
                        out = 0
                        cycle_attacks += 1
                    cycle_attacks -= 2
                else:
                    out = rolls.roll_full(at, pow, defense, arm, dice_hit, dice_dmg)
                    if out == -1:
                        out = 0
                    cycle_attacks -= 1
                result = result + out
        elif boostflag == 3:
            cycle_attacks = attacks + foc
            while cycle_attacks > 0:
                if cycle_attacks >= 3:    
                    out = rolls.roll_full(at, pow, defense, arm, dice_hit + 1, dice_dmg + 1)  # Do initial attacks
                    if out == -1:
                        out = 0
                        cycle_attacks += 1
                    cycle_attacks -= 3                 
                elif cycle_attacks == 2:    
                    out = rolls.roll_full(at, pow, defense, arm, dice_hit, dice_dmg + 1)  # Do initial attacks
                    if out == -1:
                        out = 0
                        cycle_attacks += 1
                    cycle_attacks -= 2                  
                else:
                    out = rolls.roll_full(at, pow, defense, arm, dice_hit, dice_dmg)
                    if out == -1:
                        out = 0
                    cycle_attacks -= 1
                result = result + out
        
#         if input_foc == 1:  # If one foc is left, buy an attack
#             out = sim.simroll(1, at, pow, defense, arm, dice_hit, dice_dmg)
#             if out == -1:
#                 out = 0
#             result = result + out
        ordered_results.append(result)
        
    ordered_results.sort()  # Order the ordered_results for later use
    
    for index, i in enumerate(ordered_results):
        results[i] += 1  # Populate the results array
        
    damage_total = sum(ordered_results)
    # print("Total Damage:", damage_total)
    if full == 1:
        print("*** Full results for", attacks, "attacks ***")
    
    print("Average Damage:", "\t", round(damage_total / cycles, 2))
     
    print("25th Percentile:", "\t", percentile.percentile(ordered_results, 0.25))
    print("50th Percentile:", "\t", percentile.percentile(ordered_results, 0.50))
    print("75th Percentile:", "\t", percentile.percentile(ordered_results, 0.75))
    print("90th Percentile:", "\t", percentile.percentile(ordered_results, 0.90))
    
    if full == 1:
        print("DAMAGE\t", "CONFIDENCE\t", "SPECIFIC")
        result_counter = 0
        while result_counter < max_result:
            if results[result_counter] != 0:
                ordered_index = ordered_results.index(result_counter)
                confidence = cycles - ordered_index
                print(result_counter, "\t", round(confidence / cycles * 100, 2), "%", "\t", round(results[result_counter] / cycles * 100, 2), "%")
            result_counter += 1
Ejemplo n.º 5
0
    def predict_test(self, save_csv=False):
        ''' Use the MCMC traces to predict the test data '''
        # setup constants
        num_test_rows = self.test_data.shape[0]
        num_iters = self.mod_mc.beta.trace().shape[0]

        # indices
        t_index = dict([(t, i) for i, t in enumerate(self.year_list)])
        a_index = dict([(a, i) for i, a in enumerate(self.age_list)])

        # fixed effects
        X = np.array([self.test_data['x%d'%i] for i in range(self.mod_mc.beta.value.shape[0])])
        BX = np.dot(self.mod_mc.beta.trace(), X)

        # exposure
        '''
        if self.training_type == 'make predictions':
            E = np.ones((num_iters, num_test_rows))*self.test_data.envelope
        else:
            E = np.random.binomial(np.round(self.test_data.envelope).astype('int'), (self.test_data.sample_size/self.test_data.envelope), (num_iters, num_test_rows))
        '''
        E = np.ones((num_iters, num_test_rows))*self.test_data.envelope

        # pi_s
        s_index = [np.where(self.test_data.super_region==s) for s in self.super_region_list]
        t_by_s = [[t_index[self.test_data.year[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))]
        a_by_s = [[a_index[self.test_data.age[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))]
        pi_s = np.zeros((num_iters, num_test_rows))
        for s in range(len(self.super_region_list)):
            pi_s[:,s_index[s][0]] = self.mod_mc.pi_s_list.trace()[:,s][:,a_by_s[s],t_by_s[s]]
        self.test_s_index = s_index

        # pi_r
        r_index = [np.where(self.test_data.region==r) for r in self.region_list]
        t_by_r = [[t_index[self.test_data.year[j]] for j in r_index[r][0]] for r in range(len(self.region_list))]
        a_by_r = [[a_index[self.test_data.age[j]] for j in r_index[r][0]] for r in range(len(self.region_list))]
        pi_r = np.zeros((num_iters, num_test_rows))
        for r in range(len(self.region_list)):
            pi_r[:,r_index[r][0]] = self.mod_mc.pi_r_list.trace()[:,r][:,a_by_r[r],t_by_r[r]]
        self.test_r_index = r_index

        # pi_c
        c_index = [np.where(self.test_data.country==c) for c in self.country_list]
        t_by_c = [[t_index[self.test_data.year[j]] for j in c_index[c][0]] for c in range(len(self.country_list))]
        a_by_c = [[a_index[self.test_data.age[j]] for j in c_index[c][0]] for c in range(len(self.country_list))]
        pi_c = np.zeros((num_iters, num_test_rows))
        for c in range(len(self.country_list)):
            pi_c[:,c_index[c][0]] = self.mod_mc.pi_c_list.trace()[:,c][:,a_by_c[c],t_by_c[c]]	
        self.test_c_index = c_index

        # make predictions
        import os
        os.chdir('/home/j/Project/Causes of Death/CoDMod/codmod2/')
        import percentile
        predictions = np.exp(BX + np.log(E) + pi_s + pi_r + pi_c)
        mean = predictions.mean(axis=0)
        lower = percentile.percentile(predictions, 2.5, axis=0)
        upper = percentile.percentile(predictions, 97.5, axis=0)
        self.predictions = self.test_data[['country','region','super_region','year','age','pop']]
        self.predictions = recfunctions.append_fields(self.predictions, 'mean_deaths', mean)
        self.predictions = recfunctions.append_fields(self.predictions, 'lower_deaths', lower)
        self.predictions = recfunctions.append_fields(self.predictions, 'upper_deaths', upper)
        if self.training_type != 'make predictions':
            self.predictions = recfunctions.append_fields(self.predictions, 'actual_deaths', self.test_data.cf*self.test_data.envelope)
        self.predictions = self.predictions.view(np.recarray)

        # save the predictions
        if save_csv == True:
            pl.rec2csv(self.predictions, '/home/j/Project/Causes of Death/CoDMod/tmp/' + self.name + '_predictions_' + self.cause + '_' + self.sex + '.csv')
Ejemplo n.º 6
0
    def predict_test(self, save_csv=False):
        ''' Use the MCMC traces to predict the test data '''
        # setup constants
        num_test_rows = self.test_data.shape[0]
        num_iters = self.approxs['beta'].shape[0]

        # indices
        t_index = dict([(t, i) for i, t in enumerate(self.year_list)])
        a_index = dict([(a, i) for i, a in enumerate(self.age_list)])

        # fixed effects
        X = np.array([self.test_data['x%d'%i] for i in range(self.mod_mc.beta.value.shape[0])])
        BX = np.dot(self.approxs['beta'], X)

        # exposure
        '''
        if self.training_type == 'make predictions':
            E = np.ones((num_iters, num_test_rows))*self.test_data.envelope
        else:
            E = np.random.binomial(np.round(self.test_data.envelope).astype('int'), (self.test_data.sample_size/self.test_data.envelope), (num_iters, num_test_rows))
        '''
        E = np.ones((num_iters, num_test_rows))*self.test_data.envelope

        # interpolation parameters
        x_samples = self.sample_points[:,0]
        y_samples = self.sample_points[:,1]
        xb = self.age_list[0]
        xe = self.age_list[-1]
        yb = self.year_list[0]
        ye = self.year_list[-1]
        kx = 3 if len(self.age_samples) > 3 else len(self.age_samples)-1
        ky = 3 if len(self.year_samples) > 3 else len(self.year_samples)-1
        
        # pi_s
        s_index = [np.where(self.test_data.super_region==s) for s in self.super_region_list]
        t_by_s = [[t_index[self.test_data.year[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))]
        a_by_s = [[a_index[self.test_data.age[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))]
        pi_s = np.zeros((num_iters, num_test_rows))
        for s in range(len(self.super_region_list)):
            for i in range(num_iters):
                interpolator = interpolate.bisplrep(x=x_samples, y=y_samples, z=self.approxs['pi_s_'+str(s)][i], xb=xb, xe=xe, yb=yb, ye=ye, kx=kx, ky=ky)
                pi_s[i,s_index[s][0]] = interpolate.bisplev(x=self.age_list, y=self.year_list, tck=interpolator)[a_by_s[s],t_by_s[s]]
            mean_pi_s = pi_s[:,s_index[s][0]].mean(axis=1)
            pi_s[:,s_index[s][0]] = pi_s[:,s_index[s][0]][np.argsort(mean_pi_s)]
        
        # pi_r
        r_index = [np.where(self.test_data.region==r) for r in self.region_list]
        t_by_r = [[t_index[self.test_data.year[j]] for j in r_index[r][0]] for r in range(len(self.region_list))]
        a_by_r = [[a_index[self.test_data.age[j]] for j in r_index[r][0]] for r in range(len(self.region_list))]
        pi_r = np.zeros((num_iters, num_test_rows))
        for r in range(len(self.region_list)):
            for i in range(num_iters):
                interpolator = interpolate.bisplrep(x=x_samples, y=y_samples, z=self.approxs['pi_r_'+str(r)][i], xb=xb, xe=xe, yb=yb, ye=ye, kx=kx, ky=ky)
                pi_r[i,r_index[r][0]] = interpolate.bisplev(x=self.age_list, y=self.year_list, tck=interpolator)[a_by_r[r],t_by_r[r]]
            mean_pi_r = pi_r[:,r_index[r][0]].mean(axis=1)
            pi_r[:,r_index[r][0]] = pi_r[:,r_index[r][0]][np.argsort(mean_pi_r)]

        # pi_c
        c_index = [np.where(self.test_data.country==c) for c in self.country_list]
        t_by_c = [[t_index[self.test_data.year[j]] for j in c_index[c][0]] for c in range(len(self.country_list))]
        a_by_c = [[a_index[self.test_data.age[j]] for j in c_index[c][0]] for c in range(len(self.country_list))]
        pi_c = np.zeros((num_iters, num_test_rows))
        for c in range(len(self.country_list)):
            for i in range(num_iters):
                interpolator = interpolate.bisplrep(x=x_samples, y=y_samples, z=self.approxs['pi_c_'+str(c)][i], xb=xb, xe=xe, yb=yb, ye=ye, kx=kx, ky=ky)
                pi_c[i,c_index[c][0]] = interpolate.bisplev(x=self.age_list, y=self.year_list, tck=interpolator)[a_by_c[c],t_by_c[c]]
            mean_pi_c = pi_c[:,c_index[c][0]].mean(axis=1)
            pi_s[:,c_index[c][0]] = pi_c[:,c_index[c][0]][np.argsort(mean_pi_c)]

        # make predictions
        import os
        os.chdir('/home/j/Project/Causes of Death/CoDMod/codmod2/')
        import percentile
        predictions = np.exp(BX + np.log(E) + pi_s + pi_r + pi_c)
        mean = predictions.mean(axis=0)
        lower = percentile.percentile(predictions, 2.5, axis=0)
        upper = percentile.percentile(predictions, 97.5, axis=0)
        self.predictions = self.test_data[['country','region','super_region','year','age','pop']]
        self.predictions = recfunctions.append_fields(self.predictions, 'mean_deaths', mean)
        self.predictions = recfunctions.append_fields(self.predictions, 'lower_deaths', lower)
        self.predictions = recfunctions.append_fields(self.predictions, 'upper_deaths', upper)
        if self.training_type != 'make predictions':
            self.predictions = recfunctions.append_fields(self.predictions, 'actual_deaths', self.test_data.cf*self.test_data.envelope)
        self.predictions = self.predictions.view(np.recarray)

        # save the predictions
        if save_csv == True:
            pl.rec2csv(self.predictions, '/home/j/Project/Causes of Death/CoDMod/tmp/' + self.name + '_predictions_' + self.cause + '_' + self.sex + '.csv')