コード例 #1
0
def information_gain(array_source, array_children_list, criterion='gini'):
    """Computes the information gain between the first and second array using the criterion 'gini' or 'entropy' 333"""
    if isinstance(array_source, np.ndarray) == 1 and isinstance(
            array_children_list, np.ndarray) == 1:
        if criterion == "gini" or criterion == "entropy":
            if criterion == "gini":
                So = gini(array_source)
                q = len(array_children_list)
                N = len(array_source)
                somme = 0.0
                for i in range(q):
                    somme += (len(array_children_list / N) *
                              gini(array_children_list))
                IG = So - somme
                return (IG)
            else:
                So = entropy(array_source)
                q = len(array_children_list)
                N = len(array_source)
                somme = 0.0
                for i in range(q):
                    somme += (len(array_children_list / N) *
                              entropy(array_children_list))
                IG = So - somme
                return (IG)
        else:
            print("info_gain: error in children list or criterion type")
    else:
        print("info_gain: error in type of array")
コード例 #2
0
def cal_diversity():

    subj_refnum = json.loads(open('data/subj_refnum.json').read())
    subj_totalnum = len(subj_refnum.keys())

    paper_year= json.loads(open('data/paper_year.json').read())
    logging.info('year data loaded ....')

    citnum_total = defaultdict(int)
    for subj in subj_refnum.keys():
        for subj2 in subj_refnum[subj].keys():
            citnum_total[subj2]+=subj_refnum[subj][subj2]

    of = open('data/pid_divs.txt','w')
    progress = 0

    for line in open('data/paper_ref_attrs.json'):

        progress+=1

        logging.info('progress {} ...'.format(progress))
        
        pid_div_vs = {}


        line = line.strip()

        paper_ref_attrs = json.loads(line)

        for pid in paper_ref_attrs.keys():

            years = []
            c5s = []
            c10s = []

            all_subjs = []

            for ref_attr in paper_ref_attrs[pid]:

                year,c5,c10,subjs = ref_attr

                years.append(int(year)-int(paper_year[pid]))
                c5s.append(c5)
                c10s.append(c10)

                all_subjs.append(subjs)

            if len(years)<5:
                continue

            year_div = gini(np.array(years))
            c5_div = gini(c5s)
            c10_div = gini(c10s)
            subj_div = cal_subj_div(all_subjs,subj_refnum,subj_totalnum,citnum_total)

            pid_div_vs[pid] = [year_div,c5_div,c10_div,subj_div]

        of.write(json.dumps(pid_div_vs)+"\n")

    logging.info('paper attr done.')
    def fit(self, X, y):
        """ Build the decision tree from the training set (X, y).
	The training set has m data_points (examples).
		Each of them has n features.
		Args:
			X: a pandas.Dataframe representing the training input of dimension m x n.
			y: a pandas.Dataframe representing the labels (m x 1).
		Returns:
		 object self: Trained tree.
	Raises:
		This method should not raise any Exception.
		"""
        # Your code here. You can add more things if needed
        # self.root
        r = gini(X)
        # print(f'r>> {r}')
        # print(X.iloc[:,:])
        for feat in range(4):
            tmp = X.iloc[:, feat]
            mean = np.mean(tmp)
            right, left = self.split_(X, feat, mean)
            inf = information_gain(X, [right, left])
            print(inf)
        # print(inf)
        return self.root
コード例 #4
0
def diversity_of_equal_percentile(pid_citnum, N):

    cits = pid_citnum.values()

    total = np.sum(cits)

    num = len(cits)

    acc_total = 0
    c_p = 0
    num_of_p = 0

    percents = []
    for v in sorted(cits, key=lambda x: int(x), reverse=True):

        acc_total += v
        num_of_p += 1
        ##
        if acc_total / float(total) - c_p >= 1 / float(N):

            c_p += 1 / float(N)

            percents.append(num_of_p / float(num))

            num_of_p = 0

    ##得到不同社区的文章比例,后计算不同percentile的论文的diversity

    diversity = gini(percents)
    # print(percents)
    # print(diversity)

    return percents, diversity
コード例 #5
0
ファイル: gini_wrapper.py プロジェクト: luckchem/GiniQC
def main():
	# parse arguments
	infile = open(sys.argv[1], 'r')
	outfile = open(sys.argv[2], 'w')
	reads_threshold = int(sys.argv[3])
	cis_threshold = float(sys.argv[4])
	max_aberration = float(sys.argv[5])
	gini_threshold = float(sys.argv[6].strip())

	# print header lines
	outfile.write(OUTFILE_HEADER)
	outfile.write(LINE_STRUCTURE % ("THRESHOLD", reads_threshold, cis_threshold, 0.0, gini_threshold, max_aberration, "N/A"))

	# calculate GiniQC and other metrics for each cell. Then write each to outfile
	lines = infile.readlines()
	for line in tqdm(lines):
		if "/" in line:
			file = line.strip()
		else:
			file = "/".join(sys.argv[1].split("/")[:-1])+"/"+line.strip()
		cell_name = line.split(".cool")[0]
		try:
			matrix = cooler.Cooler(file)
		except:
			print("Files must be in cool format")
		normalized, reads, cis_reads, trans_reads = normalize_matrix(matrix)
		percent_cis = 100.0*cis_reads/reads
		raw_gini = gini(normalized)
		adj_gini = adjust(raw_gini, reads)
		chrom_aberration = get_max_aberration(matrix)

		passed = (reads > reads_threshold) and (percent_cis > cis_threshold) and (adj_gini > gini_threshold) and (chrom_aberration < max_aberration)
		outfile.write(LINE_STRUCTURE % (cell_name, reads, percent_cis, raw_gini, adj_gini, chrom_aberration, passed))
コード例 #6
0
 def test_gini_val(self):
     
     X_list, y_list = read_data_class()
     
     gini_val = 0.0
     gini_test = gini.gini(X_list,y_list,3)[2]
     
     self.assertEqual(gini_val, gini_test)
コード例 #7
0
def cal_subj_div(subj_totalnum, subj_nums, subjs, subj_subj_sim):

    variety = len(subjs) / float(subj_totalnum)

    balance = gini(subj_nums)

    disparsity = cal_disparsity(subjs, subj_subj_sim)

    return variety * balance * disparsity
コード例 #8
0
ファイル: threshold.py プロジェクト: luckchem/GiniQC
def get_threshold(combos, cull_by_cis, bedfile):
    reads = {}
    rawgini = {}
    adjustedgini = {}
    bins_df = make_df(bedfile)

    if cull_by_cis:
        cis_threshold = int(sys.argv[3])

    for pair in tqdm(combos):
        if pair[0] == pair[1]:
            continue
        pair = tuple(pair)
        cool1, cool2 = get_cools(pair)
        try:
            matrix1 = np.array(cool1.matrix(as_pixels=True, balance=False)[:])
            matrix2 = np.array(cool2.matrix(as_pixels=True, balance=False)[:])
        except:
            continue
        numreads1 = sum(matrix1[:, -1])
        numreads2 = sum(matrix2[:, -1])
        totalreads = numreads1 + numreads2

        if cull_by_cis and (calculate_cistrans(matrix1) < cis_threshold
                            or calculate_cistrans(matrix2) < cis_threshold):
            continue

        if numreads1 == 0 or numreads2 == 0 or totalreads < 50000:
            continue

        numtoselect = int(
            abs(np.random.normal(totalreads / 2, totalreads / 20)))

        rands = np.random.choice(np.arange(1, totalreads),
                                 numtoselect,
                                 replace=False)
        rands.sort()

        pixel_df = fill_pixel_df(rands, matrix1, matrix2, numreads1, numreads2)
        cooler.create_cooler("temp.cool",
                             bins=bins_df,
                             pixels=pixel_df,
                             dtypes={
                                 'bin1_id': int,
                                 'bin2_id': int,
                                 'count': int
                             },
                             ordered=True)
        newcool = cooler.Cooler("temp.cool")

        normalized, reads[pair], cis, trans = normalize_matrix(newcool)
        rawgini[pair] = gini(normalized)
        adjustedgini[pair] = adjust(rawgini[pair], reads[pair])
        os.unlink("temp.cool")

    return reads, rawgini, adjustedgini
コード例 #9
0
ファイル: tree.py プロジェクト: wildberry93/RandomTrees
    def plantclass(X, y, num):
        """
        Input:
        X - macierz z przykladami budujacymi drzewo
        y - wektor z decyzjami
        num - liczba cech, sposrod ktorych gini wybiera wartosc podzialu

        Rekurencyjna funkcja budujaca drzewo. Wybiera wartosc podzialu na podstawie
        wlasnosci Gini impurity. Budowanie drzewa konczy sie, kiedy wartosc Gini w wezle
        jest rowna 0.0 - wtedy tez tworzone sa liscie z decyzjami.
        """
        gini_tup = gini.gini(X, y, num)

        if gini_tup[2] == 0:
            set1, set2, y1, y2 = Tree.divideset(X, gini_tup[0], gini_tup[1], y)

            if len(y1) == 0 and len(y2) > 0:
                fbval = float(y2[0])
                tbval = abs(fbval - 1)
            elif len(y2) == 0 and len(y1) > 0:
                tbval = float(y1[0])
                fbval = abs(tbval - 1)
            elif len(y1) > 0 and len(y2) > 0:
                tbval = y1[0]
                fbval = y2[0]

            return node.Node(tb=leaf.Leaf(tbval),
                             fb=leaf.Leaf(fbval),
                             value=gini_tup[1],
                             index=gini_tup[0],
                             gn=gini_tup[2])

        else:
            set1, set2, y1, y2 = Tree.divideset(X, gini_tup[0], gini_tup[1], y)

            if len(set1) != 0:
                trueBranch = Tree.plantclass(set1, y1, num)
            else:
                trueBranch = leaf.Leaf(random.randint(0, 1))
            if len(set2) != 0:
                falseBranch = Tree.plantclass(set2, y2, num)
            else:
                falseBranch = leaf.Leaf(random.randint(0, 1))

            return node.Node(tb=trueBranch,
                             fb=falseBranch,
                             value=gini_tup[1],
                             index=gini_tup[0],
                             gn=gini_tup[2])
コード例 #10
0
def cal_subj_div(all_subjs,subj_refnum,subj_totalnum,citnum_total):
    subj_set = []
    subj_num = []
    for subjs in all_subjs:
        subj_num.append(len(subjs))

        subj_set.extend(subjs)

    subj_set = list(set(subj_set))

    ## nc/N
    variety = len(subj_set)/float(subj_totalnum)

    balance = gini(subj_num)

    disparsity = cal_disparsity(subj_set,subj_refnum,citnum_total)

    return variety*balance*disparsity,variety,balance,disparsity
コード例 #11
0
ファイル: tree.py プロジェクト: wildberry93/RandomTrees
    def plantclass(X, y, num):
        """
        Input:
        X - macierz z przykladami budujacymi drzewo
        y - wektor z decyzjami
        num - liczba cech, sposrod ktorych gini wybiera wartosc podzialu

        Rekurencyjna funkcja budujaca drzewo. Wybiera wartosc podzialu na podstawie
        wlasnosci Gini impurity. Budowanie drzewa konczy sie, kiedy wartosc Gini w wezle
        jest rowna 0.0 - wtedy tez tworzone sa liscie z decyzjami.
        """
        gini_tup = gini.gini(X,y,num)

        if gini_tup[2] == 0:
            set1, set2, y1, y2 = Tree.divideset(X, gini_tup[0],gini_tup[1],y)

            if len(y1) == 0 and len(y2)>0:
                fbval = float(y2[0])
                tbval = abs(fbval-1)
            elif len(y2) == 0 and len(y1)>0:
                tbval = float(y1[0])
                fbval = abs(tbval-1)
            elif len(y1) > 0 and len(y2) >0:
                tbval = y1[0]
                fbval = y2[0]

            return node.Node(tb=leaf.Leaf(tbval), fb = leaf.Leaf(fbval), value=gini_tup[1], index=gini_tup[0], gn = gini_tup[2])

        else:
            set1, set2, y1, y2 = Tree.divideset(X, gini_tup[0],gini_tup[1],y)

            if len(set1) != 0:
                trueBranch = Tree.plantclass(set1, y1, num)
            else:
                trueBranch = leaf.Leaf(random.randint(0,1))
            if len(set2) != 0:
                falseBranch = Tree.plantclass(set2, y2, num)
            else:
                falseBranch = leaf.Leaf(random.randint(0,1))

            return node.Node(tb=trueBranch, fb=falseBranch, value=gini_tup[1], index=gini_tup[0], gn = gini_tup[2])
コード例 #12
0
def best_value(rows, col):
    """
    param: fdata 第0行为属性,第1行为类别
    """
    # 选取一列,确定以一个值作为分界分出两部分的gini增益最大,
    # 返回这个分界值,以及gini增益,和划分后的两个数组
    # 遍历取值:2-10,>=取值的分到true
    # 然后分裂成两个
    # 循环,在循环中保存gini增益值最大的分界值和分裂后的数组

    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    current_uncertainty = gini(rows)

    for val in range(2, 11):
        question = Question(col, val)
        true_rows, false_rows = partition(rows, question)
        if len(true_rows) == 0 or len(false_rows) == 0:
            continue
        gain = info_gain(true_rows, false_rows, current_uncertainty)
        if gain > best_gain:
            best_gain, best_question = gain, question

    return best_gain, best_question
コード例 #13
0
ファイル: gini_test.py プロジェクト: mrckostecki/RandomTrees
# -*- coding: UTF-8 -*-
import gini

tabela = []
f = open("gini_dane.txt", "r")
for i in f:
	tabela.append(i.strip().split("\t"))
	
y = []
g = open("gini_klasyfikacje.txt", "r")
for j in g:
	y.append(j.strip())

print "Tabela X z wartościami y po prawej stronie:"
for element in zip(tabela, y):
	print (element[0] + [element[1]])
	

print "Wynik działania funkcji na zbiorze danych\n(indeks cechy / wartość cechy / wartość Gini impurity):"
a = gini.gini(tabela, y, 3)
print a
コード例 #14
0
    def run_sim(self):
        self.policy.eval()
        with torch.no_grad():
            trajectories = np.asarray(
                [Trajectory() for i in range(self.n_trajectories)])

            ra_length = 1
            #             epsilon = 0.9
            item_embeds = torch.from_numpy(self.item_embeddings).to(
                self.device).float()

            ave_score = 0
            ave_cost = 0
            states = self.env.reset()
            #             print(states.shape)
            recommended_item_onehot = torch.FloatTensor(
                self.n_trajectories, self.nb_item).zero_().to(self.device)
            recommendations = []
            for t in range(self.trajectory_len):
                policy_input = torch.FloatTensor(states).to(self.device).view(
                    self.n_trajectories, -1)
                weight_dists = self.policy(policy_input)
                w = weight_dists.sample()
                #                 print(w.shape)
                item_weights = torch.mm(w.view(-1, item_embeds.shape[1]),
                                        item_embeds.transpose(0, 1)).view(
                                            self.n_trajectories, ra_length, -1)
                item_weights = torch.mul(item_weights.transpose(0, 1),
                                         1 - recommended_item_onehot).reshape(
                                             states.shape[0], ra_length, -1)
                item_idxes = torch.argmax(item_weights, dim=2)

                recommendations.append(item_idxes)
                recommended_item_onehot = recommended_item_onehot.scatter_(
                    1, item_idxes, 1)

                actions = item_embeds[item_idxes.cpu().detach()]
                states_prime, rewards, costs, info = self.env.step(
                    actions, item_idxes)

                for i in range(len(trajectories)):
                    trajectory = trajectories[i]
                    trajectory.observations.append(policy_input[i].to(
                        self.device).squeeze())
                    trajectory.actions.append(actions[i].to(
                        self.device).squeeze())
                    trajectory.rewards.append(rewards[i].to(
                        self.device).squeeze())
                    trajectory.costs.append(costs[i].to(self.device).squeeze())

                states = states_prime
                ave_score += torch.sum(info).detach().cpu()
                ave_cost += torch.sum(costs).detach().cpu()

            memory = Memory(trajectories)

            #             print(ave_score.float()/(self.trajectory_len*self.n_trajectories), ave_cost/(self.trajectory_len*self.n_trajectories))
            self.pop_rate.append(ave_cost /
                                 (self.trajectory_len * self.n_trajectories))

            recommendation_tensor = torch.cat(recommendations, 1)
            idx, val = torch.unique(torch.cat(recommendations),
                                    return_counts=True)
            hr = (ave_score.float() /
                  (self.trajectory_len * self.n_trajectories)).cpu().numpy()
            self.hit_rate.append(hr)

            val_ = torch.cat(
                (val.float(),
                 torch.zeros(self.nb_item - len(val)).to(self.device)))
            g = gini(val_.cpu().numpy())
            self.gini_coefficient.append(g)

            return memory
    print('First best output {}, capital {}, interest rate {}, hours {} and labour supply {}'.format(fb_Y, fb_K, fb_r, fb_H, fb_L))

    results_FB = dict( (name, eval(name)) for name in ['fb_Y', 'fb_K', 'fb_r', 'fb_w', 'fb_H', 'fb_L'])


    #====Calcuate Incomplete Market Results ===#

    eqm_r_IM = brentq(Gamma_IM, -fp.delta*.95, (1-cp.beta)/cp.beta, xtol = tol_brent)


    im_r, im_w, im_Lambda, im_K, im_L, im_H, im_coefvar, im_a, im_z_rlz, im_h_val, im_l_val, im_policy  = compute_agg_prices(cp,z_seq, social =1)
    
    im_Y = KL_to_Y(im_K, im_L, fp)

    im_gini_a = gini(im_a)

    im_gini_i = gini(im_z_rlz)

    results_IM = dict( (name, eval(name)) for name in ['im_r', 'im_w', 'im_Lambda', 'im_K', 'im_L',\
                                                        'im_H', 'im_coefvar', 'im_a', 'im_z_rlz', 'im_h_val',\
                                                        'im_l_val', 'im_policy', 'im_Y', 'im_gini_a',  'im_gini_i'])
    

    print('Incomplete market capital {}, interest rate {}, hours {} and labour supply {} and Lambda {}'.format(im_K, im_r, im_H, im_L, im_Lambda))
  
    cp.r = eqm_r_IM
    cp.R = 1+cp.r
    cp.w = r_to_w(cp.r,fp, cp)
    cap_lab_ratio = ((cp.r+fp.delta)/fp.alpha)**(1/(fp.alpha-1))
    
コード例 #16
0
ファイル: deptReport.py プロジェクト: jtleider/uisalaries
def selection():
	var = 'newsalaryperfte'
	if selectVariable.value == 'Previous Salary (AY 2016-2017)': var = 'cursalaryperfte'
	campus = selectCampus.value
	college = selectCollege.value
	dept = selectDept.value
	if excludeSlider.value > 0: exclude = slice(None, -excludeSlider.value)
	else: exclude = slice(None)
	df = salaries.loc[(salaries.campus == campus) & (salaries.college == college) & (salaries.dept == dept), 
		['empname', 'empdepttitle', var]].rename(columns={var: 'value'})
	df['Rank'] = df['value'].rank(ascending=False)
	df['ylabel'] = df.apply(lambda row: '{:g} {}'.format(row['Rank'], row['empname']), axis=1)
	df['value_scaled'] = df['value']/1000
	return df.sort_values('value', na_position='first', ascending=True).iloc[exclude], df['value_scaled'].quantile(0.5), gini(df['value_scaled'])
コード例 #17
0
def cal_wos_paper_divs():

    pid_pubyear, pid_subjects, pid_topsubjs, pid_teamsize = load_basic_data()
    ## pid c2
    pid_c2 = json.loads(open('../WOS_data_processing/data/pid_c2.json').read())
    ## pid c5
    pid_c5 = json.loads(open('../WOS_data_processing/data/pid_c5.json').read())
    ## pid_c10
    pid_c10 = json.loads(
        open('../WOS_data_processing/data/pid_c10.json').read())
    ## subject subject sim
    subj_subj_sim = json.loads(
        open('../WOS_data_processing/data/subj_subj_sim.json').read())

    #  计算c2 c5 c10的percentile
    c2_percentile = nums_to_percentile_dict(pid_c2.values())
    c5_percentile = nums_to_percentile_dict(pid_c5.values())
    c10_percentile = nums_to_percentile_dict(pid_c10.values())

    subj_totalnum = float(len(subj_subj_sim.keys()))

    pid_divs = {}

    progress = 0

    sub_progress = 0

    total_paper_num = 0

    total_cit_num = 0

    selected_paper_num = 0

    selected_cit_num = 0

    cn_dis = defaultdict(int)

    ref_nums = defaultdict(int)

    for line in open('../WOS_data_processing/data/pid_refs.txt'):

        line = line.strip()

        progress += 1

        pid_refs = json.loads(line)

        for pid in pid_refs.keys():

            sub_progress += 1

            if sub_progress % 1000000 == 0:
                logging.info('progress:{},sub progress {} ...'.format(
                    progress, sub_progress))

            pubyear = int(pid_pubyear.get(pid, 9999))

            ## 1980年 到 如果年份大于2004则舍弃
            if pubyear > 2004 or pubyear < 1980:
                continue

            total_paper_num += 1

            total_cit_num += len(pid_refs[pid])

            ref_nums[len(pid_refs[pid])] += 1

            if len(pid_refs[pid]) < 4 or len(pid_refs[pid]) > 100:
                continue

            selected_cit_num += len(pid_refs[pid])
            selected_paper_num += 1

            ## 对于每一篇文章来讲 需要计算三个
            ## year differences
            ## subject diversity
            ## c5 diversity
            ## c10 diversity
            yds = []
            subjs = []
            subj_nums = []
            c2s = []
            c5s = []
            c10s = []

            c2ps = []
            c5ps = []
            c10ps = []

            for ref_id in pid_refs[pid]:

                yds.append(abs(int(pid_pubyear[ref_id]) - pubyear))

                c2s.append(pid_c2.get(ref_id, 0))

                c5s.append(pid_c5.get(ref_id, 0))

                c10s.append(pid_c10.get(ref_id, 0))

                c2ps.append(c2_percentile[pid_c2.get(ref_id, 0)])

                c5ps.append(c5_percentile[pid_c5.get(ref_id, 0)])

                c10ps.append(c10_percentile[pid_c10.get(ref_id, 0)])

                subj_nums.append(len(pid_subjects.get(ref_id, [])))

                subjs.extend(pid_subjects[ref_id])

                cn_dis[ref_id] += 1

            ## 通过上面的值计算每篇论文reference的diversity
            yd_div = gini(yds)
            c2_div = gini(c2s)
            c5_div = gini(c5s)
            c10_div = gini(c10s)
            # 均值以及std
            yd_mean = np.mean(yds)
            yd_std = np.std(yds)
            #
            c2_mean = np.mean(c2s)
            c2_std = np.std(c2s)
            #
            c5_mean = np.mean(c5s)
            c5_std = np.std(c5s)
            #
            c10_mean = np.mean(c10s)
            c10_std = np.std(c10s)

            c2p_div = gini(c2ps)
            c5p_div = gini(c5ps)
            c10p_div = gini(c10ps)

            c2p_mean = np.mean(c2ps)
            c2p_std = np.std(c2ps)
            #
            c5p_mean = np.mean(c5ps)
            c5p_std = np.std(c5ps)
            #
            c10p_mean = np.mean(c10ps)
            c10p_std = np.std(c10ps)

            subjs = list(set(subjs))

            if len(subjs) <= 1:
                subj_div = 0

            else:
                subj_div = cal_subj_div(subj_totalnum, subj_nums, subjs,
                                        subj_subj_sim)

            pid_divs[pid] = [
                yd_div, subj_div, c2_div, c5_div, c10_div, yd_mean, yd_std,
                c2_mean, c2_std, c5_mean, c5_std, c10_mean, c10_std, c2p_div,
                c5p_div, c10p_div, c2p_mean, c2p_std, c5p_mean, c5p_std,
                c10p_mean, c10p_std
            ]

    open('data/pid_divs.json', 'w').write(json.dumps(pid_divs))
    logging.info('{} papers div data saved to data/pid_divs.json'.format(
        len(pid_divs.keys())))

    # 将现有的需要统计的指标进行列出来
    print('===============================')
    print('Total paper num:', total_paper_num, ',total num of citation links:',
          total_cit_num)
    print('reserved paper num:', selected_paper_num,
          ',reserved num of citation links:', selected_cit_num)

    # 将保留的引用次数分布画出来
    cc_counter = Counter(cn_dis.values())

    xs = []
    ys = []
    for cc in sorted(cc_counter.keys()):

        if cc == 100:
            print('Number of papers cited 100 times:', cc_counter[cc])

        xs.append(cc)
        ys.append(cc_counter[cc])

    plt.figure(figsize=(7, 5))

    plt.plot(xs, ys, 'o', fillstyle='none')

    plt.xscale('log')

    plt.yscale('log')

    plt.xlabel('number of citations')

    plt.ylabel('number of publications')

    plt.tight_layout()

    plt.savefig('fig/citation_distritbuion.png', dpi=400)

    # 将refnum_distribution进行画出来

    xs = []
    ys = []
    for rn in sorted(ref_nums.keys()):

        if rn > 100:
            continue

        xs.append(rn)
        ys.append(ref_nums[rn])

    plt.figure(figsize=(7, 5))
    plt.plot(xs, ys)

    plt.xlabel('number of references')
    plt.ylabel('number of publications')

    plt.xscale('log')

    plt.yscale('log')

    plt.tight_layout()

    plt.savefig('fig/refnum_distribution.png', dpi=400)

    print('DONE')
コード例 #18
0
ファイル: test.py プロジェクト: Artygo8/python_ml_bootcamp
import numpy as np
from gini import gini

# print(gini(np.array([])))
# None
# print(gini(np.array({1, 2})))
# None
# print(gini(np.array('bob')))
# None
print(gini(np.array([0, 0, 0, 0, 0, 0])))
# 0.0
print(gini(np.array([6])))
# 0.0
print(gini(np.array(['a', 'a', 'b', 'b'])))
# 1.0
print(gini(np.array(['0', '0', '1', '0', 'bob', '1'])))
# 1.4591479170272448
print(gini(np.array([0, 0, 1, 0, 2, 1])))
# 1.4591479170272448
print(gini(np.array(['0', 'bob', '1'])))
# 1.584962500721156
print(gini(np.array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])))
# 0.0
print(gini(np.array([0., 1., 1., 1., 1., 1., 1., 1., 1., 1.])))
# 0.4689955935892812
print(gini(np.array([0, 0, 1])))
# 0.9182958340544896
コード例 #19
0
def find_measures(year, month=0, pnt=False):
    if month is 0:
        # Yearly
        file_name_holderdata = "Shareholder" + str(year) + ".csv"
        file_name_measures = "Measures" + str(year) + ".csv"
        path = r"C:\Users\Mahdi\OneDrive\Master Thesis\Data"
    else:
        file_name_holderdata = "Shareholder" + str(year) + '_' + str(
            month) + ".csv"
        file_name_measures = "Measures" + str(year) + '_' + str(month) + ".csv"
        path = r"C:\Users\Mahdi\OneDrive\Master Thesis\Data\MonthlyShareholder"

    os.chdir(path)
    SDATA = pd.read_csv(file_name_holderdata, index_col=0).drop_duplicates()

    # Creating Dataframe for saving concentration mearsurs
    CMdf = SDATA.groupby('Symbol', as_index=False).agg({
        'Id_tse': 'first',
        'Industry': 'first',
        'percent': 'sum',
        'ShareHolder': 'count',
        'MarketCap': 'first'
    }).rename(columns={
        'ShareHolder': 'Num_holders',
        'percent': 'sum_over1'
    })
    CMdf.reset_index(drop=True, inplace=True)
    Orginal_Size = len(CMdf)
    print('Number of observed firms in year ', str(year), ' is : ',
          Orginal_Size)

    # Largest Owner
    temp = SDATA.groupby('Symbol', as_index=False).agg({
        'percent': 'max'
    }).rename(columns={'percent': 'Largest_Owner'})
    CMdf = pd.merge(CMdf,
                    temp,
                    left_on='Symbol',
                    right_on='Symbol',
                    how='left')

    # First/Second
    temp = SDATA.groupby('Symbol', as_index=False).agg({
        'percent': {lambda x: max(x) / nth_max(x, nth=2, interval=False)}
    }).rename(columns={'percent': 'First_Second'})
    CMdf = pd.merge(CMdf,
                    temp,
                    left_on='Symbol',
                    right_on='Symbol',
                    how='left').rename(
                        columns={('First_Second', '<lambda>'): 'First_Second'})

    # First/SumToFour
    temp = SDATA.groupby('Symbol', as_index=False).agg({
        'percent':
        {lambda x: max(x) / sum(nth_max(x, nth=[2, 4], interval=True))}
    }).rename(columns={'percent': 'First_Sumtwofour'})
    CMdf = pd.merge(
        CMdf, temp, left_on='Symbol', right_on='Symbol', how='left').rename(
            columns={('First_Sumtwofour', '<lambda>'): 'First_Sumtwofour'})

    # Sumfive
    temp = SDATA.groupby('Symbol', as_index=False).agg({
        'percent': {lambda x: sum(nth_max(x, nth=[1, 5], interval=True))}
    }).rename(columns={'percent': 'Sumfive'})
    CMdf = pd.merge(
        CMdf, temp, left_on='Symbol', right_on='Symbol',
        how='left').rename(columns={('Sumfive', '<lambda>'): 'Sumfive'})

    temp = SDATA.groupby('Symbol', as_index=False).agg({
        'percent': {lambda x: sum(nth_max(x, nth=[1, 4], interval=True))}
    }).rename(columns={'percent': 'Sumfour'})
    CMdf = pd.merge(
        CMdf, temp, left_on='Symbol', right_on='Symbol',
        how='left').rename(columns={('Sumfour', '<lambda>'): 'Sumfour'})

    temp = SDATA.groupby('Symbol', as_index=False).agg({
        'percent': {lambda x: sum(nth_max(x, nth=[1, 3], interval=True))}
    }).rename(columns={'percent': 'Sumthree'})
    CMdf = pd.merge(
        CMdf, temp, left_on='Symbol', right_on='Symbol',
        how='left').rename(columns={('Sumthree', '<lambda>'): 'Sumthree'})

    temp = SDATA.groupby('Symbol', as_index=False).agg({
        'percent': {lambda x: sum(nth_max(x, nth=[1, 2], interval=True))}
    }).rename(columns={'percent': 'Sumtwo'})
    CMdf = pd.merge(
        CMdf, temp, left_on='Symbol', right_on='Symbol',
        how='left').rename(columns={('Sumtwo', '<lambda>'): 'Sumtwo'})

    # Gini
    temp = SDATA.groupby('Symbol', as_index=False).agg({
        'percent': {lambda x: gini(list(x))}
    }).rename(columns={'percent': 'Gini'})
    CMdf = pd.merge(CMdf,
                    temp,
                    left_on='Symbol',
                    right_on='Symbol',
                    how='left').rename(columns={('Gini', '<lambda>'): 'Gini'})

    # Herfindahl
    temp = SDATA.groupby('Symbol', as_index=False).agg({
        'percent': {lambda x: sum([(t / 100)**2 for t in list(x)])}
    }).rename(columns={'percent': 'Herfindhal'})
    CMdf = pd.merge(
        CMdf, temp, left_on='Symbol', right_on='Symbol',
        how='left').rename(columns={('Herfindhal', '<lambda>'): 'Herfindhal'})

    # Shapley-Shubik
    # For refilling
    try:
        os.chdir(path)
        CMdf_load = pd.read_csv(file_name_measures)
        CMdf = pd.merge(CMdf,
                        CMdf_load[[
                            'Symbol', 'SSCL', 'SSCO', 'SSDL', 'SSDO', 'BZCL',
                            'BZCO', 'BZDL'
                        ]],
                        left_on='Symbol',
                        right_on='Symbol',
                        how='left')
        print('RE-FILL!')
    except:
        print('NEW!')
        # For the first time
        # Initiating columns
        CMdf['SSCL'] = np.nan
        CMdf['SSCO'] = np.nan
        CMdf['SSDL'] = np.nan
        CMdf['SSDO'] = np.nan
        CMdf['BZCL'] = np.nan
        CMdf['BZCO'] = np.nan
        CMdf['BZDL'] = np.nan

    data = fill_shapley_banzhaf(data=CMdf,
                                SDATA=SDATA,
                                fast_mode=True,
                                time_pnt=pnt,
                                major_thr=10)
    CMdf = data['CMdf']

    print('len(Errors): ', len(data['Errors']))
    data['Errors']
    [x for x in data['Errors'] if x[2] != 'Error: request error!']

    Output_Size = len(CMdf)
    print('Orginal Size is ', Orginal_Size, ' and output size is: ',
          Output_Size)
    os.chdir(path)
    CMdf.to_csv(file_name_measures)
    return (file_name_measures + ' is done!\n')