def create_tree(self, dataset, attributes): """Create decision tree using trainset. Arguments: dataset {list} -- Training dataset. attributes {list} -- Attributes. Returns: dictionary -- Trained decision tree. """ class_list = [sample[-1] for sample in dataset] if class_list.count(class_list[0]) == len(class_list): return class_list[0] if len(dataset[0]) == 1: return majority_count(class_list) selected_attribute = self._select_attribute(dataset) selected_attribute_label = attributes[selected_attribute] print('Current selected attribute: ', selected_attribute_label) tree = {selected_attribute_label: {}} del (attributes[selected_attribute]) attribute_list = [sample[selected_attribute] for sample in dataset] unique_vals = set(attribute_list) for val in unique_vals: sub_attributes = attributes[:] tree[selected_attribute_label][val] = self.create_tree( get_subset(dataset, selected_attribute, val), sub_attributes) return tree
def fpGrowth(tree, alpha): if tree.has_unique_branch(): nodes = tree.get_all_node_exclude_root() # print([str(r) for r in nodes]) for node_list in get_subset(nodes): beta = FrequentPattern(node_list) if beta.support_count >= tree.min_sup: beta = beta | alpha # print([str(r) for r in node_list]) print(beta) else: print(tree.get_head_table()) for item in tree.get_head_table(): #print("item %s " % (item)) #print(alpha) # 产生模式beta beta = FrequentPattern([item]) beta = beta | alpha print(beta) conditional_pattern_base = [] def get_prefix_path(node): path = [] cur_node = node.get_parent() while cur_node and not cur_node.is_root(): path.insert(0, cur_node) cur_node = cur_node.get_parent() return path for node in item.nodes: prefix_path = get_prefix_path(node) if prefix_path: conditional_pattern_base.append( FrequentPattern(prefix_path, support_count=node.cnt)) #conditional_pattern_base() # print([str(item) for item in conditional_pattern_base]) # print("############") if conditional_pattern_base: tree_beta = ConditionalFPTree(min_sup=tree.min_sup) tree_beta.set_conditional_pattern_base( conditional_pattern_base) tree_beta.build() fpGrowth(tree_beta, beta)
def load_dataset_svrt(set_num, batch_size, split, trainimages=0, dat_augment=False, return_dataset=False, prep_method='imagenet'): dat_dir = '../stimuli/problem_' + str(set_num) + '/' if dat_augment: print('- do data augmentation') prepfun = utils.prep_imagenet_augment else: prepfun = utils.prep_imagenet # need this for adversarial examples if prep_method == 'orig': prepfun = None #load dataset dataset = MixedImageFolder(root=dat_dir + split + '/', transform=prepfun) if split == 'train': shuffle = True dataset = utils.get_subset( dataset, trainimages) # change the size of the trainingset else: shuffle = False # define dataloader dataloader = torch.utils.data.DataLoader(dataset, shuffle=shuffle, batch_size=batch_size) if return_dataset: return dataloader, dataset else: return dataloader
from tqdm import tqdm from sklearn.cluster import AgglomerativeClustering import statsmodels.api as sm import matplotlib.pyplot as plt ##Set a random seed to make it reproducible! np.random.seed(utils.getSeed()) utils.set_mpl_params() #load up data: x, y = utils.load_feature_and_label_matrices(type='morgan') ##select a subset of columns of 'y' to use as a test matrix: #this is the same each time thanks to setting the random.seed. col_indices = np.random.choice(243, 10, replace=False) x_, y_ = utils.get_subset(x, y, indices=col_indices) #This will be used for clustering: distance_matrix = utils.fast_dice(x_) #choose a random target: idx = np.random.choice(y_.shape[1]) all_positive_indices = (y_[:,idx]==1).nonzero()[0] pos_test_counts = {index: 0 for index in all_positive_indices} all_negative_indices = (y_[:,idx]==0).nonzero()[0] neg_test_counts = {index: 0 for index in all_negative_indices} positive_fractions = []
from scipy import stats, sparse from scipy.spatial.distance import pdist, squareform from paris_cluster import ParisClusterer from sklearn.linear_model import LogisticRegression from tqdm import tqdm ##Set a random seed to make it reproducible! np.random.seed(utils.getSeed()) #load up data: x, y = utils.load_feature_and_label_matrices(type='morgan') ##select a subset of columns of 'y' to use as a test matrix: #this is the same each time thanks to setting the random.seed. col_indices = np.random.choice(243, 100, replace=False) x_, y_ = utils.get_subset(x, y, indices=col_indices) #Open a memory mapped distance matrix. #We do this because the pairwise distance matrix for 100 targets does not fit in memory. #It is nearly 100% dense and has 117747*117747 = 13864356009 elements. This is also #Why it uses float16 (reducing the required storage space to ~26GB, c.f. 52GB for float32). distance_matrix = np.memmap('./processed_data/graph_fp_comparison/distMat.dat', dtype=np.float16, shape=(x_.shape[0], x_.shape[0])) #build the hierarchical clustering tree: clusterer = ParisClusterer(x_.toarray()) clusterer.buildAdjacency() clusterer.fit()
def run(): functions, phenotypes = load_data() terms = list() n = len(functions) global counter counter = Counter() global tree tree = dict() e = 100 term_index = dict() term_list = list() for go_id in go: term_index[go_id] = len(term_index) term_list.append(go_id) for hp_id in hp: term_index[hp_id] = len(term_index) term_list.append(hp_id) for i in xrange(n): funcs = set(map(lambda x: term_index[x], functions[i])) phenos = set(map(lambda x: term_index[x], phenotypes[i])) terms.append(funcs | phenos) for func in funcs: for pheno in phenos: counter[frozenset([func, pheno])] += 1 for s, c in counter.items(): if c < e: del counter[s] for s, c in counter.items(): for term in s: if term_list[term] in go: tree[term] = set( map(lambda x: term_index[x], get_anchestors(go, term_list[term]))) tree[term] |= set( map(lambda x: term_index[x], get_subset(go, term_list[term]))) else: tree[term] = set( map(lambda x: term_index[x], get_anchestors(hp, term_list[term]))) tree[term] |= set( map(lambda x: term_index[x], get_subset(hp, term_list[term]))) print(len(counter)) pool = Pool(48) gf = gzip.open('data/results.gz', 'w') while len(counter) > 0: cnts = pool.map(next_level, terms) cnt = sum(cnts) print(counter.most_common(10)) print(cnt.most_common(10)) for s, c in cnt.items(): if c < e: del cnt[s] else: gf.write(c) for term in s: gf.write('\t' + term_list[term]) gf.write('\n') counter = cnt
def test_get_subset(): foo = {'a': 1, 'b': 2, 'c': 3} subset = get_subset(foo, ['a']) assert subset == {'a': 1} subset = get_subset(foo, ['a', 'b']) assert subset == {'a': 1, 'b': 2}
def test_1(self): res = get_subset([1, 2, 3]) print(res) self.assertEqual(res, [[1], [2], [1, 2], [3], [1, 3], [2, 3], [1, 2, 3]], msg='Your input is not 10')
def get_connecting_nodes(diff_start_end_elr_dat, route_name=None, update=False, verbose=False): """ Get data of connecting points for different ELRs. :param diff_start_end_elr_dat: data frame where StartELR != EndELR :type diff_start_end_elr_dat: pandas.DataFrame :param route_name: name of a Route; if ``None`` (default), all Routes :type route_name: str or None :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool or int :return: data of connecting points for different ELRs :rtype: pandas.DataFrame **Test**:: from mssqlserver.metex import view_metex_schedule8_incident_locations from models.prototype.furlong import get_connecting_nodes update = False verbose = True route_name = None diff_start_end_elr_dat = view_metex_schedule8_incident_locations( route_name=route_name, start_and_end_elr='diff', verbose=verbose) connecting_nodes = get_connecting_nodes(diff_start_end_elr_dat, route_name, update, verbose) print(connecting_nodes) route_name = 'Anglia' diff_start_end_elr_dat = view_metex_schedule8_incident_locations( route_name=route_name, start_and_end_elr='diff', verbose=verbose) connecting_nodes = get_connecting_nodes(diff_start_end_elr_dat, route_name, update, verbose) print(connecting_nodes) """ filename = "connections-between-different-ELRs" pickle_filename = make_filename(filename, route_name) path_to_pickle = cdd_geodata(pickle_filename) if os.path.isfile(path_to_pickle) and not update: return load_pickle(path_to_pickle, verbose=verbose) else: try: pickle_filename_temp = make_filename(filename) path_to_pickle_temp = cdd_geodata(pickle_filename_temp) if os.path.isfile(path_to_pickle_temp) and not update: connecting_nodes_all = load_pickle(path_to_pickle_temp) connecting_nodes = get_subset(connecting_nodes_all, route_name) else: diff_elr_mileages = diff_start_end_elr_dat.drop_duplicates() em = ELRMileages() print("Searching for connecting ELRs ... ", end="") if verbose else "" mileage_file_dir = cdd_railway_codes("line data\\elrs-and-mileages\\mileages") # noinspection PyTypeChecker conn_mileages = diff_elr_mileages.apply( lambda x: em.get_conn_mileages(x.StartELR, x.EndELR, update, pickle_mileage_file=True, data_dir=mileage_file_dir), axis=1) print("\nFinished.") if verbose else "" conn_mileages_data = pd.DataFrame(conn_mileages.to_list(), index=diff_elr_mileages.index, columns=['StartELR_EndMileage', 'ConnELR', 'ConnELR_StartMileage', 'ConnELR_EndMileage', 'EndELR_StartMileage']) connecting_nodes = diff_elr_mileages.join(conn_mileages_data) connecting_nodes.set_index(['StartELR', 'StartMileage', 'EndELR', 'EndMileage'], inplace=True) save_pickle(connecting_nodes, path_to_pickle, verbose=verbose) return connecting_nodes except Exception as e: print("Failed to get \"{}\". {}.".format(os.path.splitext(pickle_filename)[0], e))
def _select_attribute(self, dataset): """Select attribute to split subset. Arguments: dataset {list} -- Training dataset. Returns: integer -- Selected attribute index. """ n_features = len(dataset[0]) - 1 base_entropy = self._compute_entropy(dataset) if self.name == 'ID3': max_info_gain = 0.0 elif self.name == 'C45': max_info_gain_ratio = 0.0 elif self.name == 'CART': min_gini = 99999.0 selected_attribute = -1 for i in range(n_features): attribute_list = [sample[i] for sample in dataset] unique_vals = set(attribute_list) sub_entropy = 0.0 if self.name == 'C45': iv = 0.0 elif self.name == 'CART': gini = 0.0 for val in unique_vals: sub_dataset = get_subset(dataset, i, val) p = len(sub_dataset) / float(len(dataset)) if self.name == 'CART': sub_p = len(get_subset(sub_dataset, -1, '0')) / float( len(sub_dataset)) else: sub_entropy += p * self._compute_entropy(sub_dataset) if self.name == 'C45': iv = iv - p * log(p, 2) elif self.name == 'CART': gini += p * (1.0 - pow(sub_p, 2) - pow(1 - sub_p, 2)) print('{0:d}th information gini in CART is:{1:.3f}'.format( i, gini)) info_gain = base_entropy - sub_entropy if self.name == 'ID3': print('{0:d}th information gain in ID3 is:{1:3f}'.format( i, info_gain)) if info_gain > max_info_gain: max_info_gain = info_gain selected_attribute = i elif self.name == 'C45': if iv == 0: continue info_gain_ratio = info_gain / iv print( '{0:d}th information gain rario in C4.5 is:{1:3f}'.format( i, info_gain_ratio)) if info_gain_ratio > max_info_gain_ratio: max_info_gain_ratio = info_gain_ratio selected_attribute = i elif self.name == 'CART': if gini < min_gini: min_gini = gini selected_attribute = i return selected_attribute