Ejemplo n.º 1
0
    def create_tree(self, dataset, attributes):
        """Create decision tree using trainset.
        
        Arguments:
            dataset {list} -- Training dataset.
            attributes {list} -- Attributes.
        
        Returns:
            dictionary -- Trained decision tree.
        """

        class_list = [sample[-1] for sample in dataset]
        if class_list.count(class_list[0]) == len(class_list):
            return class_list[0]
        if len(dataset[0]) == 1:
            return majority_count(class_list)

        selected_attribute = self._select_attribute(dataset)
        selected_attribute_label = attributes[selected_attribute]
        print('Current selected attribute: ', selected_attribute_label)
        tree = {selected_attribute_label: {}}
        del (attributes[selected_attribute])

        attribute_list = [sample[selected_attribute] for sample in dataset]
        unique_vals = set(attribute_list)
        for val in unique_vals:
            sub_attributes = attributes[:]
            tree[selected_attribute_label][val] = self.create_tree(
                get_subset(dataset, selected_attribute, val), sub_attributes)

        return tree
Ejemplo n.º 2
0
def fpGrowth(tree, alpha):
    if tree.has_unique_branch():
        nodes = tree.get_all_node_exclude_root()
        # print([str(r) for r in nodes])
        for node_list in get_subset(nodes):
            beta = FrequentPattern(node_list)
            if beta.support_count >= tree.min_sup:
                beta = beta | alpha
                # print([str(r) for r in node_list])
                print(beta)
    else:
        print(tree.get_head_table())
        for item in tree.get_head_table():
            #print("item %s " % (item))
            #print(alpha)

            # 产生模式beta
            beta = FrequentPattern([item])
            beta = beta | alpha
            print(beta)
            conditional_pattern_base = []

            def get_prefix_path(node):
                path = []
                cur_node = node.get_parent()
                while cur_node and not cur_node.is_root():
                    path.insert(0, cur_node)
                    cur_node = cur_node.get_parent()
                return path

            for node in item.nodes:
                prefix_path = get_prefix_path(node)
                if prefix_path:
                    conditional_pattern_base.append(
                        FrequentPattern(prefix_path, support_count=node.cnt))
            #conditional_pattern_base()
            # print([str(item) for item in conditional_pattern_base])
            # print("############")
            if conditional_pattern_base:
                tree_beta = ConditionalFPTree(min_sup=tree.min_sup)
                tree_beta.set_conditional_pattern_base(
                    conditional_pattern_base)
                tree_beta.build()
                fpGrowth(tree_beta, beta)
def load_dataset_svrt(set_num,
                      batch_size,
                      split,
                      trainimages=0,
                      dat_augment=False,
                      return_dataset=False,
                      prep_method='imagenet'):
    dat_dir = '../stimuli/problem_' + str(set_num) + '/'

    if dat_augment:
        print('- do data augmentation')
        prepfun = utils.prep_imagenet_augment
    else:
        prepfun = utils.prep_imagenet

    # need this for adversarial examples
    if prep_method == 'orig':
        prepfun = None

    #load dataset
    dataset = MixedImageFolder(root=dat_dir + split + '/', transform=prepfun)

    if split == 'train':
        shuffle = True
        dataset = utils.get_subset(
            dataset, trainimages)  # change the size of the trainingset
    else:
        shuffle = False

    # define dataloader
    dataloader = torch.utils.data.DataLoader(dataset,
                                             shuffle=shuffle,
                                             batch_size=batch_size)
    if return_dataset:
        return dataloader, dataset
    else:
        return dataloader
from tqdm import tqdm
from sklearn.cluster import AgglomerativeClustering
import statsmodels.api as sm
import matplotlib.pyplot as plt

##Set a random seed to make it reproducible!
np.random.seed(utils.getSeed())
utils.set_mpl_params()

#load up data:
x, y = utils.load_feature_and_label_matrices(type='morgan')
##select a subset of columns of 'y' to use as a test matrix:
#this is the same each time thanks to setting the random.seed.
col_indices = np.random.choice(243, 10, replace=False)
x_, y_ = utils.get_subset(x, y, indices=col_indices)


#This will be used for clustering:
distance_matrix = utils.fast_dice(x_)


#choose a random target:
idx = np.random.choice(y_.shape[1])
all_positive_indices = (y_[:,idx]==1).nonzero()[0]
pos_test_counts = {index: 0 for index in all_positive_indices}

all_negative_indices = (y_[:,idx]==0).nonzero()[0]
neg_test_counts = {index: 0 for index in all_negative_indices}

positive_fractions = []
from scipy import stats, sparse
from scipy.spatial.distance import pdist, squareform

from paris_cluster import ParisClusterer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

##Set a random seed to make it reproducible!
np.random.seed(utils.getSeed())

#load up data:
x, y = utils.load_feature_and_label_matrices(type='morgan')
##select a subset of columns of 'y' to use as a test matrix:
#this is the same each time thanks to setting the random.seed.
col_indices = np.random.choice(243, 100, replace=False)
x_, y_ = utils.get_subset(x, y, indices=col_indices)

#Open a memory mapped distance matrix.
#We do this because the pairwise distance matrix for 100 targets does not fit in memory.
#It is nearly 100% dense and has 117747*117747 = 13864356009 elements. This is also
#Why it uses float16 (reducing the required storage space to ~26GB, c.f. 52GB for float32).
distance_matrix = np.memmap('./processed_data/graph_fp_comparison/distMat.dat', dtype=np.float16,
              shape=(x_.shape[0], x_.shape[0]))



#build the hierarchical clustering tree:
clusterer = ParisClusterer(x_.toarray())
clusterer.buildAdjacency()
clusterer.fit()
Ejemplo n.º 6
0
def run():
    functions, phenotypes = load_data()
    terms = list()
    n = len(functions)
    global counter
    counter = Counter()
    global tree
    tree = dict()
    e = 100
    term_index = dict()
    term_list = list()
    for go_id in go:
        term_index[go_id] = len(term_index)
        term_list.append(go_id)
    for hp_id in hp:
        term_index[hp_id] = len(term_index)
        term_list.append(hp_id)
    for i in xrange(n):
        funcs = set(map(lambda x: term_index[x], functions[i]))
        phenos = set(map(lambda x: term_index[x], phenotypes[i]))
        terms.append(funcs | phenos)
        for func in funcs:
            for pheno in phenos:
                counter[frozenset([func, pheno])] += 1
    for s, c in counter.items():
        if c < e:
            del counter[s]
    for s, c in counter.items():
        for term in s:
            if term_list[term] in go:
                tree[term] = set(
                    map(lambda x: term_index[x],
                        get_anchestors(go, term_list[term])))
                tree[term] |= set(
                    map(lambda x: term_index[x],
                        get_subset(go, term_list[term])))
            else:
                tree[term] = set(
                    map(lambda x: term_index[x],
                        get_anchestors(hp, term_list[term])))
                tree[term] |= set(
                    map(lambda x: term_index[x],
                        get_subset(hp, term_list[term])))

    print(len(counter))
    pool = Pool(48)
    gf = gzip.open('data/results.gz', 'w')
    while len(counter) > 0:
        cnts = pool.map(next_level, terms)
        cnt = sum(cnts)
        print(counter.most_common(10))
        print(cnt.most_common(10))
        for s, c in cnt.items():
            if c < e:
                del cnt[s]
            else:
                gf.write(c)
                for term in s:
                    gf.write('\t' + term_list[term])
                gf.write('\n')
        counter = cnt
Ejemplo n.º 7
0
def test_get_subset():
    foo = {'a': 1, 'b': 2, 'c': 3}
    subset = get_subset(foo, ['a'])
    assert subset == {'a': 1}
    subset = get_subset(foo, ['a', 'b'])
    assert subset == {'a': 1, 'b': 2}
 def test_1(self):
     res = get_subset([1, 2, 3])
     print(res)
     self.assertEqual(res,
                      [[1], [2], [1, 2], [3], [1, 3], [2, 3], [1, 2, 3]],
                      msg='Your input is not 10')
Ejemplo n.º 9
0
def get_connecting_nodes(diff_start_end_elr_dat, route_name=None, update=False, verbose=False):
    """
    Get data of connecting points for different ELRs.

    :param diff_start_end_elr_dat: data frame where StartELR != EndELR
    :type diff_start_end_elr_dat: pandas.DataFrame
    :param route_name: name of a Route; if ``None`` (default), all Routes
    :type route_name: str or None
    :param update: whether to check on update and proceed to update the package data,
        defaults to ``False``
    :type update: bool
    :param verbose: whether to print relevant information in console as the function runs,
        defaults to ``False``
    :type verbose: bool or int
    :return: data of connecting points for different ELRs
    :rtype: pandas.DataFrame

    **Test**::

        from mssqlserver.metex import view_metex_schedule8_incident_locations
        from models.prototype.furlong import get_connecting_nodes

        update = False
        verbose = True

        route_name = None
        diff_start_end_elr_dat = view_metex_schedule8_incident_locations(
            route_name=route_name, start_and_end_elr='diff', verbose=verbose)
        connecting_nodes = get_connecting_nodes(diff_start_end_elr_dat, route_name, update, verbose)
        print(connecting_nodes)

        route_name = 'Anglia'
        diff_start_end_elr_dat = view_metex_schedule8_incident_locations(
            route_name=route_name, start_and_end_elr='diff', verbose=verbose)
        connecting_nodes = get_connecting_nodes(diff_start_end_elr_dat, route_name, update, verbose)
        print(connecting_nodes)
    """

    filename = "connections-between-different-ELRs"
    pickle_filename = make_filename(filename, route_name)
    path_to_pickle = cdd_geodata(pickle_filename)

    if os.path.isfile(path_to_pickle) and not update:
        return load_pickle(path_to_pickle, verbose=verbose)

    else:
        try:
            pickle_filename_temp = make_filename(filename)
            path_to_pickle_temp = cdd_geodata(pickle_filename_temp)

            if os.path.isfile(path_to_pickle_temp) and not update:
                connecting_nodes_all = load_pickle(path_to_pickle_temp)
                connecting_nodes = get_subset(connecting_nodes_all, route_name)

            else:
                diff_elr_mileages = diff_start_end_elr_dat.drop_duplicates()

                em = ELRMileages()
                print("Searching for connecting ELRs ... ", end="") if verbose else ""
                mileage_file_dir = cdd_railway_codes("line data\\elrs-and-mileages\\mileages")

                # noinspection PyTypeChecker
                conn_mileages = diff_elr_mileages.apply(
                    lambda x: em.get_conn_mileages(x.StartELR, x.EndELR, update,
                                                   pickle_mileage_file=True,
                                                   data_dir=mileage_file_dir), axis=1)

                print("\nFinished.") if verbose else ""

                conn_mileages_data = pd.DataFrame(conn_mileages.to_list(), index=diff_elr_mileages.index,
                                                  columns=['StartELR_EndMileage', 'ConnELR',
                                                           'ConnELR_StartMileage',
                                                           'ConnELR_EndMileage', 'EndELR_StartMileage'])

                connecting_nodes = diff_elr_mileages.join(conn_mileages_data)
                connecting_nodes.set_index(['StartELR', 'StartMileage', 'EndELR', 'EndMileage'],
                                           inplace=True)

            save_pickle(connecting_nodes, path_to_pickle, verbose=verbose)

            return connecting_nodes

        except Exception as e:
            print("Failed to get \"{}\". {}.".format(os.path.splitext(pickle_filename)[0], e))
Ejemplo n.º 10
0
    def _select_attribute(self, dataset):
        """Select attribute to split subset.
        
        Arguments:
            dataset {list} -- Training dataset.
        
        Returns:
            integer -- Selected attribute index.
        """

        n_features = len(dataset[0]) - 1
        base_entropy = self._compute_entropy(dataset)
        if self.name == 'ID3':
            max_info_gain = 0.0
        elif self.name == 'C45':
            max_info_gain_ratio = 0.0
        elif self.name == 'CART':
            min_gini = 99999.0

        selected_attribute = -1
        for i in range(n_features):
            attribute_list = [sample[i] for sample in dataset]
            unique_vals = set(attribute_list)
            sub_entropy = 0.0
            if self.name == 'C45':
                iv = 0.0
            elif self.name == 'CART':
                gini = 0.0
            for val in unique_vals:
                sub_dataset = get_subset(dataset, i, val)
                p = len(sub_dataset) / float(len(dataset))
                if self.name == 'CART':
                    sub_p = len(get_subset(sub_dataset, -1, '0')) / float(
                        len(sub_dataset))
                else:
                    sub_entropy += p * self._compute_entropy(sub_dataset)
                if self.name == 'C45':
                    iv = iv - p * log(p, 2)
                elif self.name == 'CART':
                    gini += p * (1.0 - pow(sub_p, 2) - pow(1 - sub_p, 2))
                    print('{0:d}th information gini in CART is:{1:.3f}'.format(
                        i, gini))

            info_gain = base_entropy - sub_entropy
            if self.name == 'ID3':
                print('{0:d}th information gain in ID3 is:{1:3f}'.format(
                    i, info_gain))
                if info_gain > max_info_gain:
                    max_info_gain = info_gain
                    selected_attribute = i
            elif self.name == 'C45':
                if iv == 0:
                    continue
                info_gain_ratio = info_gain / iv
                print(
                    '{0:d}th information gain rario in C4.5 is:{1:3f}'.format(
                        i, info_gain_ratio))
                if info_gain_ratio > max_info_gain_ratio:
                    max_info_gain_ratio = info_gain_ratio
                    selected_attribute = i
            elif self.name == 'CART':
                if gini < min_gini:
                    min_gini = gini
                    selected_attribute = i

        return selected_attribute