Esempio n. 1
0
def calculate_information_gain_star(dataset, classification_file, rule_set, rule_number):
    if 'users' in dataset:
        df_dataset = pandas.read_csv(dataset)
        attribute = rules.attributes(rule_set,rule_number)
        if isinstance(attribute,list):
            attribute_list = []
            attrs = df_dataset[attribute].values
            for attr in attrs:
                rule_output = rules.rules(rule_set, rule_number, attr)
                if rule_output == 1:
                     number_satisfied = 1
                else:
                     number_satisfied = 0
                attribute_list.append(number_satisfied)
            
        else:
            attribute_list = df_dataset[attribute].values
        df_classification = pandas.read_csv(classification_file)
        classification_list = df_classification['class'].values
        print(classification_list)
        print(attribute_list)
        information_gain_star = info_gain.info_gain(classification_list, attribute_list)
    else:
        attribute = rules.attributes(rule_set, rule_number)
        df_dataset = pandas.read_csv(dataset)
        user_id_list = list(set(df_dataset['user_id'].values))
        df_classification = pandas.read_csv(classification_file)
        attr_values = []
        real_classes = []
        for user_id in user_id_list:
            df_user = df_dataset.loc[df_dataset['user_id'] == user_id]
            attribute_list = df_user[attribute].values
            attribute_value = attribute_list[0]
            number_satisfied = 0
            if rule_number == 22:
                number_satisfied = len(df_user[attribute].unique())
            elif rule_number == 3 and rule_set == 'social_bakers':
                number_satisfied = df_user[attribute].value_counts().max()
            else:
                for attr in attribute_list:
                    rule_output = rules.rules(rule_set, rule_number, attr)
                    if rule_output == 1:
                        number_satisfied += 1
            attr_values.append(number_satisfied)
            df_class = df_classification.loc[df_classification['id'] == user_id]
            real_class = df_class['class'].values[0]
            real_classes.append(real_class)
        information_gain_star = info_gain.info_gain(real_classes, attr_values)
        
    return information_gain_star
Esempio n. 2
0
def feature2():
    dataset = pd.read_csv(BAS)
    dataset_tweets = pd.read_csv(BAS_TWEETS)
    dataset_tweets.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True)

    users_id = dataset['id'].values
    users_id_tweets = dataset_tweets['user_id'].values
    users_id_tweets_list = users_id_tweets.tolist()

    tmp = []
    tweets_count = []

    # Checking if each ID appears more than 20 times in users_id_tweets
    for id in users_id:
        count = users_id_tweets_list.count(id)
        if count >= 20:
            tmp.append(1)
        else:
            tmp.append(0)
        tweets_count.append(count)

    ig = info_gain.info_gain(tmp, tweets_count)
    print("Information Gain: " + str(ig))

    class_list = utils.read_dataset()
    print("Correlation coefficient: " +
          str(corrcoef(tweets_count, class_list)[0][1]))
    return tmp
Esempio n. 3
0
def feature3():
    print("Reading datasets...")
    dataset = pd.read_csv(BAS)
    dataset_tweets = pd.read_csv(BAS_TWEETS)
    dataset_tweets.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True)
    print("Done")

    users_id = dataset['id'].values

    temp = []
    similarities = []

    for i in range(len(users_id)):
        print(i)
        all_user_tweets = dataset_tweets['text'].loc[dataset_tweets['user_id']
                                                     == users_id[i]]
        similarities.append(utils.message_similarity(all_user_tweets))

    for similarity in similarities:
        if similarity > 100:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, similarities)
    print("Information Gain: " + str(ig))

    class_list = utils.read_dataset()
    print("Correlation coefficient: " +
          str(corrcoef(similarities, class_list)[0][1]))
    return temp
Esempio n. 4
0
def info_gain_calculate(tf_tweets_stems, classes):
    keys = []
    info_gains = {}
    for i in tf_tweets_stems[0].keys():
        keys.append(i)
    N = len(tf_tweets_stems)
    M = len(keys)
    i = 0
    j = 0

    while i < M:
        values = []
        while j < N:
            values.append(tf_tweets_stems[j][keys[i]])
            j = j + 1
        ig = info_gain.info_gain(classes, values)
        info_gains[keys[i]] = ig
        """
		if ig > 0.02:
			print(keys[i] + ': ' + str(ig))
		"""
        i = i + 1
        j = 0

    return info_gains
Esempio n. 5
0
def extract_metafeature(a):
    #statistical(3)
    #print(mean(a.kurtosis()))
    #print(mean(a.skew()))
    #print (mean(a.mean()))
    from sklearn.feature_selection import mutual_info_classif
    from info_gain import info_gain
    y=df[2000]
    X = df.drop(2000,1)
    ft2 = pd.DataFrame({
     #simple
    'nr_instances':[len(a)],
    'nr_features':[len(a.columns)],
    'nr_missing_values':[a.isnull().sum().sum()],
    #statistical 
    #"max_value":[a.values.max()],
    #"min_value":[a.values.min()],
    'mean_kurtosis':[mean(a.kurtosis())],
    'mean_skewness':[mean(a.skew())],
    'mean':[mean(a.mean())],
    #information_theoretic
    #'MI':[mean(mutual_info_classif(X, y))],
    #model_based
    'Info_gain':[info_gain.info_gain(X,y)],
    #'Intistic_value':[info_gain.intrinsic_value(X,y)],
    'Inf_gain_ratio':[info_gain.info_gain_ratio(X,y)]
    })
    
    return(ft2)
Esempio n. 6
0
def feature9():
    dataset = pd.read_csv(BAS, low_memory=False)
    user_ids = dataset['id'].values
    current_year = 2015

    ratios = []
    temp = []

    for i in range(len(user_ids)):
        friends = dataset['friends_count'].loc[dataset['id'] ==
                                               user_ids[i]].values[0]
        created = dataset['created_at'].loc[dataset['id'] ==
                                            user_ids[i]].values[0]

        year = created.split()[5]
        difference = current_year - int(year)

        ratios.append(friends / difference)

    for ratio in ratios:
        if ratio > 100:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, ratios)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(ratios, class_list)[0][1]))
    return temp
Esempio n. 7
0
def calc_gains(data):
    gains = []
    amount_of_columns = data.shape[1]
    for column_index in range(0, amount_of_columns):
        print(column_index)
        column = np.array(get_column(data, column_index))
        gain = info_gain.info_gain(labels, column)
        gains.append([column_index, gain])
    return gains
Esempio n. 8
0
 def calculate_info_gain(self):
     self.gain = []
     for index, feature in enumerate(self.features):
         self.gain.append(
             [info_gain.info_gain(feature, self.labels), index])
     self.gain.sort(key=self.take_first, reverse=True)
     print "======================================================="
     print "{first} {second}".format(first=self.gain[0],
                                     second=self.gain[1])
     print "======================================================="
Esempio n. 9
0
def get_info_gain(train, classes):
    global info_gain_res

    # info_gain_res = dict(zip(col_name,mutual_info_classif(traindf, train_class, discrete_features=True)))
    for i in range(PIXELS):
        if i == 524:
            pass
        else:
            # train[str(i)] to access the dataframe column e.g, train['0']
            info_gain_res.append(info_gain.info_gain(classes, train[str(i)]))
    # info_gain_res = mutual_info_classif(traindf, train_class, discrete_features=True)
    with open('./info_gain.txt', 'w') as f:
        print(info_gain_res, file=f)
    return info_gain_res
Esempio n. 10
0
def extract_metafeature(a):
    y = a[a.columns[-1]]
    X = a[a.columns[:-1]]
    return {
        #simple
        'nr_instances': len(a),
        'nr_features': len(a.columns),
        'nr_missing_values': a.isnull().sum().sum(),
        'mean_kurtosis': mean(a.kurtosis()),
        'mean_skewness': mean(a.skew()),
        'mean': mean(a.mean()),
        'Info_gain': info_gain.info_gain(X, y),
        'Inf_gain_ratio': info_gain.info_gain_ratio(X, y)
    }
Esempio n. 11
0
def feature1():
    dataset = pd.read_csv(BAS)
    temp_list = []
    friends_list = dataset['friends_count'].values

    for friends_count in friends_list:
        if friends_count >= 1000:
            temp_list.append(1)
        else:
            temp_list.append(0)

    ig = info_gain.info_gain(temp_list, friends_list)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("Correlation coefficient: " +
          str(corrcoef(friends_list, class_list)[0][1]))
    return temp_list
Esempio n. 12
0
def feature4():
    dataset = pd.read_csv(BAS)
    dataset_tweets = pd.read_csv(BAS_TWEETS)
    dataset_tweets.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True)

    users_id = dataset['id'].values

    url_ratios = []
    temp = []

    for id in users_id:
        user_tweets = dataset_tweets['text'].loc[dataset_tweets['user_id'] ==
                                                 id]
        tweet_url_count = 0
        for tweet in user_tweets:
            if re.findall(
                    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                    str(tweet)):
                tweet_url_count += 1
        try:
            ratio = tweet_url_count / len(user_tweets)
        except ZeroDivisionError:
            ratio = 0
        print(ratio)
        url_ratios.append(ratio)

    for ratio in url_ratios:
        if ratio >= 0.6:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, url_ratios)
    print("Information Gain: " + str(ig))

    class_list = utils.read_dataset()
    print("Correlation coefficient: " +
          str(corrcoef(url_ratios, class_list)[0][1]))
    return temp
Esempio n. 13
0
def feature5():
    dataset = pd.read_csv(BAS)

    friends_list = dataset['friends_count'].values
    followers_list = dataset['followers_count'].values

    ratios = []
    temp = []

    for i in range(0, len(friends_list)):
        try:
            ratio = (friends_list[i] / (followers_list[i]**2))
        except RuntimeWarning:
            ratio = 0
        except ZeroDivisionError:
            ratio = 0
        ratios.append(ratio)

    for i in range(len(ratios)):
        if isnan(ratios[i]):
            ratios[i] = 0

    for i in range(len(ratios)):
        if isinf(ratios[i]):
            ratios[i] = 0

    for ratio in ratios:
        if ratio < 0.1:
            temp.append(1)
        else:
            temp.append(0)

    ig = info_gain.info_gain(temp, ratios)
    print("Information Gain: " + str(ig))

    class_list = utils.read_dataset()
    print("Correlation coefficient: " +
          str(corrcoef(ratios, class_list)[0][1]))
    return temp
Esempio n. 14
0
def feature1():
    dataset = pd.read_csv(BAS)
    creation_date = dataset['created_at'].values
    current_year = 2020

    temp = []
    age = []

    for date in creation_date:
        year = date.split()[5]
        difference = current_year - int(year)
        if difference < 8:
            temp.append(0)
        else:
            temp.append(1)
        age.append(difference)

    ig = info_gain.info_gain(temp, age)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(age, class_list)[0][1]))
    return temp
def hierarchy_based_filter(df,
                           label_column,
                           G=None,
                           threshold=0.99,
                           metric="info_gain",
                           pruning=True,
                           all_remove=True,
                           progress=True,
                           **kwargs):
    """Feature selection approach, namely, SHSEL including the initial
    selection algorithm and pruning algorithm. Identify and filter out the
    ranges of nodes with similar relevance in each branch of the hierarchy.

        Ristoski, P. and Paulheim, H., 2014, October. Feature selection in 
        hierarchical feature spaces. In International conference on discovery 
        science (pp. 288-300). Springer, Cham.
    
    Args:
        df (pd.DataFrame): Dataframe containing the original features and the
            class column.
        label_column (str): Name of the output/class column.
        G (nx.DirectedGraph, optional): The directed graph of all classes and
            superclasses can be specified here; if None the function looks for 
            the graph in the pd.DataFrame.attrs.hierarchy attribute of the 
            input dataframe. Defaults to None.
        threshold (float, optional): A relevance similarity threshold which is 
            set be users, recommended to be 0.99. Defaults to 0.99. 
        metric (str/func, optional): The relevance similarity metrics including 
            infomation gain and correlation("info_gain"/"correlation"). Can use
            your own metric function. Defaults to "info_gain". 
        pruning (bool, optional): If or not use the pruning algorithm, if True, 
            select only the most valuable features which is greater than the 
            average Information Gain values from the previously reduced set. 
            Defaults to True.
        all_remove (bool, optional): Only valid when pruning is True. If or not 
            strictly remove all the nodes once one of their info gain value are 
            smaller than the average info gain of paths. Defaults to True.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.

    Returns:
        pd.DataFrame: Filtered Dataframe containing the selected attributes.
    """

    # Take graph attached to df or selected by user.
    if G == None:
        G = df.attrs["hierarchy"].copy()

    elif G:
        G = G.copy()

    else:
        raise RuntimeError("""No hierarchy graph found. It should either be
                              attached to the dataframe in df.attrs['hierarchy]
                              or passed in the G argument.""")

    df = df.copy()
    # delete and save prefix strings, e.g. "uri_bool_" to comply with graph
    prefix_cols = [col for col in df.columns if re.findall("http:", col)]

    prefix_cols_stripped = [
        re.sub(r"^.*?http://", "http://", col) for col in prefix_cols
    ]

    renaming_dict = dict(zip(prefix_cols_stripped, prefix_cols))

    # preparing part
    df.columns = [re.sub(r"^.*?http://", "http://", col) for col in df.columns]

    # save class col and columns without features for later
    label_column = re.sub(r"^.*?http://", "http://", label_column)
    non_class_cols = list(set(df.columns) - set(G.nodes) - set([label_column]))

    class_col = df.loc[:, label_column]
    non_class_df = df.loc[:, non_class_cols]

    #main part
    df_from_hierarchy = add_hierarchy_columns(df, G, keep_prefix=False)

    G = G.reverse()

    if not nx.is_directed_acyclic_graph(G):

        raise TypeError(
            "The Hierarchy Based Filter is designed for directed acyclic graphs (DAGs)."
        )

    node_availability = {}

    ig_values = []

    if progress:
        iterator = tqdm(list(G.nodes()),
                        desc="Hierarchy Based Filter: Initial Selection")
    else:
        iterator = list(G.nodes())

    #for node in list(G.nodes()):
    for node in iterator:

        node_availability[node] = True

        ig = info_gain.info_gain(df_from_hierarchy[label_column],
                                 df_from_hierarchy[node])

        ig_values.append(ig)

        #global node_values

        node_values = dict(zip(G.nodes, ig_values))

    # the main structure of Inital Selection

    L = [x for x in G.nodes() if G.out_degree(x) == 0 and G.in_degree(x) > 0]

    for l in L:

        D = G.predecessors(l)  # direct ancestors of the current leaf l

        D = list(D)  # necessary! transform keydict_iterator type to list

        # selection by similarity
        for d in D:

            if callable(metric):

                similarity = metric(df_from_hierarchy, l, d, **kwargs)

            elif metric == "info_gain":

                similarity = 1 - abs(node_values[d] - node_values[l])

            elif metric == "correlation":

                similarity = np.corrcoef(df_from_hierarchy[l],
                                         df_from_hierarchy[d])[0, 1]

            if similarity >= threshold or np.isnan(similarity) == True:

                node_availability[l] = False

                break

        # extend L by D

        newleaf = [d for d in D if d not in L]

        L.extend(newleaf)

    SF = [node for node in list(G.nodes()) if node_availability[node] == True]

    df_filtered = df_from_hierarchy.copy()

    for col in df_from_hierarchy.columns:

        if col not in SF or col not in df.columns:

            df_filtered.drop(col, axis=1, inplace=True)

    if pruning:

        df_filtered = prune(df_filtered,
                            G,
                            node_values,
                            node_availability,
                            L,
                            remove_flag=all_remove,
                            progress=progress)

    df_filtered = pd.concat([non_class_df, class_col, df_filtered], axis=1)

    df_filtered.rename(columns=renaming_dict, inplace=True)

    return df_filtered
Esempio n. 16
0
def feature8():
    timenow = datetime.datetime.now()

    e13_tweets = pd.read_csv(E13_tweets)
    fsf_tweets = pd.read_csv(FSF_tweets)
    int_tweets = pd.read_csv(INT_tweets)
    tfp_tweets = pd.read_csv(TFP_tweets)
    twt_tweets = pd.read_csv(TWT_tweets)

    dataset = pd.read_csv(BAS, low_memory=False)
    user_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    total = []
    temp = []

    for i in range(len(user_ids)):
        api_tweets = []
        if bas_dataset[i] == 'E13':
            tweets = e13_tweets['text'].loc[e13_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            similarity_count = utils.message_similarity(api_tweets)
            total.append(similarity_count)
        elif bas_dataset[i] == 'FSF':
            tweets = fsf_tweets['text'].loc[fsf_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            similarity_count = utils.message_similarity(api_tweets)
            total.append(similarity_count)
        elif bas_dataset[i] == 'INT':
            tweets = int_tweets['text'].loc[int_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            similarity_count = utils.message_similarity(api_tweets)
            total.append(similarity_count)
        elif bas_dataset[i] == 'TFP':
            tweets = tfp_tweets['text'].loc[tfp_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            similarity_count = utils.message_similarity(api_tweets)
            total.append(similarity_count)
        elif bas_dataset[i] == 'TWT':
            tweets = twt_tweets['text'].loc[twt_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            similarity_count = utils.message_similarity(api_tweets)
            total.append(similarity_count)

    for i in range(len(total)):
        if isnan(total[i]):
            total[i] = 0

    for count in total:
        if count > 10:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, total)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(total, class_list)[0][1]))

    timeend = datetime.datetime.now()
    print("TIME TAKEN: " + str(timeend - timenow))
    pass
Esempio n. 17
0
def feature7():
    e13_tweets = pd.read_csv(E13_tweets)
    fsf_tweets = pd.read_csv(FSF_tweets)
    int_tweets = pd.read_csv(INT_tweets)
    tfp_tweets = pd.read_csv(TFP_tweets)
    twt_tweets = pd.read_csv(TWT_tweets)

    dataset = pd.read_csv(BAS, low_memory=False)
    user_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    ratios = []
    temp = []

    for i in range(len(user_ids)):
        api_tweetsurl_count = 0
        api_tweets = []
        print(i)
        if bas_dataset[i] == 'E13':
            tweets = e13_tweets['text'].loc[e13_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            for api_tweet in api_tweets:
                if re.findall(
                        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        str(api_tweet)):
                    api_tweetsurl_count += 1
            if api_tweetsurl_count == 0:
                ratios.append(0)
            else:
                ratios.append(api_tweetsurl_count / len(api_tweets))
        elif bas_dataset[i] == 'FSF':
            tweets = fsf_tweets['text'].loc[fsf_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            for api_tweet in api_tweets:
                if re.findall(
                        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        str(api_tweet)):
                    api_tweetsurl_count += 1
            if api_tweetsurl_count == 0:
                ratios.append(0)
            else:
                ratios.append(api_tweetsurl_count / len(api_tweets))
        elif bas_dataset[i] == 'INT':
            tweets = int_tweets['text'].loc[int_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            for api_tweet in api_tweets:
                if re.findall(
                        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        str(api_tweet)):
                    api_tweetsurl_count += 1
            if api_tweetsurl_count == 0:
                ratios.append(0)
            else:
                ratios.append(api_tweetsurl_count / len(api_tweets))
        elif bas_dataset[i] == 'TFP':
            tweets = tfp_tweets['text'].loc[tfp_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            for api_tweet in api_tweets:
                if re.findall(
                        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        str(api_tweet)):
                    api_tweetsurl_count += 1
            if api_tweetsurl_count == 0:
                ratios.append(0)
            else:
                ratios.append(api_tweetsurl_count / len(api_tweets))
        elif bas_dataset[i] == 'TWT':
            tweets = twt_tweets['text'].loc[twt_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            for api_tweet in api_tweets:
                if re.findall(
                        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        str(api_tweet)):
                    api_tweetsurl_count += 1
            if api_tweetsurl_count == 0:
                ratios.append(0)
            else:
                ratios.append(api_tweetsurl_count / len(api_tweets))

    for ratio in ratios:
        if ratio > 0.8:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, ratios)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(ratios, class_list)[0][1]))
    pass
Esempio n. 18
0
def feature2():
    dataset = pd.read_csv('../datasets/BAS/bas_users.csv')
    e13_followers = pd.read_csv(E13_followers)
    fsf_followers = pd.read_csv(FSF_followers)
    int_followers = pd.read_csv(INT_followers)
    tfp_followers = pd.read_csv(TFP_followers)
    twt_followers = pd.read_csv(TWT_followers)

    bas_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values
    bas_friends = dataset['friends_count'].values

    ratios = []
    temp = []

    for i in range(0, len(bas_ids)):
        count = 0
        print(i)
        try:
            if bas_dataset[i] == 'E13':
                followers_of_id = e13_followers['source_id'].loc[
                    e13_followers['target_id'] == bas_ids[i]].values
                for id in followers_of_id:
                    try:
                        forward = e13_followers['target_id'].loc[
                            e13_followers['source_id'] == id].values
                        if forward[0] == bas_ids[i]:
                            count += 1
                        else:
                            pass
                    except KeyError:
                        pass

                ratio = count / bas_friends[i]
                ratios.append(ratio)
            elif bas_dataset[i] == 'TFP':
                followers_of_id = tfp_followers['source_id'].loc[
                    tfp_followers['target_id'] == bas_ids[i]].values
                for id in followers_of_id:
                    try:
                        forward = tfp_followers['target_id'].loc[
                            tfp_followers['source_id'] == id].values
                        if forward[0] == bas_ids[i]:
                            count += 1
                        else:
                            pass
                    except KeyError:
                        pass

                ratio = count / bas_friends[i]
                ratios.append(ratio)
            elif bas_dataset[i] == 'FSF':
                followers_of_id = fsf_followers['source_id'].loc[
                    fsf_followers['target_id'] == bas_ids[i]].values
                for id in followers_of_id:
                    try:
                        forward = fsf_followers['target_id'].loc[
                            fsf_followers['source_id'] == id].values
                        if forward[0] == bas_ids[i]:
                            count += 1
                        else:
                            pass
                    except KeyError:
                        pass

                ratio = count / bas_friends[i]
                ratios.append(ratio)
            elif bas_dataset[i] == 'INT':
                followers_of_id = int_followers['source_id'].loc[
                    int_followers['target_id'] == bas_ids[i]].values
                for id in followers_of_id:
                    try:
                        forward = int_followers['target_id'].loc[
                            int_followers['source_id'] == id].values
                        if forward[0] == bas_ids[i]:
                            count += 1
                        else:
                            pass
                    except KeyError:
                        pass

                ratio = count / bas_friends[i]
                ratios.append(ratio)
            elif bas_dataset[i] == 'TWT':
                followers_of_id = twt_followers['source_id'].loc[
                    twt_followers['target_id'] == bas_ids[i]].values
                for id in followers_of_id:
                    try:
                        forward = twt_followers['target_id'].loc[
                            twt_followers['source_id'] == id].values
                        if forward[0] == bas_ids[i]:
                            count += 1
                        else:
                            pass
                    except KeyError:
                        pass

                ratio = count / bas_friends[i]
                ratios.append(ratio)
        except:
            pass

    for ratio in ratios:
        if isnan(ratio):
            ratio = 0
        else:
            pass

    for ratio in ratios:
        if ratio < 0.5:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, ratios)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(ratios, class_list)[0][1]))
    return temp
Esempio n. 19
0
def feature6():
    e13_tweets = pd.read_csv(E13_tweets)
    fsf_tweets = pd.read_csv(FSF_tweets)
    int_tweets = pd.read_csv(INT_tweets)
    tfp_tweets = pd.read_csv(TFP_tweets)
    twt_tweets = pd.read_csv(TWT_tweets)

    dataset = pd.read_csv(BAS, low_memory=False)
    user_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    ratios = []
    temp = []

    for i in range(len(user_ids)):
        api_tweets_count = 0
        print(i)
        if bas_dataset[i] == 'E13':
            sources_from_id = e13_tweets['source'].loc[e13_tweets['user_id'] ==
                                                       user_ids[i]]
            tweets_count = dataset['statuses_count'].loc[dataset['id'] ==
                                                         user_ids[i]].values

            for source_id in sources_from_id:
                if "API" or "AutoTwitter" in source_id:
                    api_tweets_count += 1
                else:
                    pass

            if api_tweets_count == 0:
                ratios.append(0)
            else:
                ratios.append(tweets_count[0] / api_tweets_count)
        elif bas_dataset[i] == 'FSF':
            sources_from_id = fsf_tweets['source'].loc[fsf_tweets['user_id'] ==
                                                       user_ids[i]]
            tweets_count = dataset['statuses_count'].loc[dataset['id'] ==
                                                         user_ids[i]].values

            for source_id in sources_from_id:
                if "API" or "AutoTwitter" in source_id:
                    api_tweets_count += 1
                else:
                    pass

            if api_tweets_count == 0:
                ratios.append(0)
            else:
                ratios.append(tweets_count[0] / api_tweets_count)
        elif bas_dataset[i] == 'INT':
            sources_from_id = int_tweets['source'].loc[int_tweets['user_id'] ==
                                                       user_ids[i]]
            tweets_count = dataset['statuses_count'].loc[dataset['id'] ==
                                                         user_ids[i]].values

            for source_id in sources_from_id:
                if "API" or "AutoTwitter" in source_id:
                    api_tweets_count += 1
                else:
                    pass

            if api_tweets_count == 0:
                ratios.append(0)
            else:
                ratios.append(tweets_count[0] / api_tweets_count)
        elif bas_dataset[i] == 'TFP':
            sources_from_id = tfp_tweets['source'].loc[tfp_tweets['user_id'] ==
                                                       user_ids[i]]
            tweets_count = dataset['statuses_count'].loc[dataset['id'] ==
                                                         user_ids[i]].values

            for source_id in sources_from_id:
                if "API" or "AutoTwitter" in source_id:
                    api_tweets_count += 1
                else:
                    pass

            if api_tweets_count == 0:
                ratios.append(0)
            else:
                ratios.append(tweets_count[0] / api_tweets_count)
        elif bas_dataset[i] == 'TWT':
            sources_from_id = twt_tweets['source'].loc[twt_tweets['user_id'] ==
                                                       user_ids[i]]
            tweets_count = dataset['statuses_count'].loc[dataset['id'] ==
                                                         user_ids[i]].values

            for source_id in sources_from_id:
                if "API" or "AutoTwitter" in source_id:
                    api_tweets_count += 1
                else:
                    pass

            if api_tweets_count == 0:
                ratios.append(0)
            else:
                ratios.append(tweets_count[0] / api_tweets_count)

    for i in range(len(ratios)):
        if isnan(ratios[i]):
            ratios[i] = 0

    for ratio in ratios:
        print(ratio)
        if ratio > 1.03:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, ratios)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(ratios, class_list)[0][1]))
    pass
Esempio n. 20
0
def feature5():
    e13_followers = pd.read_csv(E13_followers)
    fsf_followers = pd.read_csv(FSF_followers)
    int_followers = pd.read_csv(INT_followers)
    tfp_followers = pd.read_csv(TFP_followers)
    twt_followers = pd.read_csv(TWT_followers)

    dataset = pd.read_csv(BAS)
    user_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    medians = []
    friends_count = []

    for i in range(len(user_ids)):
        print(i)
        id_followers = []
        if bas_dataset[i] == 'E13':
            source_ids = e13_followers['target_id'].loc[
                e13_followers['source_id'] == user_ids[i]]
            friends = dataset['friends_count'].loc[dataset['id'] ==
                                                   user_ids[i]].values
            friends_count.append(friends[0])

            for id in source_ids:
                source_source_ids = e13_followers['target_id'].loc[
                    e13_followers['source_id'] == id].values
                for source_id in source_source_ids:
                    followers_count = dataset['followers_count'].loc[
                        dataset['id'] == source_id].values
                    if followers_count:
                        id_followers.append(followers_count)
                    else:
                        pass
            medians.append(median(id_followers))
        elif bas_dataset[i] == 'FSF':
            source_ids = fsf_followers['target_id'].loc[
                fsf_followers['source_id'] == user_ids[i]]
            friends = dataset['friends_count'].loc[dataset['id'] ==
                                                   user_ids[i]].values
            friends_count.append(friends[0])

            for id in source_ids:
                source_source_ids = fsf_followers['target_id'].loc[
                    fsf_followers['source_id'] == id].values
                for source_id in source_source_ids:
                    followers_count = dataset['followers_count'].loc[
                        dataset['id'] == source_id].values
                    if followers_count:
                        id_followers.append(followers_count)
                    else:
                        pass
            medians.append(median(id_followers))
        elif bas_dataset[i] == 'INT':
            source_ids = int_followers['target_id'].loc[
                int_followers['source_id'] == user_ids[i]]
            friends = dataset['friends_count'].loc[dataset['id'] ==
                                                   user_ids[i]].values
            friends_count.append(friends[0])

            for id in source_ids:
                source_source_ids = int_followers['target_id'].loc[
                    int_followers['source_id'] == id].values
                for source_id in source_source_ids:
                    followers_count = dataset['followers_count'].loc[
                        dataset['id'] == source_id].values
                    if followers_count:
                        id_followers.append(followers_count)
                    else:
                        pass
            medians.append(median(id_followers))
        elif bas_dataset[i] == 'TFP':
            source_ids = tfp_followers['target_id'].loc[
                tfp_followers['source_id'] == user_ids[i]]
            friends = dataset['friends_count'].loc[dataset['id'] ==
                                                   user_ids[i]].values
            friends_count.append(friends[0])

            for id in source_ids:
                source_source_ids = tfp_followers['target_id'].loc[
                    tfp_followers['source_id'] == id].values
                for source_id in source_source_ids:
                    followers_count = dataset['followers_count'].loc[
                        dataset['id'] == source_id].values
                    if followers_count:
                        id_followers.append(followers_count)
                    else:
                        pass
            medians.append(median(id_followers))
        elif bas_dataset[i] == 'TWT':
            source_ids = twt_followers['target_id'].loc[
                twt_followers['source_id'] == user_ids[i]]
            friends = dataset['friends_count'].loc[dataset['id'] ==
                                                   user_ids[i]].values
            friends_count.append(friends[0])

            for id in source_ids:
                source_source_ids = twt_followers['target_id'].loc[
                    twt_followers['source_id'] == id].values
                for source_id in source_source_ids:
                    followers_count = dataset['followers_count'].loc[
                        dataset['id'] == source_id].values
                    if followers_count:
                        id_followers.append(followers_count)
                    else:
                        pass
            medians.append(median(id_followers))

    for i in range(len(medians)):
        if isnan(medians[i]):
            medians[i] = 0

    temp = []
    ratios = []

    for i in range(len(medians)):
        if medians[i] == 0:
            ratio = 0
        else:
            ratio = friends_count[i] / medians[i]
        ratios.append(ratio)

    for ratio in ratios:
        if ratio < 1.5:
            temp.append(1)
        else:
            temp.append(0)

    ig = info_gain.info_gain(temp, ratios)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(ratios, class_list)[0][1]))
    return temp
Esempio n. 21
0
def feature4():
    dataset = pd.read_csv(BAS)
    e13_followers = pd.read_csv(E13_followers)
    fsf_followers = pd.read_csv(FSF_followers)
    int_followers = pd.read_csv(INT_followers)
    tfp_followers = pd.read_csv(TFP_followers)
    twt_followers = pd.read_csv(TWT_followers)

    bas_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    tweets_count = []
    global_tweets_count = []
    temp = []

    for i in range(len(bas_ids)):
        print(i)
        if bas_dataset[i] == 'E13':
            id_followers = e13_followers['target_id'].loc[
                e13_followers['source_id'] == bas_ids[i]].values

            for follower in id_followers:
                tweets = dataset['statuses_count'].loc[dataset['id'] ==
                                                       follower].values
                if tweets:
                    tweets_count.append(tweets)
                else:
                    pass
            global_tweets_count.append(average(tweets_count))
        elif bas_dataset[i] == 'FSF':
            id_followers = fsf_followers['target_id'].loc[
                fsf_followers['source_id'] == bas_ids[i]].values

            for follower in id_followers:
                tweets = dataset['statuses_count'].loc[dataset['id'] ==
                                                       follower].values
                if tweets:
                    tweets_count.append(tweets)
                else:
                    pass
            global_tweets_count.append(average(tweets_count))
        elif bas_dataset[i] == 'INT':
            id_followers = int_followers['target_id'].loc[
                int_followers['source_id'] == bas_ids[i]].values

            for follower in id_followers:
                tweets = dataset['statuses_count'].loc[dataset['id'] ==
                                                       follower].values
                if tweets:
                    tweets_count.append(tweets)
                else:
                    pass
            global_tweets_count.append(average(tweets_count))
        elif bas_dataset[i] == 'TFP':
            id_followers = tfp_followers['target_id'].loc[
                tfp_followers['source_id'] == bas_ids[i]].values

            for follower in id_followers:
                tweets = dataset['statuses_count'].loc[dataset['id'] ==
                                                       follower].values
                if tweets:
                    tweets_count.append(tweets)
                else:
                    pass
            global_tweets_count.append(average(tweets_count))
        elif bas_dataset[i] == 'TWT':
            id_followers = twt_followers['target_id'].loc[
                twt_followers['source_id'] == bas_ids[i]].values

            for follower in id_followers:
                tweets = dataset['statuses_count'].loc[dataset['id'] ==
                                                       follower].values
                if tweets:
                    tweets_count.append(tweets)
                else:
                    pass
            global_tweets_count.append(average(tweets_count))
    for mean_value in global_tweets_count:
        if mean_value < 9000:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, global_tweets_count)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(global_tweets_count, class_list)[0][1]))
    return temp
Esempio n. 22
0
def feature3():
    dataset = pd.read_csv(BAS)
    e13_friends = pd.read_csv(E13_friends)
    fsf_friends = pd.read_csv(FSF_friends)
    int_friends = pd.read_csv(INT_friends)
    tfp_friends = pd.read_csv(TFP_friends)
    twt_friends = pd.read_csv(TWT_friends)

    bas_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    followers_count = []
    averages = []
    temp = []

    for i in range(len(bas_ids)):
        print(i)
        if bas_dataset[i] == 'E13':
            friends = e13_friends['target_id'].loc[e13_friends['source_id'] ==
                                                   bas_ids[i]].values

            for friend in friends:
                friend_followers = dataset['followers_count'].loc[
                    dataset['id'] == friend].values
                if friend_followers:
                    followers_count.append(friend_followers)
                else:
                    pass
            averages.append(average(followers_count))
        elif bas_dataset[i] == 'FSF':
            friends = fsf_friends['target_id'].loc[fsf_friends['source_id'] ==
                                                   bas_ids[i]].values

            for friend in friends:
                friend_followers = dataset['followers_count'].loc[
                    dataset['id'] == friend].values
                if friend_followers:
                    followers_count.append(friend_followers)
                else:
                    pass
            averages.append(average(followers_count))
        elif bas_dataset[i] == 'INT':
            friends = int_friends['target_id'].loc[int_friends['source_id'] ==
                                                   bas_ids[i]].values

            for friend in friends:
                friend_followers = dataset['followers_count'].loc[
                    dataset['id'] == friend].values
                if friend_followers:
                    followers_count.append(friend_followers)
                else:
                    pass
            averages.append(average(followers_count))
        elif bas_dataset[i] == 'TFP':
            friends = tfp_friends['target_id'].loc[tfp_friends['source_id'] ==
                                                   bas_ids[i]].values

            for friend in friends:
                friend_followers = dataset['followers_count'].loc[
                    dataset['id'] == friend].values
                if friend_followers:
                    followers_count.append(friend_followers)
                else:
                    pass
            averages.append(average(followers_count))
        elif bas_dataset[i] == 'TWT':
            friends = twt_friends['target_id'].loc[twt_friends['source_id'] ==
                                                   bas_ids[i]].values

            for friend in friends:
                friend_followers = dataset['followers_count'].loc[
                    dataset['id'] == friend].values
                if friend_followers:
                    followers_count.append(friend_followers)
                else:
                    pass
            averages.append(average(followers_count))

    for mean_value in averages:
        if mean_value < 25000:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, averages)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(averages, class_list)[0][1]))
    return temp
def gain(classifier, attribute):
    return ig.info_gain(classifier, attribute)
Esempio n. 24
0
print("entropy Type", entropyType)

probDoors = [
    float(Doors.count(c)) / len(Doors) for c in dict.fromkeys(list(Doors))
]

entropyDoors = -sum([p * math.log(p) / math.log(2.0) for p in probDoors])

print("entropy Doors", entropyDoors)

probTyres = [
    float(Tyres.count(c)) / len(Tyres) for c in dict.fromkeys(list(Tyres))
]

entropyTyres = -sum([p * math.log(p) / math.log(2.0) for p in probTyres])

print("entropy Tyres", entropyTyres)

igcolor = info_gain.info_gain(Color, Class)
print("Color Info Gain", igcolor)

igtype = info_gain.info_gain(Type, Class)
print("Type Info Gain", igtype)

igdoors = info_gain.info_gain(Doors, Class)
print("Doors Info Gain", igdoors)

igtyres = info_gain.info_gain(Tyres, Class)
print("Tyres Info Gain", igtyres)
Esempio n. 25
0
def calculate_information_gain(classification_file, rule_set,rule_number):
    df_classification = pandas.read_csv(classification_file)
    output_list = df_classification['output'].values
    classification_list = df_classification['class'].values
    information_gain = info_gain.info_gain(classification_list, output_list)
    return information_gain
Esempio n. 26
0
def get_info_gain_ranking(X, y):
    feat_gain = []
    for j in range(X.shape[1]):
        feat_gain.append(info_gain.info_gain(y, X[:, j]))
    return feat_gain
                                rounded=True,
                                filled=True)
# Gini decides which attribute/feature should be placed at the root node,
# which features will act as internal nodes or leaf nodes
# Create Graph from DOT data
graph = pydotplus.graph_from_dot_data(dot_data)

# Create Decision Tree PDF
graph.write_pdf("DT1_Breast_Cancer.pdf")

######################################
# Run an information gain evaluation
######################################
print('\nInformation Gain on Recurrence')

ig = info_gain.info_gain(df['recur_event'], df['Tumor_Size'])
print('\tTumor Size=', ig)

ig = info_gain.info_gain(df['recur_event'], df['Menopause'])
print('\tMenopause=', ig)

ig = info_gain.info_gain(df['recur_event'], df['Age_Range'])
print('\tAge Range=', ig)

ig = info_gain.info_gain(df['recur_event'], df['Degree_Malignant'])
print('\tDegree Malignant=', ig)

ig = info_gain.info_gain(df['recur_event'], df['inv_nodes'])
print('\tNumber Involved Nodes=', ig)

ig = info_gain.info_gain(df['recur_event'], df['breast_quad'])
#GaussianNB/16columns
from sklearn.naive_bayes import GaussianNB
g_nb = GaussianNB(priors = None)
g_nb_fit = g_nb.fit(x_train,y_train)

g_nb_pred = g_nb.predict(x_test)
print(confusion_matrix(y_test,g_nb_pred))
print('\n')
print(classification_report(y_test,g_nb_pred))

#info_gain,Gain_Ratio/14columns
!pip install info_gain
from info_gain import info_gain
noShow_plus = noShow.drop('Status',axis=1)
for item in noShow_plus:
  ig = info_gain.info_gain(noShow[item], noShow['Status'])
  igr = info_gain.info_gain_ratio(noShow[item], noShow['Status'])

  print("%s的info_gain:" %(item),ig)
  print("%s的Gain_Ratio:" %(item),igr)

#info_gain,Gain_Ratio/16columns
!pip install info_gain
from info_gain import info_gain
noShow_plus = noShow.drop('Status',axis=1)
for item in noShow_plus:
  ig = info_gain.info_gain(noShow[item], noShow['Status'])
  igr = info_gain.info_gain_ratio(noShow[item], noShow['Status'])

  print("%s的info_gain:" %(item),ig)
  print("%s的Gain_Ratio:" %(item),igr)
def tree_based_filter(df, label_column, G=None, metric="Lift", progress=True):
    """Filter attributes with Tree-Based Feature Selection (TSEL). TSEL selects
    the most valuable attributes from each path in the hierarchy, based on lift
    or information gain.

        Jeong, Y. and Myaeng, S.H., 2013, October. Feature selection using a
        semantic hierarchy for event recognition and type classification. In
        Proceedings of the Sixth International Joint Conference on Natural
        Language Processing (pp. 136-144).

    Args:
        df (pd.DataFrame): Dataframe with hierarchy (output of generator)
        label_column (str): Name of the column with the class/label
        G (nx.DirectedGraph, optional): The directed graph of all classes and 
            superclasses can be specified here; if None the function looks for 
            the graph in the pd.DataFrame.attrs.hierarchy attribute of the input
            dataframe. Defaults to None.
        metric (str/func, optional): Metric which is used to determine the 
            representative features (IG/Lift). Defaults to 'Lift'.
        progress (bool, optional): If True, progress updates will be shown to   
            inform the user about the progress made by the process. Defaults to 
            True. 

    Returns:
        pd.DataFrame: Filtered Dataframe containing the selected attributes.
    """
    df = df.copy()

    if G:
        G = G.copy()
    else:
        G = df.attrs["hierarchy"].copy()

    if progress:
        print("Tree Based Filter - (1/4) Initialization.")

    # delete and save prefix strings, e.g. 'uri_bool_" to comply with graph
    prefix_cols = [col for col in df.columns if re.findall("http:", col)]
    prefix_cols_stripped = [
        re.sub(r"^.*?http://", "http://", col) for col in prefix_cols
    ]
    renaming_dict = dict(zip(prefix_cols_stripped, prefix_cols))
    df.columns = [re.sub(r"^.*?http://", "http://", col) for col in df.columns]

    # save class col and columns without features for later
    label_column = re.sub(r"^.*?http://", "http://", label_column)
    non_class_cols = list(set(df.columns) - set(G.nodes) - set([label_column]))

    df_from_hierarchy = add_hierarchy_columns(df, G, keep_prefix=False)

    # tsel is a top-down algorithm ==> graph has to be reversed
    G = G.reverse()

    # add virtual root node
    roots_and_isolated_nodes = [
        x for x in G.nodes() if G.out_degree(x) >= 0 and G.in_degree(x) == 0
    ]
    for node in roots_and_isolated_nodes:
        G.add_edge("VRN", node)

    if progress:
        print("Tree Based Filter - (2/4) Calculate Metric Values.")

    if callable(metric):
        node_metrics = metric(df_from_hierarchy, G, label_column)

    elif metric == "IG":
        metrics = []
        for node in G.nodes:
            if node != "VRN":

                ig = info_gain.info_gain(df_from_hierarchy[label_column],
                                         df_from_hierarchy[node])

                metrics.append(ig)
        node_metrics = dict(zip(G.nodes, metrics))

    else:
        node_metrics = calculate_lift(df_from_hierarchy, G, label_column)

    representative_features = []

    # traverse all paths

    if progress:
        print("Tree Based Filter - (3/4) Get initial representative features.")

    for p in get_all_paths(G, "VRN"):
        # select representative feature
        feature = representative_feature(p, node_metrics)

        if feature not in representative_features:
            representative_features.append(feature)

    if progress:
        print("Tree Based Filter - (4/4) Update representative features.")

    # loop over representative features
    checkUpdated = True
    while checkUpdated == True:
        checkUpdated = False

        for feature in representative_features:

            # loop over all descendants
            for desc in nx.descendants(G, feature):

                # check if descendant is representative feature
                if desc in representative_features:
                    representative_features.remove(feature)

                    # loop over all direct child nodes of x
                    for child in nx.neighbors(G, feature):

                        # loop over all paths from child to leaf nodes
                        for p in get_all_paths(G, child):

                            # select representative feature
                            feature = representative_feature(p, node_metrics)
                            if feature not in representative_features:
                                representative_features.append(feature)

                    checkUpdated = True
                    break
            # loop again if representative nodes were updated
            if checkUpdated == True:
                break

    if label_column in representative_features:
        representative_features.remove(label_column)

    df_filtered = df_from_hierarchy.loc[:, non_class_cols + [label_column] +
                                        representative_features]

    df_filtered.columns = non_class_cols + [label_column
                                            ] + representative_features
    df_filtered.rename(columns=renaming_dict, inplace=True)

    return df_filtered
Esempio n. 30
0
        imputed = imputer.predict(data)
        newdata1.loc[np.where(pd.isna(data[i]) == True)[0],
                     i] = imputed.iloc[np.where(pd.isna(data[i]) == True)[0],
                                       len(imputed.columns) - 2]

orig = []
tech = []
deep = []
missing = []
var = []
for i in data.columns:
    if type_var[i] not in ['int64', 'float64'
                           ] and sum(pd.isna(data[i]) == True) > 0:
        var.append(i)
        orig.append(info_gain.info_gain(list(data[target]), list(data[i])))
        tech.append(
            info_gain.info_gain(list(newdata[target]), list(newdata[i])))
        missing.append(sum(pd.isna(data[i]) == True) / len(data))

ixx = np.where(pd.isna(data[i]) == True)[0]
newdata.loc[ixx, i]
newdata1.loc[ixx, i]
result = pd.DataFrame({
    'size_missing': missing,
    'Var': var,
    'Orig': orig,
    'Tech': tech
})

newdata2 = data.copy()