Exemple #1
0
def feature9():
    dataset = pd.read_csv(BAS, low_memory=False)
    user_ids = dataset['id'].values
    current_year = 2015

    ratios = []
    temp = []

    for i in range(len(user_ids)):
        friends = dataset['friends_count'].loc[dataset['id'] ==
                                               user_ids[i]].values[0]
        created = dataset['created_at'].loc[dataset['id'] ==
                                            user_ids[i]].values[0]

        year = created.split()[5]
        difference = current_year - int(year)

        ratios.append(friends / difference)

    for ratio in ratios:
        if ratio > 100:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, ratios)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(ratios, class_list)[0][1]))
    return temp
Exemple #2
0
def feature3():
    print("Reading datasets...")
    dataset = pd.read_csv(BAS)
    dataset_tweets = pd.read_csv(BAS_TWEETS)
    dataset_tweets.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True)
    print("Done")

    users_id = dataset['id'].values

    temp = []
    similarities = []

    for i in range(len(users_id)):
        print(i)
        all_user_tweets = dataset_tweets['text'].loc[dataset_tweets['user_id']
                                                     == users_id[i]]
        similarities.append(utils.message_similarity(all_user_tweets))

    for similarity in similarities:
        if similarity > 100:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, similarities)
    print("Information Gain: " + str(ig))

    class_list = utils.read_dataset()
    print("Correlation coefficient: " +
          str(corrcoef(similarities, class_list)[0][1]))
    return temp
Exemple #3
0
def feature2():
    dataset = pd.read_csv(BAS)
    dataset_tweets = pd.read_csv(BAS_TWEETS)
    dataset_tweets.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True)

    users_id = dataset['id'].values
    users_id_tweets = dataset_tweets['user_id'].values
    users_id_tweets_list = users_id_tweets.tolist()

    tmp = []
    tweets_count = []

    # Checking if each ID appears more than 20 times in users_id_tweets
    for id in users_id:
        count = users_id_tweets_list.count(id)
        if count >= 20:
            tmp.append(1)
        else:
            tmp.append(0)
        tweets_count.append(count)

    ig = info_gain.info_gain(tmp, tweets_count)
    print("Information Gain: " + str(ig))

    class_list = utils.read_dataset()
    print("Correlation coefficient: " +
          str(corrcoef(tweets_count, class_list)[0][1]))
    return tmp
Exemple #4
0
def feature1():
    dataset = pd.read_csv(BAS)
    temp_list = []
    friends_list = dataset['friends_count'].values

    for friends_count in friends_list:
        if friends_count >= 1000:
            temp_list.append(1)
        else:
            temp_list.append(0)

    ig = info_gain.info_gain(temp_list, friends_list)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("Correlation coefficient: " +
          str(corrcoef(friends_list, class_list)[0][1]))
    return temp_list
Exemple #5
0
def feature4():
    dataset = pd.read_csv(BAS)
    dataset_tweets = pd.read_csv(BAS_TWEETS)
    dataset_tweets.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True)

    users_id = dataset['id'].values

    url_ratios = []
    temp = []

    for id in users_id:
        user_tweets = dataset_tweets['text'].loc[dataset_tweets['user_id'] ==
                                                 id]
        tweet_url_count = 0
        for tweet in user_tweets:
            if re.findall(
                    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                    str(tweet)):
                tweet_url_count += 1
        try:
            ratio = tweet_url_count / len(user_tweets)
        except ZeroDivisionError:
            ratio = 0
        print(ratio)
        url_ratios.append(ratio)

    for ratio in url_ratios:
        if ratio >= 0.6:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, url_ratios)
    print("Information Gain: " + str(ig))

    class_list = utils.read_dataset()
    print("Correlation coefficient: " +
          str(corrcoef(url_ratios, class_list)[0][1]))
    return temp
Exemple #6
0
def feature5():
    dataset = pd.read_csv(BAS)

    friends_list = dataset['friends_count'].values
    followers_list = dataset['followers_count'].values

    ratios = []
    temp = []

    for i in range(0, len(friends_list)):
        try:
            ratio = (friends_list[i] / (followers_list[i]**2))
        except RuntimeWarning:
            ratio = 0
        except ZeroDivisionError:
            ratio = 0
        ratios.append(ratio)

    for i in range(len(ratios)):
        if isnan(ratios[i]):
            ratios[i] = 0

    for i in range(len(ratios)):
        if isinf(ratios[i]):
            ratios[i] = 0

    for ratio in ratios:
        if ratio < 0.1:
            temp.append(1)
        else:
            temp.append(0)

    ig = info_gain.info_gain(temp, ratios)
    print("Information Gain: " + str(ig))

    class_list = utils.read_dataset()
    print("Correlation coefficient: " +
          str(corrcoef(ratios, class_list)[0][1]))
    return temp
Exemple #7
0
def feature1():
    dataset = pd.read_csv(BAS)
    creation_date = dataset['created_at'].values
    current_year = 2020

    temp = []
    age = []

    for date in creation_date:
        year = date.split()[5]
        difference = current_year - int(year)
        if difference < 8:
            temp.append(0)
        else:
            temp.append(1)
        age.append(difference)

    ig = info_gain.info_gain(temp, age)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(age, class_list)[0][1]))
    return temp
Exemple #8
0
def feature8():
    timenow = datetime.datetime.now()

    e13_tweets = pd.read_csv(E13_tweets)
    fsf_tweets = pd.read_csv(FSF_tweets)
    int_tweets = pd.read_csv(INT_tweets)
    tfp_tweets = pd.read_csv(TFP_tweets)
    twt_tweets = pd.read_csv(TWT_tweets)

    dataset = pd.read_csv(BAS, low_memory=False)
    user_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    total = []
    temp = []

    for i in range(len(user_ids)):
        api_tweets = []
        if bas_dataset[i] == 'E13':
            tweets = e13_tweets['text'].loc[e13_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            similarity_count = utils.message_similarity(api_tweets)
            total.append(similarity_count)
        elif bas_dataset[i] == 'FSF':
            tweets = fsf_tweets['text'].loc[fsf_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            similarity_count = utils.message_similarity(api_tweets)
            total.append(similarity_count)
        elif bas_dataset[i] == 'INT':
            tweets = int_tweets['text'].loc[int_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            similarity_count = utils.message_similarity(api_tweets)
            total.append(similarity_count)
        elif bas_dataset[i] == 'TFP':
            tweets = tfp_tweets['text'].loc[tfp_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            similarity_count = utils.message_similarity(api_tweets)
            total.append(similarity_count)
        elif bas_dataset[i] == 'TWT':
            tweets = twt_tweets['text'].loc[twt_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            similarity_count = utils.message_similarity(api_tweets)
            total.append(similarity_count)

    for i in range(len(total)):
        if isnan(total[i]):
            total[i] = 0

    for count in total:
        if count > 10:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, total)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(total, class_list)[0][1]))

    timeend = datetime.datetime.now()
    print("TIME TAKEN: " + str(timeend - timenow))
    pass
Exemple #9
0
def feature7():
    e13_tweets = pd.read_csv(E13_tweets)
    fsf_tweets = pd.read_csv(FSF_tweets)
    int_tweets = pd.read_csv(INT_tweets)
    tfp_tweets = pd.read_csv(TFP_tweets)
    twt_tweets = pd.read_csv(TWT_tweets)

    dataset = pd.read_csv(BAS, low_memory=False)
    user_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    ratios = []
    temp = []

    for i in range(len(user_ids)):
        api_tweetsurl_count = 0
        api_tweets = []
        print(i)
        if bas_dataset[i] == 'E13':
            tweets = e13_tweets['text'].loc[e13_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            for api_tweet in api_tweets:
                if re.findall(
                        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        str(api_tweet)):
                    api_tweetsurl_count += 1
            if api_tweetsurl_count == 0:
                ratios.append(0)
            else:
                ratios.append(api_tweetsurl_count / len(api_tweets))
        elif bas_dataset[i] == 'FSF':
            tweets = fsf_tweets['text'].loc[fsf_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            for api_tweet in api_tweets:
                if re.findall(
                        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        str(api_tweet)):
                    api_tweetsurl_count += 1
            if api_tweetsurl_count == 0:
                ratios.append(0)
            else:
                ratios.append(api_tweetsurl_count / len(api_tweets))
        elif bas_dataset[i] == 'INT':
            tweets = int_tweets['text'].loc[int_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            for api_tweet in api_tweets:
                if re.findall(
                        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        str(api_tweet)):
                    api_tweetsurl_count += 1
            if api_tweetsurl_count == 0:
                ratios.append(0)
            else:
                ratios.append(api_tweetsurl_count / len(api_tweets))
        elif bas_dataset[i] == 'TFP':
            tweets = tfp_tweets['text'].loc[tfp_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            for api_tweet in api_tweets:
                if re.findall(
                        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        str(api_tweet)):
                    api_tweetsurl_count += 1
            if api_tweetsurl_count == 0:
                ratios.append(0)
            else:
                ratios.append(api_tweetsurl_count / len(api_tweets))
        elif bas_dataset[i] == 'TWT':
            tweets = twt_tweets['text'].loc[twt_tweets['user_id'] ==
                                            user_ids[i]]
            for tweet in tweets:
                if "API" or "AutoBot" in tweet:
                    api_tweets.append(tweet)
                else:
                    pass
            for api_tweet in api_tweets:
                if re.findall(
                        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        str(api_tweet)):
                    api_tweetsurl_count += 1
            if api_tweetsurl_count == 0:
                ratios.append(0)
            else:
                ratios.append(api_tweetsurl_count / len(api_tweets))

    for ratio in ratios:
        if ratio > 0.8:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, ratios)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(ratios, class_list)[0][1]))
    pass
Exemple #10
0
def feature2():
    dataset = pd.read_csv('../datasets/BAS/bas_users.csv')
    e13_followers = pd.read_csv(E13_followers)
    fsf_followers = pd.read_csv(FSF_followers)
    int_followers = pd.read_csv(INT_followers)
    tfp_followers = pd.read_csv(TFP_followers)
    twt_followers = pd.read_csv(TWT_followers)

    bas_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values
    bas_friends = dataset['friends_count'].values

    ratios = []
    temp = []

    for i in range(0, len(bas_ids)):
        count = 0
        print(i)
        try:
            if bas_dataset[i] == 'E13':
                followers_of_id = e13_followers['source_id'].loc[
                    e13_followers['target_id'] == bas_ids[i]].values
                for id in followers_of_id:
                    try:
                        forward = e13_followers['target_id'].loc[
                            e13_followers['source_id'] == id].values
                        if forward[0] == bas_ids[i]:
                            count += 1
                        else:
                            pass
                    except KeyError:
                        pass

                ratio = count / bas_friends[i]
                ratios.append(ratio)
            elif bas_dataset[i] == 'TFP':
                followers_of_id = tfp_followers['source_id'].loc[
                    tfp_followers['target_id'] == bas_ids[i]].values
                for id in followers_of_id:
                    try:
                        forward = tfp_followers['target_id'].loc[
                            tfp_followers['source_id'] == id].values
                        if forward[0] == bas_ids[i]:
                            count += 1
                        else:
                            pass
                    except KeyError:
                        pass

                ratio = count / bas_friends[i]
                ratios.append(ratio)
            elif bas_dataset[i] == 'FSF':
                followers_of_id = fsf_followers['source_id'].loc[
                    fsf_followers['target_id'] == bas_ids[i]].values
                for id in followers_of_id:
                    try:
                        forward = fsf_followers['target_id'].loc[
                            fsf_followers['source_id'] == id].values
                        if forward[0] == bas_ids[i]:
                            count += 1
                        else:
                            pass
                    except KeyError:
                        pass

                ratio = count / bas_friends[i]
                ratios.append(ratio)
            elif bas_dataset[i] == 'INT':
                followers_of_id = int_followers['source_id'].loc[
                    int_followers['target_id'] == bas_ids[i]].values
                for id in followers_of_id:
                    try:
                        forward = int_followers['target_id'].loc[
                            int_followers['source_id'] == id].values
                        if forward[0] == bas_ids[i]:
                            count += 1
                        else:
                            pass
                    except KeyError:
                        pass

                ratio = count / bas_friends[i]
                ratios.append(ratio)
            elif bas_dataset[i] == 'TWT':
                followers_of_id = twt_followers['source_id'].loc[
                    twt_followers['target_id'] == bas_ids[i]].values
                for id in followers_of_id:
                    try:
                        forward = twt_followers['target_id'].loc[
                            twt_followers['source_id'] == id].values
                        if forward[0] == bas_ids[i]:
                            count += 1
                        else:
                            pass
                    except KeyError:
                        pass

                ratio = count / bas_friends[i]
                ratios.append(ratio)
        except:
            pass

    for ratio in ratios:
        if isnan(ratio):
            ratio = 0
        else:
            pass

    for ratio in ratios:
        if ratio < 0.5:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, ratios)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(ratios, class_list)[0][1]))
    return temp
Exemple #11
0
def feature6():
    e13_tweets = pd.read_csv(E13_tweets)
    fsf_tweets = pd.read_csv(FSF_tweets)
    int_tweets = pd.read_csv(INT_tweets)
    tfp_tweets = pd.read_csv(TFP_tweets)
    twt_tweets = pd.read_csv(TWT_tweets)

    dataset = pd.read_csv(BAS, low_memory=False)
    user_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    ratios = []
    temp = []

    for i in range(len(user_ids)):
        api_tweets_count = 0
        print(i)
        if bas_dataset[i] == 'E13':
            sources_from_id = e13_tweets['source'].loc[e13_tweets['user_id'] ==
                                                       user_ids[i]]
            tweets_count = dataset['statuses_count'].loc[dataset['id'] ==
                                                         user_ids[i]].values

            for source_id in sources_from_id:
                if "API" or "AutoTwitter" in source_id:
                    api_tweets_count += 1
                else:
                    pass

            if api_tweets_count == 0:
                ratios.append(0)
            else:
                ratios.append(tweets_count[0] / api_tweets_count)
        elif bas_dataset[i] == 'FSF':
            sources_from_id = fsf_tweets['source'].loc[fsf_tweets['user_id'] ==
                                                       user_ids[i]]
            tweets_count = dataset['statuses_count'].loc[dataset['id'] ==
                                                         user_ids[i]].values

            for source_id in sources_from_id:
                if "API" or "AutoTwitter" in source_id:
                    api_tweets_count += 1
                else:
                    pass

            if api_tweets_count == 0:
                ratios.append(0)
            else:
                ratios.append(tweets_count[0] / api_tweets_count)
        elif bas_dataset[i] == 'INT':
            sources_from_id = int_tweets['source'].loc[int_tweets['user_id'] ==
                                                       user_ids[i]]
            tweets_count = dataset['statuses_count'].loc[dataset['id'] ==
                                                         user_ids[i]].values

            for source_id in sources_from_id:
                if "API" or "AutoTwitter" in source_id:
                    api_tweets_count += 1
                else:
                    pass

            if api_tweets_count == 0:
                ratios.append(0)
            else:
                ratios.append(tweets_count[0] / api_tweets_count)
        elif bas_dataset[i] == 'TFP':
            sources_from_id = tfp_tweets['source'].loc[tfp_tweets['user_id'] ==
                                                       user_ids[i]]
            tweets_count = dataset['statuses_count'].loc[dataset['id'] ==
                                                         user_ids[i]].values

            for source_id in sources_from_id:
                if "API" or "AutoTwitter" in source_id:
                    api_tweets_count += 1
                else:
                    pass

            if api_tweets_count == 0:
                ratios.append(0)
            else:
                ratios.append(tweets_count[0] / api_tweets_count)
        elif bas_dataset[i] == 'TWT':
            sources_from_id = twt_tweets['source'].loc[twt_tweets['user_id'] ==
                                                       user_ids[i]]
            tweets_count = dataset['statuses_count'].loc[dataset['id'] ==
                                                         user_ids[i]].values

            for source_id in sources_from_id:
                if "API" or "AutoTwitter" in source_id:
                    api_tweets_count += 1
                else:
                    pass

            if api_tweets_count == 0:
                ratios.append(0)
            else:
                ratios.append(tweets_count[0] / api_tweets_count)

    for i in range(len(ratios)):
        if isnan(ratios[i]):
            ratios[i] = 0

    for ratio in ratios:
        print(ratio)
        if ratio > 1.03:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, ratios)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(ratios, class_list)[0][1]))
    pass
Exemple #12
0
def feature5():
    e13_followers = pd.read_csv(E13_followers)
    fsf_followers = pd.read_csv(FSF_followers)
    int_followers = pd.read_csv(INT_followers)
    tfp_followers = pd.read_csv(TFP_followers)
    twt_followers = pd.read_csv(TWT_followers)

    dataset = pd.read_csv(BAS)
    user_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    medians = []
    friends_count = []

    for i in range(len(user_ids)):
        print(i)
        id_followers = []
        if bas_dataset[i] == 'E13':
            source_ids = e13_followers['target_id'].loc[
                e13_followers['source_id'] == user_ids[i]]
            friends = dataset['friends_count'].loc[dataset['id'] ==
                                                   user_ids[i]].values
            friends_count.append(friends[0])

            for id in source_ids:
                source_source_ids = e13_followers['target_id'].loc[
                    e13_followers['source_id'] == id].values
                for source_id in source_source_ids:
                    followers_count = dataset['followers_count'].loc[
                        dataset['id'] == source_id].values
                    if followers_count:
                        id_followers.append(followers_count)
                    else:
                        pass
            medians.append(median(id_followers))
        elif bas_dataset[i] == 'FSF':
            source_ids = fsf_followers['target_id'].loc[
                fsf_followers['source_id'] == user_ids[i]]
            friends = dataset['friends_count'].loc[dataset['id'] ==
                                                   user_ids[i]].values
            friends_count.append(friends[0])

            for id in source_ids:
                source_source_ids = fsf_followers['target_id'].loc[
                    fsf_followers['source_id'] == id].values
                for source_id in source_source_ids:
                    followers_count = dataset['followers_count'].loc[
                        dataset['id'] == source_id].values
                    if followers_count:
                        id_followers.append(followers_count)
                    else:
                        pass
            medians.append(median(id_followers))
        elif bas_dataset[i] == 'INT':
            source_ids = int_followers['target_id'].loc[
                int_followers['source_id'] == user_ids[i]]
            friends = dataset['friends_count'].loc[dataset['id'] ==
                                                   user_ids[i]].values
            friends_count.append(friends[0])

            for id in source_ids:
                source_source_ids = int_followers['target_id'].loc[
                    int_followers['source_id'] == id].values
                for source_id in source_source_ids:
                    followers_count = dataset['followers_count'].loc[
                        dataset['id'] == source_id].values
                    if followers_count:
                        id_followers.append(followers_count)
                    else:
                        pass
            medians.append(median(id_followers))
        elif bas_dataset[i] == 'TFP':
            source_ids = tfp_followers['target_id'].loc[
                tfp_followers['source_id'] == user_ids[i]]
            friends = dataset['friends_count'].loc[dataset['id'] ==
                                                   user_ids[i]].values
            friends_count.append(friends[0])

            for id in source_ids:
                source_source_ids = tfp_followers['target_id'].loc[
                    tfp_followers['source_id'] == id].values
                for source_id in source_source_ids:
                    followers_count = dataset['followers_count'].loc[
                        dataset['id'] == source_id].values
                    if followers_count:
                        id_followers.append(followers_count)
                    else:
                        pass
            medians.append(median(id_followers))
        elif bas_dataset[i] == 'TWT':
            source_ids = twt_followers['target_id'].loc[
                twt_followers['source_id'] == user_ids[i]]
            friends = dataset['friends_count'].loc[dataset['id'] ==
                                                   user_ids[i]].values
            friends_count.append(friends[0])

            for id in source_ids:
                source_source_ids = twt_followers['target_id'].loc[
                    twt_followers['source_id'] == id].values
                for source_id in source_source_ids:
                    followers_count = dataset['followers_count'].loc[
                        dataset['id'] == source_id].values
                    if followers_count:
                        id_followers.append(followers_count)
                    else:
                        pass
            medians.append(median(id_followers))

    for i in range(len(medians)):
        if isnan(medians[i]):
            medians[i] = 0

    temp = []
    ratios = []

    for i in range(len(medians)):
        if medians[i] == 0:
            ratio = 0
        else:
            ratio = friends_count[i] / medians[i]
        ratios.append(ratio)

    for ratio in ratios:
        if ratio < 1.5:
            temp.append(1)
        else:
            temp.append(0)

    ig = info_gain.info_gain(temp, ratios)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(ratios, class_list)[0][1]))
    return temp
Exemple #13
0
def feature4():
    dataset = pd.read_csv(BAS)
    e13_followers = pd.read_csv(E13_followers)
    fsf_followers = pd.read_csv(FSF_followers)
    int_followers = pd.read_csv(INT_followers)
    tfp_followers = pd.read_csv(TFP_followers)
    twt_followers = pd.read_csv(TWT_followers)

    bas_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    tweets_count = []
    global_tweets_count = []
    temp = []

    for i in range(len(bas_ids)):
        print(i)
        if bas_dataset[i] == 'E13':
            id_followers = e13_followers['target_id'].loc[
                e13_followers['source_id'] == bas_ids[i]].values

            for follower in id_followers:
                tweets = dataset['statuses_count'].loc[dataset['id'] ==
                                                       follower].values
                if tweets:
                    tweets_count.append(tweets)
                else:
                    pass
            global_tweets_count.append(average(tweets_count))
        elif bas_dataset[i] == 'FSF':
            id_followers = fsf_followers['target_id'].loc[
                fsf_followers['source_id'] == bas_ids[i]].values

            for follower in id_followers:
                tweets = dataset['statuses_count'].loc[dataset['id'] ==
                                                       follower].values
                if tweets:
                    tweets_count.append(tweets)
                else:
                    pass
            global_tweets_count.append(average(tweets_count))
        elif bas_dataset[i] == 'INT':
            id_followers = int_followers['target_id'].loc[
                int_followers['source_id'] == bas_ids[i]].values

            for follower in id_followers:
                tweets = dataset['statuses_count'].loc[dataset['id'] ==
                                                       follower].values
                if tweets:
                    tweets_count.append(tweets)
                else:
                    pass
            global_tweets_count.append(average(tweets_count))
        elif bas_dataset[i] == 'TFP':
            id_followers = tfp_followers['target_id'].loc[
                tfp_followers['source_id'] == bas_ids[i]].values

            for follower in id_followers:
                tweets = dataset['statuses_count'].loc[dataset['id'] ==
                                                       follower].values
                if tweets:
                    tweets_count.append(tweets)
                else:
                    pass
            global_tweets_count.append(average(tweets_count))
        elif bas_dataset[i] == 'TWT':
            id_followers = twt_followers['target_id'].loc[
                twt_followers['source_id'] == bas_ids[i]].values

            for follower in id_followers:
                tweets = dataset['statuses_count'].loc[dataset['id'] ==
                                                       follower].values
                if tweets:
                    tweets_count.append(tweets)
                else:
                    pass
            global_tweets_count.append(average(tweets_count))
    for mean_value in global_tweets_count:
        if mean_value < 9000:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, global_tweets_count)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(global_tweets_count, class_list)[0][1]))
    return temp
Exemple #14
0
def feature3():
    dataset = pd.read_csv(BAS)
    e13_friends = pd.read_csv(E13_friends)
    fsf_friends = pd.read_csv(FSF_friends)
    int_friends = pd.read_csv(INT_friends)
    tfp_friends = pd.read_csv(TFP_friends)
    twt_friends = pd.read_csv(TWT_friends)

    bas_ids = dataset['id'].values
    bas_dataset = dataset['dataset'].values

    followers_count = []
    averages = []
    temp = []

    for i in range(len(bas_ids)):
        print(i)
        if bas_dataset[i] == 'E13':
            friends = e13_friends['target_id'].loc[e13_friends['source_id'] ==
                                                   bas_ids[i]].values

            for friend in friends:
                friend_followers = dataset['followers_count'].loc[
                    dataset['id'] == friend].values
                if friend_followers:
                    followers_count.append(friend_followers)
                else:
                    pass
            averages.append(average(followers_count))
        elif bas_dataset[i] == 'FSF':
            friends = fsf_friends['target_id'].loc[fsf_friends['source_id'] ==
                                                   bas_ids[i]].values

            for friend in friends:
                friend_followers = dataset['followers_count'].loc[
                    dataset['id'] == friend].values
                if friend_followers:
                    followers_count.append(friend_followers)
                else:
                    pass
            averages.append(average(followers_count))
        elif bas_dataset[i] == 'INT':
            friends = int_friends['target_id'].loc[int_friends['source_id'] ==
                                                   bas_ids[i]].values

            for friend in friends:
                friend_followers = dataset['followers_count'].loc[
                    dataset['id'] == friend].values
                if friend_followers:
                    followers_count.append(friend_followers)
                else:
                    pass
            averages.append(average(followers_count))
        elif bas_dataset[i] == 'TFP':
            friends = tfp_friends['target_id'].loc[tfp_friends['source_id'] ==
                                                   bas_ids[i]].values

            for friend in friends:
                friend_followers = dataset['followers_count'].loc[
                    dataset['id'] == friend].values
                if friend_followers:
                    followers_count.append(friend_followers)
                else:
                    pass
            averages.append(average(followers_count))
        elif bas_dataset[i] == 'TWT':
            friends = twt_friends['target_id'].loc[twt_friends['source_id'] ==
                                                   bas_ids[i]].values

            for friend in friends:
                friend_followers = dataset['followers_count'].loc[
                    dataset['id'] == friend].values
                if friend_followers:
                    followers_count.append(friend_followers)
                else:
                    pass
            averages.append(average(followers_count))

    for mean_value in averages:
        if mean_value < 25000:
            temp.append(0)
        else:
            temp.append(1)

    ig = info_gain.info_gain(temp, averages)
    print("INFORMATION GAIN: " + str(ig))

    class_list = utils.read_dataset()
    print("PEARSON CORRELATION COEFFICIENT: " +
          str(corrcoef(averages, class_list)[0][1]))
    return temp