Exemple #1
0
def check_debot_api(prefix):
    full_prefix = util.get_full_prefix(prefix)
    spam_group = load_user_all(full_prefix)
    debot_result = {}
    count = 0
    unique_user = set([])
    for name, group in spam_group.iteritems():
        print name
        print
        user = group['screen_name']
        print user
        count += 1
        if user in unique_user:
            continue
        else:
            unique_user.add(user)
        mydata = [('screen_name', '@' + user)]
        mydata = urllib.urlencode(mydata)
        req = urllib2.Request(path, mydata)
        req.add_header("Content-type", "application/x-www-form-urlencoded")
        page = urllib2.urlopen(req).read()
        if 'This account has not been detected by DeBot' in page:
            print 'nobot'
            debot_result[user] = 'nobot'
        else:
            print 'isbot'
            debot_result[user] = 'isbot'

    json.dump(
        debot_result,
        open(DEBOT_DIR + 'debot_' + prefix + '_all_user_score.json', 'w'))
Exemple #2
0
def user_info_nine_urls():
    y = []
    x = []
    for keyword, KEYWORD in [('bit', ['bit ly']), ('tinyurl', ['tinyurl']),
                             ('goo', ['goo gl']), ('dld', ["dld bz"]),
                             ('ift', ["ift tt"]), ('dlvr', ['dlvr it']),
                             ('ow', ['ow ly']), ('lnis', ['ln is']),
                             ('viid', ['viid'])]:
        # keyword = keyword + "_long"
        prefix = util.get_full_prefix(keyword)
        #streamer.collect(keyword=KEYWORD, filename=util.get_full_src_path(prefix), num_tweets=NUM_TWEETS, duration = 43200)
        detector = SpamDetector(prefix=prefix,
                                url_based=True,
                                collect_url_only=False)
        # pprint.pprint(detector.get_user_info())
        group = detector.get_spam_group()
        user_info = detector.get_user_info()

        # pprint.pprint(group)

        # let's use two for now: top language, and tweet variability
        from collections import Counter

        for index, g in enumerate(group.keys()):
            user_infos = [user_info[str(u)] for u in group[g]['spam_user']]
            std = np.std([u["statuses_count"] for u in user_infos])
            mean = np.mean([u["statuses_count"] for u in user_infos])
            top_language = Counter([u["lang"]
                                    for u in user_infos]).most_common(1)[0][0]
            x.append([std / mean, top_language])
            #return group[g]['spam_user'], user_info
            y.append(keyword + str(index + 1))

        print x, y
Exemple #3
0
def load_sample_user(group='mybot'):
    import urlparse
    dic = json.load(open(DEBOT_DIR + 'debot_mybot_total_news_url.json', 'r'))
    prefix = 'bit'
    full_prefix = util.get_full_prefix(prefix)
    my_bot = load_user_screenname(full_prefix)
    print '%s news result' % (group)
    # those four URLs are not part of news bots, but generic URLs
    exclude = ['twitter.com', 'fb.me', 'www.youtube.com', 'youtu.be']
    count = 0
    total = 0
    precents = []
    for u, v in dic[group].iteritems():
        if u not in my_bot and group == 'mybot':
            continue
        total += 1
        have_news = False
        num_news = 0
        for url in v:
            if urlparse.urlparse(
                    url).netloc in crawler.whitelist and urlparse.urlparse(
                        url).netloc not in exclude:
                num_news += 1
                have_news = True
        if have_news:
            count += 1
            precents.append(num_news * 1.0 / len(v))
    print 'total num accounts %d' % (total)
    print 'num news accounts %d' % (count)
    print 'avg percent of news tweets %f' % (sum(precents) / len(precents))
Exemple #4
0
def parameter_sweeping_plot_miniplot():
    all_range = []
    all_y = []
    all_OX = []
    all_np = []
    titles = []
    for key in sorted(
        ['bit', 'tinyurl', 'lnis', 'viid', 'goo', 'dld', 'ift', 'dlvr', 'ow']):
        PREFIX = util.get_full_prefix(key)
        detector = SpamDetector(prefix=PREFIX, url_based=True)
        #result = detector.parameter_sweeping_plot(min_duplicate_factor = 3, return_all = True)
        result = detector.parameter_sweeping_plot(percent_same=0.6,
                                                  return_all=True)
        titles.append(key)
        all_range.append(result[0])
        all_y.append(result[1])
        all_OX.append(result[2])
        all_np.append(result[3])

    timeline_new.plot_xybar_miniplot(
        all_range,
        all_y,
        all_OX,
        all_np,
        titles,
        xlabel='percent same',
        ylabel='number of spam user',
        filename=
        'parameter_sweeping/parameter_sweeping_plot_all_URL_shorteners_min_dup_factor'
    )
    exit()
Exemple #5
0
def helper_print_metadata():
    for key in ['viid', 'goo', 'bit', 'dld', 'ift', 'dlvr']:
        PREFIX = util.get_full_prefix(key)
        detector = SpamDetector(prefix=PREFIX, url_based=True)
        #detector.print_metadata()
        #pprint.pprint(detector.get_suspicious_user_group(startover = False, filter_function = url_detect))
        detector.save_user_info()
    exit()
Exemple #6
0
def load_user(prefix, group_id):
    full_prefix = util.get_full_prefix(prefix)
    detector = detect.SpamDetector(prefix=full_prefix)
    group = detector.get_spam_group()
    user_info = detector.get_user_info()
    #alluser = set([])
    id_count = 1
    for g in group:
        if id_count == group_id:
            return group[g]['spam_user'], user_info
        id_count += 1
Exemple #7
0
def store_num_tweet_per_user_json():
    #dic = json.load(open('spam_category.json', 'r'))
    dic = {}
    for key in sorted(
        ['bit', 'tinyurl', 'lnis', 'viid', 'goo', 'dld', 'ift', 'dlvr', 'ow']):
        PREFIX = util.get_full_prefix(key)
        detector = SpamDetector(prefix=PREFIX, url_based=True)
        dic[key] = detector.get_tweet_per_user()
    json.dump(dic, open('metadata/user_num_tweet_all_URL_shorteners.json',
                        'w'))
    exit()
Exemple #8
0
def compare_score_all_user(prefix,
                           return_debot_only=False,
                           return_mybot_only=False):
    debot_result = json.load(
        open(DEBOT_DIR + 'debot_' + prefix + '_all_user_score.json', 'r'))
    debot_bot = set([
        user for user, status in debot_result.iteritems() if status == 'isbot'
    ])
    full_prefix = util.get_full_prefix(prefix)
    my_bot = load_user_screenname(full_prefix)
    print 'num bots for debot %d' % (len(debot_bot))
    print 'num bots for our method %d' % (len(my_bot))
    print 'intersection is %d' % (len(debot_bot.intersection(my_bot)))
    print 'percent our bot in the intersection is %f' % (
        1.0 * len(debot_bot.intersection(my_bot)) / len(my_bot))
    print
    print 'num bots in debot,not in ours %d' % (len(
        debot_bot.difference(my_bot)))
    print 'num bots in ours,not in debot %d' % (len(
        my_bot.difference(debot_bot)))

    print 'Closer analysis of bots identified by debot...'
    debot_only = debot_bot.difference(my_bot)
    if return_debot_only:
        return debot_only
    if return_mybot_only:
        return my_bot.difference(debot_bot)

    user_info = json.load(open(full_prefix + 'user_info.json', 'r'))
    user_info_dic = {}
    for u, v in user_info.iteritems():
        user_info_dic[v['screen_name']] = v

    count = 0
    for u in debot_only:
        #pprint.pprint(user_info_dic[u]['screen_name'])
        if user_info_dic[u]['verified']:
            count += 1
            #print user_info_dic[u]['screen_name']
    print 'debot num verified ', count
    #print len(my_bot.intersection(user_info_dic.keys()))
    print

    count = 0
    for u in my_bot:
        if u in user_info_dic:
            if user_info_dic[u]['verified']:
                count += 1
                #print user_info_dic[u]
    print 'mybot num verified ', count
    print
def load_user(prefix, group_id):
    full_prefix = util.get_full_prefix(prefix)
    detector = detect.SpamDetector(prefix=full_prefix)
    group = detector.get_spam_group()
    user_info = detector.get_user_info()
    #alluser = set([])
    #print '[IN load_user], group_id is %d' %(group_id)
    #print '[IN load_user], length of group is %d' %(len(group))
    id_count = 1
    for g in group:
        #print 'current group_id is %d' %(id_count)
        if id_count == group_id:
            return group[g]['spam_user'], user_info
        id_count += 1
Exemple #10
0
def run_long_experiment():
    for keyword, KEYWORD in [('bitly', ['bit ly']), ('tinyurl', ['tinyurl']),
                             ('goo', ['goo gl']), ('dld', ["dld bz"]),
                             ('ift', ["ift tt"]), ('dlvr', ['dlvr it']),
                             ('ow', ['ow ly']), ('lnis', ['ln is']),
                             ('viid', ['viid'])]:
        keyword = keyword + "_long"
        prefix = util.get_full_prefix(keyword)
        NUM_TWEETS = 500000
        #streamer.collect(keyword=KEYWORD, filename=util.get_full_src_path(prefix), num_tweets=NUM_TWEETS, duration = 43200)
        detector = SpamDetector(prefix=prefix,
                                url_based=True,
                                collect_url_only=False)
        detector.get_percent_of_spam()
def get_connectivity(prefix, group_id):
    userlist, user_info = load_user(prefix, group_id)
    filename = util.get_full_prefix(prefix) + "group_" + str(
        group_id) + "_user_followers_dic.json"
    myFollowerFinder = follower.FollowerFinder(prefix=prefix,
                                               userlist=userlist)
    """generate and save json file"""
    TYPE = "undirected"
    SAVE_DIR = "gephi/"
    g = Graph(myFollowerFinder.load_file(filename=filename), TYPE)
    g.build_graph()
    g.add_screenname(user_info)
    print len(g.get_graph().nodes())
    print approx.node_connectivity(g.get_graph())
    dic = degree_alg.degree_centrality(g.get_graph())
    print sum(dic.values()) / len(dic)
    return sum(dic.values()) / len(dic)
Exemple #12
0
def load_user(prefix):
    full_prefix = util.get_full_prefix(prefix)
    detector = detect.SpamDetector(prefix=full_prefix)
    group = detector.get_spam_group()
    """
	Run those two lines of code of url info file does not exist
	
	url_info = detector.get_url_per_user()
	json.dump(url_info, open('metadata/'+prefix+'_user_url_dictionary.json','w'))
	"""

    url_info = json.load(
        open('metadata/' + prefix + '_user_url_dictionary.json', 'r'))
    print len(url_info)

    #alluser = set([])
    # id_count = 1

    for index, g in enumerate(group):
        unique_url = set([])
        for user in group[g]['spam_user']:
            for url in url_info[str(user)]:
                if prefix in url:
                    unique_url.add(url)
                else:
                    if 'twitter.com' in url:
                        if url in cache:
                            print 'find url in cache'
                            unique_url.add(cache[url])
                        else:
                            try:
                                print url
                                new_url = extract_url_from_twitter_page(
                                    url, prefix)
                                if new_url:
                                    unique_url.add(new_url)
                            except Exception, e:
                                print e
                                time.sleep(2)

            #unique_url = unique_url.union(set(url_info[str(user)]))
            #print len(unique_url)

        #pprint.pprint(unique_url)
        group[g]['unique_url'] = list(unique_url)
def get_and_store_status(filename):
    if os.path.isfile(filename):
        print 'file exists'
        return
    dic = {}
    #['bit', 'tinyurl', 'lnis', 'viid', 'goo', 'dld', 'ift', 'dlvr', 'ow']
    for prefix in [
            'bit', 'tinyurl', 'lnis', 'viid', 'goo', 'dld', 'ift', 'dlvr', 'ow'
    ]:
        print prefix
        full_prefix = util.get_full_prefix(prefix)
        detector = detect.SpamDetector(prefix=full_prefix)
        user = detector.get_spam_user_info(variable='screen_name')
        #user = ['WuerzRodrigo', 'reed_schepens']
        #user = ['InceZehraince3', 'noexistingasdf123', 'zhouhanchen', 'NBA76ersFans']
        #user = list(user)[:2]
        print len(user)
        dic[prefix] = check_status(user)
    json.dump(dic, open(filename, 'w'))
Exemple #14
0
def get_social_network(prefix, group_id):
    userlist, user_info = load_user(prefix, group_id)
    filename = util.get_full_prefix(prefix) + "group_" + str(
        group_id) + "_user_followers_dic.json"
    myFollowerFinder = FollowerFinder(prefix=prefix, userlist=userlist)
    # if filename does not exist, call Twitter API to collect data
    if not os.path.isfile(filename):
        myFollowerFinder.getFollowers(userlist, filename=filename)
        #myFollowerFinder.check_common_user(filename = filename)
    """generate and save json file"""
    TYPE = "undirected"
    SAVE_DIR = "gephi/"
    print myFollowerFinder.load_file(filename=filename)
    exit()
    g = social_network.Graph(myFollowerFinder.load_file(filename=filename),
                             TYPE)
    g.build_graph()
    g.add_screenname(user_info)
    #g.delete_singleton()
    g.generatejsonfile(SAVE_DIR + prefix + "_bot_group_" + str(group_id))
Exemple #15
0
def sample_user():
    # sample of debot only [u'melanieviveros9', u'RatanSharda55', u'KevinMcshea', u'imchrismva', u'Phaedrus08']
    import random
    import urlparse
    #debot_only = compare_score_all_user('bit', return_mybot_only = True)
    #samples = random.sample(list(debot_only), 5)
    #print samples
    prefix = 'bit'
    full_prefix = util.get_full_prefix(prefix)
    myUserCrawler = crawler.UserCrawler(simplecrawl=True)
    debot_result = json.load(
        open(DEBOT_DIR + 'debot_' + prefix + '_all_user_score.json', 'r'))
    debot_bot = set([
        user for user, status in debot_result.iteritems() if status == 'isbot'
    ])
    my_bot = load_user_screenname_custom(full_prefix)

    data = {'debot': list(debot_bot), 'mybot': list(my_bot)}
    final_result = {'debot': {}, 'mybot': {}}
    for name, userlist in data.iteritems():
        print name
        for u in userlist:
            final_result[name][u] = []
            try:
                result = myUserCrawler.get200(u,
                                              use_screen_name=True,
                                              return_error_code=False)
                crawler.tokenindex += 1
                crawler.tokenindex = crawler.tokenindex % crawler.ROUND
                time.sleep(0.2)
                for t in result:
                    final_result[name][u] += streamer.get_embedded_url(t)
            except Exception, e:
                print e
                myUserCrawler = crawler.UserCrawler(simplecrawl=True)
                time.sleep(20)
Exemple #16
0
    """
        Be careful when calling crawler on existing duplifliers 
        because of update of filename from actual text to numerical
        number group_1, group_2, ..., ...

    """
    #helper_print_metadata()
    #parameter_sweeping_plot_miniplot()
    #get_spam_group_num_tweet()
    """
        input variables: keyword, DATA_DIR (defined at very top)
    """
    # default keyword
    keyword = 'git_test'

    prefix = util.get_full_prefix(keyword)
    if len(sys.argv) > 1:
        keyword = sys.argv[1]
    #SOURCE_FILE = DATA_DIR + PREFIX[:PREFIX.index('/')] + '.txt'
    #SOURCE_FILE = util.slice_data(2014, 10, 10)
    """
            variables for streaming: (optional)
            KEYWORD: a list of keywords
            num_tweets: number of tweets to collect
    """
    KEYWORD = ['bit']
    NUM_TWEETS = 1000
    """
        start the streamer first (optional if the dataset already exists)
    """