Beispiel #1
0
 def get_tweet_from_user(self, userlist, field='id'):
     # return a list of tweets made by users in userlist
     duplicateFinder = dupefilter.DupInfo(prefix=self.prefix)
     return duplicateFinder.get_tweet(collection=util.loadjson(
         self.sourcefile),
                                      userlist=userlist,
                                      field=field)
Beispiel #2
0
def generate_output_file(output_file_name, sourcefile):
    output_file = open(output_file_name, 'w')
    for tweet in util.loadjson(sourcefile):
        output_file.write(
            str(tweet['created_at']) + "," +
            str(tweet['user']['screen_name']) + "," +
            str(tweet['user']['id']) + "\n")
    output_file.close()
Beispiel #3
0
 def get_suspicious_user_group(self,
                               startover=False,
                               filter_function=all_groups):
     duplicateFinder = dupefilter.DupInfo(prefix=self.prefix,
                                          startover=startover)
     duplicateFinder.find_duplicate_tweet(
         collection=util.loadjson(self.sourcefile),
         collect_url_only=self.collect_url_only)
     botty_groups = duplicateFinder.get_suspicious_user_group(
         filter_function=filter_function, url_based=self.url_based)
     return botty_groups
Beispiel #4
0
    def get_percent_of_spam(self):
        spam_user = self.get_spam_user_info(variable='spam_user')
        duplicateFinder = dupefilter.DupInfo(prefix=self.prefix)
        total_user = duplicateFinder.get_metadata(variable='num_user')

        print 'Start time is ', duplicateFinder.get_metadata(
            variable='start_time')
        print 'End time is ', duplicateFinder.get_metadata(variable='end_time')
        print 'Total number of account is %d' % (total_user)
        print 'Total number of spam account is %d' % (len(spam_user))
        print 'Percent of spam account is %f' % (float(len(spam_user)) /
                                                 float(total_user))

        total_tweet = duplicateFinder.get_metadata(variable='num_tweet')
        print 'Total number of tweets is %d' % (total_tweet)
        num_spam_tweet = duplicateFinder.get_tweet(collection=util.loadjson(
            self.sourcefile),
                                                   userlist=spam_user,
                                                   only_number=True)
        print 'Total number of spam tweets is %d' % (num_spam_tweet)
        print 'Percent of spam tweets is %f' % (float(num_spam_tweet) /
                                                float(total_tweet))
#! /bin/python
from util import loadjson
from numpy import corrcoef
from counttag import tag_freq

utag_freq = loadjson("tag-freq")
utag_freq = dict(map(lambda i: (i["name"], i["freq"]), utag_freq))

uqtag_freq = tag_freq(loadjson("hxiao/merged"))
nqtag_freq = tag_freq(loadjson("nobody/merged"))


ushared_tags = set(utag_freq.keys()).intersection(set(uqtag_freq.keys()))
nshared_tags = set(utag_freq.keys()).intersection(set(nqtag_freq.keys()))

upairs = [(utag_freq[tag], uqtag_freq[tag]) for tag in ushared_tags]
npairs = [(utag_freq[tag], nqtag_freq[tag]) for tag in nshared_tags]

x,uy = zip(*upairs)
print corrcoef(x,uy)

x,ny = zip(*npairs)
print corrcoef(x,ny)

#recent tags
rtag_freq = loadjson("recent-tag-freq")
rtag_freq = dict(map(lambda i: (i["name"], i["freq"]), rtag_freq))

shared_tags = set(rtag_freq.keys()).intersection(set(uqtag_freq.keys()))

pairs = [(rtag_freq[tag], uqtag_freq[tag]) for tag in shared_tags]
Beispiel #6
0
 def save_user_info(self):
     duplicateFinder = dupefilter.DupInfo(prefix=self.prefix)
     duplicateFinder.save_user_info(
         collection=util.loadjson(self.sourcefile))
Beispiel #7
0
 def get_url_per_user(self):
     duplicateFinder = dupefilter.DupInfo(prefix=self.prefix)
     return duplicateFinder.get_url_per_user(
         collection=util.loadjson(self.sourcefile))
Beispiel #8
0
 def get_top_embedd_url(self):
     # this is used to generate a white list
     url = util.get_top_embedd_url(self.prefix,
                                   util.loadjson(self.sourcefile))
     print url  # unsorted
from util import loadjson
from qtag import tagfreq, tagset

if __name__ == '__main__':
    hx_lst = loadjson("qlist.hxiao")
    nb_lst = loadjson("qlist.nobody")

    hx_freq, nb_freq = tagfreq(hx_lst), tagfreq(nb_lst)
    hx_set, nb_set = tagset(hx_lst), tagset(nb_lst)

    print sorted(list(hx_set))
    print sorted(list(nb_set))
    print hx_set.intersection(nb_set)