Esempio n. 1
0
    def search_wordpress(self, raw_keyword):
        keyword = urllib2.quote(raw_keyword)
        opsecHeader.write_last_checked_time('wordpress')

        ############### WORDPRESS ##################
        #
        # See http://en.search.wordpress.com/?q=obama&s=date&f=json
        #
        # Arguments:
        # q = keyword to search for
        # s = sort by; we want date; not relevance
        # f = format; we want JSON

        wordpress_query_string = 'http://en.search.wordpress.com/?q=' + keyword + '&s=date&f=json'

        opsecHeader.query_website_json("wordpress", wordpress_query_string)

        wordpress_latest_epoch = self.get_latest_wordpress()
        wordpress_results = opsecHeader.read_results_json('wordpress')
        epoch_time = wordpress_results[0]['epoch_time']

        if str(wordpress_latest_epoch) == str(epoch_time):
            print "No new blog posts since last query."
        else:
            for i in wordpress_results:
                epoch_time = i['epoch_time']
                if int(wordpress_latest_epoch) < int(epoch_time):
                    title = (i['title']).encode('utf-8')
                    author = (i['author']).encode('utf-8')
                    content = (i['content']).encode('utf-8')
                    link = (i['link']).encode('utf-8')
                    self.write_latest_wordpress(epoch_time, title, author, content, link, keyword)
                    opsecHeader.send_email(keyword, "Wordpress")
Esempio n. 2
0
def search_twitter(raw_keyword):
    keyword = urllib2.quote(raw_keyword)
    opsecHeader.write_last_checked_time('twitter')

    # See https://dev.twitter.com/docs/api/1/get/search
    tweet_since_date = str(get_latest_tweet(None, keyword)[0])
    search_query_string = 'http://search.twitter.com/search.json?q=' + keyword + '&rpp=10&result_type=recent'

    if tweet_since_date != '0':  # Twitter does not play nice with invalid since_id's
        search_query_string += '&since_id=' + tweet_since_date

    opsecHeader.query_website_json("twitter", search_query_string)

    twitter_results = opsecHeader.read_results_json('twitter')
    twitter_all_results = twitter_results['results']

    if not twitter_all_results:
        print "No results."
    else:
        existing_epoch_time = get_latest_tweet(None, keyword)[1]

        for i in twitter_all_results:
            created_at = (i['created_at']).encode('utf-8')
            epoch_time_found = calendar.timegm((time.strptime(created_at, '%a, %d %b %Y %H:%M:%S +0000')))
            if int(epoch_time_found) > int(existing_epoch_time):
                twitter_id = (i['id'])
                from_user = (i['from_user']).encode('utf-8')
                text = (i['text']).encode('utf-8')
                created_at = (i['created_at']).encode('utf-8')
                profile_image_url_https = (i['profile_image_url_https']).encode('utf-8')
                location, lat, lng = gen_geo(from_user)

                write_tweet(twitter_id, from_user, text, created_at, keyword, location, lat, lng, epoch_time_found, profile_image_url_https)
                opsecHeader.send_email(keyword, "Twitter")
Esempio n. 3
0
    def search_twitter(self, raw_keyword):
        keyword = urllib2.quote(raw_keyword)
        opsecHeader.write_last_checked_time('twitter')

        # See https://dev.twitter.com/docs/api/1.1/get/search/tweets
        tweet_since_date = str(self.get_latest_tweet(None, keyword)[0])
        search_query_string = 'https://api.twitter.com/1.1/search/tweets.json?q=' + keyword + '&count=10&result_type=recent'

        if tweet_since_date != '0':  # Twitter does not play nice with invalid since_id's
            search_query_string += '&since_id=' + tweet_since_date

        opsecHeader.query_website_oauth_json("twitter", search_query_string, self.consumer_key, self.consumer_secret, self.access_token, self.access_token_secret)

        twitter_results = opsecHeader.read_results_json('twitter')
        twitter_all_results = twitter_results['statuses']

        if not twitter_all_results:
            print "No results."
        else:
            existing_epoch_time = self.get_latest_tweet(None, keyword)[1]

            for i in twitter_all_results:
                created_at = (i['created_at']).encode('utf-8')
                epoch_time_found = calendar.timegm((time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y')))
                if int(epoch_time_found) > int(existing_epoch_time):
                    twitter_id = (i['id'])
                    from_user = (i['user']['screen_name']).encode('utf-8')
                    text = (i['text']).encode('utf-8')
                    created_at = (i['created_at']).encode('utf-8')
                    profile_image_url_https = (i['user']['profile_image_url_https']).encode('utf-8')
                    location, lat, lng = self.gen_geo(from_user)

                    self.write_tweet(twitter_id, from_user, text, created_at, keyword, location, lat, lng, epoch_time_found, profile_image_url_https)
                    opsecHeader.send_email(keyword, "Twitter")
Esempio n. 4
0
def get_pastes():
    global paste_ids_found, paste_max_size

    if (len(paste_ids_found) >= (paste_max_size * 2)):
        print "[-] cleaning list"
        for i in range(0, len(paste_ids_found) - (paste_max_size)):
            paste_ids_found.pop(0)
    print "[-] Pulling archive list..."
    try:
        page = urllib2.urlopen("http://www.pastebin.com/archive.php").read()
        regex = re.compile(
            '<td><img src="/i/t.gif" .*?<a href="/(.*?)">(.*?)</a></td>.*?<td>(.*?)</td>',
            re.S)
        pastes = regex.findall(page)
        for i in pastes:
            paste_id = i[0]
            paste_title = i[1]
            fetch_attempt = 0
            opsecHeader.write_last_checked_time('pastebin')
            if (paste_id not in paste_ids_found):
                print "[-] New paste(", paste_id, ")"
                paste_ids_found.append(paste_id)
                print len(paste_ids_found)
                paste_page = ''
                while (paste_page == ''):
                    print "[+] Pulling Raw paste"
                    sock = urllib2.urlopen("http://pastebin.com/raw.php?i=" +
                                           paste_id)
                    paste_page = sock.read()
                    encoding = sock.headers['Content-type'].split('charset=')[
                        1]  # iso-8859-1
                    try:
                        paste_page = paste_page.decode(encoding).encode(
                            'utf-8')
                        if (paste_page == ''):
                            paste_page = 'empty paste from http://pastebin.com/raw.php?i=' + paste_id
                        if "requesting a little bit too much" in paste_page:
                            paste_page = ''
                            print "[-] hitting pastebin too quickly, sleeping for 2 seconds and trying again.."
                            time.sleep(2)
                    except:
                        print "[!] couldnt decode page to utf-8"
                    print "[-] Sleeping for 1 second"
                    time.sleep(1)
                    fetch_attempt = fetch_attempt + 1
                    if (fetch_attempt > 1):
                        print "[+] Couldnt fetch " + "http://pastebin.com/raw.php?i=" + paste_id + " after 2 tries"
                        paste_page = '  '
                add_paste(paste_title, paste_id, paste_page)
            else:
                print "[-] Already seen ", paste_id
        sleep_time = random.randint(15, 45)
        print "[-] sleeping for", sleep_time, "seconds.."
        time.sleep(sleep_time)
        return 1
    except IOError:
        print "[!] Error fetching list of pastes, sleeping for 10 seconds and trying again"
        time.sleep(10)
        return 0
Esempio n. 5
0
    def get_post(self, account_id, site, user_id, content_type):
        latest_epoch_time = self.get_latest_post(user_id, site, content_type)
        query_string = 'http://api.stackexchange.com/2.1/users/' + str(user_id) + '/' + str(content_type) + 's?fromdate=' + str(latest_epoch_time) + '&order=desc&sort=creation&site=' + site + '&key=' + self.api_key
        opsecHeader.query_website_json(str(site) + str(user_id) + str(content_type), query_string)
        opsecHeader.write_last_checked_time('stackexchange')

        results = opsecHeader.read_results_json(str(site) + str(user_id) + str(content_type))
        items = results['items']
        for i in items:

            creation_date = i['creation_date']
            if(latest_epoch_time != creation_date):

                if(content_type == 'question'):
                    url = i['link']
                    html = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(html)
                    dirty_content = soup.find('div', {'class': 'post-text', 'itemprop': 'description'})
                    content = ''.join(dirty_content.findAll(text=True))

                elif(content_type == 'answer'):
                    answer_id = i['answer_id']
                    url = "http://" + str(site) + ".com/a/" + str(answer_id)
                    html = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(html)
                    answer_id = 'answer-' + str(answer_id)
                    div_content = soup.find('div', {'id': answer_id})
                    dirty_content = div_content.find('div', {'class': 'post-text'})
                    content = ''.join(dirty_content.findAll(text=True))

                elif(content_type == 'comment'):
                    comment_id = i['comment_id']
                    post_id = i['post_id']
                    short_url = 'http://' + str(site) + '.com/q/' + str(post_id)
                    long_url = str(urllib2.urlopen(short_url).geturl())
                    long_url = long_url.split("#")[0]
                    url = long_url + '#comment' + str(comment_id) + '_' + str(post_id)
                    html = urllib2.urlopen(url).read()
                    soup = BeautifulSoup(html)
                    comment_id_format = 'comment-' + str(comment_id)
                    try:  # Will fail if comments need to be loaded via AJAX
                        comment_tr = soup.find('tr', {'id': comment_id_format})
                        dirty_content = comment_tr.find('span', {'class': 'comment-copy'})
                        content = ''.join(dirty_content.findAll(text=True))
                    except AttributeError:
                        content = 'See website'

                profile_image = i['owner']['profile_image']
                display_name = i['owner']['display_name']

                self.write_display_name(account_id, display_name)
                self.write_latest_post(account_id, user_id, site, content_type, creation_date, profile_image, url, content, display_name)

                keywords = opsecHeader.get_user_keywords(account_id, 'stackexchange')
                for keyword in keywords:
                    if keyword in content:
                        opsecHeader.send_email(keyword, "Stack Exchange", display_name)
Esempio n. 6
0
def get_pastes():
    global paste_ids_found, paste_max_size

    if(len(paste_ids_found) >= (paste_max_size * 2)):
        print "[-] cleaning list"
        for i in range(0, len(paste_ids_found) - (paste_max_size)):
            paste_ids_found.pop(0)
    print "[-] Pulling archive list..."
    try:
        page = urllib2.urlopen("http://www.pastebin.com/archive.php").read()
        regex = re.compile('<td><img src="/i/t.gif" .*?<a href="/(.*?)">(.*?)</a></td>.*?<td>(.*?)</td>', re.S)
        pastes = regex.findall(page)
        for i in pastes:
            paste_id = i[0]
            paste_title = i[1]
            fetch_attempt = 0
            opsecHeader.write_last_checked_time('pastebin')
            if(paste_id not in paste_ids_found):
                print "[-] New paste(", paste_id, ")"
                paste_ids_found.append(paste_id)
                print len(paste_ids_found)
                paste_page = ''
                while (paste_page == ''):
                    print "[+] Pulling Raw paste"
                    sock = urllib2.urlopen("http://pastebin.com/raw.php?i=" + paste_id)
                    paste_page = sock.read()
                    encoding = sock.headers['Content-type'].split('charset=')[1]  # iso-8859-1
                    try:
                        paste_page = paste_page.decode(encoding).encode('utf-8')
                        if(paste_page == ''):
                            paste_page = 'empty paste from http://pastebin.com/raw.php?i=' + paste_id
                        if "requesting a little bit too much" in paste_page:
                            paste_page = ''
                            print "[-] hitting pastebin too quickly, sleeping for 2 seconds and trying again.."
                            time.sleep(2)
                    except:
                        print "[!] couldnt decode page to utf-8"
                    print "[-] Sleeping for 1 second"
                    time.sleep(1)
                    fetch_attempt = fetch_attempt + 1
                    if(fetch_attempt > 1):
                        print "[+] Couldnt fetch " + "http://pastebin.com/raw.php?i=" + paste_id + " after 2 tries"
                        paste_page = '  '
                add_paste(paste_title, paste_id, paste_page)
            else:
                print "[-] Already seen ", paste_id
        sleep_time = random.randint(15, 45)
        print "[-] sleeping for", sleep_time, "seconds.."
        time.sleep(sleep_time)
        return 1
    except IOError:
        print "[!] Error fetching list of pastes, sleeping for 10 seconds and trying again"
        time.sleep(10)
        return 0
Esempio n. 7
0
    def get_user_tweets(self, user):
        screen_name = urllib2.quote(user)
        opsecHeader.write_last_checked_time('twitter')

        # See https://dev.twitter.com/docs/api/1/get/statuses/user_timeline
        tweet_since_date = str(self.get_latest_tweet(screen_name, None)[0])
        epoch_time_existing = self.get_latest_tweet(screen_name, None)[1]

        twitter_query_string = 'https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=' + screen_name + '&count=10'

        if tweet_since_date != '0':  # Twitter does not play nice with invalid since_id's
            twitter_query_string += '&since_id=' + tweet_since_date

        opsecHeader.query_website_oauth_json("twitterUserTweets", twitter_query_string, self.consumer_key, self.consumer_secret, self.access_token, self.access_token_secret)

        twitter_results = opsecHeader.read_results_json('twitterUserTweets')
        if twitter_results is not None:
            twitter_all_results = twitter_results
        else:
            twitter_all_results = None

        if not twitter_all_results:
            print "No results."
        else:
            for i in twitter_all_results:
                created_at = (i['created_at']).encode('utf-8')
                epoch_time_found = calendar.timegm((email.utils.parsedate(created_at)))
                if int(epoch_time_found) > int(epoch_time_existing):
                    twitter_id = (i['id'])
                    text = (i['text']).encode('utf-8')
                    from_user = (i['user']['screen_name']).encode('utf-8')
                    created_at = (i['created_at']).encode('utf-8')
                    profile_image_url_https = (i['user']['profile_image_url_https']).encode('utf-8')

                    try:
                        location = '?'
                        lat = i['geo']['coordinates'][0]
                        lng = i['geo']['coordinates'][1]
                        print("Got coordinates!")
                    except:
                        location, lat, lng = self.gen_geo(from_user)

                    self.write_tweet(twitter_id, from_user, text, created_at, '', location, lat, lng, epoch_time_found, profile_image_url_https)
                    keywords = opsecHeader.get_user_keywords(from_user, 'twitter')
                    for keyword in keywords:
                        if keyword in text:
                            opsecHeader.send_email(keyword, "Twitter", from_user)
Esempio n. 8
0
    def get_user_comments(self, user):
        #http://www.reddit.com/dev/api

        user = urllib2.quote(user)

        reddit_query_string = 'http://www.reddit.com/user/' + user + '/overview.json'
        opsecHeader.query_website_json("reddit", reddit_query_string, opsecHeader.reddit_api_key)
        opsecHeader.write_last_checked_time('reddit')

        reddit_results = opsecHeader.read_results_json('reddit')
        try:
            reddit_all_results = reddit_results['data']['children']
        except KeyError:
            reddit_all_results = None
        epoch_time_existing = self.get_latest_user_epoch(user)

        if not reddit_all_results:
            print "No results."
        else:
            for i in reddit_all_results:
                epoch_time_found = str((i['data']['created_utc'])).encode('utf-8')[:-2]
                if int(epoch_time_found) > int(epoch_time_existing):
                    try:
                        link_id = (i['data']['link_id']).encode('utf-8')[3:]
                    except KeyError:
                        link_id = ''
                    comment_id = (i['data']['id']).encode('utf-8')
                    author = (i['data']['author']).encode('utf-8')
                    try:
                        body = (i['data']['body']).encode('utf-8')
                    except KeyError:
                        body = ''
                    try:
                        link_title = (i['data']['link_title']).encode('utf-8')
                    except KeyError:
                        link_title = ''
                    subreddit = (i['data']['subreddit']).encode('utf-8')
                    permalink = 'http://www.reddit.com/r/' + subreddit + '/comments/' + link_id + '/' + urllib2.quote(link_title) + '/' + comment_id
                    self.write_latest_post(author, body, link_id, comment_id, link_title, subreddit, epoch_time_found, permalink)

                    keywords = opsecHeader.get_user_keywords(author, 'reddit')
                    for keyword in keywords:
                        if keyword in body:
                            opsecHeader.send_email(keyword, "Reddit", author)
Esempio n. 9
0
    def search_facebook(self, raw_keyword):
        opsecHeader.write_last_checked_time('facebook')
        keyword = urllib2.quote(raw_keyword)
        # See https://developers.facebook.com/docs/reference/api/
        #
        # Arguments:
        # q = keyword we are searching for
        # type = kind of object we are searching for e.g post
        #
        # Returns:
        # name; id (facebook.com/id for their profile)

        facebook_latest_epoch = self.get_latest_post_time()
        facebook_query_string = 'https://graph.facebook.com/search?q=' + keyword + '&type=post'
        opsecHeader.query_website_json("facebook", facebook_query_string)

        print "Parsing Facebook data..."

        facebook_results = opsecHeader.read_results_json('facebook')
        facebook_all_results = facebook_results['data']

        if facebook_all_results:
            for i in facebook_all_results:
                if 'message' in i:
                    message = i['message'].encode('utf-8')
                    name = (i['from']['name']).encode('utf-8')
                    user_id = (i['from']['id']).encode('utf-8')
                    updated_time = (i['updated_time']).encode('utf-8')
                    epoch_time = calendar.timegm((time.strptime(updated_time, '%Y-%m-%dT%H:%M:%S+0000')))

                    if int(epoch_time) > int(facebook_latest_epoch):
                        profile_picture = self.get_profile_picture(user_id)
                        self.write_latest_post(name, user_id, message, profile_picture, updated_time, keyword, epoch_time)
                        opsecHeader.send_email(keyword, "Facebook")
                        print "Updated Time: " + updated_time
                    else:
                        print "Post too old."