Ejemplo n.º 1
0
 def on_status(self, status):
     global follow_list
         
     #format time into 2011-02-23T16:42:40+0000 format ala facebook
     #Twitter Format: Wed Mar 23 22:51:50 +0000 2011
     formattedTime = self.formatTime(status['created_at'])
     hashtags = []
     
     if len(status['entities']['hashtags']):
         for val in status['entities']['hashtags']:
             hashtags.append(val['text'].replace("'", "\\'"))
     
     hashtag = ','.join(hashtags)
     
     urls = []
     if len(status['entities']['urls']):
         for val in status['entities']['urls']:
             urls.append(val['url'].replace("'", "\\'"))
     
     url = ','.join(urls)
     #print status['text']
     text = status['text'].replace("'", "\\'")
     if text[-1] == '\\':
     	text = text + " "
     if str(status['user']['id']) in follow_list:
         file_put_contents(str(status['user']['screen_name']) + " posted something")
         infoModule.info.site['dblink'] = mysql_tools.db_connect()
         
         sql = u"INSERT INTO `peepbuzz`.`twitter_queue` SET `status_id` = '" + str(status['id']) + "', `created` = '" + formattedTime + "', `promoter_id` = '" + str(status['user']['id']) + "', `promoter` = '" + status['user']['screen_name'] + "', `thumbnail` = '" + str(status['user']['profile_image_url']) + "', `summary` = '" + text + "', `external_id` = '" + str(status['user']['id']) + "', `hashtags` = '" + hashtag + "', `urls` = '" + url + "'";
         mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
         infoModule.info.site['dblink'].close()
     else:
         pass        
Ejemplo n.º 2
0
def cleanup(days):
    link = mysql_tools.db_connect()
    query = 'SELECT filament_id, story_id FROM peepbuzz.filaments WHERE created <= DATE_SUB(NOW(), INTERVAL '+str(days)+' DAY)'
    result = mysql_tools.mysqlQuery(query, link)
    while (1):
        row = result.fetch_row(1,1)
        if row == ():
            break
        query = 'DELETE from peepbuzz.filaments WHERE filament_id = "'+str(row[0]['filament_id'])+'"'
        try:
            result2 = mysql_tools.mysqlQuery(query, link)
        except:
            pprint.pprint(query)
            sys.exit(1)
        if row[0]['story_id'] != None:
            query = 'SELECT count(*) from peepbuzz.filaments WHERE story_id = "'+str(row[0]['story_id'])+'"'
            try:
                result2 = mysql_tools.mysqlQuery(query, link)
            except:
               pprint.pprint(query)
               sys.exit(1)
            row = result2.fetch_row(1,1)
            if row == None:
                break
            if row[0] == 0:
                query = 'DELETE FROM peepbuzz.stories WHERE story_id = "'+str(row[0]['story_id'])+'"'
                try:
                    result2 = mysql_tools.mysqlQuery(query, link)
                except:
                    pprint.pprint(query)
                    sys.exit(1)
    return True
Ejemplo n.º 3
0
 def test_find_account(self):
     infoModule.info.site['dblink'] = mysql_tools.db_connect()
     dblink = infoModule.info.site['dblink']
     self.stream_id = 1
     self.external_id = 'acctfndrtest'
     self.user_name = 'account finder'
     self.thumbnail = 'http://newdisorder.com/image/trabant_cover.gif'
     res = accountFinder.accountFinder(self.stream_id, self.external_id, self.user_name, self.thumbnail)
     self.assertEqual(res[1], 'accounts')
     self.new_id = res[0]
     res = accountFinder.accountFinder(self.stream_id, self.external_id, self.user_name, self.thumbnail)
     self.assertEqual(res[0], self.new_id)
Ejemplo n.º 4
0
 def test_find_account(self):
     infoModule.info.site['dblink'] = mysql_tools.db_connect()
     dblink = infoModule.info.site['dblink']
     self.stream_id = 1
     self.external_id = 'acctfndrtest'
     self.user_name = 'account finder'
     self.thumbnail = 'http://newdisorder.com/image/trabant_cover.gif'
     res = accountFinder.accountFinder(self.stream_id, self.external_id,
                                       self.user_name, self.thumbnail)
     self.assertEqual(res[1], 'accounts')
     self.new_id = res[0]
     res = accountFinder.accountFinder(self.stream_id, self.external_id,
                                       self.user_name, self.thumbnail)
     self.assertEqual(res[0], self.new_id)
Ejemplo n.º 5
0
def main():
    global follow_list
    
    infoModule.info.site['dblink'] = mysql_tools.db_connect()
    auth = BasicAuthHandler(twitterAccount, tiwtterPassword)
    stream = newStream(auth, StreamWatcherListener(), timeout=None)

    follow_list = getUserList()
    # Demo Mode
    #follow_list = getUserList(True)
    infoModule.info.site['dblink'].close()
    if len(follow_list) == 0:
        file_put_contents('Could not get a list of users to follow')
        sys.exit()

    stream.filter(follow=follow_list)
Ejemplo n.º 6
0
def main():
    global follow_list

    infoModule.info.site['dblink'] = mysql_tools.db_connect()
    auth = BasicAuthHandler(twitterAccount, tiwtterPassword)
    stream = newStream(auth, StreamWatcherListener(), timeout=None)

    follow_list = getUserList()
    # Demo Mode
    #follow_list = getUserList(True)
    infoModule.info.site['dblink'].close()
    if len(follow_list) == 0:
        file_put_contents('Could not get a list of users to follow')
        sys.exit()

    stream.filter(follow=follow_list)
    def setUp(self):
        infoModule.info.site['dblink'] = mysql_tools.db_connect()
        dblink = infoModule.info.site['dblink']
        #set up accounts for test
        # get valid account
        sql = "SELECT user_id from peepbuzz.users limit 1"
        userQ = mysql_tools.mysqlQuery(sql, dblink)
        user = userQ.fetch_row(1, 1)

        sql = "SELECT account_id from peepbuzz.accounts limit 1"
        accountQ = mysql_tools.mysqlQuery(sql, dblink)
        account = accountQ.fetch_row(1, 1)
        self.account_id = account[0]['account_id']
        self.user_id = user[0]['user_id']
        sql = "insert into peepbuzz.blocked_accounts set user_id=" + self.user_id + ", unknown_account_id=" + self.unknown_account_id + ", account_id=" + self.account_id
        testQ = mysql_tools.mysqlQuery(sql, dblink)
        self.ba_id = dblink.insert_id()
    def setUp(self):
        infoModule.info.site['dblink'] = mysql_tools.db_connect()
        dblink = infoModule.info.site['dblink']
        #set up accounts for test
        # get valid account
        sql = "SELECT user_id from peepbuzz.users limit 1"
        userQ = mysql_tools.mysqlQuery(sql, dblink)
        user = userQ.fetch_row(1,1)

        sql = "SELECT account_id from peepbuzz.accounts limit 1"
        accountQ = mysql_tools.mysqlQuery(sql, dblink)
        account = accountQ.fetch_row(1,1)
        self.account_id = account[0]['account_id']
        self.user_id = user[0]['user_id']
        sql = "insert into peepbuzz.blocked_accounts set user_id=" + self.user_id + ", unknown_account_id=" + self.unknown_account_id + ", account_id=" + self.account_id
        testQ = mysql_tools.mysqlQuery(sql, dblink)
        self.ba_id = dblink.insert_id()
Ejemplo n.º 9
0
    def on_status(self, status):
        global follow_list

        #format time into 2011-02-23T16:42:40+0000 format ala facebook
        #Twitter Format: Wed Mar 23 22:51:50 +0000 2011
        formattedTime = self.formatTime(status['created_at'])
        hashtags = []

        if len(status['entities']['hashtags']):
            for val in status['entities']['hashtags']:
                hashtags.append(val['text'].replace("'", "\\'"))

        hashtag = ','.join(hashtags)

        urls = []
        if len(status['entities']['urls']):
            for val in status['entities']['urls']:
                urls.append(val['url'].replace("'", "\\'"))

        url = ','.join(urls)
        #print status['text']
        text = status['text'].replace("'", "\\'")
        if text[-1] == '\\':
            text = text + " "
        if str(status['user']['id']) in follow_list:
            file_put_contents(
                str(status['user']['screen_name']) + " posted something")
            infoModule.info.site['dblink'] = mysql_tools.db_connect()

            sql = u"INSERT INTO `peepbuzz`.`twitter_queue` SET `status_id` = '" + str(
                status['id']
            ) + "', `created` = '" + formattedTime + "', `promoter_id` = '" + str(
                status['user']['id']
            ) + "', `promoter` = '" + status['user'][
                'screen_name'] + "', `thumbnail` = '" + str(
                    status['user']['profile_image_url']
                ) + "', `summary` = '" + text + "', `external_id` = '" + str(
                    status['user']['id']
                ) + "', `hashtags` = '" + hashtag + "', `urls` = '" + url + "'"
            mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
            infoModule.info.site['dblink'].close()
        else:
            pass
Ejemplo n.º 10
0
def cleanup(days):
    link = mysql_tools.db_connect()
    query = 'SELECT filament_id, story_id FROM peepbuzz.filaments WHERE created <= DATE_SUB(NOW(), INTERVAL ' + str(
        days) + ' DAY)'
    result = mysql_tools.mysqlQuery(query, link)
    while (1):
        row = result.fetch_row(1, 1)
        if row == ():
            break
        query = 'DELETE from peepbuzz.filaments WHERE filament_id = "' + str(
            row[0]['filament_id']) + '"'
        try:
            result2 = mysql_tools.mysqlQuery(query, link)
        except:
            pprint.pprint(query)
            sys.exit(1)
        if row[0]['story_id'] != None:
            query = 'SELECT count(*) from peepbuzz.filaments WHERE story_id = "' + str(
                row[0]['story_id']) + '"'
            try:
                result2 = mysql_tools.mysqlQuery(query, link)
            except:
                pprint.pprint(query)
                sys.exit(1)
            row = result2.fetch_row(1, 1)
            if row == None:
                break
            if row[0] == 0:
                query = 'DELETE FROM peepbuzz.stories WHERE story_id = "' + str(
                    row[0]['story_id']) + '"'
                try:
                    result2 = mysql_tools.mysqlQuery(query, link)
                except:
                    pprint.pprint(query)
                    sys.exit(1)
    return True
Ejemplo n.º 11
0
    if(len(be['images'])>0):
        for img in be['images']:
            query = u'insert into peepbuzz.story_images (story_id, url, width, height) values ("'+str(story_id)+'","'+str(img['url'])+'","'+str(img['width'])+'","'+str(img['height'])+'")'
            print query
            try:
                mysql_tools.mysqlQuery(query, infoModule.info.site['dblink'])
            except:
                return False
    # check videos
    if(len(be['videos'])>0):
        for vid in be['videos']:
            #check for dupes
            sql = "SELECT video_id FROM peepbuzz.story_videos WHERE story_id=" + str(story_id) + " and url='" + str(vid['url']) + "'"
            video_dupe_check_q = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
            if video_dupe_check_q.num_rows() == 0:
                embed_code = vid['embed_code'].replace("'", "\\'")
                query = u"insert into peepbuzz.story_videos (story_id, url, embed_code, width, height) values ('"+str(story_id)+"','"+str(vid['url'])+"','"+embed_code+"','"+str(vid['width'])+"','"+str(vid['height'])+"')"
                try:
                    mysql_tools.mysqlQuery(query, infoModule.info.site['dblink'])
                except:
                    infoModule.info.errorList.append("failed to add video")
                    infoModule.info.errorList.append(query)
    return True

if __name__ == "__main__":   
    link = mysql_tools.db_connect()
    if link == False : 
        print "no connection" 
        sys.exit(0) 
    infoModule.info.site['dblink'] = link
Ejemplo n.º 12
0
    disasterTimeout = 120.0
    #better be able to get these all in 2 minutes
    startTime = time.time()
    while True:
        if explodedCount == len(url_list):
            break
        #disaster timeout and crash
        if time.time() - startTime > disasterTimeout:
            print "explodeUrls timeout"
            break
    return res


if __name__ == "__main__":
    infoModule.info.site['dblink'] = mysql_tools.db_connect()
    infoModule.info.source['body_extractor_servers'] = [
        ('192.168.0.100', '1348'), ('192.168.0.26', '1348')
    ]
    URLList = [
        'http://bit.ly/hrRze1', 'http://twitpic.com/43s13z',
        'http://twitpic.com/43ouap', 'http://twitpic.com/43ouak',
        'http://fb.me/ETYTKELP', 'http://fb.me/ESd8X8Bl',
        'http://bit.ly/gZAJzU', 'http://bit.ly/eBIhUP', 'http://bit.ly/eTzXuu',
        'http://bit.ly/glhMcB', 'http://fb.me/QSucj2IG',
        'http://fb.me/VnX2tOZn', 'http://ow.ly/i/8ury', 'http://t.co/OwnQba6',
        'http://bit.ly/ibSNUF', 'http://fb.me/IfoHBSoh', 'http://ow.ly/i/8ssC',
        'http://fb.me/S7Fmh2Ro', 'http://twitpic.com/42o42e',
        'http://bit.ly/e7YkQA', 'http://fb.me/RVtX25f4', 'http://img.ly/34lS',
        'http://img.ly/34lj', 'http://fb.me/IjEMbpZF', 'http://bit.ly/fytGPO',
        'http://tinyurl.com/4bew58q', 'http://ow.ly/i/8qnV',
Ejemplo n.º 13
0
def main():
    sleep = 1 * 60 # Seconds to sleep
    pidPath = "/tmp/twitterUserStream.pid"
    streamPath = "twitterUserStream.py"
    pid = None
    userCount = None
    lastUserCount = None
    noCheck = False
    running = False

    # Create a loop
    while True:
        infoModule.info.site['dblink'] = mysql_tools.db_connect()
        # New Loop so move the user counts
        lastUserCount = userCount
        userCount = None
    
        # Check the file for a PID
        try:
            file = open(pidPath)
            while True:
                line = file.readline()
                if not line:
                    break
                pid = line
                
                if pid:
                    # Get the status of the PID
                    try:
                        os.kill(int(pid), 0)
                    except OSError:
                        running = False
                    else:
                        running = True
                    
                    print str(pid) + " - is running?: " + str(running)
                else:
                    running = False
                
        except IOError:
            # We dont care if the file does not exist, since it will get created the first time around
            # So we can treat it as the deamon is not running
            running = False
            pid = 0
        
        
        # Get the count for how many users we are following
        sql = "SELECT count(*) as `userCount` FROM `peepbuzz`.`curators` LEFT JOIN `peepbuzz`.`accounts` ON `accounts`.`account_id` = `curators`.`account_id` WHERE `accounts`.`external_id` IS NOT NULL AND `accounts`.`stream_id` = 1"
        countQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
        
        try:
            userCountV = countQ.fetch_row(1,1)
            userCount = userCountV[0]['userCount']
        except:
            print "Problem fetching the users from the database"
            sys.exit()

        # if the count has changed from last count OR the PID is not running start a new deamon
        if userCount != lastUserCount or not running:
            try:
                os.kill(int(pid), 9)
            except OSError:
                # Proc is aready dead
                pass
            
            print "Starting the Daemon"
            
            os.system("python2.7 " + streamPath + " &")
            
        infoModule.info.site['dblink'].close()            
        time.sleep(sleep)
import infoModule
infoModule.info.site['remoteLogging'] = False

from alogClient import *
import mysql_tools
import _mysql

dblink = mysql_tools.db_connect()
infoModule.info.site['log_priority_threshold'] = 100

sql = "alter table peepbuzz.curators drop foreign key curator_unknown_account_id_constraint"
print sql
mysql_tools.mysqlQuery(sql, dblink)
sql = "alter table peepbuzz.curators drop unknown_account_id"
print sql
mysql_tools.mysqlQuery(sql, dblink)

sql = "select * from peepbuzz.unknown_accounts"
print sql
uaQ = mysql_tools.mysqlQuery(sql, dblink)
while True:
    ua = uaQ.fetch_row(1, 1)
    if ua == ():
        break

    sql = "insert into peepbuzz.accounts set stream_id=" + str(
        ua[0]['stream_id']
    ) + ", account_user_name='" + ua[0]['user_name'] + "', external_id='" + ua[
        0]['external_id'] + "', thumbnail='" + ua[0]['thumbnail'] + "'"
    print sql
    mysql_tools.mysqlQuery(sql, dblink)
Ejemplo n.º 15
0
def findImages (fullArticle, url):
    ## findImages finds all images in the page and returns them in a dict.  This is different from the
    ## sourceReader version of findImages that only returns the largest image
    URLParts = urlparse.urlparse(url)
    myHost = URLParts[1]
    log.plog('image search: full article len=' + str(len(fullArticle)), 3)
    imageSearch = fullArticle
    overrideImageMinSize = False
    minWidth = 100
    minHeight = 100
    largestImage = ""
    largestWidth = 0
    largestHeight = 0
    largestSize = []
    maxSize = 0
    imageURLList = []
    imgSize = [0,0]
    ic = 0
    #if isinstance(imageSearch, str) == False:
    #    log.plog('imageSearch in findImages is not a string', 4)
    #    return None
    imageSearch = re.sub('<a.*?href=[\'"]([^\'"]*?\.jpg)[\'"].*?>', '<img src="\\1">', imageSearch, re.I | re.M)
    imageBucket = re.findall('(<img.*?src=[\'"][^"].*?\.jpg[\'"].*?>)', imageSearch, re.I)
    images = []
    #get last story image
    link = mysql_tools.db_connect()
    query = 'select story_id from peepbuzz.stories where url like "%'+str(myHost)+'%" order by story_id DESC LIMIT 1'
    result = mysql_tools.mysqlQuery(query, link)
    previous_images = []
    pprint.pprint(previous_images)
    while(1):
        row = result.fetch_row(0,1)
        if row == ():
            break
        story_id = str(row[0]['story_id'])
        query = 'select url from peepbuzz.story_images where story_id="'+str(story_id)+'"'
        result = mysql_tools.mysqlQuery(query, link)

        while(1):
            row = result.fetch_row(1,1)
            if row == ():
                break
            previous_images.append(row[0]['url'])

    #get list of blacklisted images for domain
    query = 'select full_path from peepbuzz.blacklisted_images where host like "%'+str(myHost)+'%"'
    result = mysql_tools.mysqlQuery(query, link)
    blacklisted_images = []
    if(result):
        while(1):
            row = result.fetch_row(1,1)
            if row == ():
                break
            blacklisted_images.append(row[0]['full_path'])

    pprint.pprint(previous_images)
    pprint.pprint(blacklisted_images)
    for image in imageBucket:
        log.plog("image: " + image)
        match = re.search('[\s]*src=["\'](.*?)["\']', image)
        if match != None:
            imgUrl = match.group(1)
            if imgUrl[0:4] != "http":
                if imgUrl[0:1] == '/':
                    imgUrl = 'http://' +myHost+imgUrl
                else:
                    imgUrl = 'http://' +myHost+ '/' + imgUrl
                    log.plog("adding site_url to image to get " + imgUrl)
            # check against last story image
            if(imgUrl in previous_images):
                # if match, add to blacklist DB and blackList list here
                query = 'insert into peepbuzz.blacklisted_images set host = "'+str(myHost)+'", full_path = "'+str(imgUrl)+'"'
                result = mysql_tools.mysqlQuery(query, link)
                blacklisted_images.append(imgUrl)
            if (imgUrl not in imageURLList) and (imgUrl not in blacklisted_images) and (imgUrl[-3:] == 'jpg' or imgUrl[-4:] == 'jpeg'):
                imageURLList.append(imgUrl)
                image = re.sub('onclick=\".*?\"', "", image)
                decWidth = re.search('width=["\']*(\d+)', image, re.I)
                decHeight = re.search('height=["\']*(\d+)', image, re.I)
                decWidth2 = re.search('width:\s*(\d+)px', image, re.I)
                decHeight2 = re.search('width=["\']*(\d+)', image, re.I)
                if (decWidth != None and decHeight != None) or (decWidth2 != None and decHeight2 != None):
                    if decWidth.group(0) > minWidth and decHeight.group(0) > minHeight :
                        imgSize[0] = int(decWidth.group(1))
                        imgSize[1] = int(decHeight.group(1))
                        err = "declared image size: " + str(imgSize[0]) + " x " + str(imgSize[1])
                        log.plog(err)
                    elif decWidth2.group(0) > minWidth and decHeight2.group(0) > minHeight :
                        imgSize[0] = int(decWidth2.group(0))
                        imgSize[1] = int(decHeight2.group(0))
                        err = "declared (via style) image size: " + str(imgSize[0]) + " x " + str(imgSize[1])
                        log.plog(err)
                        imageErrorLog(err + " Image: " + imgUrl)
                else:
                    # splitting up the imgUrl to grab the file for local width and height
                    imgSize = getimagesize(imgUrl)
                if imgSize[0] == 0 or (imgSize[0] < minWidth and imgSize[1] < minHeight):
                    continue
                err = "fetched image size via getimagesize("+imgUrl+") and got: "+str(imgSize[0])+" x "+str(imgSize[1])
                log.plog(err, 1)
                images.append({"url" : imgUrl, "width" : imgSize[0],  "height":imgSize[1]})
                if imgSize[0] * imgSize[1] > maxSize and imgSize[0]/imgSize[1] < 3.5 and imgSize[1]/imgSize[0] < 3.5 :
                    maxSize = imgSize[0] * imgSize[1]
                    largestImage = imgUrl
                    largestWidth = imgSize[0]
                    largestHeight = imgSize[1]
                    err = "Image: " + imgUrl +" - Size: "+ str(imgSize[0]) + " x " + str(imgSize[1])
                    log.plog(err)
                    
    if largestWidth == 0:
        return None
    else:
        log.plog("findndImages found " + str(len(images)) + " images", 3)
        return images
Ejemplo n.º 16
0
        )
        mysql_tools.mysqlQuery(sql, dblink)


def testUTF():
    dblink = infoModule.info.site["dblink"]
    sql = u"insert into peepbuzz.stories set title='foo faa \u2026 fum'"
    # mysql_tools.mysqlQuery(sql, dblink)
    sql = "select title from peepbuzz.stories"
    accountsQ = mysql_tools.mysqlQuery(sql, dblink)
    accounts = accountsQ.fetch_row(0, 1)
    print accounts[0]["title"]


if __name__ == "__main__":
    infoModule.info.site["dblink"] = mysql_tools.db_connect()
    infoModule.info.site["log_priority_threshold"] = 100
    # streamList is the accumulated json from all streams, declared global because of the multi-threaded call
    # to fetch streams
    streamList = []
    ## totalStreamsRead is incremented to decide when to exit from stream threading hold pattern
    totalStreamsRead = 0

    # map stream name to id, a common need
    # use: infoModule.info.site['stream_name_to_id']['twitter'] produces id
    infoModule.info.site["stream_name_to_id"] = loadStreams.loadStreams()

    # infoModule.info.source['body_extractor_host'] = "68.68.109.26"
    # infoModule.info.source['body_extractor_port'] = "1348"

    # determine the body extractor servers
Ejemplo n.º 17
0
def main():
    sleep = 1 * 60  # Seconds to sleep
    pidPath = "/tmp/twitterUserStream.pid"
    streamPath = "twitterUserStream.py"
    pid = None
    userCount = None
    lastUserCount = None
    noCheck = False
    running = False

    # Create a loop
    while True:
        infoModule.info.site['dblink'] = mysql_tools.db_connect()
        # New Loop so move the user counts
        lastUserCount = userCount
        userCount = None

        # Check the file for a PID
        try:
            file = open(pidPath)
            while True:
                line = file.readline()
                if not line:
                    break
                pid = line

                if pid:
                    # Get the status of the PID
                    try:
                        os.kill(int(pid), 0)
                    except OSError:
                        running = False
                    else:
                        running = True

                    print str(pid) + " - is running?: " + str(running)
                else:
                    running = False

        except IOError:
            # We dont care if the file does not exist, since it will get created the first time around
            # So we can treat it as the deamon is not running
            running = False
            pid = 0

        # Get the count for how many users we are following
        sql = "SELECT count(*) as `userCount` FROM `peepbuzz`.`curators` LEFT JOIN `peepbuzz`.`accounts` ON `accounts`.`account_id` = `curators`.`account_id` WHERE `accounts`.`external_id` IS NOT NULL AND `accounts`.`stream_id` = 1"
        countQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])

        try:
            userCountV = countQ.fetch_row(1, 1)
            userCount = userCountV[0]['userCount']
        except:
            print "Problem fetching the users from the database"
            sys.exit()

        # if the count has changed from last count OR the PID is not running start a new deamon
        if userCount != lastUserCount or not running:
            try:
                os.kill(int(pid), 9)
            except OSError:
                # Proc is aready dead
                pass

            print "Starting the Daemon"

            os.system("python2.7 " + streamPath + " &")

        infoModule.info.site['dblink'].close()
        time.sleep(sleep)
Ejemplo n.º 18
0
            MyThread(x).start()

    disasterTimeout = 120.0
    #better be able to get these all in 2 minutes
    startTime = time.time()
    while True:
        if explodedCount == len(url_list):
            break
        #disaster timeout and crash
        if time.time() - startTime > disasterTimeout:
            print "explodeUrls timeout"
            break
    return res
 
if __name__ == "__main__":
    infoModule.info.site['dblink'] = mysql_tools.db_connect()
    infoModule.info.source['body_extractor_servers'] = [('192.168.0.100', '1348'), ('192.168.0.26', '1348')]
    URLList = ['http://bit.ly/hrRze1',
     'http://twitpic.com/43s13z',
     'http://twitpic.com/43ouap',
     'http://twitpic.com/43ouak',
     'http://fb.me/ETYTKELP',
     'http://fb.me/ESd8X8Bl',
     'http://bit.ly/gZAJzU',
     'http://bit.ly/eBIhUP',
     'http://bit.ly/eTzXuu',
     'http://bit.ly/glhMcB',
     'http://fb.me/QSucj2IG',
     'http://fb.me/VnX2tOZn',
     'http://ow.ly/i/8ury',
     'http://t.co/OwnQba6',
Ejemplo n.º 19
0
def findImages(fullArticle, url):
    ## findImages finds all images in the page and returns them in a dict.  This is different from the
    ## sourceReader version of findImages that only returns the largest image
    URLParts = urlparse.urlparse(url)
    myHost = URLParts[1]
    log.plog("image search: full article len=" + str(len(fullArticle)), 3)
    imageSearch = fullArticle
    overrideImageMinSize = False
    minWidth = 100
    minHeight = 100
    largestImage = ""
    largestWidth = 0
    largestHeight = 0
    largestSize = []
    maxSize = 0
    imageURLList = []
    imgSize = [0, 0]
    ic = 0
    # if isinstance(imageSearch, str) == False:
    #    log.plog('imageSearch in findImages is not a string', 4)
    #    return None
    imageSearch = re.sub("<a.*?href=['\"]([^'\"]*?\.jpg)['\"].*?>", '<img src="\\1">', imageSearch, re.I | re.M)
    imageBucket = re.findall('(<img.*?src=[\'"][^"].*?\.jpg[\'"].*?>)', imageSearch, re.I)
    images = []
    # get last story image
    link = mysql_tools.db_connect()
    query = (
        'select story_id from peepbuzz.stories where url like "%' + str(myHost) + '%" order by story_id DESC LIMIT 1'
    )
    result = mysql_tools.mysqlQuery(query, link)
    previous_images = []
    pprint.pprint(previous_images)
    while 1:
        row = result.fetch_row(0, 1)
        if row == ():
            break
        story_id = str(row[0]["story_id"])
        query = 'select url from peepbuzz.story_images where story_id="' + str(story_id) + '"'
        result = mysql_tools.mysqlQuery(query, link)

        while 1:
            row = result.fetch_row(1, 1)
            if row == ():
                break
            previous_images.append(row[0]["url"])

    # get list of blacklisted images for domain
    query = 'select full_path from peepbuzz.blacklisted_images where host like "%' + str(myHost) + '%"'
    result = mysql_tools.mysqlQuery(query, link)
    blacklisted_images = []
    if result:
        while 1:
            row = result.fetch_row(1, 1)
            if row == ():
                break
            blacklisted_images.append(row[0]["full_path"])

    pprint.pprint(previous_images)
    pprint.pprint(blacklisted_images)
    for image in imageBucket:
        log.plog("image: " + image)
        match = re.search("[\s]*src=[\"'](.*?)[\"']", image)
        if match != None:
            imgUrl = match.group(1)
            if imgUrl[0:4] != "http":
                if imgUrl[0:1] == "/":
                    imgUrl = "http://" + myHost + imgUrl
                else:
                    imgUrl = "http://" + myHost + "/" + imgUrl
                    log.plog("adding site_url to image to get " + imgUrl)
            # check against last story image
            if imgUrl in previous_images:
                # if match, add to blacklist DB and blackList list here
                query = (
                    'insert into peepbuzz.blacklisted_images set host = "'
                    + str(myHost)
                    + '", full_path = "'
                    + str(imgUrl)
                    + '"'
                )
                result = mysql_tools.mysqlQuery(query, link)
                blacklisted_images.append(imgUrl)
            if (
                (imgUrl not in imageURLList)
                and (imgUrl not in blacklisted_images)
                and (imgUrl[-3:] == "jpg" or imgUrl[-4:] == "jpeg")
            ):
                imageURLList.append(imgUrl)
                image = re.sub('onclick=".*?"', "", image)
                decWidth = re.search("width=[\"']*(\d+)", image, re.I)
                decHeight = re.search("height=[\"']*(\d+)", image, re.I)
                decWidth2 = re.search("width:\s*(\d+)px", image, re.I)
                decHeight2 = re.search("width=[\"']*(\d+)", image, re.I)
                if (decWidth != None and decHeight != None) or (decWidth2 != None and decHeight2 != None):
                    if decWidth.group(0) > minWidth and decHeight.group(0) > minHeight:
                        imgSize[0] = int(decWidth.group(1))
                        imgSize[1] = int(decHeight.group(1))
                        err = "declared image size: " + str(imgSize[0]) + " x " + str(imgSize[1])
                        log.plog(err)
                    elif decWidth2.group(0) > minWidth and decHeight2.group(0) > minHeight:
                        imgSize[0] = int(decWidth2.group(0))
                        imgSize[1] = int(decHeight2.group(0))
                        err = "declared (via style) image size: " + str(imgSize[0]) + " x " + str(imgSize[1])
                        log.plog(err)
                        imageErrorLog(err + " Image: " + imgUrl)
                else:
                    # splitting up the imgUrl to grab the file for local width and height
                    imgSize = getimagesize(imgUrl)
                if imgSize[0] == 0 or (imgSize[0] < minWidth and imgSize[1] < minHeight):
                    continue
                err = (
                    "fetched image size via getimagesize("
                    + imgUrl
                    + ") and got: "
                    + str(imgSize[0])
                    + " x "
                    + str(imgSize[1])
                )
                log.plog(err, 1)
                images.append({"url": imgUrl, "width": imgSize[0], "height": imgSize[1]})
                if (
                    imgSize[0] * imgSize[1] > maxSize
                    and imgSize[0] / imgSize[1] < 3.5
                    and imgSize[1] / imgSize[0] < 3.5
                ):
                    maxSize = imgSize[0] * imgSize[1]
                    largestImage = imgUrl
                    largestWidth = imgSize[0]
                    largestHeight = imgSize[1]
                    err = "Image: " + imgUrl + " - Size: " + str(imgSize[0]) + " x " + str(imgSize[1])
                    log.plog(err)

    if largestWidth == 0:
        return None
    else:
        log.plog("findndImages found " + str(len(images)) + " images", 3)
        return images