コード例 #1
0
ファイル: scanStreams.py プロジェクト: ctwiz/sourcereader
 def run(self):
     print "storing exploded URLs"
     storiesFound = storeUrlContents.storeUrlContents(json.dumps(self.urlList))
     print "stories read: " + str(storiesFound)
コード例 #2
0
ファイル: scanStreams.py プロジェクト: dpgailey/sourcereader
def scanStreams(user_id):
    ##scanStreams:
    ##params: user_id
    ##returns: bool
    ##base function for reading all the streams from a user's accounts and importing them
    ## into the db
    global totalStreamsRead
    global streamList

    streamList = []
    totalStreamsRead = 0
    
    dblink = infoModule.info.site['dblink']
    
    #fetch accounts, and last_read id
    sql = "SELECT account_id, account_user_name, last_read_id, streams.name, public_key, account_user_name, private_key, external_id, user_id from peepbuzz.accounts INNER JOIN peepbuzz.streams ON accounts.stream_id=streams.stream_id WHERE user_id=" + str(user_id)
    accountsQ = mysql_tools.mysqlQuery(sql, dblink)
    try:
        accounts = accountsQ.fetch_row(0,1)
    except AttributeError:
        print "accountsQ error: " + sql
    if accounts == ():
        infoModule.info.errorList.append('no accounts found for user_id ' + str(user_id))
        return False
    
#    pprint.pprint(accounts)
#    sys.exit()

    # for each account type, launch a thread to request the stream
    # also use this look to create a dict for calculating the most recent update to that account
    account_latest_ids = {}
    account_latest_date = {}

    for account in accounts:
        print account['name']
        account_latest_ids[account['account_id']] = ''
        account_latest_date[account['account_id']] = 0
        #update last time read for this account
        sql = "UPDATE peepbuzz.accounts set last_scanned=now() WHERE account_id=" + account['account_id']
        mysql_tools.mysqlQuery(sql, dblink)
        
        authDict = {'site_key' : account['public_key'], 'account_password' : account['private_key'], 'account_id' : account['account_id'], 'last_read_id' : account['last_read_id'], 'external_id' : account['external_id'], 'user_id' : account['user_id']}
        if account['name'] == 'twitter':
            twitterThread(authDict).start()
        elif account['name'] == 'facebook':
            facebookThread(authDict).start()
        elif account['name'] == 'homepage':
            homepageThread(account['user_id']).start()

    #wait for threads to finish
    #for tr in threading.enumerate():
    #    print pprint.pprint(tr);
    # total timeout after 120 secs
    disasterTimeout = 120.0
    startTime = time.time()
    while True:
        if len(accounts) == totalStreamsRead:
            break
        #disaster timeout and crash
        if time.time() - startTime > disasterTimeout:
            print "scanStreams timeout"
            break  
    
    #urlList is list of all urls found in this pass of reading the stream
    urlList = []
    
    print "streamList"
    pprint.pprint(streamList)
#    sys.exit()

    ## now we have all filaments needed to go through the list and process each one
    for filament in streamList:
        #clear filament_id for safety
        print '-------------------'
        print "FILAMENT"
        print filament['created']
        print filament['stream_name']
#        if filament['external_id'] == '142283309153694_175236912525000':
#            print "pig radio"
        
        ## find promoter ID
        currentStreamID = infoModule.info.site['stream_name_to_id'][filament['stream_name']]
        (filament['promoter_account_id'],filament['promoter_account_table']) = accountFinder.accountFinder(currentStreamID, filament['promoter_id'], filament['promoter'], filament['thumbnail'])
        
        ## check that promoter isn't in blacklist.  Skip this record if they are
        blackListed = checkFilamentBlacklist.checkBlacklist(user_id, filament['promoter_account_id'])
        if blackListed:
            filament['status'] = 'blocked'
        else:
            filament['status'] = 'active'
            
#        pprint.pprint(filament)
#        sys.exit
        
        ## insert filament info into the filament table, checking to make sure it's not a dupe
        filamentJSON = json.dumps(filament)
        print "storing filament"
        filament_id = storeFilament.storeFilament(user_id, filamentJSON)
        if filament_id == False:
            continue
        ## add following
        filament['filament_id'] = filament_id

        # keep track of latest item for each account
        # if stream name isn't homepage, timestamp is likely to be in facebook format
        if filament['stream_name'] != 'homepage':
            time_in_struct = time.strptime(filament['created'][:-5], '%Y-%m-%dT%H:%M:%S')
            time_in_secs = int(time.mktime(time_in_struct))
            if time_in_secs > account_latest_date[filament['account_id']]:
                #this is the latest filament from this account
                account_latest_date[filament['account_id']] = time_in_secs
                account_latest_ids[filament['account_id']] = filament['external_id']
        else:
            # created timestamp is now for homepage
            current_time_in_secs = time.mktime(time.localtime())
            account_latest_date[filament['account_id']] = current_time_in_secs
            account_latest_ids[filament['account_id']] = filament['external_id']
            
        addFollowing.addFollowing(user_id, account_id=filament['promoter_account_id'])
        
        ## update hashtags stats if appropriate.
        if 'hashtags' in filament:
            for hashtag in filament['hashtags']:
                hashtagUpdate.hashtagUpdate(hashtag)

        ## insert discussions
        pprint.pprint(filament['discussion'])
        if len(filament['discussion']) > 0: 
            discussion_json = json.dumps(filament['discussion'])
            store_discussion_element.store_discussion(discussion_json, filament_id)
        
        ## is there a url in the filament?  append it to a url list structure
        ## that contains the filament ID and the url
        if filament['urls'] != None:
            urlsInFilament = filament['urls']
            if len(urlsInFilament) > 0:
                print "urls to read:"
                pprint.pprint(urlsInFilament)
                # only add the uniques
                for URL in urlsInFilament:
                    try: 
                        urlList.index(URL)
                    except ValueError:
                        # check against failed_urls
                        sql = "SELECT original_url FROM peepbuzz.failed_urls WHERE original_url='" + URL + "'"
                        check_failed_query = mysql_tools.mysqlQuery(sql, dblink)
                        if check_failed_query.num_rows() == 0:
                            # and test against db
                            story_id = URLInStories(URL)
                            if story_id > 0: 
                                #story already exists, just link story id to filament
                                linkFilamentToStory(filament['filament_id'], story_id)
                            else:
                                ## add url to original_url in stories, then link filament
                                ## after a few transformations
                                URL = re.sub('\?utm_source.*$', '', URL)
                                
                                urlList.append((URL, filament['account_id']))
                                sql = "insert into peepbuzz.stories set original_url='" + str(URL) + "'"
                                mysql_tools.mysqlQuery(sql, dblink)
                                story_id = dblink.insert_id() 
                                linkFilamentToStory(filament['filament_id'], story_id)
            else:
                pass
                
    #now outside of this loop, fetch all urls in parallel
    print "urls to fetch"
    pprint.pprint(urlList)
    
    if urlList == []:
        print "no urls to read"
    else:
        #urlStoringThread(urlList).start()
        print "storing exploded URLs"
        storiesFound = storeUrlContents.storeUrlContents(json.dumps(urlList))
        print "stories read: " + str(storiesFound)

    for key in account_latest_ids:
        sql = "update peepbuzz.accounts set last_read_id='" + str(account_latest_ids[key]) + "' where account_id=" + str(key)
        mysql_tools.mysqlQuery(sql, dblink)
コード例 #3
0
ファイル: scanStreams.py プロジェクト: ctwiz/sourcereader
def scanStreams(user_id):
    ##scanStreams:
    ##params: user_id
    ##returns: bool
    ##base function for reading all the streams from a user's accounts and importing them
    ## into the db
    global totalStreamsRead
    global streamList

    streamList = []
    totalStreamsRead = 0

    dblink = infoModule.info.site["dblink"]

    # fetch accounts, and last_read id
    sql = (
        "SELECT account_id, account_user_name, last_read_id, streams.name, public_key, account_user_name, private_key, external_id, user_id from peepbuzz.accounts INNER JOIN peepbuzz.streams ON accounts.stream_id=streams.stream_id WHERE user_id="
        + str(user_id)
    )
    accountsQ = mysql_tools.mysqlQuery(sql, dblink)
    try:
        accounts = accountsQ.fetch_row(0, 1)
    except AttributeError:
        print "accountsQ error: " + sql
    if accounts == ():
        infoModule.info.errorList.append("no accounts found for user_id " + str(user_id))
        return False

    #    pprint.pprint(accounts)
    #    sys.exit()

    # for each account type, launch a thread to request the stream
    # also use this look to create a dict for calculating the most recent update to that account
    account_latest_ids = {}
    account_latest_date = {}

    for account in accounts:
        print account["name"]
        account_latest_ids[account["account_id"]] = ""
        account_latest_date[account["account_id"]] = 0
        # update last time read for this account
        sql = "UPDATE peepbuzz.accounts set last_scanned=now() WHERE account_id=" + account["account_id"]
        mysql_tools.mysqlQuery(sql, dblink)

        authDict = {
            "site_key": account["public_key"],
            "account_password": account["private_key"],
            "account_id": account["account_id"],
            "last_read_id": account["last_read_id"],
            "external_id": account["external_id"],
            "user_id": account["user_id"],
        }
        if account["name"] == "twitter":
            twitterThread(authDict).start()
        elif account["name"] == "facebook":
            facebookThread(authDict).start()
        elif account["name"] == "homepage":
            homepageThread(account["user_id"]).start()

    # wait for threads to finish
    # for tr in threading.enumerate():
    #    print pprint.pprint(tr);
    # total timeout after 120 secs
    disasterTimeout = 120.0
    startTime = time.time()
    while True:
        if len(accounts) == totalStreamsRead:
            break
        # disaster timeout and crash
        if time.time() - startTime > disasterTimeout:
            print "scanStreams timeout"
            break

    # urlList is list of all urls found in this pass of reading the stream
    urlList = []

    print "streamList"
    pprint.pprint(streamList)
    #    sys.exit()

    ## now we have all filaments needed to go through the list and process each one
    for filament in streamList:
        # clear filament_id for safety
        print "-------------------"
        print "FILAMENT"
        print filament["created"]
        print filament["stream_name"]
        #        if filament['external_id'] == '142283309153694_175236912525000':
        #            print "pig radio"

        ## find promoter ID
        currentStreamID = infoModule.info.site["stream_name_to_id"][filament["stream_name"]]
        (filament["promoter_account_id"], filament["promoter_account_table"]) = accountFinder.accountFinder(
            currentStreamID, filament["promoter_id"], filament["promoter"], filament["thumbnail"]
        )

        ## check that promoter isn't in blacklist.  Skip this record if they are
        blackListed = checkFilamentBlacklist.checkBlacklist(user_id, filament["promoter_account_id"])
        if blackListed:
            filament["status"] = "blocked"
        else:
            filament["status"] = "active"

        #        pprint.pprint(filament)
        #        sys.exit

        ## insert filament info into the filament table, checking to make sure it's not a dupe
        filamentJSON = json.dumps(filament)
        print "storing filament"
        filament_id = storeFilament.storeFilament(user_id, filamentJSON)
        if filament_id == False:
            continue
        ## add following
        filament["filament_id"] = filament_id

        # keep track of latest item for each account
        # if stream name isn't homepage, timestamp is likely to be in facebook format
        if filament["stream_name"] != "homepage":
            time_in_struct = time.strptime(filament["created"][:-5], "%Y-%m-%dT%H:%M:%S")
            time_in_secs = int(time.mktime(time_in_struct))
            if time_in_secs > account_latest_date[filament["account_id"]]:
                # this is the latest filament from this account
                account_latest_date[filament["account_id"]] = time_in_secs
                account_latest_ids[filament["account_id"]] = filament["external_id"]
        else:
            # created timestamp is now for homepage
            current_time_in_secs = time.mktime(time.localtime())
            account_latest_date[filament["account_id"]] = current_time_in_secs
            account_latest_ids[filament["account_id"]] = filament["external_id"]

        addFollowing.addFollowing(user_id, account_id=filament["promoter_account_id"])

        ## update hashtags stats if appropriate.
        if "hashtags" in filament:
            for hashtag in filament["hashtags"]:
                hashtagUpdate.hashtagUpdate(hashtag)

        ## insert discussions
        pprint.pprint(filament["discussion"])
        if len(filament["discussion"]) > 0:
            discussion_json = json.dumps(filament["discussion"])
            store_discussion_element.store_discussion(discussion_json, filament_id)

        ## is there a url in the filament?  append it to a url list structure
        ## that contains the filament ID and the url
        if filament["urls"] != None:
            urlsInFilament = filament["urls"]
            if len(urlsInFilament) > 0:
                print "urls to read:"
                pprint.pprint(urlsInFilament)
                # only add the uniques
                for URL in urlsInFilament:
                    try:
                        urlList.index(URL)
                    except ValueError:
                        # check against failed_urls
                        sql = "SELECT original_url FROM peepbuzz.failed_urls WHERE original_url='" + URL + "'"
                        check_failed_query = mysql_tools.mysqlQuery(sql, dblink)
                        if check_failed_query.num_rows() == 0:
                            # and test against db
                            story_id = URLInStories(URL)
                            if story_id > 0:
                                # story already exists, just link story id to filament
                                linkFilamentToStory(filament["filament_id"], story_id)
                            else:
                                ## add url to original_url in stories, then link filament
                                ## after a few transformations
                                URL = re.sub("\?utm_source.*$", "", URL)

                                urlList.append((URL, filament["account_id"]))
                                sql = "insert into peepbuzz.stories set original_url='" + str(URL) + "'"
                                mysql_tools.mysqlQuery(sql, dblink)
                                story_id = dblink.insert_id()
                                linkFilamentToStory(filament["filament_id"], story_id)
            else:
                pass

    # now outside of this loop, fetch all urls in parallel
    print "urls to fetch"
    pprint.pprint(urlList)

    if urlList == []:
        print "no urls to read"
    else:
        # urlStoringThread(urlList).start()
        print "storing exploded URLs"
        storiesFound = storeUrlContents.storeUrlContents(json.dumps(urlList))
        print "stories read: " + str(storiesFound)

    for key in account_latest_ids:
        sql = (
            "update peepbuzz.accounts set last_read_id='"
            + str(account_latest_ids[key])
            + "' where account_id="
            + str(key)
        )
        mysql_tools.mysqlQuery(sql, dblink)