Beispiel #1
0
 def crawl(self, only_english=False):
     '''
     Performs the actual crawling. 
     '''
     text_analyser = TextAnalyser(ngram=1, only_english=only_english)
     exception_log = []
     kw = otter.loadrc() # load api key
     count = 0
     while self.maxtime != self.to_date:
         for page in range(PAGE_SIZE):        
             try:
                 search = otter.Resource('search', **kw)
                 #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10)
                 search(q=self.keywords, mintime = time.mktime(self.from_date.timetuple()), maxtime = time.mktime(self.maxtime.timetuple()), type='tweet', perpage=100, page=page+1)
                 for item in search.response.list:
                     print "--------------------------------------------------------------------------"
                     print "Storing tweet #",count, "for the period",self.from_date,"until",self.maxtime 
                     tt = self.type()
                     tt.url = item.url
                     analysed = text_analyser.add_document(item.content)
                     #if this tweet is really small just ignore it. 
                     if len(analysed['tokens']) <= 3: 
                         print"Ignoring this tweet"
                         continue
                     content = Content()
                     content.raw = analysed['raw']
                     content.tokens = analysed['tokens']
                     content.construct_word_freq_list(analysed['word_frequencies'])
                     content.date = self.from_date
                     tt.content = content
                     tt.date = self.from_date
                     tt.retweet_count = item.trackback_total
                     tt.screen_name = item.trackback_author_nick
                     tt.author_screen_name = item.trackback_author_nick
                     tt.author_name = item.trackback_author_name                        
                     tt.save(safe=True)
                     
                     if len(Author.objects(screen_name=item.trackback_author_nick)) == 0:
                         
                         author = Author()
                         author.screen_name = item.trackback_author_nick
                         author.tweets.append(tt)
                     else:
                         author = Author.objects(screen_name=item.trackback_author_nick).get()
                         author.tweets.append(tt)
                     author.save()
                     
                     count += 1                             
             except Exception, e:
                 print e
                 exception_log.append(e)
             finally:
                 pass          
Beispiel #2
0
import urllib
import sys
sys.path.append("..")
import keys
import otter # easy_install python-otter

print keys.topsy_apikey

# otter library doesn't seem to need my api key
# but if you do use it, you get more calls (I think)
# put apikey=xxxxxx in ~/.otterrc

kw = otter.loadrc()

#query = "Whitney Houston"

query = raw_input("Enter query: ")


# STEP 1, set the resource
# STEP 2, make the call with the right parameters
# STEP 3, read the results.  r.response.o is the full JSON response
# but r.response.list.o is the list of actual results
# r.next_page() takes you to the next page of results


# find expert contributors to a query
r = otter.Resource('experts', **kw)
r(q = query)

for item in r.response.list.o:
#!/usr/bin/python

import otter
import codecs
import sys

sys.stdout = codecs.getwriter('utf-8')(sys.stdout) 

kw = otter.loadrc() # load beta 

# rc = otter.Resource('searchcount', **kw)
# rc(q='gangnam style site:twitter.com')
# print rc.response

for i in range(0,100):
    rs = otter.Resource('search')
    rs(q='gangnam style', window='d'+str(i), type='tweet')

    for page in rs:
        for item in page.response.list:
            print item.title, item.url

# rt = otter.Resource('trackbacks', **kw)
# rt(url='https://www.youtube.com/watch?v=9bZkp7q19f0', sort_method='date')

# for page in rt:
#     print page.response.total
#     for item in page.response.list:
#         print item.author

    
This module utilizes the Otter API bindings for Python to retrieve old tweets.
'''
import otter #!@UnresolvedImport
from model.tweets import PsychTweet
from mongoengine import connect

PAGE_SIZE = 100

connect("pythia_db")

count = 0
exception_log = []

users = ["debatespsych", "psychissues", "chatpsych"]

kw = otter.loadrc() # load api key
for user in users:
    for page in range(PAGE_SIZE):        
        try:
            search = otter.Resource('linkposts', **kw)
            search(url="http://twitter.com/"+user, type='tweet', perpage=10, page=page+1)
            for item in search.response.list:
                print "Storing tweet #",count, "for the user",user 
                print item
                count += 1     
        except Exception, e:
            print e
            exception_log.append(e)
        finally:
            pass          
    print "Retrieving tweets for next user"         
Beispiel #5
0
    def crawl(self, only_english=False):
        '''
        Performs the actual crawling. 
        '''
        text_analyser = TextAnalyser(ngram=1, only_english=only_english)
        exception_log = []
        kw = otter.loadrc()  # load api key
        count = 0
        while self.maxtime != self.to_date:
            for page in range(PAGE_SIZE):
                try:
                    search = otter.Resource('search', **kw)
                    #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10)
                    search(q=self.keywords,
                           mintime=time.mktime(self.from_date.timetuple()),
                           maxtime=time.mktime(self.maxtime.timetuple()),
                           type='tweet',
                           perpage=100,
                           page=page + 1)
                    for item in search.response.list:
                        print "--------------------------------------------------------------------------"
                        print "Storing tweet #", count, "for the period", self.from_date, "until", self.maxtime
                        tt = self.type()
                        tt.url = item.url
                        analysed = text_analyser.add_document(item.content)
                        #if this tweet is really small just ignore it.
                        if len(analysed['tokens']) <= 3:
                            print "Ignoring this tweet"
                            continue
                        content = Content()
                        content.raw = analysed['raw']
                        content.tokens = analysed['tokens']
                        content.construct_word_freq_list(
                            analysed['word_frequencies'])
                        content.date = self.from_date
                        tt.content = content
                        tt.date = self.from_date
                        tt.retweet_count = item.trackback_total
                        tt.screen_name = item.trackback_author_nick
                        tt.author_screen_name = item.trackback_author_nick
                        tt.author_name = item.trackback_author_name
                        tt.save(safe=True)

                        if len(
                                Author.objects(screen_name=item.
                                               trackback_author_nick)) == 0:

                            author = Author()
                            author.screen_name = item.trackback_author_nick
                            author.tweets.append(tt)
                        else:
                            author = Author.objects(
                                screen_name=item.trackback_author_nick).get()
                            author.tweets.append(tt)
                        author.save()

                        count += 1
                except Exception, e:
                    print e
                    exception_log.append(e)
                finally:
                    pass