def crawl(self, only_english=False): ''' Performs the actual crawling. ''' text_analyser = TextAnalyser(ngram=1, only_english=only_english) exception_log = [] kw = otter.loadrc() # load api key count = 0 while self.maxtime != self.to_date: for page in range(PAGE_SIZE): try: search = otter.Resource('search', **kw) #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10) search(q=self.keywords, mintime = time.mktime(self.from_date.timetuple()), maxtime = time.mktime(self.maxtime.timetuple()), type='tweet', perpage=100, page=page+1) for item in search.response.list: print "--------------------------------------------------------------------------" print "Storing tweet #",count, "for the period",self.from_date,"until",self.maxtime tt = self.type() tt.url = item.url analysed = text_analyser.add_document(item.content) #if this tweet is really small just ignore it. if len(analysed['tokens']) <= 3: print"Ignoring this tweet" continue content = Content() content.raw = analysed['raw'] content.tokens = analysed['tokens'] content.construct_word_freq_list(analysed['word_frequencies']) content.date = self.from_date tt.content = content tt.date = self.from_date tt.retweet_count = item.trackback_total tt.screen_name = item.trackback_author_nick tt.author_screen_name = item.trackback_author_nick tt.author_name = item.trackback_author_name tt.save(safe=True) if len(Author.objects(screen_name=item.trackback_author_nick)) == 0: author = Author() author.screen_name = item.trackback_author_nick author.tweets.append(tt) else: author = Author.objects(screen_name=item.trackback_author_nick).get() author.tweets.append(tt) author.save() count += 1 except Exception, e: print e exception_log.append(e) finally: pass
import urllib import sys sys.path.append("..") import keys import otter # easy_install python-otter print keys.topsy_apikey # otter library doesn't seem to need my api key # but if you do use it, you get more calls (I think) # put apikey=xxxxxx in ~/.otterrc kw = otter.loadrc() #query = "Whitney Houston" query = raw_input("Enter query: ") # STEP 1, set the resource # STEP 2, make the call with the right parameters # STEP 3, read the results. r.response.o is the full JSON response # but r.response.list.o is the list of actual results # r.next_page() takes you to the next page of results # find expert contributors to a query r = otter.Resource('experts', **kw) r(q = query) for item in r.response.list.o:
#!/usr/bin/python import otter import codecs import sys sys.stdout = codecs.getwriter('utf-8')(sys.stdout) kw = otter.loadrc() # load beta # rc = otter.Resource('searchcount', **kw) # rc(q='gangnam style site:twitter.com') # print rc.response for i in range(0,100): rs = otter.Resource('search') rs(q='gangnam style', window='d'+str(i), type='tweet') for page in rs: for item in page.response.list: print item.title, item.url # rt = otter.Resource('trackbacks', **kw) # rt(url='https://www.youtube.com/watch?v=9bZkp7q19f0', sort_method='date') # for page in rt: # print page.response.total # for item in page.response.list: # print item.author
This module utilizes the Otter API bindings for Python to retrieve old tweets. ''' import otter #!@UnresolvedImport from model.tweets import PsychTweet from mongoengine import connect PAGE_SIZE = 100 connect("pythia_db") count = 0 exception_log = [] users = ["debatespsych", "psychissues", "chatpsych"] kw = otter.loadrc() # load api key for user in users: for page in range(PAGE_SIZE): try: search = otter.Resource('linkposts', **kw) search(url="http://twitter.com/"+user, type='tweet', perpage=10, page=page+1) for item in search.response.list: print "Storing tweet #",count, "for the user",user print item count += 1 except Exception, e: print e exception_log.append(e) finally: pass print "Retrieving tweets for next user"
def crawl(self, only_english=False): ''' Performs the actual crawling. ''' text_analyser = TextAnalyser(ngram=1, only_english=only_english) exception_log = [] kw = otter.loadrc() # load api key count = 0 while self.maxtime != self.to_date: for page in range(PAGE_SIZE): try: search = otter.Resource('search', **kw) #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10) search(q=self.keywords, mintime=time.mktime(self.from_date.timetuple()), maxtime=time.mktime(self.maxtime.timetuple()), type='tweet', perpage=100, page=page + 1) for item in search.response.list: print "--------------------------------------------------------------------------" print "Storing tweet #", count, "for the period", self.from_date, "until", self.maxtime tt = self.type() tt.url = item.url analysed = text_analyser.add_document(item.content) #if this tweet is really small just ignore it. if len(analysed['tokens']) <= 3: print "Ignoring this tweet" continue content = Content() content.raw = analysed['raw'] content.tokens = analysed['tokens'] content.construct_word_freq_list( analysed['word_frequencies']) content.date = self.from_date tt.content = content tt.date = self.from_date tt.retweet_count = item.trackback_total tt.screen_name = item.trackback_author_nick tt.author_screen_name = item.trackback_author_nick tt.author_name = item.trackback_author_name tt.save(safe=True) if len( Author.objects(screen_name=item. trackback_author_nick)) == 0: author = Author() author.screen_name = item.trackback_author_nick author.tweets.append(tt) else: author = Author.objects( screen_name=item.trackback_author_nick).get() author.tweets.append(tt) author.save() count += 1 except Exception, e: print e exception_log.append(e) finally: pass