def getTweets(n): for tweet in engine.search("visualization", count=n, start=1, cached=False): # Parse the date time field dateTime = time.strptime(tweet.date, "%a, %d %b %Y %H:%M:%S +0000") #======================================================================= # In the sample output it looks like there are fields for three hashtags # and that if there are no hashtags the fields are populated with 'None' #======================================================================= tags = hashtags(tweet.text) # Trim the list to three entries to match sample.csv #if (len(tags) > 3): # del tags[3:] # Add 'None' for hashtag fields that are empty - match sample.csv if (len(tags) == 2): tags.append('None') elif (len(tags) == 1): tags.append('None') tags.append('None') elif (len(tags) == 0): tags.append('None') tags.append('None') tags.append('None') else: pass # Setup variables for the fields author = tweet.author.encode('ascii', 'ignore') # For some reason assigning these to a variable causes errors - annoying #date = str(time.strftime('%m/%d/%y', dateTime).encode('ascii', 'ignore')) #time = str(time.strftime('%H:%M:%S', dateTime).encode('ascii','ignore')) text = re.sub('[\r\n]+','',tweet.text.encode('ascii', 'ignore')) tag1 = re.sub('#','',tags[0].encode('ascii', 'ignore')) tag2 = re.sub('#','',tags[1].encode('ascii', 'ignore')) tag3 = re.sub('#','',tags[2].encode('ascii', 'ignore')) """ print author + "," + \ time.strftime('%m/%d/%y', dateTime).encode('ascii', 'ignore') + "," + \ time.strftime('%H:%M:%S', dateTime).encode('ascii','ignore') + "," + \ text + "," + tag1 + "," + tag2 + "," + tag3 """ # Create a unique ID based on the tweet content and author. id = str(hash(tweet.author + tweet.text)) # Only add the tweet to the table if it doesn't already contain this ID. # This can give results less than 100. if len(table) == 0 or id not in index: table.append([author, time.strftime('%m/%d/%y', dateTime).encode('ascii', 'ignore'), \ time.strftime('%H:%M:%S', dateTime).encode('ascii','ignore'), \ text, tag1, tag2, tag3]) index[id] = True table.save("twitter_output.csv")
def dumpTweets(stream, nIgnored): nTweets = 0 hashList = [] keywordsList = [] curNIgnored = 0 # The stream is a list of buffered tweets so far, # with the latest tweet at the end of the list. for tweet in reversed(stream): if not isKosher(tweet): curNIgnored += 1 continue curKeywords = sorted(getKeywords(tweet)) keywordsList += curKeywords nTweets += 1 hashes = hashtags(tweet.text.encode("utf-8")) hashList += [hashes] if filePath: tweetToFile(tweet, filePath) print("\nPolled {0} tweets, #ignored (current) = {1}".format( nTweets, curNIgnored)) print("Hash list = {0}, keywords (appeared) = {1}".format( hashList, sorted(keywordsList))) return nTweets
def main(search_query): """ Returns Twitter Search Results :param search_query: (str) """ final = "Twitter Search Results:" for i in range(2): print(i) for tweet in ENGINE.search(search_query, start=None, count=25, cached=False): final = final + "\n\n" + tweet.text + "\n" + \ tweet.author + "\n" + tweet.date + "\n" + \ str(hashtags(tweet.text)) # Keywords in tweets start with a "#". return final
def search(self, args): """ Usage: search [-fty] <keyword> search -h | --help Options: -h --help Show this help message. -f --facebook Search for keyword on Facebook. -t --twitter Search for keyword on Twitter. -y --youtube Search for keyword on YouTube. """ # Example args information: # {'--facebook': False, # '--help': False, # '--twitter': True, # '--youtube': False, # '': 'f'} engine = Twitter(language='en') ret = [] ''' generator = ({ 'text': tweet.text, 'author': tweet.author, 'date': tweet.date, 'hashtags': hashtags(tweet.text) } for tweet in engine.search('is cooler than', count=25, cached=False)) self.db.bulk_insert('test', generator) ''' for tweet in engine.search('is cooler than', count=25, cached=False): ret.append({ 'text': tweet.text, 'author': tweet.author, 'date': tweet.date, 'hashtags': hashtags(tweet.text) }) return str(ret)
def getKeywords(tweet): res = [] if onlyHashTags: hashes = hashtags(tweet.text) for h in hashes: for k in keywords: if strEqual(h.strip("#"), k): res += [h] break elif onlyWords: words = getWordsStripped(tweet) for w in words: for k in keywords: if strEqual(w, k): res += [w] break else: res = getKeywordsSubstr(tweet.text) return list(set(res)) # We remove the duplicates.
engine = Twitter(language="en") # With Twitter.search(cached=False), a "live" request is sent to Twitter: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. prev = None for i in range(2): print(i) for tweet in engine.search("eulogy", start=prev, count=25, cached=False): print("") print(tweet.text) print(tweet.author) print(tweet.date) print(hashtags(tweet.text)) # Keywords in tweets start with a "#". print("") # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.id not in index: table.append([tweet.id, tweet.text]) index.add(tweet.id) # Continue mining older tweets in next iteration. prev = tweet.id # Create a .csv in pattern/examples/01-web/ table.save(pd("eulogy_july_21.csv")) print("Total results: %s" % len(table)) print("") # Print all the rows in the table.
# The index becomes important once more and more rows are added to the table (speed). table = Datasheet.load("cool.txt") index = dict.fromkeys(table.columns[0], True) except: table = Datasheet() index = {} engine = Twitter(language="en") # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search("is cooler than", count=25, cached=False): print tweet.description print tweet.author print tweet.date print hashtags(tweet.description) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author. id = hash(tweet.author + tweet.description) # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index: table.append([id, tweet.description]) index[id] = True table.save("cool.txt") print "Total results:", len(table) print # Print all the rows in the table. # Since it is stored as a file it can grow comfortably each time the script runs.
def test_twitter_hashtags(self): self.assertEqual(web.hashtags("#cat #dog"), ["#cat", "#dog"]) print "pattern.web.hashtags()"
index2 = dict.fromkeys(table.columns[1], True) except: table = Datasheet() index = {} index2 = {} engine = Twitter(language="en") comparray=[" "] #spam filter # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for i in range(1, 10000): for tweet in Twitter().search('volcano OR Sicily AND etna +exclude:retweets', start=i, count=100): comparray.append(tweet.text[0:15]) print tweet.text print tweet.author print tweet.date print hashtags(tweet.text) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author. id = str(hash(tweet.author + tweet.text)) # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index and sentiment(tweet.text)[0]!= 0 and comparray[-1]!=comparray[-2]: table.append([id,tweet.author, tweet.date, tweet.text, sentiment(tweet.text)[0]]) index[id] = True table.save("tweets.csv") print "Total results:", len(table) print
# The index becomes important once more and more rows are added to the table (speed). table = Table.load("cool.txt") index = table.index(table.columns[0]) except: table = Table() index = {} engine = Twitter() # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search("is cooler than", count=25, cached=False): print tweet.description print tweet.author print tweet.date print hashtags(tweet.description) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author. id = hash(tweet.author + tweet.description) # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index: table.append([id, tweet.description]) index[id] = True table.save("cool.txt") print "Total results:", len(table) print # Print all the rows in the table. # Since it is stored as a file it can grow comfortably each time the script runs.
engine = Twitter(language="en") # With Twitter.search(cached=False), a "live" request is sent to Twitter: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. prev = None for i in range(2): print(i) for tweet in engine.search("is cooler than", start=prev, count=25, cached=False): print() print(tweet.text.encode("utf-8")) print(tweet.author) print(tweet.date) print(hashtags(tweet.text)) # Keywords in tweets start with a "#". print() # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.id not in index: table.append([tweet.id, tweet.text]) index.add(tweet.id) # Continue mining older tweets in next iteration. prev = tweet.id # Create a .csv in pattern/examples/01-web/ table.save(pd("cool.csv")) print("Total results:", len(table)) print() # Print all the rows in the table.
def execute(self, **commandArgs): searchString = commandArgs.get("search", "is cooler than") #from:decebel (from:username is also supported) print("searching for {0}: ").format(searchString) count = commandArgs.get("count", 25) cached = commandArgs.get("cached", False) return "skip skip " # TODO: arg to decide if we want to do blocking or non-blocking # also, we should allow user to cancel pending requests # also, we should auto-alert if the request is taking a long time # TODO: we should have test cases for verifying these engine = self.args["engine"] result = {} for tweet in engine.search(searchString, count=count, cached=cached): tid = str(hash(tweet.author + tweet.date)) # since the id is sometimes negative. TODO print "# {2} Date={0}. Author= {1}".format(tweet.date, tweet.author, tid) rec = {"author" : tweet.author, "text" : tweet.text, "date" : tweet.date, "hashtags" : hashtags(tweet.text)} #print "record = {0}".format(rec) result[tweet.author] = rec #print "I AM HERE" #print "\nRESULT={0}".format(result) #print "RECORD = {0}".format(rec) #print "HOW MANY TIMES" #pp.pprint(result) #print tweet.text pp.pprint(result) print "FOR LOOP COMPLETE" return result
# The index becomes important once more and more rows are added to the table (speed). table = Datasheet.load("cool.csv") index = dict.fromkeys(table.columns[0], True) except: table = Datasheet() index = {} engine = Twitter(language="en") # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search("is cooler than", count=25, cached=False): print tweet.text print tweet.author print tweet.date print hashtags(tweet.text) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author. id = str(hash(tweet.author + tweet.text)) # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index: table.append([id, tweet.text]) index[id] = True table.save("cool.csv") print "Total results:", len(table) print # Print all the rows in the table. # Since it is stored as a file it can grow comfortably each time the script runs.
''' Cleaning the tweets extracted from a particular user timeline, code is available in another file ''' with open('extracted_tweets_translated.txt') as f: for tweet in f.readlines(): clean_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet).split()) print print clean_text print author(tweet) print hashtags(tweet) print table.append([clean_text]) ''' Keyword based crawling ''' twitter = Twitter(language='en')