コード例 #1
0
ファイル: API_scraper.py プロジェクト: goodspeedj/csci-e64
def getTweets(n):
	for tweet in engine.search("visualization", count=n, start=1, cached=False):

		# Parse the date time field	
		dateTime = time.strptime(tweet.date, "%a, %d %b %Y %H:%M:%S +0000")
	
		#=======================================================================
		# In the sample output it looks like there are fields for three hashtags
		# and that if there are no hashtags the fields are populated with 'None'
		#=======================================================================
		tags = hashtags(tweet.text)
	
		# Trim the list to three entries to match sample.csv
		#if (len(tags) > 3):
		#	del tags[3:]
	
	
		# Add 'None' for hashtag fields that are empty - match sample.csv
		if (len(tags) == 2):
			tags.append('None')
		elif (len(tags) == 1):
			tags.append('None')
			tags.append('None')
		elif (len(tags) == 0):
			tags.append('None')
			tags.append('None')
			tags.append('None')
		else:
			pass
	
		# Setup variables for the fields
		author = tweet.author.encode('ascii', 'ignore')
	
		# For some reason assigning these to a variable causes errors - annoying
		#date = str(time.strftime('%m/%d/%y', dateTime).encode('ascii', 'ignore'))
		#time = str(time.strftime('%H:%M:%S', dateTime).encode('ascii','ignore'))
		text = re.sub('[\r\n]+','',tweet.text.encode('ascii', 'ignore'))
		tag1 = re.sub('#','',tags[0].encode('ascii', 'ignore'))
		tag2 = re.sub('#','',tags[1].encode('ascii', 'ignore'))
		tag3 = re.sub('#','',tags[2].encode('ascii', 'ignore'))

		"""
		print author + "," + \
		  time.strftime('%m/%d/%y', dateTime).encode('ascii', 'ignore') + "," + \
		  time.strftime('%H:%M:%S', dateTime).encode('ascii','ignore') + "," + \
		  text + "," + tag1 + "," + tag2 + "," + tag3
		"""

		# Create a unique ID based on the tweet content and author.
		id = str(hash(tweet.author + tweet.text))
	
    	# Only add the tweet to the table if it doesn't already contain this ID.  
     	# This can give results less than 100.
		if len(table) == 0 or id not in index:
			table.append([author, time.strftime('%m/%d/%y', dateTime).encode('ascii', 'ignore'), \
					  time.strftime('%H:%M:%S', dateTime).encode('ascii','ignore'), \
					  text, tag1, tag2, tag3])
			index[id] = True

		table.save("twitter_output.csv")
コード例 #2
0
def dumpTweets(stream, nIgnored):
    nTweets = 0

    hashList = []
    keywordsList = []

    curNIgnored = 0

    # The stream is a list of buffered tweets so far,
    # with the latest tweet at the end of the list.
    for tweet in reversed(stream):
        if not isKosher(tweet):
            curNIgnored += 1
            continue

        curKeywords = sorted(getKeywords(tweet))
        keywordsList += curKeywords
        nTweets += 1

        hashes = hashtags(tweet.text.encode("utf-8"))
        hashList += [hashes]

        if filePath:
            tweetToFile(tweet, filePath)

    print("\nPolled {0} tweets, #ignored (current) = {1}".format(
        nTweets, curNIgnored))
    print("Hash list = {0}, keywords (appeared) = {1}".format(
        hashList, sorted(keywordsList)))
    return nTweets
コード例 #3
0
def main(search_query):
    """ Returns Twitter Search Results
    :param search_query: (str)
    """
    final = "Twitter Search Results:"
    for i in range(2):
        print(i)
        for tweet in ENGINE.search(search_query,
                                   start=None,
                                   count=25,
                                   cached=False):

            final = final + "\n\n" + tweet.text + "\n" + \
                    tweet.author + "\n" + tweet.date + "\n" + \
                    str(hashtags(tweet.text))  # Keywords in tweets start with a "#".
    return final
コード例 #4
0
ファイル: kernel.py プロジェクト: willyg302/Parrot
	def search(self, args):
		"""
		Usage:
		  search [-fty] <keyword>
		  search -h | --help

		Options:
		  -h --help      Show this help message.
		  -f --facebook  Search for keyword on Facebook.
		  -t --twitter   Search for keyword on Twitter.
		  -y --youtube   Search for keyword on YouTube.
		"""

		# Example args information:
		# {'--facebook': False,
		# '--help': False,
		# '--twitter': True,
		# '--youtube': False,
		# '': 'f'}

		engine = Twitter(language='en')
		ret = []

		'''
		generator = ({
			'text': tweet.text,
			'author': tweet.author,
			'date': tweet.date,
			'hashtags': hashtags(tweet.text)
		} for tweet in engine.search('is cooler than', count=25, cached=False))

		self.db.bulk_insert('test', generator)
		'''
		
		for tweet in engine.search('is cooler than', count=25, cached=False):
			ret.append({
				'text': tweet.text,
				'author': tweet.author,
				'date': tweet.date,
				'hashtags': hashtags(tweet.text)
			})
		

		return str(ret)
コード例 #5
0
def getKeywords(tweet):
    res = []

    if onlyHashTags:
        hashes = hashtags(tweet.text)

        for h in hashes:
            for k in keywords:
                if strEqual(h.strip("#"), k):
                    res += [h]
                    break
    elif onlyWords:
        words = getWordsStripped(tweet)

        for w in words:
            for k in keywords:
                if strEqual(w, k):
                    res += [w]
                    break
    else:
        res = getKeywordsSubstr(tweet.text)

    return list(set(res))  # We remove the duplicates.
コード例 #6
0
engine = Twitter(language="en")

# With Twitter.search(cached=False), a "live" request is sent to Twitter:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
prev = None
for i in range(2):
    print(i)
    for tweet in engine.search("eulogy", start=prev, count=25, cached=False):
        print("")
        print(tweet.text)
        print(tweet.author)
        print(tweet.date)
        print(hashtags(tweet.text))  # Keywords in tweets start with a "#".
        print("")
        # Only add the tweet to the table if it doesn't already exists.
        if len(table) == 0 or tweet.id not in index:
            table.append([tweet.id, tweet.text])
            index.add(tweet.id)
        # Continue mining older tweets in next iteration.
        prev = tweet.id

# Create a .csv in pattern/examples/01-web/
table.save(pd("eulogy_july_21.csv"))

print("Total results: %s" % len(table))
print("")

# Print all the rows in the table.
コード例 #7
0
ファイル: 03-twitter.py プロジェクト: sp00/pattern
    # The index becomes important once more and more rows are added to the table (speed).
    table = Datasheet.load("cool.txt")
    index = dict.fromkeys(table.columns[0], True)
except:
    table = Datasheet()
    index = {}

engine = Twitter(language="en")

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.description
    print tweet.author
    print tweet.date
    print hashtags(tweet.description)  # Keywords in tweets start with a #.
    print
    # Create a unique ID based on the tweet content and author.
    id = hash(tweet.author + tweet.description)
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, tweet.description])
        index[id] = True

table.save("cool.txt")

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a file it can grow comfortably each time the script runs.
コード例 #8
0
ファイル: test_web.py プロジェクト: daeon/pattern
 def test_twitter_hashtags(self):
     self.assertEqual(web.hashtags("#cat #dog"), ["#cat", "#dog"])
     print "pattern.web.hashtags()"
コード例 #9
0
ファイル: twitter.py プロジェクト: gabrielhase/Etna
    index2 = dict.fromkeys(table.columns[1], True)
except:
    table = Datasheet()
    index = {}
    index2 = {}


engine = Twitter(language="en")
comparray=[" "] #spam filter 
# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for i in range(1, 10000):
    for tweet in Twitter().search('volcano OR Sicily AND etna +exclude:retweets', start=i, count=100):
        comparray.append(tweet.text[0:15])
        print tweet.text
        print tweet.author
        print tweet.date
        print hashtags(tweet.text) # Keywords in tweets start with a #.
        print
        # Create a unique ID based on the tweet content and author.
        id = str(hash(tweet.author + tweet.text))
        # Only add the tweet to the table if it doesn't already contain this ID.     
        if len(table) == 0 or id not in index and sentiment(tweet.text)[0]!= 0 and comparray[-1]!=comparray[-2]:
            table.append([id,tweet.author, tweet.date, tweet.text, sentiment(tweet.text)[0]])
            index[id] = True

table.save("tweets.csv")

print "Total results:", len(table)
print
コード例 #10
0
 def test_twitter_hashtags(self):
     self.assertEqual(web.hashtags("#cat #dog"), ["#cat", "#dog"])
     print "pattern.web.hashtags()"
コード例 #11
0
    # The index becomes important once more and more rows are added to the table (speed).
    table = Table.load("cool.txt")
    index = table.index(table.columns[0])
except:
    table = Table()
    index = {}

engine = Twitter()

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.description
    print tweet.author
    print tweet.date
    print hashtags(tweet.description)  # Keywords in tweets start with a #.
    print
    # Create a unique ID based on the tweet content and author.
    id = hash(tweet.author + tweet.description)
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, tweet.description])
        index[id] = True

table.save("cool.txt")

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a file it can grow comfortably each time the script runs.
コード例 #12
0
ファイル: 04-twitter.py プロジェクト: DataBranner/pattern
engine = Twitter(language="en")

# With Twitter.search(cached=False), a "live" request is sent to Twitter:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
prev = None
for i in range(2):
    print(i)
    for tweet in engine.search("is cooler than", start=prev, count=25, cached=False):
        print()
        print(tweet.text.encode("utf-8"))
        print(tweet.author)
        print(tweet.date)
        print(hashtags(tweet.text))  # Keywords in tweets start with a "#".
        print()
        # Only add the tweet to the table if it doesn't already exists.
        if len(table) == 0 or tweet.id not in index:
            table.append([tweet.id, tweet.text])
            index.add(tweet.id)
        # Continue mining older tweets in next iteration.
        prev = tweet.id

# Create a .csv in pattern/examples/01-web/
table.save(pd("cool.csv"))

print("Total results:", len(table))
print()

# Print all the rows in the table.
コード例 #13
0
	def execute(self, **commandArgs):
		searchString = commandArgs.get("search", "is cooler than") #from:decebel (from:username is also supported)
		print("searching for {0}: ").format(searchString)
		count = commandArgs.get("count", 25)
		cached = commandArgs.get("cached", False)
		return "skip skip "

		# TODO: arg to decide if we want to do blocking or non-blocking 
		# also, we should allow user to cancel pending requests
		# also, we should auto-alert if the request is taking a long time
		# TODO: we should have test cases for verifying these
		engine = self.args["engine"]
		result = {}
		for tweet in engine.search(searchString, count=count, cached=cached):
			tid = str(hash(tweet.author + tweet.date)) #  since the id is sometimes negative. TODO
			print "# {2} Date={0}. Author= {1}".format(tweet.date, tweet.author, tid)			
			rec = {"author" : tweet.author, "text" : tweet.text, "date" : tweet.date, "hashtags" : hashtags(tweet.text)}
			#print "record = {0}".format(rec)
			result[tweet.author] = rec
			#print "I AM HERE"
			#print "\nRESULT={0}".format(result)
			#print "RECORD = {0}".format(rec)
			#print "HOW MANY TIMES"
			#pp.pprint(result)

			#print tweet.text
		pp.pprint(result)	
		print "FOR LOOP COMPLETE"

		return result
コード例 #14
0
    # The index becomes important once more and more rows are added to the table (speed).
    table = Datasheet.load("cool.csv")
    index = dict.fromkeys(table.columns[0], True)
except:
    table = Datasheet()
    index = {}

engine = Twitter(language="en")

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.text
    print tweet.author
    print tweet.date
    print hashtags(tweet.text) # Keywords in tweets start with a #.
    print
    # Create a unique ID based on the tweet content and author.
    id = str(hash(tweet.author + tweet.text))
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, tweet.text])
        index[id] = True

table.save("cool.csv")

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a file it can grow comfortably each time the script runs.
コード例 #15
0

'''
Cleaning the tweets extracted from a particular user timeline,
code is available in another file
'''


with open('extracted_tweets_translated.txt') as f:
    for tweet in f.readlines():
        clean_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet).split())
        print

        print clean_text
        print author(tweet)
        print hashtags(tweet)

        print

        table.append([clean_text])



'''
Keyword based crawling
'''


twitter = Twitter(language='en')