Beispiel #1
0
def cleanMessage(message):
    # Remove @mentions.
    message = re.sub("@[a-zA-Z0-9]+", "", message)
    # Remove links.
    links = socialUrlUtils.urlsInText(message)
    # links.reverse()
    for link in links:
        message = message.replace(link, "")
        # Return.
    return message.strip()
def cleanMessage(message):
	# Remove Twitter-specific syntax.
	message = message.replace('@','')
	message = message.replace('#','')
	# Remove links.
	links = socialUrlUtils.urlsInText(message)
	# links.reverse()
	for link in links:
		message = message.replace(link,'')
	# Return.
	return message.strip()
Beispiel #3
0
def associatedUrls(network,post):
	if network == 'facebook':
		links = []
		if 'link' in post:
			links += [post['link']]
		if 'message' in post:
			links += socialUrlUtils.urlsInText(post['message'])
		return links
	elif network == 'twitter':
		return [u['expanded_url'] for u in post['entities']['urls']]
	return []
	# Get all relevant Facebook information.
	service   = 'facebook'
	posts     = json.load(open(dirPath_consolidatedFb + [f for f in os.listdir(dirPath_consolidatedFb) if f.startswith(org)][-1]))
	bitly     = json.load(open((dirPath_bitly % service) + org + '.json'))
	sentiment = json.load(open((dirPath_sentiment % service) + org + '.json'))
	urls      = json.load(open((dirPath_urls % service) + org + '.json'))

	# Traverse posts.
	for post in posts:
		# Get some initial information.
		expandedUrls = []
		if 'link' in post:
			expandedUrls.append(post['link'])
		if 'message' in post:
			expandedUrls += socialUrlUtils.urlsInText(post['message'])
		created_utc = dp(post['created_time'])
		# Ensure date in proper range. We want to exclude everything outside of it.
		if created_utc < fbStartDate or created_utc >= fbEndDate:
			continue

		# Container to store stats that will eventually become a data frame.
		thisPost = { 'service': service, 'id': post['id']}

		# Post features.
		thisPost['text'] = post.get('message')
		thisPost['medium'] = fbStatusTypeToMedium.get(post.get('status_type')) or 'other'
		# Post datetime features.
		thisPost['date_utc'] = str(created_utc.date())
		thisPost['day_of_week_utc'] = created_utc.weekday()
		thisPost['weekend_utc'] = thisPost['day_of_week_utc'] >= 5
# Get posts (for URLs).
postFilename = [f for f in os.listdir(dirPath_consolidated % network) if f.startswith(org)][-1]
posts = json.load(open((dirPath_consolidated % network) + postFilename))

# Traverse posts and find URLs.
urls = set()
if network == 'facebook':
	for p in posts:
		# Verify date.
		if dateutil.parser.parse(p['created_time']) < lastDate:
			# Find URLs to add.
			if 'link' in p:
				urls.add(p['link'])
			if 'message' in p:
				for url in socialUrlUtils.urlsInText(p['message']):
					urls.add(url)
		else:
			recentPostsSkipped += 1
elif network == 'twitter':
	for p in posts:
		# Verify date.
		if dateutil.parser.parse(p['created_at']) < lastDate:
			# Find URLs to add.
			for url in p['entities']['urls']:
				urls.add(url['expanded_url'])
		else:
			recentPostsSkipped += 1

# Traverse all URLs.
# enumerate() pattern is for debugging. It allows us to break after a certain number of URLs if necessary.