Esempio n. 1
0
def get_data(topic_tag):
	"""
		Get the training data from MongoDB.
		Returns a list of posts
	"""
	content, labels = [], []
	params = ['clean', 'links', 'images', 'embed', 'ident', 'punct', 'misc', 'stop', 'encoding']

	mongo_data = training_data.find({'Topic': topic_tag})

	for entry in mongo_data:
		content.append(pc.parse_post(entry['Content'], params))
		labels.append('1')

	num_posts = len(content)
	mongo_data = training_data.find({'Topic': 'Irrelevant'})
	for entry in mongo_data:
		if len(content) <= num_posts * 2:
			content.append(pc.parse_post(entry['Content'], params))
			#labels.append(entry['Topic'])
			labels.append('0')

	content = np.asarray(content)
	labels = np.asarray(labels)

	return content, labels
Esempio n. 2
0
def code_posts(es_docs):
	contents, es_posts = [], []

	es_docs = es_docs['hits']['hits']
	for es_post in es_docs:
		contents.append(pc.parse_post(es_post['_source']['post']['content'], params))
	contents = np.asarray(contents)

	temp_data = []
	for es_post in es_docs:
		es_post['_source']['analysis'] = {'relevance': []}
		es_post.pop('_score', None)
		temp_data.append(es_post)
	es_docs = temp_data

	#TODO Only update the analysis field
	for topic in topic_models:
		predict = topic_models[topic].predict_proba(contents)
		temp_data = []

		for es_post, code in zip(es_docs, predict):
			es_post['_source']['analysis']['relevance'].append({'topicid': topic, 'code': code[1]})
			temp_data.append(es_post)
		es_docs = temp_data

	try:
		helpers.bulk(es, es_docs)
	except Exception as e:
		with open('temp.txt', 'a') as outfile:
			print 'Exception: logging'
			outfile.write('>Event Start:' + '\n' + str(e) + '\n')
			#outfile.write(str(es_docs))
	return
Esempio n. 3
0
def get_data(topic_name):
	"""
		Get the training data from MongoDB.
		Returns a list of posts
	"""
	ids, content, labels = [], [], []
	params = ['clean', 'links', 'images', 'embed', 'ident', 'punct', 'misc', 'stop', 'encoding']

	mongo_data = test_data.find({'Topic': topic_name})

	for entry in mongo_data:
		ids.append(entry['PostId'])
		content.append(pc.parse_post(entry['Content'], params))
		labels.append(topic_name)

	content = np.asarray(content)
	labels = np.asarray(labels)

	return ids, content, labels
Esempio n. 4
0
while queries_complete is False:
	if first_query is True:
		es_response = es.search(index=index_name, body={'query': es_query}, scroll=ES_SCROLL_TIMEOUT, size=ES_DOC_COUNT)
		es_scroll_id = es_response['_scroll_id']
		total_docs = es_response['hits']['total']
		first_query = False
	else:
		es_response = es.scroll(scroll_id=es_scroll_id, scroll=ES_SCROLL_TIMEOUT)
		es_scroll_id = es_response['_scroll_id']

	if len(es_response['hits']['hits']) == 0:
		queries_complete = True
	else:
		for entry in es_response['hits']['hits']:
			data_dict[entry['_id']]['content'] = pc.parse_post(entry['_source']['twitter']['text'], params)

test_ids, test_user, test_content = [], [], []
for key in data_dict.keys():
	test_ids.append(key)
	test_user.append(data_dict[key]['user_code'])
	test_content.append(data_dict[key]['content'])


test_content = np.asarray(test_content)
test_ids = np.asarray(test_ids)

predicted = model.predict_proba(test_content)

output_content = []
for postid, tweet_text, label, code in zip(test_ids, test_content, predicted, test_user):