Ejemplo n.º 1
0
	def __init__(self, goal_path=None):
		self.plan = RawPlan()
		self.goal_list = []
		self.goal_vector = []
		self.stemmer = nltk.PorterStemmer()
		if goal_path is None:
			self.plan.populate_goal_actions_map()
			self.goal_actions_map = self.plan.goal_actions_map
			self.goal_list = self.goal_actions_map.keys()
		else:
			self.goal_actions_map = load_pickle(goal_path)
			self.goal_list = self.goal_actions_map.keys()
Ejemplo n.º 2
0
class GoalCluster():
	def __init__(self, goal_path=None):
		self.plan = RawPlan()
		self.goal_list = []
		self.goal_vector = []
		self.stemmer = nltk.PorterStemmer()
		if goal_path is None:
			self.plan.populate_goal_actions_map()
			self.goal_actions_map = self.plan.goal_actions_map
			self.goal_list = self.goal_actions_map.keys()
		else:
			self.goal_actions_map = load_pickle(goal_path)
			self.goal_list = self.goal_actions_map.keys()

	def create_tfidf_vector(self):
		count_vect = CountVectorizer()
		doc = map(lambda x: " ".join(flatten(x)) + " " + \
				x[0], self.goal_actions_map.items())
		X_train_counts = count_vect.fit_transform(doc)
		tfidf_transformer = TfidfTransformer()
		X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
		return X_train_tfidf

	def clustering(self):
		# Calculate similarity matrix
		X = self.create_tfidf_vector()
		X = X.toarray()
		pca = PCA(n_components=300, copy=False)
		X = pca.fit(X).transform(X)
		S = cosine_similarity(X, X)
		# Run affinity propogation
		af = AffinityPropagation()
		af.fit(S)
		# Formulate result
		tmp_clusters = defaultdict(list)
		goal_clusters = defaultdict(list)
		cluster_centers_indices = af.cluster_centers_indices_
		labels = af.labels_
		count = 0
		for label in labels:
			tmp_clusters[\
				self.goal_list[cluster_centers_indices[label]]].append(\
				self.goal_list[count])
			count += 1
		# 2nd-layer clutering of each cluster
		for goal, item_list in tmp_clusters.items():
			subclusters = self.subcluster_by_editdistance(goal, item_list)
			for subgoal, items in subclusters.items():
				goal_clusters[subgoal] = items
		return goal_clusters

	def subcluster_by_editdistance(self, center, item_list, threshold=2):
		clusters = defaultdict(list)
		clusters[center].append(center)
		for item in item_list:
			if item in clusters.keys():
				continue
			flag = 0
			list_item = self.stemmer.stem(item.encode('utf-8')).split()
			for goal in clusters.keys():
				list_goal = self.stemmer.stem(goal.encode('utf-8')).split()
				d = edit_distance(list_goal, list_item)
				if d < threshold:
					clusters[goal].append(item)
					flag = 1
					break
			if flag == 0:
				clusters[item].append(item)
		return clusters

	def get_wordnet_lch(self, concept1, concept2, max_depth=5):
		if self.stemmer.stem(concept1) == self.stemmer.stem(concept2):
			return concept1
		concept1_synsets = wn.synsets(concept1)
		concept2_synsets = wn.synsets(concept2)
		for concept1_synset in concept1_synsets:
			for concept2_synset in concept2_synsets:
				commons = \
					concept1_synset.lowest_common_hypernyms(concept2_synset)
				for common in commons:
					print common
					if common.max_depth() > max_depth:
						return common.lemma_names[0].replace('_', ' ')
		return None

	def get_generalized_goal(self, center, goal_list):
		output = []
		center_tokens = nltk.word_tokenize(center.encode('utf-8'))
		print center_tokens
		for goal in goal_list:
			if goal == center:
				continue
			goal_tokens = nltk.word_tokenize(goal.encode('utf-8'))
			print goal_tokens
			goal_count = 0
			center_count = 0
			while center_count < len(center_tokens):
				match = False
				token = center_tokens[center_count]
				print token
				while match == False:
					if center_count >= len(center_tokens) or \
							goal_count >= len(goal_tokens):
						break
					print goal_tokens[goal_count]
					print goal_count
					print center_count
					common = self.get_wordnet_lch(\
								token, goal_tokens[goal_count])
					if common != None:
						output.append(common)
						match = True
						goal_count += 1
						center_count += 1
					elif center_count < len(center_tokens) - 1:
						if goal_count < len(goal_tokens) - 1:
							if self.get_wordnet_lch(\
									center_tokens[center_count+1], \
									goal_tokens[goal_count+1]) != None:
								match = True
								goal_count += 1
								center_count += 1
							elif self.get_wordnet_lch(\
									center_tokens[center_count], \
									goal_tokens[goal_count+1]) != None:
								match = True
								goal_count += 1
							elif self.get_wordnet_lch(\
									center_tokens[center_count+1], \
									goal_tokens[goal_count]) != None:
								match = True
								center_count += 1
							else:
								break
						else:
							if self.get_wordnet_lch(\
									center_tokens[center_count+1], \
									goal_tokens[goal_count]) != None:
								match = True
								center_count += 1
							else:
								break
					elif goal_count < len(goal_tokens) - 1:
						if self.get_wordnet_lch(\
								center_tokens[center_count], \
								goal_tokens[goal_count+1]) != None:
							match = True
							goal_count += 1
						else:
							break
					else:
						break
				if match == False and \
						(goal_count == len(goal_tokens)-1 or \
						center_count == len(center_tokens)-1):
					break
				elif match == False:
					output = []
					goal_count = 0
					center_count = 0
					break	
			if len(output) > 0:
				if output[-1] in ['a', 'an', 'the', 'be']:
					del output[-1]
				return " ".join(output)
		return None