Esempio n. 1
0
 def similarity(self, term1, term2):
     # stem and remove stop words in two terms first to make them compatible with those stored
     parser = Parser()
     term1 = parser.tokenise(term1)  # after tokenization, it is a list
     if term1 == []:
         return 0
     else:
         term1 = parser.tokenise(term1[0])
     term2 = parser.tokenise(term2)
     if term2 == []:
         return 0
     else:
         term2 = parser.tokenise(term2[0])
     try:
         index1 = self.keyword_index_mapping[term1[0]]
     except:
         print term1, ": indexing error 1"
         return 0
     try:
         index2 = self.keyword_index_mapping[term2[0]]
     except:
         print term2, "indexing error 2"
         return 0
     return float(
         np.dot(self.lsa_matrix[index1], self.lsa_matrix[index2])
         / (np.linalg.norm(self.lsa_matrix[index1]) * np.linalg.norm(self.lsa_matrix[index2]))
     )
Esempio n. 2
0
	def __init__(self, documents = [], transforms = []):
		self.parser = Parser()
		#self._doc_preprocess(documents)
		if len(documents) > 0:
				self._build(documents, transforms)
Esempio n. 3
0
class VectorSpace:
	""" A algebraic model for representing text documents as vectors of identifiers.
    A document is represented as a vector. Each dimension of the vector corresponds to a
    separate term. If a term occurs in the document, then the value in the vector is non-zero.
    """
	vector_index_to_keyword_mapping = {}


	parser = None

	def __init__(self, documents = [], transforms = []):
		self.parser = Parser()
		#self._doc_preprocess(documents)
		if len(documents) > 0:
				self._build(documents, transforms)

	def _doc_preprocess(self, docs):
		print "previous", len(docs)
		docs = map(self.parser._clean, docs)
		docs = filter(lambda x: x == '' or ' ', docs)
		print "now", len(docs)


	def _build(self, documents, transforms):
		""" Create the vector space for the passed document strings without duplicate words"""
		self.vector_index_to_keyword_mapping = self._get_vector_keyword_index(documents)

		# comment out for the class splitting, moved downward to vs_tf
		#matrix = [self._make_vector(document) for document in documents]
		#matrix = reduce(lambda matrix,transform: transform(matrix).transform(), transforms, matrix)
		#self.collection_of_document_term_vectors = matrix

		# comment out for the algorithm modification
		#matrix2 = [self._make_dict(document) for document in documents]
		#matrix2 = [self._make_dict(word_list) for word_list in self.word_list_of_docs]
		#self.collection_of_document_term_dicts = matrix2


	def _get_vector_keyword_index(self, document_list):
		""" create the keyword associated to the position of the elements within the document vectors """
		print datetime.now(), "  Tokenizing documents..."
		vocabulary_list = self.parser.tokenise_and_remove_stop_words(document_list)
		print datetime.now(), "  Removing duplicated words..."
		unique_vocabulary_list = self._remove_duplicates(vocabulary_list)
		print datetime.now(), "  Building indexing dictionary..."
		vector_index={}
		offset=0
		#Associate a position with the keywords which maps to the dimension on the vector used to represent this word
		for word in unique_vocabulary_list:
			vector_index[word] = offset
			offset += 1
		return vector_index  #(keyword:position)


	# comment out for the class splitting
	'''
	def _make_vector(self, word_string):
		""" @pre: unique(vectorIndex) """

		vector = [0] * len(self.vector_index_to_keyword_mapping)

		word_list = self.parser.tokenise_and_remove_stop_words(word_string.split(" "))

		index_list = []
		for word in word_list:
			vector[self.vector_index_to_keyword_mapping[word]] += 1 #Use simple Term Count Model
			#index_list.append(self.vector_index_to_keyword_mapping[word])
		#self.word_index_list_of_docs.append(index_list)
		return vector
	'''
	'''
	def _make_list(self, word_string):
		""" make an array of list of the index of each term in each document"""
		vector = [[]] * len(self.vector_index_to_keyword_mapping)
		print 'vector', vector

		word_list = self.parser.tokenise_and_remove_stop_words(word_string.split(" "))

		counter = 0
		for word in word_list:
			i = self.vector_index_to_keyword_mapping[word]
			if vector[3] == []:
				print word, 'has index', 3
				(vector[3]).append(counter)
				print vector[3]
				print 'vector', vector
				exit(0)
			else:
				print word, 'has index', self.vector_index_to_keyword_mapping[word]
				list = vector[i]
				print list
				list.append(counter)
				print vector[i]
				print 'vector', vector
			counter += 1
		print vector
		return vector
	'''

	# As _make_vector has been modified and the word_list for each doc is stored,
	# there is no need to rebuild these lists again.
	'''
	def _make_dict(self, word_string):
		""" make an array of list of the index of each term in each document"""
		dict = {}
		#print 'dict', dict

		word_list = self.parser.tokenise_and_remove_stop_words(word_string.split(" "))

		counter = 0
		for word in word_list:
			i = self.vector_index_to_keyword_mapping[word]
			if not dict.has_key(word):
				dict[word] = [counter]

			else:
				dict[word].append(counter)
			counter += 1
		return dict
	'''


	def _remove_duplicates(self, list):
		""" remove duplicates from a list """
		return set((item for item in list))


	def _cosine(self, vector1, vector2):
		""" related documents j and q are in the concept space by comparing the vectors :
			cosine  = ( V1 * V2 ) / ||V1|| x ||V2|| """
		return float(numpy.dot(vector1,vector2) / (numpy.norm(vector1) * numpy.norm(vector2)))
Esempio n. 4
0
    def kl_divergence(self, doc1, doc2):

        parser = Parser()
        # words in terms are connected with underscore after the NLTK MWE tokenization
        # replace it with " " to make it compatible with parser here
        term_list1 = parser.tokenise_and_remove_stop_words([doc1]) # the defined argument is document_list
        term_list2 = parser.tokenise_and_remove_stop_words([doc2]) # the defined argument is document_list

        if 0 in (len(term_list1), len(term_list2)):
            return (0,0)

        index1 = -1
        index2 = -1
        index_vector1 = []
        index_vector2 = []
        length = len(self.vs.vector_index_to_keyword_mapping)


        start_time = datetime.now()
        for word1 in term_list1:
            try:
                index1 = self.vs.vector_index_to_keyword_mapping[word1]
            except:
                #print word1, ": indexing error 1"
                pass

            #print datetime.now(), " Indexing word1 completes."
            if not index1 == -1:
                index_vector1.append(index1)

        # turn vector1 into tf-idf-vector
        index_count_dict1 = {}
        for index1 in index_vector1:
            if index1 in index_count_dict1:
                index_count_dict1[index1]+=1
            else:
                index_count_dict1[index1]=1
        word_max = max(index_count_dict1, key=index_count_dict1.get)
        if word_max ==0:
            return (0,0)
        col1 = []
        for key in index_count_dict1:
            col1.append(key)
            index_count_dict1[key] = (0.5+0.5*index_count_dict1[key]/word_max)\
                                     *np.log1p(abs(self.vs.tfidf.document_total / float(self.vs.tfidf.term_doc_occur_list[key])))

        length1 = len(col1)
        col1.append(length-1)
        data1 = index_count_dict1.values()
        data1.append(0)
        row1 = [0]*(length1+1)
        term_vector1 = sp.coo_matrix((data1, (row1, col1)))
        term_vector1=self.vs.model.transform(term_vector1)
        #print datetime.now(), " Vector 1 built, cost ", datetime.now()-start_time

        start_time = datetime.now()
        for word2 in term_list2:
            try:
                index2 = self.vs.vector_index_to_keyword_mapping[word2]
            except:
                #print word2, ": indexing error 2"
                pass

            #print datetime.now(), " Indexing word2 completes."
            if not index2 == -1:
                index_vector2.append(index2)

        # turn vector2 into tf-idf-vector
        index_count_dict2 = {}
        for index2 in index_vector2:
            if index2 in index_count_dict2:
                index_count_dict2[index2]+=1
            else:
                index_count_dict2[index2]=1
        word_max = max(index_count_dict2, key=index_count_dict2.get)
        if word_max ==0:
            return (0,0)
        col2 = []
        for key in index_count_dict2:
            col2.append(key)
            index_count_dict2[key] = (0.5+0.5*index_count_dict2[key]/word_max)\
                                     *np.log1p(abs(self.vs.tfidf.document_total / float(self.vs.tfidf.term_doc_occur_list[key])))

        length2 = len(col2)
        col2.append(length-1)
        data2 = index_count_dict2.values()
        data2.append(0)
        row2 = [0]*(length2+1)
        term_vector2 = sp.coo_matrix((data2, (row2, col2)))
        term_vector2=self.vs.model.transform(term_vector2)
        #print datetime.now(), " Vector 1 built, cost ", datetime.now()-start_time

        term_vector1 = term_vector1[0]
        term_vector2 = term_vector2[0]
        #vector_m =  [a+b for a,b in zip(map(lambda x: x*0.5, term_vector1),map(lambda x: x*0.5, term_vector2))]
        result1 = 0
        result2 = 0
        for i in range(len(term_vector1)):
            if not term_vector2[i]==0:
                result1+= term_vector1[i]*np.log1p(term_vector1[i]/term_vector2[i])
            else:
                result1+= term_vector1[i]*np.log1p(term_vector1[i]/0.000001)
            if not term_vector1[i]==0:
                result2+= term_vector2[i]*np.log1p(term_vector2[i]/term_vector1[i])
            else:
                result2+= term_vector2[i]*np.log1p(term_vector2[i]/0.000001)
        return (result1,result2)
Esempio n. 5
0
    def term_similarity(self, term1, term2):
        """
        Take in two terms and calculate their similarity through vector combination
        :param term1:
        :param term2:
        :return:
        """
        parser = Parser()
        # words in terms are connected with underscore after the NLTK MWE tokenization
        # replace it with " " to make it compatible with parser here
        term1 = str(term1).replace("_", " ")
        term2 = str(term2).replace("_", " ")
        term1 = str(term1).replace("-", " ")
        term2 = str(term2).replace("-", " ")

        term_list1 = parser.tokenise_and_remove_stop_words([term1])  # the defined argument is document_list
        term_list2 = parser.tokenise_and_remove_stop_words([term2])  # the defined argument is document_list
        term_vector1 = []
        term_vector2 = []
        index1 = -1
        index2 = -1

        start_time = datetime.now()
        for word1 in term_list1:
            try:
                index1 = self.keyword_index_mapping[word1]
            except:
                # print word1, ": indexing error 1"
                pass

            # print datetime.now(), " Indexing word1 completes."
            if not index1 == -1:
                if term_vector1 == []:
                    term_vector1 = (
                        self.lsa_matrix.getcol(index1).toarray().flatten()
                    )  # to ndarray > become (X, 1) > flatten
                else:
                    term_vector1 = term_vector1 + self.lsa_matrix.getcol(index1).toarray().flatten()  # [index1]
        # print datetime.now(), " Vector 1 built, cost ", datetime.now()-start_time

        start_time = datetime.now()
        for word2 in term_list2:
            try:
                index2 = self.keyword_index_mapping[word2]
            except:
                # print word2, ": indexing error 2"
                pass

            # print datetime.now(), " Indexing word2 completes."
            if not index2 == -1:
                if term_vector2 == []:
                    term_vector2 = self.lsa_matrix.getcol(index2).toarray().flatten()  # [index2]
                else:
                    term_vector2 = term_vector1 + self.lsa_matrix.getcol(index2).toarray().flatten()  # [index2]
        # print datetime.now(), " Vector 2 built, cost ", datetime.now()-start_time

        if 1 not in (term_vector1 == [], term_vector2 == []):
            # http://stackoverflow.com/questions/1075652/using-the-and-and-not-operator-in-python
            return float(
                np.dot(term_vector1, term_vector2) / (np.linalg.norm(term_vector1) * np.linalg.norm(term_vector2))
            )
        else:
            return 0