def similarity(self, term1, term2): # stem and remove stop words in two terms first to make them compatible with those stored parser = Parser() term1 = parser.tokenise(term1) # after tokenization, it is a list if term1 == []: return 0 else: term1 = parser.tokenise(term1[0]) term2 = parser.tokenise(term2) if term2 == []: return 0 else: term2 = parser.tokenise(term2[0]) try: index1 = self.keyword_index_mapping[term1[0]] except: print term1, ": indexing error 1" return 0 try: index2 = self.keyword_index_mapping[term2[0]] except: print term2, "indexing error 2" return 0 return float( np.dot(self.lsa_matrix[index1], self.lsa_matrix[index2]) / (np.linalg.norm(self.lsa_matrix[index1]) * np.linalg.norm(self.lsa_matrix[index2])) )
def __init__(self, documents = [], transforms = []): self.parser = Parser() #self._doc_preprocess(documents) if len(documents) > 0: self._build(documents, transforms)
class VectorSpace: """ A algebraic model for representing text documents as vectors of identifiers. A document is represented as a vector. Each dimension of the vector corresponds to a separate term. If a term occurs in the document, then the value in the vector is non-zero. """ vector_index_to_keyword_mapping = {} parser = None def __init__(self, documents = [], transforms = []): self.parser = Parser() #self._doc_preprocess(documents) if len(documents) > 0: self._build(documents, transforms) def _doc_preprocess(self, docs): print "previous", len(docs) docs = map(self.parser._clean, docs) docs = filter(lambda x: x == '' or ' ', docs) print "now", len(docs) def _build(self, documents, transforms): """ Create the vector space for the passed document strings without duplicate words""" self.vector_index_to_keyword_mapping = self._get_vector_keyword_index(documents) # comment out for the class splitting, moved downward to vs_tf #matrix = [self._make_vector(document) for document in documents] #matrix = reduce(lambda matrix,transform: transform(matrix).transform(), transforms, matrix) #self.collection_of_document_term_vectors = matrix # comment out for the algorithm modification #matrix2 = [self._make_dict(document) for document in documents] #matrix2 = [self._make_dict(word_list) for word_list in self.word_list_of_docs] #self.collection_of_document_term_dicts = matrix2 def _get_vector_keyword_index(self, document_list): """ create the keyword associated to the position of the elements within the document vectors """ print datetime.now(), " Tokenizing documents..." vocabulary_list = self.parser.tokenise_and_remove_stop_words(document_list) print datetime.now(), " Removing duplicated words..." unique_vocabulary_list = self._remove_duplicates(vocabulary_list) print datetime.now(), " Building indexing dictionary..." vector_index={} offset=0 #Associate a position with the keywords which maps to the dimension on the vector used to represent this word for word in unique_vocabulary_list: vector_index[word] = offset offset += 1 return vector_index #(keyword:position) # comment out for the class splitting ''' def _make_vector(self, word_string): """ @pre: unique(vectorIndex) """ vector = [0] * len(self.vector_index_to_keyword_mapping) word_list = self.parser.tokenise_and_remove_stop_words(word_string.split(" ")) index_list = [] for word in word_list: vector[self.vector_index_to_keyword_mapping[word]] += 1 #Use simple Term Count Model #index_list.append(self.vector_index_to_keyword_mapping[word]) #self.word_index_list_of_docs.append(index_list) return vector ''' ''' def _make_list(self, word_string): """ make an array of list of the index of each term in each document""" vector = [[]] * len(self.vector_index_to_keyword_mapping) print 'vector', vector word_list = self.parser.tokenise_and_remove_stop_words(word_string.split(" ")) counter = 0 for word in word_list: i = self.vector_index_to_keyword_mapping[word] if vector[3] == []: print word, 'has index', 3 (vector[3]).append(counter) print vector[3] print 'vector', vector exit(0) else: print word, 'has index', self.vector_index_to_keyword_mapping[word] list = vector[i] print list list.append(counter) print vector[i] print 'vector', vector counter += 1 print vector return vector ''' # As _make_vector has been modified and the word_list for each doc is stored, # there is no need to rebuild these lists again. ''' def _make_dict(self, word_string): """ make an array of list of the index of each term in each document""" dict = {} #print 'dict', dict word_list = self.parser.tokenise_and_remove_stop_words(word_string.split(" ")) counter = 0 for word in word_list: i = self.vector_index_to_keyword_mapping[word] if not dict.has_key(word): dict[word] = [counter] else: dict[word].append(counter) counter += 1 return dict ''' def _remove_duplicates(self, list): """ remove duplicates from a list """ return set((item for item in list)) def _cosine(self, vector1, vector2): """ related documents j and q are in the concept space by comparing the vectors : cosine = ( V1 * V2 ) / ||V1|| x ||V2|| """ return float(numpy.dot(vector1,vector2) / (numpy.norm(vector1) * numpy.norm(vector2)))
def kl_divergence(self, doc1, doc2): parser = Parser() # words in terms are connected with underscore after the NLTK MWE tokenization # replace it with " " to make it compatible with parser here term_list1 = parser.tokenise_and_remove_stop_words([doc1]) # the defined argument is document_list term_list2 = parser.tokenise_and_remove_stop_words([doc2]) # the defined argument is document_list if 0 in (len(term_list1), len(term_list2)): return (0,0) index1 = -1 index2 = -1 index_vector1 = [] index_vector2 = [] length = len(self.vs.vector_index_to_keyword_mapping) start_time = datetime.now() for word1 in term_list1: try: index1 = self.vs.vector_index_to_keyword_mapping[word1] except: #print word1, ": indexing error 1" pass #print datetime.now(), " Indexing word1 completes." if not index1 == -1: index_vector1.append(index1) # turn vector1 into tf-idf-vector index_count_dict1 = {} for index1 in index_vector1: if index1 in index_count_dict1: index_count_dict1[index1]+=1 else: index_count_dict1[index1]=1 word_max = max(index_count_dict1, key=index_count_dict1.get) if word_max ==0: return (0,0) col1 = [] for key in index_count_dict1: col1.append(key) index_count_dict1[key] = (0.5+0.5*index_count_dict1[key]/word_max)\ *np.log1p(abs(self.vs.tfidf.document_total / float(self.vs.tfidf.term_doc_occur_list[key]))) length1 = len(col1) col1.append(length-1) data1 = index_count_dict1.values() data1.append(0) row1 = [0]*(length1+1) term_vector1 = sp.coo_matrix((data1, (row1, col1))) term_vector1=self.vs.model.transform(term_vector1) #print datetime.now(), " Vector 1 built, cost ", datetime.now()-start_time start_time = datetime.now() for word2 in term_list2: try: index2 = self.vs.vector_index_to_keyword_mapping[word2] except: #print word2, ": indexing error 2" pass #print datetime.now(), " Indexing word2 completes." if not index2 == -1: index_vector2.append(index2) # turn vector2 into tf-idf-vector index_count_dict2 = {} for index2 in index_vector2: if index2 in index_count_dict2: index_count_dict2[index2]+=1 else: index_count_dict2[index2]=1 word_max = max(index_count_dict2, key=index_count_dict2.get) if word_max ==0: return (0,0) col2 = [] for key in index_count_dict2: col2.append(key) index_count_dict2[key] = (0.5+0.5*index_count_dict2[key]/word_max)\ *np.log1p(abs(self.vs.tfidf.document_total / float(self.vs.tfidf.term_doc_occur_list[key]))) length2 = len(col2) col2.append(length-1) data2 = index_count_dict2.values() data2.append(0) row2 = [0]*(length2+1) term_vector2 = sp.coo_matrix((data2, (row2, col2))) term_vector2=self.vs.model.transform(term_vector2) #print datetime.now(), " Vector 1 built, cost ", datetime.now()-start_time term_vector1 = term_vector1[0] term_vector2 = term_vector2[0] #vector_m = [a+b for a,b in zip(map(lambda x: x*0.5, term_vector1),map(lambda x: x*0.5, term_vector2))] result1 = 0 result2 = 0 for i in range(len(term_vector1)): if not term_vector2[i]==0: result1+= term_vector1[i]*np.log1p(term_vector1[i]/term_vector2[i]) else: result1+= term_vector1[i]*np.log1p(term_vector1[i]/0.000001) if not term_vector1[i]==0: result2+= term_vector2[i]*np.log1p(term_vector2[i]/term_vector1[i]) else: result2+= term_vector2[i]*np.log1p(term_vector2[i]/0.000001) return (result1,result2)
def term_similarity(self, term1, term2): """ Take in two terms and calculate their similarity through vector combination :param term1: :param term2: :return: """ parser = Parser() # words in terms are connected with underscore after the NLTK MWE tokenization # replace it with " " to make it compatible with parser here term1 = str(term1).replace("_", " ") term2 = str(term2).replace("_", " ") term1 = str(term1).replace("-", " ") term2 = str(term2).replace("-", " ") term_list1 = parser.tokenise_and_remove_stop_words([term1]) # the defined argument is document_list term_list2 = parser.tokenise_and_remove_stop_words([term2]) # the defined argument is document_list term_vector1 = [] term_vector2 = [] index1 = -1 index2 = -1 start_time = datetime.now() for word1 in term_list1: try: index1 = self.keyword_index_mapping[word1] except: # print word1, ": indexing error 1" pass # print datetime.now(), " Indexing word1 completes." if not index1 == -1: if term_vector1 == []: term_vector1 = ( self.lsa_matrix.getcol(index1).toarray().flatten() ) # to ndarray > become (X, 1) > flatten else: term_vector1 = term_vector1 + self.lsa_matrix.getcol(index1).toarray().flatten() # [index1] # print datetime.now(), " Vector 1 built, cost ", datetime.now()-start_time start_time = datetime.now() for word2 in term_list2: try: index2 = self.keyword_index_mapping[word2] except: # print word2, ": indexing error 2" pass # print datetime.now(), " Indexing word2 completes." if not index2 == -1: if term_vector2 == []: term_vector2 = self.lsa_matrix.getcol(index2).toarray().flatten() # [index2] else: term_vector2 = term_vector1 + self.lsa_matrix.getcol(index2).toarray().flatten() # [index2] # print datetime.now(), " Vector 2 built, cost ", datetime.now()-start_time if 1 not in (term_vector1 == [], term_vector2 == []): # http://stackoverflow.com/questions/1075652/using-the-and-and-not-operator-in-python return float( np.dot(term_vector1, term_vector2) / (np.linalg.norm(term_vector1) * np.linalg.norm(term_vector2)) ) else: return 0