Beispiel #1
0
 def __init__(self, documents = [], transforms = [TFIDF, LSA]):
 	self.vectors = []
 	self.parser = Parser()
 	if len(documents) > 0:
 		self._build(documents, transforms)
     self.vector_to_doc = {}
     self.jokes = documents #added this instance variable to contain actual jokes
Beispiel #2
0
    def __init__(self, documents=[], transforms=[TFIDF, LSA]):
        self.collection_of_document_term_vectors = []

        self.transformed_matrix = []
        self.parser = Parser()
        if len(documents) > 0:
            self._build(documents, transforms)
Beispiel #3
0
    def __init__(self, documentsdict={}, transforms=[TFIDF, LSA]):
        self.collection_of_document_term_vectors = []
        self.documents = []
        self.file_path_all = []
        self.document_ID_file_info_mapping = {}
        self.transformed_matrix = []
        self.parser = Parser()

        self._addToList(documentsdict)
        if len(self.documents) > 0:
            self._build(self.documents, transforms)
Beispiel #4
0
 def __init__(self, documents = [], transforms = [TFIDF, LSA]):
         self.collection_of_document_term_vectors = []
         self.parser = Parser()
         if len(documents) > 0:
                 self._build(documents, transforms)
Beispiel #5
0
class VectorSpace:
        """ A algebraic model for representing text documents as vectors of identifiers. 
        A document is represented as a vector. Each dimension of the vector corresponds to a 
        separate term. If a term occurs in the document, then the value in the vector is non-zero.
        """

        collection_of_document_term_vectors = []
        vector_index_to_keyword_mapping = []

        parser = None

        def __init__(self, documents = [], transforms = [TFIDF, LSA]):
                self.collection_of_document_term_vectors = []
                self.parser = Parser()
                if len(documents) > 0:
                        self._build(documents, transforms)


        def related(self, document_id):
                """ find documents that are related to the document indexed by passed Id within the document Vectors"""
                ratings = [self._cosine(self.collection_of_document_term_vectors[document_id], document_vector) for document_vector in self.collection_of_document_term_vectors]
                ratings.sort(reverse = True)
                return ratings


        def search(self, searchList):
                """ search for documents that match based on a list of terms """
                queryVector = self._build_query_vector(searchList)

                ratings = [self._cosine(queryVector, documentVector) for documentVector in self.collection_of_document_term_vectors]
                #LSIscores = []
                #for i in range(len(ratings)):
                #    ss = {"docIndex" : i, "similarity" : ratings[i]}
                #    LSIscores.append(ss)
                #LSIscores = sorted(LSIscores, key=lambda k: k['similarity'], reverse=True)
                #print "ratings"
                #print ratings
                #ratings.sort(reverse=True)
		#print "sorted ratings"
                #return LSIscores
                return ratings

        def _build(self, documents, transforms):
                """ Create the vector space for the passed document strings """
                self.vector_index_to_keyword_mapping = self._get_vector_keyword_index(documents)

                matrix = [self._make_vector(document) for document in documents]
                matrix = reduce(lambda matrix,transform: transform(matrix).transform(), transforms, matrix)
                self.collection_of_document_term_vectors = matrix

        def _get_vector_keyword_index(self, document_list):
                """ create the keyword associated to the position of the elements within the document vectors """
                vocabulary_list = self.parser.tokenise_and_remove_stop_words(document_list)
                unique_vocabulary_list = self._remove_duplicates(vocabulary_list)
                        
                vector_index={}
                offset=0
                #Associate a position with the keywords which maps to the dimension on the vector used to represent this word
                for word in unique_vocabulary_list:
                        vector_index[word] = offset
                        offset += 1
                return vector_index  #(keyword:position)


        def _make_vector(self, word_string):
                """ @pre: unique(vectorIndex) """
                vector = [0] * len(self.vector_index_to_keyword_mapping)
                word_list = self.parser.tokenise_and_remove_stop_words(word_string.split(" "))
                for word in word_list:
                        vector[self.vector_index_to_keyword_mapping[word]] += 1; #Use simple Term Count Model
                return vector


        def _build_query_vector(self, term_list):
                """ convert query string into a term vector """
		query = self._make_vector(" ".join(term_list))
                return query


        def _remove_duplicates(self, list):
                """ remove duplicates from a list """
                return set((item for item in list))


        def _cosine(self, vector1, vector2):
                """ related documents j and q are in the concept space by comparing the vectors :
                        cosine  = ( V1 * V2 ) / ||V1|| x ||V2|| """
                return float(dot(vector1,vector2) / (norm(vector1) * norm(vector2)))
Beispiel #6
0
 def create_parser(self):
   return Parser(ParserTest.FakeStopWords())
Beispiel #7
0
 def create_parser_with_stopwords(self, words_string):
   return Parser(ParserTest.FakeStopWords(words_string))
Beispiel #8
0
class VectorSpace:
    """ A algebraic model for representing text documents as vectors of identifiers. 
    A document is represented as a vector. Each dimension of the vector corresponds to a 
    separate term. If a term occurs in the document, then the value in the vector is non-zero.
    """

    vectors = []
    vector_index_to_keyword_mapping = []
    vector_to_doc = {}
    jokes = []
    parser = None

    def __init__(self, documents = [], transforms = [TFIDF, LSA]):
    	self.vectors = []
    	self.parser = Parser()
    	if len(documents) > 0:
    		self._build(documents, transforms)
        self.vector_to_doc = {}
        self.jokes = documents #added this instance variable to contain actual jokes
        #jokes index corresponds to vectors index


    def related(self, document_id, k = 4):# added k parameter to find k most similar
        """ find documents that are related to the document indexed by passed Id within the document Vectors"""
        ratings = [(self._cosine(self.vectors[document_id], self.vectors[i]), self.jokes[i], i) for i in range(len(self.vectors))]
        ratings.sort(key = lambda x: x[0], reverse = True) #i changed this method to return the actual documents as well as the scores
        return ratings[1:k+1]


    def search(self, searchList, k= 1):# added k parameter to find k most similar
        """ search for documents that match based on a list of terms """
        queryVector = self._build_query_vector(searchList)

        ratings = [(self._cosine(queryVector, self.vectors[i]), self.jokes[i]) for i in range(len(self.vectors))]
        ratings.sort(key = lambda x: x[0], reverse=True) #i changed this method to return the actual documents as well as the scores
        return ratings[:k]


    def _build(self, documents, transforms):
    	""" Create the vector space for the passed document strings """
    	self.vector_index_to_keyword_mapping = self._get_vector_keyword_index(documents)

    	matrix = [self._make_vector(document) for document in documents]
        matrix = reduce(lambda matrix,transform: transform(matrix).transform(), transforms, matrix)
        self.vectors = matrix

    def _get_vector_keyword_index(self, document_list):
    	""" create the keyword associated to the position of the elements within the document vectors """
    	vocabulary_list = self.parser.tokenise_and_remove_stop_words(document_list)
        unique_vocabulary_list = self._remove_duplicates(vocabulary_list)
		
    	vector_index={}
    	offset=0
    	#Associate a position with the keywords which maps to the dimension on the vector used to represent this word
    	for word in unique_vocabulary_list:
    		vector_index[word] = offset
    		offset += 1
    	return vector_index  #(keyword:position)


    def _make_vector(self, word_string):
    	""" @pre: unique(vectorIndex) """

    	vector = [0] * len(self.vector_index_to_keyword_mapping)

    	word_list = self.parser.tokenise_and_remove_stop_words(word_string.split(" "))

    	for word in word_list:
            vector[self.vector_index_to_keyword_mapping[word]] += 1; #Use simple Term Count Model
    	return vector


    def _build_query_vector(self, term_list):
    	""" convert query string into a term vector """
    	query = self._make_vector(" ".join(term_list))
    	return query


    def _remove_duplicates(self, list):
        """ remove duplicates from a list """
        return set((item for item in list))
    
        
    def _cosine(self, vector1, vector2):
    	""" related documents j and q are in the concept space by comparing the vectors :
    		cosine  = ( V1 * V2 ) / ||V1|| x ||V2|| """
    	return float(dot(vector1,vector2) / (norm(vector1) * norm(vector2)))
Beispiel #9
0
class VectorSpace:
    """ A algebraic model for representing text documents as vectors of identifiers. 
    A document is represented as a vector. Each dimension of the vector corresponds to a 
    separate term. If a term occurs in the document, then the value in the vector is non-zero.
    """

    collection_of_document_term_vectors = []
    documents = []
    file_path_all = []
    document_ID_file_info_mapping = []
    vector_index_to_keyword_mapping = []
    document_ID_file_info_mapping = {}
    transformed_matrix = []
    parser = None

    def __init__(self, documents=[], transforms=[TFIDF, LSA]):
        self.collection_of_document_term_vectors = []

        self.transformed_matrix = []
        self.parser = Parser()
        if len(documents) > 0:
            self._build(documents, transforms)

    def __init__(self, documentsdict={}, transforms=[TFIDF, LSA]):
        self.collection_of_document_term_vectors = []
        self.documents = []
        self.file_path_all = []
        self.document_ID_file_info_mapping = {}
        self.transformed_matrix = []
        self.parser = Parser()

        self._addToList(documentsdict)
        if len(self.documents) > 0:
            self._build(self.documents, transforms)

    def _addToList(self, documents_dict):
        i = 0
        for key in documents_dict:
            self.documents.append(documents_dict[key])
            self.file_path_all.append(key)
            self.document_ID_file_info_mapping[key] = str(i)
            print key, i
            i = i + 1

    def get_file_path_all(self):
        return self.file_path_all

    def get_document_ID_file_info_mapping(self):
        return self.document_ID_file_info_mapping

    def related(self, document_id):
        """ find documents that are related to the document indexed by passed Id within the document Vectors
        How one given document is related to all other documents"""
        ratings = [
            self._cosine(self.collection_of_document_term_vectors[document_id],
                         document_vector)
            for document_vector in self.collection_of_document_term_vectors
        ]
        #import pdb
        #pdb.set_trace()
        #ratings.sort(reverse = True)
        return ratings

    def relatedBySVDmatrix(self, document_id):

        print(self.transformed_matrix[document_id])
        ratings = [
            self._cosine(self.transformed_matrix[document_id], document_vector)
            for document_vector in self.transformed_matrix
        ]
        #ratings.sort(reverse=True)
        return ratings

    def setTransform(self, transforms):
        self.transformed_matrix = transforms
        return self.transformed_matrix

    def searchInSVDmatrix(self, searchList):
        queryVector = self._build_query_vector(searchList)
        print(queryVector)
        ratings = [
            self._cosine(queryVector, documentVector)
            for documentVector in self.transformed_matrix
        ]
        #ratings.sort(reverse=True)
        return ratings

    def search(self, searchList):
        """ search for documents that match based on a list of terms """
        queryVector = self._build_query_vector(searchList)

        ratings = [
            self._cosine(queryVector, documentVector)
            for documentVector in self.collection_of_document_term_vectors
        ]
        #import pdb
        #pdb.set_trace()
        #ratings.sort(reverse=True)
        return ratings

    def _build(self, documents, transforms):
        """ Create the vector space for the passed document strings """
        print('from vector_space class')
        print(documents)
        self.vector_index_to_keyword_mapping = self._get_vector_keyword_index(
            documents)
        #import pdb
        #pdb.set_trace()

        matrix = [self._make_vector(document) for document in documents]
        #import pdb
        #pdb.set_trace()
        #matrix = reduce(lambda matrix,transform: transform(matrix).transform(), transforms, matrix)
        self.collection_of_document_term_vectors = matrix

    def _get_vector_keyword_index(self, document_list):
        """ create the keyword associated to the position of the elements within the document vectors """
        vocabulary_list = self.parser.tokenise_and_remove_stop_words(
            document_list)
        unique_vocabulary_list = self._remove_duplicates(vocabulary_list)

        vector_index = {}
        offset = 0
        #Associate a position with the keywords which maps to the dimension on the vector used to represent this word
        for word in unique_vocabulary_list:
            vector_index[word] = offset
            offset += 1
        return vector_index  #(keyword:position)

    def _make_vector(self, word_string):
        """ @pre: unique(vectorIndex) """

        vector = [0] * len(self.vector_index_to_keyword_mapping)

        word_list = self.parser.tokenise_and_remove_stop_words(
            word_string.split(" "))

        for word in word_list:
            if (word in self.vector_index_to_keyword_mapping):
                vector[self.vector_index_to_keyword_mapping[word]] += 1
                #Use simple Term Count Model
        return vector

    def _build_query_vector(self, term_list):
        """ convert query string into a term vector """
        query = self._make_vector(" ".join(term_list))
        return query

    def _remove_duplicates(self, list):
        """ remove duplicates from a list """
        return set((item for item in list))

    def _cosine(self, vector1, vector2):
        """ related documents j and q are in the concept space by comparing the vectors :
    		cosine  = ( V1 * V2 ) / ||V1|| x ||V2|| """
        return float(dot(vector1, vector2) / (norm(vector1) * norm(vector2)))