Esempio n. 1
0
    def run(self, query, page=Page(0, 20)):
        '''
        Procedure:
            1. tokenize the query
            2. calculate query weight
            3. calculate all distances
            4. sort distances
            5. return sorted result

        Args:
            query: query string
            page: page size and offset

        Returns:
            list of identifiers
        '''
        # tokenize query
        tokens = tokenize_text(query)
        if len(tokens) <= 0:
            return []

        # calculate query weigth
        bag_of_words = bag(tokens)
        max_freq = bag_of_words[max(bag_of_words, key=bag_of_words.get)]
        query_tf = lil_matrix((1, self.tokens_no))
        for token, freq in bag_of_words.items():
            if token not in self.tokens:
                continue
            index = self.tokens[token]
            query_tf[0, index] = tf(freq, max_freq)
        query_tf = csr_matrix(query_tf)
        query_w = csr_matrix(query_tf.multiply(self.idf))

        # calculate distances between all documents and query
        distances = self.distance(self.tf_idf, query_w)

        # sort results and return specified page
        distances = distances[:, 0]
        sorted_indices = np.argsort(distances)
        top = sorted_indices[page.start_index:page.end_index]
        f = np.vectorize(lambda x: self.iterative_docs[x])
        result = list(f(top))

        return result
Esempio n. 2
0
    def run(self, query, page=Page(0, 20)):
        '''
        Procedure:
            1. tokenize the query
            2. calculate query weight
            3. calculate all distances
            4. sort distances
            5. return sorted result

        Args:
            query: query string
            page: page size and offset

        Returns:
            list of identifiers
        '''
        # tokenize query
        tokens = tokenize_text(query)
        if len(tokens) <= 0:
            return []

        # calculate query weigth
        bag_of_words = bag(tokens)
        max_freq = bag_of_words[max(bag_of_words, key=bag_of_words.get)]
        query_tf = lil_matrix((1, self.tokens_no))
        for token, freq in bag_of_words.items():
            if token not in self.tokens:
                continue
            index = self.tokens[token]
            query_tf[0, index] = tf(freq, max_freq)
        query_tf = csr_matrix(query_tf)
        query_w = csr_matrix(query_tf.multiply(self.idf))

        # calculate distances between all documents and query
        distances = self.distance(self.tf_idf, query_w)

        # sort results and return specified page
        distances = distances[:, 0]
        sorted_indices = np.argsort(distances)
        top = sorted_indices[page.start_index:page.end_index]
        f = np.vectorize(lambda x: self.iterative_docs[x])
        result = list(f(top))

        return result
Esempio n. 3
0
def preprocess_one(raw_file):
    '''
    Procedure:
        1. create document
        2. fill with the data

    Args:
        raw_file: text (string)

    Returns:
        Document
    '''
    document = Document()

    document.text = raw_file
    document.identifier = text_hash(raw_file)
    document.content_hash = document.identifier
    document.tokens = tokenize_text(raw_file)
    document.bag = bag(document.tokens)

    return document
Esempio n. 4
0
def preprocess_one(raw_file):
    '''
    Procedure:
        1. create document
        2. fill with the data

    Args:
        raw_file: text (string)

    Returns:
        Document
    '''
    document = Document()

    document.text = raw_file
    document.identifier = text_hash(raw_file)
    document.content_hash = document.identifier
    document.tokens = tokenize_text(raw_file)
    document.bag = bag(document.tokens)

    return document