Example #1
0
    def append(self, raw_file):
        '''
        Add raw_file to the files and update all
        prepared algorithms.

        Args:
            raw_file: file represented as string
        '''
        file_key = text_hash(raw_file)
        self.files[file_key] = raw_file

        for alg_name, algorithm in self.prepared_algorithms.items():
            algorithm.preprocess_one(raw_file)
Example #2
0
    def insert(self, content):
        '''
        Private method for mongodb insertion operation.
        '''
        document = {}

        content_hash = text_hash(content)
        existing_documents = self.documents.find({"identifier": content_hash})
        if existing_documents.count() > 0:
            return existing_documents[0]

        document['content'] = content
        document['identifier'] = content_hash
        document['date'] = datetime.datetime.utcnow()
        result = self.documents.insert_one(document)
        print("Inserted: %s" % content_hash)
        inserted = self.documents.find({"identifier": content_hash})[0]
        return inserted
Example #3
0
    def insert(self, content):
        '''
        Private method for mongodb insertion operation.
        '''
        document = {}

        content_hash = text_hash(content)
        existing_documents = self.documents.find({"identifier": content_hash})
        if existing_documents.count() > 0:
            return existing_documents[0]

        document['content'] = content
        document['identifier'] = content_hash
        document['date'] = datetime.datetime.utcnow()
        result = self.documents.insert_one(document)
        print("Inserted: %s" % content_hash)
        inserted = self.documents.find({"identifier": content_hash})[0]
        return inserted
Example #4
0
def preprocess_one(raw_file):
    '''
    Procedure:
        1. create document
        2. fill with the data

    Args:
        raw_file: text (string)

    Returns:
        Document
    '''
    document = Document()

    document.text = raw_file
    document.identifier = text_hash(raw_file)
    document.content_hash = document.identifier
    document.tokens = tokenize_text(raw_file)
    document.bag = bag(document.tokens)

    return document
Example #5
0
def preprocess_one(raw_file):
    '''
    Procedure:
        1. create document
        2. fill with the data

    Args:
        raw_file: text (string)

    Returns:
        Document
    '''
    document = Document()

    document.text = raw_file
    document.identifier = text_hash(raw_file)
    document.content_hash = document.identifier
    document.tokens = tokenize_text(raw_file)
    document.bag = bag(document.tokens)

    return document
Example #6
0
        command_line = shlex.split(input('Infinity > '))
        if '-e' in command_line:
            print("Bye!")
            break

        # get arguments
        query = arg.get_cl(command_line, 'q', QUERY)
        algorithm_name = arg.get_cl(command_line, 'a', ALGORITHM)
        number = int(arg.get_cl(command_line, 'n', RESULTS))
        document = arg.get_cl(command_line, 'd', DOCUMENT)

        # store new document is user specified -d option
        # and ask user for next command because add document
        # option has a higher priority than other commands
        if document is not '':
            document_hash = text_hash(document)
            files[document_hash] = document
            algorithm_box.append(document)
            logger.info('New document: key = %s, content = %s' %
                        (document_hash, document))
            continue

        # otherwise try to execute the algorithm
        try:
            page = Page(0, number)
            algorithm = algorithm_box.algorithm(algorithm_name)
            rank = algorithm.run(query, page)
            print()
            logger.info("Result: %s" % rank)
            print()
        except Exception as e:
Example #7
0
        command_line = shlex.split(input('Infinity > '))
        if '-e' in command_line:
            print("Bye!")
            break

        # get arguments
        query = arg.get_cl(command_line, 'q', QUERY)
        algorithm_name = arg.get_cl(command_line, 'a', ALGORITHM)
        number = int(arg.get_cl(command_line, 'n', RESULTS))
        document = arg.get_cl(command_line, 'd', DOCUMENT)

        # store new document is user specified -d option
        # and ask user for next command because add document
        # option has a higher priority than other commands
        if document is not '':
            document_hash = text_hash(document)
            files[document_hash] = document
            algorithm_box.append(document)
            logger.info('New document: key = %s, content = %s' %
                        (document_hash, document))
            continue

        # otherwise try to execute the algorithm
        try:
            page = Page(0, number)
            algorithm = algorithm_box.algorithm(algorithm_name)
            rank = algorithm.run(query, page)
            print()
            logger.info("Result: %s" % rank)
            print()
        except Exception as e: