def append(self, raw_file): ''' Add raw_file to the files and update all prepared algorithms. Args: raw_file: file represented as string ''' file_key = text_hash(raw_file) self.files[file_key] = raw_file for alg_name, algorithm in self.prepared_algorithms.items(): algorithm.preprocess_one(raw_file)
def insert(self, content): ''' Private method for mongodb insertion operation. ''' document = {} content_hash = text_hash(content) existing_documents = self.documents.find({"identifier": content_hash}) if existing_documents.count() > 0: return existing_documents[0] document['content'] = content document['identifier'] = content_hash document['date'] = datetime.datetime.utcnow() result = self.documents.insert_one(document) print("Inserted: %s" % content_hash) inserted = self.documents.find({"identifier": content_hash})[0] return inserted
def preprocess_one(raw_file): ''' Procedure: 1. create document 2. fill with the data Args: raw_file: text (string) Returns: Document ''' document = Document() document.text = raw_file document.identifier = text_hash(raw_file) document.content_hash = document.identifier document.tokens = tokenize_text(raw_file) document.bag = bag(document.tokens) return document
command_line = shlex.split(input('Infinity > ')) if '-e' in command_line: print("Bye!") break # get arguments query = arg.get_cl(command_line, 'q', QUERY) algorithm_name = arg.get_cl(command_line, 'a', ALGORITHM) number = int(arg.get_cl(command_line, 'n', RESULTS)) document = arg.get_cl(command_line, 'd', DOCUMENT) # store new document is user specified -d option # and ask user for next command because add document # option has a higher priority than other commands if document is not '': document_hash = text_hash(document) files[document_hash] = document algorithm_box.append(document) logger.info('New document: key = %s, content = %s' % (document_hash, document)) continue # otherwise try to execute the algorithm try: page = Page(0, number) algorithm = algorithm_box.algorithm(algorithm_name) rank = algorithm.run(query, page) print() logger.info("Result: %s" % rank) print() except Exception as e: