def parse(name, content, doc_path=""): word_list = [] for line in content.split("\n"): for word in line.split(): word_list.append(word) term_list = tokenize(word_list) doc = Document(name, term_list) if doc_path != "": create_document(doc, doc_path) return doc
def parse(name, content, doc_path = ""): word_list = [] for line in content.split("\n"): for word in line.split(): word_list.append(word) term_list = tokenize(word_list) doc = Document(name, term_list) if doc_path != "": create_document(doc, doc_path) return doc
def query(self, search_text): word_list = search_text.split() term_list = tk.tokenize(word_list) term_set = set(term_list) score_dict = {} for doc_id, doc_term_dict in self.docSpace.vector_dict.items(): doc_term_set = set(list(doc_term_dict.keys())) termInt = len (term_set & doc_term_set) termUni = len (term_set | doc_term_set) if termUni == 0: ji = 0 else : ji = float(termInt) / float(termUni) score_dict[doc_id] = ji return score_dict
def query(self, search_text): word_list = search_text.split() term_list = tk.tokenize(word_list) term_set = set(term_list) score_dict = {} for doc_id, doc_term_dict in self.docSpace.vector_dict.items(): doc_term_set = set(list(doc_term_dict.keys())) termInt = len(term_set & doc_term_set) termUni = len(term_set | doc_term_set) if termUni == 0: ji = 0 else: ji = float(termInt) / float(termUni) score_dict[doc_id] = ji return score_dict
def get_term_list(self, phrase): word_list = phrase.split() term_list = tk.tokenize(word_list) return term_list