Ejemplo n.º 1
0
def parse(name, content, doc_path=""):
    word_list = []
    for line in content.split("\n"):
        for word in line.split():
            word_list.append(word)
    term_list = tokenize(word_list)
    doc = Document(name, term_list)
    if doc_path != "":
        create_document(doc, doc_path)
    return doc
Ejemplo n.º 2
0
def parse(name, content, doc_path = ""):
    word_list = []
    for line in content.split("\n"):
        for word in line.split():
            word_list.append(word)
    term_list = tokenize(word_list)
    doc = Document(name, term_list)
    if doc_path != "":
        create_document(doc, doc_path)
    return doc
Ejemplo n.º 3
0
 def query(self, search_text):
     word_list = search_text.split()
     term_list = tk.tokenize(word_list)
     term_set = set(term_list)
     score_dict = {}
     for doc_id, doc_term_dict in self.docSpace.vector_dict.items():
         doc_term_set = set(list(doc_term_dict.keys()))
         termInt = len (term_set & doc_term_set)
         termUni = len (term_set | doc_term_set)
         if termUni == 0:
             ji = 0
         else :
             ji = float(termInt) / float(termUni)
         score_dict[doc_id] = ji
     return score_dict
Ejemplo n.º 4
0
 def query(self, search_text):
     word_list = search_text.split()
     term_list = tk.tokenize(word_list)
     term_set = set(term_list)
     score_dict = {}
     for doc_id, doc_term_dict in self.docSpace.vector_dict.items():
         doc_term_set = set(list(doc_term_dict.keys()))
         termInt = len(term_set & doc_term_set)
         termUni = len(term_set | doc_term_set)
         if termUni == 0:
             ji = 0
         else:
             ji = float(termInt) / float(termUni)
         score_dict[doc_id] = ji
     return score_dict
Ejemplo n.º 5
0
 def get_term_list(self, phrase):
     word_list = phrase.split()
     term_list = tk.tokenize(word_list)
     return term_list