Ejemplo n.º 1
0
import tokenizer
import inverted_idx


#Smooth scaling
def calculate(inverted_idx, documents):
    idf = {}
    doc_no = len(documents)
    for term in inverted_idx:
        idf[term] = 1 + math.log(doc_no /
                                 (1 + len(inverted_idx[term].postings)))
    return idf


if __name__ == "__main__":
    docs = {
        "0":
        "abc alo ola 456 zzz ola",
        "1":
        "alo ola 321 123",
        "2":
        "hello 123 456 123",
        "3":
        "hello alo ola abc 123 456 zzz",
        "4":
        "123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123"
    }
    inv_idx = inverted_idx.gen_inverted_idx(docs)

    res = calculate(inv_idx, docs)
    print(res)
Ejemplo n.º 2
0
 def __init__(self, docs, class_map):
     self.inverted_idx, self.terms_in_doc = inverted_idx.gen_inverted_idx(
         docs, class_map)
     self.docs = docs
     self.idx_map = []
     self._convert_to_vectors()
Ejemplo n.º 3
0
 def __init__(self, docs):
     self.inverted_idx = inverted_idx.gen_inverted_idx(docs)
     self.docs = docs
     self._convert_to_vectors()
Ejemplo n.º 4
0
 def __init__(self, docs):
     self.inverted_idx = inverted_idx.gen_inverted_idx(docs)
Ejemplo n.º 5
0
# Smooth scaling


def calculate(inverted_idx, documents):
    idf = {}
    doc_no = len(documents)
    for term in inverted_idx:
        idf[term] = 1 + math.log(doc_no /
                                 (1 + len(inverted_idx[term].postings)))
    return idf


if __name__ == "__main__":
    docs = {
        "0":
        "abc alo ola 456 zzz ola",
        "1":
        "alo ola 321 123",
        "2":
        "hello 123 456 123",
        "3":
        "hello alo ola abc 123 456 zzz",
        "4":
        "123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123 123"
    }
    inv_idx, terms_in_doc = inverted_idx.gen_inverted_idx(docs)

    res = calculate(inv_idx, docs)
    print(res)