Exemple #1
0
def naive_bayes(data):
    dictionary_description = dict()
    dictionary_title = dict()
    count_y = 0
    count_n = 0
    total_y_des = 0
    total_n_des = 0
    total_y_title = 0
    total_n_title = 0
    for index, row in data.iterrows():
        view = row['views']
        if view == 1:
            count_y += 1
        else:
            count_n += 1
        # description
        description_vector = DictionaryProcess(
            row['description']).prepare_text()
        for token in description_vector:
            if token not in dictionary_description:
                dictionary_description[token] = [0, 0]
            if view == 1:
                total_y_des += 1
                dictionary_description[token][
                    0] = dictionary_description[token][0] + 1
            else:
                total_n_des += 1
                dictionary_description[token][
                    1] = dictionary_description[token][1] + 1
        # title
        title_vector = DictionaryProcess(row['title']).prepare_text()
        for token in title_vector:
            if token not in dictionary_title:
                dictionary_title[token] = [0, 0]
            if view == 1:
                total_y_title += 1
                dictionary_title[token][0] = dictionary_title[token][0] + 1
            else:
                total_n_title += 1
                dictionary_title[token][1] = dictionary_title[token][1] + 1

    # p(view = 1) & p(view = -1)
    pvy = count_y / (count_n + count_y)
    pvn = count_n / (count_n + count_y)

    return (dictionary_description, total_y_des,
            total_n_des), (dictionary_title, total_y_title,
                           total_n_title), (pvy, pvn)
Exemple #2
0
 def indexing_single_doc(self, doc, file):
     doc_id = doc['id']
     if file == "ted_talk":
         self.ted_talk_doc_ids.add(doc_id)
     elif file == "persian_wiki":
         self.persian_doc_ids.add(doc_id)
     print('indexing doc:', doc_id, ' in ', file)
     tokens_position = {}
     for subSection in doc.keys():
         if subSection == 'id':
             continue
         text = doc[subSection]
         dictionary_process = DictionaryProcess(text).prepare_text()
         for pos, token in enumerate(dictionary_process):
             token_key = IIDictionary.TokenKey(token, subSection)
             tokens_position[token_key.key()] = tokens_position.get(
                 token_key.key(), []) + [pos]
     for token_key_string in tokens_position.keys():
         posting_item = IIDictionary.PostingItem(doc_id)
         posting_item.positions = tokens_position[token_key_string]
         if file == "ted_talk":
             self.ted_talk_ii.merge_token_doc(token_key_string,
                                              posting_item)
             self.ted_talk_kg.merge_token_doc(
                 token_key_string.split("-")[0], doc_id)
         elif file == "persian_wiki":
             self.persian_ii.merge_token_doc(token_key_string, posting_item)
             self.persian_kg.merge_token_doc(
                 token_key_string.split("-")[0], doc_id)
Exemple #3
0
 def get_k_gram_dictionary(self, k_gram):
     ch = k_gram[0]
     persian = DictionaryProcess.check_persian(ch)
     if persian:
         return list(self.indexing.persian_kg.dictionary.get(k_gram, {}).keys())
     else:
         return list(self.indexing.ted_talk_kg.dictionary.get(k_gram, {}).keys())
Exemple #4
0
 def check_query_is_misspelled(self, word, sub_section):
     ch = word[0]
     persian = DictionaryProcess.check_persian(ch)
     if persian:
         return self.indexing.persian_ii.dictionary.get(word + "-" + sub_section, None) is None
     else:
         return self.indexing.ted_talk_ii.dictionary.get(word + "-" + sub_section, None) is None
Exemple #5
0
 def showing_every_word_that_contains_a_specific_bigram(bigram):
     if DictionaryProcess.check_persian(bigram[0]):
         postings = my_indexing.persian_kh.dictionary.get(bigram, {})
     else:
         postings = my_indexing.ted_talk_kg.dictionary.get(bigram, {})
     print("bigram:", bigram)
     for posting in postings.keys():
         print(posting)
Exemple #6
0
 def get_token_raw_tf_and_postings(self, token, sub_section):
     if DictionaryProcess.check_persian(token[0]):
         raw_tf, posting = self.index.persian_ii.dictionary.get(
             token + "-" + sub_section, [0, []])
     else:
         raw_tf, posting = self.index.ted_talk_ii.dictionary.get(
             token + "-" + sub_section, [0, []])
     return raw_tf, posting
Exemple #7
0
 def showing_posting_list_of_a_word(word, sub_section):
     if DictionaryProcess.check_persian(word[0]):
         postings = my_indexing.persian_ii.dictionary.get(
             word + "-" + sub_section, [0, []])
     else:
         postings = my_indexing.ted_talk_ii.dictionary.get(
             word + "-" + sub_section, [0, []])
     print("doc freq:", postings[0])
     for posting in postings[1]:
         print(posting)
Exemple #8
0
 def get_tokens(self):
     res = dict()
     for doc in self.json_document:
         for subSection in doc.keys() - ["views", "id"]:
             dictionary_process = DictionaryProcess(
                 doc[subSection]).prepare_text()
             dictionary_process = self.delete_stop_words(dictionary_process)
             for token in dictionary_process:
                 main_token: str = (token + "-" + subSection)
                 res[main_token] = res.get(main_token, set())
                 res[main_token].add(doc['id'])
     return res
Exemple #9
0
def classsify(description_text, title_text):
    description_vector = DictionaryProcess(description_text).prepare_text()
    title_vector = DictionaryProcess(title_text).prepare_text()
    # if view is 1
    P_is_1 = math.log(p_view[0])
    for token in description_vector:
        Tct = 0
        if token in des_table[0]:
            Tct = des_table[0][token][0]
        p = (Tct + 1) / (des_table[1] + len(des_table[0]))
        P_is_1 += math.log(p)
    for token in title_vector:
        Tct = 0
        if token in title_table[0]:
            Tct = title_table[0][token][0]
        p = (Tct + 1) / (title_table[1] + len(title_table[0]))
        P_is_1 += math.log(p)
    # if view is -1
    P_is_not_1 = math.log(p_view[1])
    for token in description_vector:
        Tct = 0
        if token in des_table[0]:
            Tct = des_table[0][token][1]
        p = (Tct + 1) / (des_table[2] + len(des_table[0]))
        P_is_not_1 += math.log(p)
    for token in title_vector:
        Tct = 0
        if token in title_table[0]:
            Tct = title_table[0][token][1]
        p = (Tct + 1) / (title_table[2] + len(title_table[0]))
        P_is_not_1 += math.log(p)
    # classify
    result = 1
    if P_is_not_1 > P_is_1:
        result = -1
    return result
Exemple #10
0
 def _get_tokens_ltc(self, tokens, tokens_df):
     import math
     ltc = tokens
     """ l part """
     for key in ltc.keys():
         ltc[key] = 1 + math.log(ltc.get(key, 0))
     """ idf part """
     try:
         ch = list(tokens.keys())[0][0]
     except:
         ch = False
     if DictionaryProcess.check_persian(ch):
         N = len(self.index.persian_doc_ids)
     else:
         N = len(self.index.ted_talk_doc_ids)
     for key in ltc.keys():
         ltc[key] = ltc[key] * math.log(
             N / len(tokens_df.get(key, set([1]))))
     """ norm part """
     ltc_weight = (self.dot_product(ltc, ltc))**0.5
     for key in ltc.keys():
         ltc[key] = ltc[key] / ltc_weight
     return ltc
Exemple #11
0
    def get_vector_space_documents_and_tokens(self, docs):
        import math
        res = []
        words = list(self.tokens.keys())
        for doc in docs:
            doc_vector = [0] * len(words)
            doc_tokens_count = dict()
            for subSection in doc.keys() - ["views", "id"]:
                dictionary_process = DictionaryProcess(
                    doc[subSection]).prepare_text()
                dictionary_process = self.delete_stop_words(dictionary_process)
                for token in dictionary_process:
                    main_token = token + "-" + subSection
                    try:
                        index = words.index(main_token)
                    except ValueError:
                        index = -1
                    if index != -1:
                        doc_tokens_count[main_token] = doc_tokens_count.get(
                            main_token, 0) + 1
                        doc_vector[index] = math.log(
                            len(self.json_document) /
                            (len(self.tokens[main_token])))
                """ tf part """
                for token in dictionary_process:
                    main_token = token + "-" + subSection
                    try:
                        index = words.index(main_token)
                    except ValueError:
                        index = -1
                    if index != -1:
                        doc_vector[index] *= doc_tokens_count.get(
                            main_token, 0)

            res.append(doc_vector)
        return res
Exemple #12
0
 def get_query_data(self, query, sub_section):
     docs_tokens = {}
     query_tokens_norm = {}
     tokens_raw_tf = {}
     tokens_raw_df = {}
     query_tokens = DictionaryProcess(query).prepare_text()
     for token in query_tokens:
         """ query tokens """
         query_tokens_norm[token] = query_tokens_norm.get(token, 0) + 1
         raw_tf, postings = self.get_token_raw_tf_and_postings(
             token, sub_section)
         """ tf """
         tokens_raw_tf[token] = raw_tf
         for posting in postings:
             """ doc tokens """
             doc_id = posting.doc_id
             doc_tokens = docs_tokens.get(doc_id, {})
             doc_tokens[token] = len(posting.positions)
             docs_tokens[doc_id] = doc_tokens
             """ df """
             tokens_docs = tokens_raw_df.get(token, set())
             tokens_docs.add(doc_id)
             tokens_raw_df[token] = tokens_docs
     return docs_tokens, query_tokens_norm, tokens_raw_tf, tokens_raw_df
Exemple #13
0
def user_interface(my_indexing, search, check_query):
    def showing_posting_list_of_a_word(word, sub_section):
        if DictionaryProcess.check_persian(word[0]):
            postings = my_indexing.persian_ii.dictionary.get(
                word + "-" + sub_section, [0, []])
        else:
            postings = my_indexing.ted_talk_ii.dictionary.get(
                word + "-" + sub_section, [0, []])
        print("doc freq:", postings[0])
        for posting in postings[1]:
            print(posting)

    def showing_every_word_that_contains_a_specific_bigram(bigram):
        if DictionaryProcess.check_persian(bigram[0]):
            postings = my_indexing.persian_kh.dictionary.get(bigram, {})
        else:
            postings = my_indexing.ted_talk_kg.dictionary.get(bigram, {})
        print("bigram:", bigram)
        for posting in postings.keys():
            print(posting)

    print('phase 1 of MIR project')
    while True:
        print("if you wanna evaluate classifiers type v\n" +
              "if you wanna search type 's'\n" +
              "if you wanna test parts of project type 't'\n" +
              "if you wanna exit type e")

        command = input()
        if command == "v":
            evaluate_classifiers()
        if command == 'e':
            break
        elif command == 't':
            print("witch part of project do you wanna test? ", end='')
            command = input()
            if command == '1':
                l = input('1:prepare a text\n2:showing most used words:\n')
                if l == '1':
                    print('enter your test and at the last line print "exit":')
                    text = ''
                    while True:
                        a = input()
                        if a == 'exit':
                            break
                        text = text + '\n' + a
                    result = DictionaryProcess(text).prepare_text()
                    print(result)
                if l == '2':
                    print("ted_talk:")
                    print(
                        my_indexing.get_stop_words_set(
                            my_indexing.ted_talk_ii.dictionary))
                    print("persian:")
                    print(
                        my_indexing.get_stop_words_set(
                            my_indexing.persian_ii.dictionary))
            if command == '2':
                l = input(
                    '2: showing post_list of a word\n3: showing index of a word in every doc\n4:showing every word that contains a specific bigram\n'
                )
                if l == '2':
                    showing_posting_list_of_a_word(input("which word:"),
                                                   input("which section:"))
                if l == '3':
                    showing_posting_list_of_a_word(input("which word:"),
                                                   input("which section:"))
                if l == '4':
                    showing_every_word_that_contains_a_specific_bigram(
                        input("which bigram:"))
            if command == '3':
                l = input(
                    '1: storage variable bytes\n2: storage gamma code\n3:store in file\n'
                )
                if l == '1':
                    VB_size, size_without_compressing = CompressUtils.calculate_size_of_VBC(
                        my_indexing)
                    print("size without compressing: " +
                          str(size_without_compressing))
                    print("size after applying variable byye code: " +
                          str(VB_size))
                if l == '2':
                    gamma_size, size_without_compressing = CompressUtils.calculate_size_of_gamma(
                        my_indexing)
                    print("size without compressing: " +
                          str(size_without_compressing))
                    print("size after applying gamma code: " + str(gamma_size))

                if l == '3':
                    # CompressUtils.compress_with_gamma(my_indexing)
                    # token_freq = []
                    # for key in my_indexing.ted_talk_ii.dictionary.keys():
                    # token_freq.append(my_indexing.ted_talk_ii.dictionary.get(key)[0])
                    # CompressUtils.decode_with_gamma(my_indexing.ted_talk_ii.dictionary.keys(),token_freq)
                    pass
            if command == '4':
                l = input(
                    '1: showing corrected query\n2: calculate jacard of two words\n3:calculate edit distance of two words\n'
                )
                if l == '1':
                    res = check_query.spell_corrector(input('query: '),
                                                      input('subsection: '))
                    print(res)
                if l == '2':
                    print(
                        check_query.jaccard_similarity(input("first word: "),
                                                       input("second word: ")))
                    pass
                if l == '3':
                    selected_word = input('selected_word: ')
                    word = input('word: ')
                    edit_distance_value = check_query.editDistance(
                        selected_word, word, len(selected_word), len(word))
                    print(edit_distance_value)
            if command == '5':
                pass

        elif command == 's':
            search.run()
Exemple #14
0
 def get_doc_class_with_id_and_query(self, query, doc_id):
     if DictionaryProcess.check_persian(query[0]):
         return self.index.persian_doc_class.get(doc_id, None)
     else:
         return self.index.ted_talk_doc_class.get(doc_id, None)