def naive_bayes(data): dictionary_description = dict() dictionary_title = dict() count_y = 0 count_n = 0 total_y_des = 0 total_n_des = 0 total_y_title = 0 total_n_title = 0 for index, row in data.iterrows(): view = row['views'] if view == 1: count_y += 1 else: count_n += 1 # description description_vector = DictionaryProcess( row['description']).prepare_text() for token in description_vector: if token not in dictionary_description: dictionary_description[token] = [0, 0] if view == 1: total_y_des += 1 dictionary_description[token][ 0] = dictionary_description[token][0] + 1 else: total_n_des += 1 dictionary_description[token][ 1] = dictionary_description[token][1] + 1 # title title_vector = DictionaryProcess(row['title']).prepare_text() for token in title_vector: if token not in dictionary_title: dictionary_title[token] = [0, 0] if view == 1: total_y_title += 1 dictionary_title[token][0] = dictionary_title[token][0] + 1 else: total_n_title += 1 dictionary_title[token][1] = dictionary_title[token][1] + 1 # p(view = 1) & p(view = -1) pvy = count_y / (count_n + count_y) pvn = count_n / (count_n + count_y) return (dictionary_description, total_y_des, total_n_des), (dictionary_title, total_y_title, total_n_title), (pvy, pvn)
def indexing_single_doc(self, doc, file): doc_id = doc['id'] if file == "ted_talk": self.ted_talk_doc_ids.add(doc_id) elif file == "persian_wiki": self.persian_doc_ids.add(doc_id) print('indexing doc:', doc_id, ' in ', file) tokens_position = {} for subSection in doc.keys(): if subSection == 'id': continue text = doc[subSection] dictionary_process = DictionaryProcess(text).prepare_text() for pos, token in enumerate(dictionary_process): token_key = IIDictionary.TokenKey(token, subSection) tokens_position[token_key.key()] = tokens_position.get( token_key.key(), []) + [pos] for token_key_string in tokens_position.keys(): posting_item = IIDictionary.PostingItem(doc_id) posting_item.positions = tokens_position[token_key_string] if file == "ted_talk": self.ted_talk_ii.merge_token_doc(token_key_string, posting_item) self.ted_talk_kg.merge_token_doc( token_key_string.split("-")[0], doc_id) elif file == "persian_wiki": self.persian_ii.merge_token_doc(token_key_string, posting_item) self.persian_kg.merge_token_doc( token_key_string.split("-")[0], doc_id)
def get_k_gram_dictionary(self, k_gram): ch = k_gram[0] persian = DictionaryProcess.check_persian(ch) if persian: return list(self.indexing.persian_kg.dictionary.get(k_gram, {}).keys()) else: return list(self.indexing.ted_talk_kg.dictionary.get(k_gram, {}).keys())
def check_query_is_misspelled(self, word, sub_section): ch = word[0] persian = DictionaryProcess.check_persian(ch) if persian: return self.indexing.persian_ii.dictionary.get(word + "-" + sub_section, None) is None else: return self.indexing.ted_talk_ii.dictionary.get(word + "-" + sub_section, None) is None
def showing_every_word_that_contains_a_specific_bigram(bigram): if DictionaryProcess.check_persian(bigram[0]): postings = my_indexing.persian_kh.dictionary.get(bigram, {}) else: postings = my_indexing.ted_talk_kg.dictionary.get(bigram, {}) print("bigram:", bigram) for posting in postings.keys(): print(posting)
def get_token_raw_tf_and_postings(self, token, sub_section): if DictionaryProcess.check_persian(token[0]): raw_tf, posting = self.index.persian_ii.dictionary.get( token + "-" + sub_section, [0, []]) else: raw_tf, posting = self.index.ted_talk_ii.dictionary.get( token + "-" + sub_section, [0, []]) return raw_tf, posting
def showing_posting_list_of_a_word(word, sub_section): if DictionaryProcess.check_persian(word[0]): postings = my_indexing.persian_ii.dictionary.get( word + "-" + sub_section, [0, []]) else: postings = my_indexing.ted_talk_ii.dictionary.get( word + "-" + sub_section, [0, []]) print("doc freq:", postings[0]) for posting in postings[1]: print(posting)
def get_tokens(self): res = dict() for doc in self.json_document: for subSection in doc.keys() - ["views", "id"]: dictionary_process = DictionaryProcess( doc[subSection]).prepare_text() dictionary_process = self.delete_stop_words(dictionary_process) for token in dictionary_process: main_token: str = (token + "-" + subSection) res[main_token] = res.get(main_token, set()) res[main_token].add(doc['id']) return res
def classsify(description_text, title_text): description_vector = DictionaryProcess(description_text).prepare_text() title_vector = DictionaryProcess(title_text).prepare_text() # if view is 1 P_is_1 = math.log(p_view[0]) for token in description_vector: Tct = 0 if token in des_table[0]: Tct = des_table[0][token][0] p = (Tct + 1) / (des_table[1] + len(des_table[0])) P_is_1 += math.log(p) for token in title_vector: Tct = 0 if token in title_table[0]: Tct = title_table[0][token][0] p = (Tct + 1) / (title_table[1] + len(title_table[0])) P_is_1 += math.log(p) # if view is -1 P_is_not_1 = math.log(p_view[1]) for token in description_vector: Tct = 0 if token in des_table[0]: Tct = des_table[0][token][1] p = (Tct + 1) / (des_table[2] + len(des_table[0])) P_is_not_1 += math.log(p) for token in title_vector: Tct = 0 if token in title_table[0]: Tct = title_table[0][token][1] p = (Tct + 1) / (title_table[2] + len(title_table[0])) P_is_not_1 += math.log(p) # classify result = 1 if P_is_not_1 > P_is_1: result = -1 return result
def _get_tokens_ltc(self, tokens, tokens_df): import math ltc = tokens """ l part """ for key in ltc.keys(): ltc[key] = 1 + math.log(ltc.get(key, 0)) """ idf part """ try: ch = list(tokens.keys())[0][0] except: ch = False if DictionaryProcess.check_persian(ch): N = len(self.index.persian_doc_ids) else: N = len(self.index.ted_talk_doc_ids) for key in ltc.keys(): ltc[key] = ltc[key] * math.log( N / len(tokens_df.get(key, set([1])))) """ norm part """ ltc_weight = (self.dot_product(ltc, ltc))**0.5 for key in ltc.keys(): ltc[key] = ltc[key] / ltc_weight return ltc
def get_vector_space_documents_and_tokens(self, docs): import math res = [] words = list(self.tokens.keys()) for doc in docs: doc_vector = [0] * len(words) doc_tokens_count = dict() for subSection in doc.keys() - ["views", "id"]: dictionary_process = DictionaryProcess( doc[subSection]).prepare_text() dictionary_process = self.delete_stop_words(dictionary_process) for token in dictionary_process: main_token = token + "-" + subSection try: index = words.index(main_token) except ValueError: index = -1 if index != -1: doc_tokens_count[main_token] = doc_tokens_count.get( main_token, 0) + 1 doc_vector[index] = math.log( len(self.json_document) / (len(self.tokens[main_token]))) """ tf part """ for token in dictionary_process: main_token = token + "-" + subSection try: index = words.index(main_token) except ValueError: index = -1 if index != -1: doc_vector[index] *= doc_tokens_count.get( main_token, 0) res.append(doc_vector) return res
def get_query_data(self, query, sub_section): docs_tokens = {} query_tokens_norm = {} tokens_raw_tf = {} tokens_raw_df = {} query_tokens = DictionaryProcess(query).prepare_text() for token in query_tokens: """ query tokens """ query_tokens_norm[token] = query_tokens_norm.get(token, 0) + 1 raw_tf, postings = self.get_token_raw_tf_and_postings( token, sub_section) """ tf """ tokens_raw_tf[token] = raw_tf for posting in postings: """ doc tokens """ doc_id = posting.doc_id doc_tokens = docs_tokens.get(doc_id, {}) doc_tokens[token] = len(posting.positions) docs_tokens[doc_id] = doc_tokens """ df """ tokens_docs = tokens_raw_df.get(token, set()) tokens_docs.add(doc_id) tokens_raw_df[token] = tokens_docs return docs_tokens, query_tokens_norm, tokens_raw_tf, tokens_raw_df
def user_interface(my_indexing, search, check_query): def showing_posting_list_of_a_word(word, sub_section): if DictionaryProcess.check_persian(word[0]): postings = my_indexing.persian_ii.dictionary.get( word + "-" + sub_section, [0, []]) else: postings = my_indexing.ted_talk_ii.dictionary.get( word + "-" + sub_section, [0, []]) print("doc freq:", postings[0]) for posting in postings[1]: print(posting) def showing_every_word_that_contains_a_specific_bigram(bigram): if DictionaryProcess.check_persian(bigram[0]): postings = my_indexing.persian_kh.dictionary.get(bigram, {}) else: postings = my_indexing.ted_talk_kg.dictionary.get(bigram, {}) print("bigram:", bigram) for posting in postings.keys(): print(posting) print('phase 1 of MIR project') while True: print("if you wanna evaluate classifiers type v\n" + "if you wanna search type 's'\n" + "if you wanna test parts of project type 't'\n" + "if you wanna exit type e") command = input() if command == "v": evaluate_classifiers() if command == 'e': break elif command == 't': print("witch part of project do you wanna test? ", end='') command = input() if command == '1': l = input('1:prepare a text\n2:showing most used words:\n') if l == '1': print('enter your test and at the last line print "exit":') text = '' while True: a = input() if a == 'exit': break text = text + '\n' + a result = DictionaryProcess(text).prepare_text() print(result) if l == '2': print("ted_talk:") print( my_indexing.get_stop_words_set( my_indexing.ted_talk_ii.dictionary)) print("persian:") print( my_indexing.get_stop_words_set( my_indexing.persian_ii.dictionary)) if command == '2': l = input( '2: showing post_list of a word\n3: showing index of a word in every doc\n4:showing every word that contains a specific bigram\n' ) if l == '2': showing_posting_list_of_a_word(input("which word:"), input("which section:")) if l == '3': showing_posting_list_of_a_word(input("which word:"), input("which section:")) if l == '4': showing_every_word_that_contains_a_specific_bigram( input("which bigram:")) if command == '3': l = input( '1: storage variable bytes\n2: storage gamma code\n3:store in file\n' ) if l == '1': VB_size, size_without_compressing = CompressUtils.calculate_size_of_VBC( my_indexing) print("size without compressing: " + str(size_without_compressing)) print("size after applying variable byye code: " + str(VB_size)) if l == '2': gamma_size, size_without_compressing = CompressUtils.calculate_size_of_gamma( my_indexing) print("size without compressing: " + str(size_without_compressing)) print("size after applying gamma code: " + str(gamma_size)) if l == '3': # CompressUtils.compress_with_gamma(my_indexing) # token_freq = [] # for key in my_indexing.ted_talk_ii.dictionary.keys(): # token_freq.append(my_indexing.ted_talk_ii.dictionary.get(key)[0]) # CompressUtils.decode_with_gamma(my_indexing.ted_talk_ii.dictionary.keys(),token_freq) pass if command == '4': l = input( '1: showing corrected query\n2: calculate jacard of two words\n3:calculate edit distance of two words\n' ) if l == '1': res = check_query.spell_corrector(input('query: '), input('subsection: ')) print(res) if l == '2': print( check_query.jaccard_similarity(input("first word: "), input("second word: "))) pass if l == '3': selected_word = input('selected_word: ') word = input('word: ') edit_distance_value = check_query.editDistance( selected_word, word, len(selected_word), len(word)) print(edit_distance_value) if command == '5': pass elif command == 's': search.run()
def get_doc_class_with_id_and_query(self, query, doc_id): if DictionaryProcess.check_persian(query[0]): return self.index.persian_doc_class.get(doc_id, None) else: return self.index.ted_talk_doc_class.get(doc_id, None)