def score_by_new_doc_(new_doc, doc, DICT): list_word_in_doc = data_processed.split_words(new_doc) score = 0 for word in list_word_in_doc: if word in DICT[doc]: score += DICT[doc][word] return score
def dict_for_all_doc(data): """ return hold data's dictionary""" dict_ = {} for doc in data: list_word_ = data_processed.split_words(doc) dict_[doc] = {word: tf_idf_modified(word, doc, K) for word in list_word_} return dict_
def tf_idf_standard(word, doc): df = 0 doc_split = data_processed.split_words(doc) if word in doc_split: tf = 1 else: tf = 0 for doc_ in DATA: if word in doc_: df+=1 if df >0: return tf*math.log(len(DATA)/df, 10) else: return 0
def tf_idf(word, doc): """ Nous avons utilisé une normalisation pour TF-IDF""" tf, df = 0, 0 len_doc = len(data_processed.split_words(doc)) value_word = 1/math.sqrt(len_doc) value_word = math.sqrt(value_word) if word in str(doc): tf = value_word for doc_ in DATA: if word in str(doc_): df+=1 if df>0: return tf*math.log(len(DATA)/(df), 10) else: return 0
def dict_doc_score_(doc, KEY_LIST, sujet): """ We return score of all one words and all two consequent words""" list_word_ = data_processed.split_words(doc) return {word: tf_idf_(word, doc, KEY_LIST, sujet) for word in list_word_}