Python KeywordExtractor.idf_extract Exemples

Langage de programmation: Python

Espace de nommage/Pack: keyword_extractor

Class/Type: KeywordExtractor

Méthode/Fonction: idf_extract

Exemples au hotexamples.com: 1

Python KeywordExtractor.idf_extract - 1 exemples trouvés. Ce sont les exemples réels les mieux notés de keyword_extractor.KeywordExtractor.idf_extract extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

KeywordExtractor(9)

candi_extract(2)

extract(2)

get_keywords(1)

idf_extract(1)

Méthodes fréquemment utilisées

KeywordExtractor (9)

candi_extract (2)

extract (2)

get_keywords (1)

idf_extract (1)

Exemple #1

0

Afficher le fichier

Fichier : dialog_data_processor.py Projet : XL2248/GRADE

class DialogDataProcessor: r"""Loads and pre-processes original dialog data. Attributes: dataset_name: A 'str' indicating the dataset name. output_data_dir: A 'str' indicating the output data directory's name. separator: A 'str' used to separate two utterances. min_kw_freq: An 'int' indicating the minimum of keyword occurrence frequency. context_turns: An 'int' indicating the number of turns of each dialog context. set_names: A 'list' of 'str' containing the set names, e.g., ['train', 'validation', 'test']. """ def __init__(self, dataset_name, output_data_dir, separator, min_kw_freq, context_turns, set_names): self.dataset_name = dataset_name self.output_data_dir = output_data_dir self.separator = separator self.min_kw_freq = min_kw_freq self.context_turns = context_turns self.set_names = set_names self._make_data_dir_if_not_exists() self._load_raw_dialog_data() # Initializes keyword extractor candi_keywords = self._obtain_candidate_keywords() idf_dict = self._calculate_idf() self.kw_extractor = KeywordExtractor(candi_keywords, idf_dict) self._obtain_and_save_uttr_kw_mapping() # uttr_kw_mapping: (utterances -> keywords) mapping self._obtain_and_save_vocab() def process_original_data(self): for name in self.set_names: self.current_set_name = name print('\nStart processing {} set...'.format(name)) print('-' * 50) self._obtain_original_dialogs() self._extract_original_dialogs_keywords() self._save_processed_original_dialogs() print('-' * 50) def _make_data_dir_if_not_exists(self): output_data_path = '../data/{}'.format(self.output_data_dir) if not os.path.exists(output_data_path): os.makedirs(output_data_path) for set_name in self.set_names: pair1_path = os.path.join(output_data_path, set_name, 'pair-1') if not os.path.exists(pair1_path): os.makedirs(pair1_path) def _calculate_idf(self, load_file_if_exists=True): r"""Calculates and saves the IDF values for extracting keywords. Args: load_file_if_exists: A 'bool' indicating whether load IDF file if it exists. Returns: idf_dict: A 'Dict' containing all the IDF values of keywords. """ if load_file_if_exists: idf_dict_name = '../data/{}/idf.dict'.format(self.output_data_dir) if os.path.isfile(idf_dict_name): with open(idf_dict_name, 'rb') as f: idf_dict = pickle.load(f) print('Loading idf dict from {}'.format(idf_dict_name)) print('idf dict size: ', len(idf_dict)) return idf_dict print('Calculating idf...') # Calculates IDF counter = collections.Counter() total = 0. for dialog in tqdm(self.list_all_dialogs): for utterance in dialog: total += 1 counter.update(set(kw_tokenize(utterance))) idf_dict = {} for k,v in counter.items(): idf_dict[k] = np.log10(total / (v+1.)) # Writes idf dict into file with open('../data/{}/idf.dict'.format(self.output_data_dir), 'wb') as f: pickle.dump(idf_dict, f) return idf_dict def _obtain_candidate_keywords(self, load_file_if_exists=True): r"""Obtains and saves the candidate keywords used for extracting keywords. Args: load_file_if_exists: A 'bool' indicating whether load candi_keywords file if it exists. Returns: candi_keywords: A 'list' containing all the candidate keywords. """ if load_file_if_exists: candi_keywords_name = '../data/{}/candi_keywords.txt'.format(self.output_data_dir) if os.path.isfile(candi_keywords_name): with open(candi_keywords_name,'r') as f: candi_keywords = [kw.strip() for kw in f.readlines()] print('Loading candidate keywords from {}'.format(candi_keywords_name)) print('Total candidate keywords count: ', len(candi_keywords)) return candi_keywords print('Obtaining candidate keywords...') # Initialization candi_keywords = [] kw_counter = collections.Counter() kw_extractor = KeywordExtractor() # Extracts possible keywords. for dialog in tqdm(self.list_all_dialogs): for utterance in dialog: cur_keywords = kw_extractor.candi_extract(utterance) kw_counter.update(cur_keywords) candi_keywords.extend(cur_keywords) # Deletes the keywords occurring less than specified times rare_keywords = [kw for kw, freq in kw_counter.most_common() if freq < self.min_kw_freq] candi_keywords = [kw for kw, freq in kw_counter.most_common() if freq >= self.min_kw_freq] # Deletes keywords containing only one single letter single_letter_keywords = [kw for kw in candi_keywords if len(kw) < 2] candi_keywords = [kw for kw in candi_keywords if len(kw) >= 2] # Writes candidate keywords into file candidate_keywords_output_path = '../data/{}/candi_keywords.txt'.format( self.output_data_dir) with open(candidate_keywords_output_path,'w') as f: for keyword in candi_keywords: f.write(keyword + '\n') return candi_keywords def _obtain_and_save_uttr_kw_mapping(self, load_file_if_exists=True): r"""Obtains and saves the mapping that maps utterances into keywords they contain. Args: load_file_if_exists: A 'bool' indicating whether load mapping file if it exists. Returns: uttr_kw_mapping: A 'dict' containing utterances->keywords mapping. """ if load_file_if_exists: uttr_kw_mapping_name = '../data/{}/uttr_kw.dict'.format( self.output_data_dir) if os.path.isfile(uttr_kw_mapping_name): with open(uttr_kw_mapping_name, 'rb') as f: self.uttr_kw_mapping = pickle.load(f) print('Loading utterances->keywords mapping from {}'.format( uttr_kw_mapping_name)) print('(utterances -> keyword) mapping size: ', len(self.uttr_kw_mapping)) return print('Obtaining mapping from utterances to keywords...') # Extracts keywords to construct mapping self.uttr_kw_mapping = {} for dialog in tqdm(self.list_all_dialogs): for utterance in dialog: cur_keywords = self.kw_extractor.idf_extract(utterance) self.uttr_kw_mapping[utterance] = cur_keywords print('(utterances -> keyword) mapping size: ', len(self.uttr_kw_mapping)) # Writes uttr_kw_mapping into file with open('../data/{}/uttr_kw.dict'.format(self.output_data_dir), 'wb') as f: pickle.dump(self.uttr_kw_mapping, f) def _obtain_and_save_vocab(self, load_file_if_exists=True): r"""Obtains and saves the vocabulary of data. Args: load_file_if_exists: A 'bool' indicating whether load vocab file if it exists. Returns: vocab: A 'list' containing all the words occurring in the data. """ if load_file_if_exists: vocab_name = '../data/{}/vocab.txt'.format(self.output_data_dir) if os.path.isfile(vocab_name): with open(vocab_name,'r') as f: self.vocab = [word.strip() for word in f.readlines()] print('Loading vocab from {}'.format(vocab_name)) print('Total vocab count: ', len(self.vocab)) return print('Obtain and save vocab...') counter = collections.Counter() for dialog in tqdm(self.list_all_dialogs): for utterance in dialog: counter.update(simp_tokenize(utterance)) print('Total vocab count: ', len(counter.items())) # Vocab sorted by occurrence frequency (descending order) self.vocab = [token for token, _ in sorted(list(counter.items()), key=lambda x: (-x[1], x[0]))] # Writes vocab into file with open('../data/{}/vocab.txt'.format(self.output_data_dir),'w') as f: for word in self.vocab: f.write(word + '\n') def _load_raw_dialog_data(self): r"""Loads raw dialog data from files. Returns: list_all_dialogs: A 'list' containing all the dialogues, where each dialogue is also a 'list' containing all the utterances of this dialogue. dict_categorized_dialogs: a 'dict' containing the dialogue list of training, validation and testing set. """ print('Loading raw dialog data...') self.list_all_dialogs = [] self.dict_categorized_dialogs = {} for set_name in self.set_names: current_dialog_path = os.path.join(self.raw_data_dir, set_name, 'dialogues_{}.txt'.format(set_name)) with open(current_dialog_path, 'r') as f: raw_dialog_data = f.readlines() for dialog_str in tqdm(raw_dialog_data): dialog = self._process_dialog_str(dialog_str) self.list_all_dialogs.append(dialog) try: self.dict_categorized_dialogs[set_name].append(dialog) except: self.dict_categorized_dialogs[set_name] = [dialog] def _obtain_original_dialogs(self): # Augments the dialog data by divide each dialog into several sub-dialogs. print('Obtaining original dialogs...') self.original_dialogs = [] for dialog in tqdm(self.list_current_dialogs): self.original_dialogs.extend( self.split_dialog(dialog, self.context_turns)) def _extract_original_dialogs_keywords(self): self.original_dialogs_keywords = [] print('Extracting keywords in original dialogs...') for dialog in tqdm(self.original_dialogs): current_dialog_keywords = [] for utterance in dialog: keywords_str = ' '.join(self.uttr_kw_mapping[utterance]) current_dialog_keywords.append(keywords_str) self.original_dialogs_keywords.append(current_dialog_keywords) def _save_processed_original_dialogs(self): # Saves all processed original dialog data into files print('Writing original dialog data into files...') o_text_path = os.path.join(self.current_set_output_dir, 'pair-1', 'original_dialog.text') o_kw_path = os.path.join(self.current_set_output_dir, 'pair-1', 'original_dialog.keyword') o_res_text_path = os.path.join(self.current_set_output_dir, 'pair-1', 'original_dialog_response.text') o_uni_res_text_path = os.path.join(self.current_set_output_dir, 'pair-1', 'original_dialog_response_uni.text') str_original_dialogs = self.element_to_str(self.original_dialogs, '|||') str_original_dialogs_keywords = self.element_to_str( self.original_dialogs_keywords, '|||') str_original_responses = [dialog[-1] for dialog in self.original_dialogs] uni_str_original_responses = [dialog.split('|||')[-1] for dialog in list(set(self.element_to_str(self.original_dialogs, '|||')))] self.save(str_original_dialogs, o_text_path) self.save(str_original_dialogs_keywords, o_kw_path) self.save(str_original_responses, o_res_text_path) self.save(uni_str_original_responses, o_uni_res_text_path) def _process_dialog_str(self, dialog_str): dialog = dialog_str.split(self.separator)[:-1] dialog = self.replace_content_in_dialog(dialog, old_content='.', new_content=' . ') dialog = self.replace_content_in_dialog(dialog, old_content='?', new_content=' ? ') dialog = self.replace_content_in_dialog(dialog, old_content=',', new_content=' , ') dialog = self.replace_content_in_dialog(dialog, old_content=' ’ ', new_content="'") dialog = [utterance.strip() for utterance in dialog] return dialog # Private Methods - End # ----------------------------------------------------------------------------- @property def raw_data_dir(self): return './dataset/{}'.format(self.dataset_name) @property def list_current_dialogs(self): return self.dict_categorized_dialogs[self.current_set_name] @property def current_set_output_dir(self): return '../data/{}/{}/'.format(self.output_data_dir, self.current_set_name) @staticmethod def split_dialog(dialog, context_turns=1): r"""Split dialog into several sub-dialogs. Inputs: dialog, context_turns - **dialog**: a 'list' containing utterances in the dialog - **context_turns**: how many turns of a dialogue containing in a context Outputs: sub_dialogs - **sub_dialogs**: a 'list' containing sub-dialogs with respect to the current dialog Example: dialog: ['Hello!', 'Hi!', 'What's your name?', 'James.'] assume context_turns = 1 => (split dialog into contexts(previous utterance) and responses) contexts: [ ['Hello!', 'Hi!'], ['Hi!', 'What's your name?'], ] responses: [ ['What's your name?'] ['James.'] ] => (merge contexts and responses one by one) sub_dialogs: [ ['Hello!', 'Hi!', 'What's your name?'], ['Hi!', 'What's your name?', 'James.'] ] """ num_uttr_in_context = context_turns * 2 contexts = [ dialog[i:i+num_uttr_in_context] for i in range(0, len(dialog) - num_uttr_in_context) ] responses = [[dialog[i]] for i in range(num_uttr_in_context, len(dialog))] sub_dialogs = [context + response for context, response in zip(contexts, responses)] return sub_dialogs @staticmethod def save(contents, output_path): with open(output_path, 'w') as f: for content in tqdm(contents): f.write(content + '\n') @staticmethod def element_to_str(contents, seperator): # each element in 'contents' is also a list return [seperator.join(element) for element in contents] @staticmethod def replace_content_in_dialog(dialog, old_content, new_content): r"""Replace specified content in the dialog with given new content. Inputs: dialog, separator - **dialog**: a 'list' containing utterances in the dialog - **old_content**: a 'str' indicating the content needed to be replaced in the dialog - **new_content**: a 'str' indicating the content used to replace the old content Outputs: replaced_dialog - **replaced_dialog**: a 'list' containing all replaced utterances in the dialog Example: For an utterance ['Hello.My name is James . '], We wanna replace the '.' with ' . ', the procedure is as follow: 1. first replace ' . ' with '.' obtained ['Hello.My name is James.'] 2. then replace '.' with ' . ' obtained ['Hello . My name is James . '] Note: if we replace 'old_content' with 'new_content' directly, in this example, we would get: ['Hello . My name is James . '] """ # first replace the 'new_content' with 'old_content' # to ensure there're no utterances containing the specified 'new_content' replaced_dialog = [utterance.replace(new_content, old_content) for utterance in dialog] replaced_dialog = [utterance.replace(old_content, new_content) for utterance in replaced_dialog] return replaced_dialog