def save_corpus_one_extracted_feature_by_name(self, feature_name, left=0, right=0): filenames = tp.get_filenames_from_dir(self._file_dir) for filename in filenames: save_extracted_feature_by_name(self._file_dir + filename, feature_name, left, right)
def save_corpus_one_extracted_feature_by_regex(self, regex, left=0, right=0, feature_name=None): filenames = tp.get_filenames_from_dir(self._file_dir) for filename in filenames: save_extracted_feature_by_regex(self._file_dir + filename, regex, left, right, feature_name)
def save_corpus_extracted_features(self, left=0, right=0): other_feature_patterns = [v for v in fs.FEATURE_DICT.values()] filenames = tp.get_filenames_from_dir(self._file_dir) for filename in filenames: raw_text = str(open(self._file_dir + filename, 'rb').read()) tagged_text = tp.get_modified_pos_tagged_text(raw_text) for pattern in other_feature_patterns: results = fs.feature_finder(pattern, tagged_text) feature_name = get_feature_name_by_regex(pattern) file_p = self._file_dir + filename save_extracted_feature_by_res(file_p, results, feature_name, tagged_text, left, right) print('The extracted features of the corpus saved!')
def corpus_feature_fre_extraction(self, normalized_rate=100, save_tagged_corpus=True, save_extracted_features=True, left=0, right=0): freq_data = [[ 'Filename', 'Words', 'Mean word length', 'Type-token ratio' ]] feature_names = [k for k in fs.FEATURE_DICT.keys()] freq_data[0].extend(feature_names) filenames = tp.get_filenames_from_dir(self._file_dir) for filename in filenames: sub_data = get_single_file_feature_fre(self._file_dir + filename, normalized_rate, save_tagged_corpus, save_extracted_features, left, right) freq_data.append(sub_data) pd.DataFrame(freq_data).to_excel('Feature_Fre_Extracted.xlsx') return freq_data
def save_cleaned_corpus(self): filenames = tp.get_filenames_from_dir(self._file_dir) for filename in filenames: save_single_cleaned_text(self._file_dir + filename) print('Cleaned corpus saved!')
def get_filepath_list(self): filenames = tp.get_filenames_from_dir(self._file_dir) path_list = [] for filename in filenames: path_list.append(self._file_dir + filename) return '\n'.join(path_list)