def analysis(directory: str = './zip', aggregate=False, key_file='keys.csv', res_file='res.xlsx'): if aggregate: aggregate_files(directory, key_file) groups = group(key_file, threshold=0.8) write_excel(groups, res_file) return True
def analysis(directory: str = "./zip", aggregate=False, clusters_count=2, key_file="keys.csv", res_file="res.xlsx"): """ 对所有的 zip 文件进行分析,输出为到 res.xlsx 文件中""" if aggregate: aggregate_files(directory, key_file) file_path = get_path("keys.csv") groups = group(file_path, clusters_count=clusters_count) write_excel(groups, res_file) return True
def analysis( directory: str = './zip', aggregate=False, clusters_count=2, file_count: int = 10, calc_times: bool = False, key_file='keys.csv', res_file='res.xlsx', ): """ 对所有的 zip 文件进行分析,输出为到 res.xlsx 文件中""" if aggregate: aggregate_files(directory, key_file) file_path = get_path('keys.csv') groups = group(file_path, clusters_count=clusters_count, calc_times=calc_times) if file_count > clusters_count: file_count = clusters_count each = clusters_count // file_count for i in range(file_count): subgroup = {t: groups.get(t) for t in range(i * each, (i + 1) * each)} write_excel(subgroup, f'res/{i + 1}_{res_file}') return True
def __init__( self, directory: str = './zip', aggregate: bool = True, key_file='keys.csv', res_file: str = 'res.xmind', depth: int = 4, width: int = 5, ): self.index = {} self.invert_index = {} self.ignore_topics = {} self.key_file = key_file self.res_file = res_file self.depth = depth self.width = width if aggregate: texts = aggregate_files(directory, file_name='keys.csv', to_csv=False) else: texts = pd.read_csv(key_file, encoding='UTF_8_SIG') self._build_index(texts, get_stop_words())
def analysis(directory: str = "./zip", aggregate=False, key_file="keys.csv", res_file="res.xlsx"): if aggregate: aggregate_files(directory, key_file) groups = group(key_file, threshold=0.8) write_excel(groups, res_file) return True