Example #1
0
def run_pca(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']
    data = df.iloc[:, columns]
    try:
        pca = PCA(n_components=params['ncomponents'])
        components = pca.fit_transform(data)
        index = ["Дисперсии осей проекции (выборочная)", "Доля информации (доля от общей дисперсии)", "Сингулярное значение"]
        columns = [f'Компонента {i}' for i in range(1, len(pca.components_) + 1)]
        components_stats_df = pd.DataFrame([pca.explained_variance_, pca.explained_variance_ratio_, pca.singular_values_], index=index, columns=columns)
        components_df = pd.DataFrame(pca.components_, index=columns, columns=data.columns)

    except Exception as e:
        return error(f'Ошибка при вычислении результата. {e}')

    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'pca', 'components_stats.csv')
            components_stats_df.to_csv(file_path)
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'pca', 'components_stats.xlsx')
            components_stats_df.to_excel(file_path)
            results.append(str(file_path))
        else:
            raise AttributeError
    except:
        return error('Ошибка при сохранении файла с результатом')
    
    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'pca', 'components.csv')
            components_df.to_csv(file_path)
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'pca', 'components.xlsx')
            components_df.to_excel(file_path)
            results.append(str(file_path))
        else:
            raise AttributeError
    except:
        return error('Ошибка при сохранении файла с результатом')

    return ready(results)
Example #2
0
def run_missingvalues(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']
    data = df.iloc[:, columns]

    try:
        if params['strategy'] == StrategyEnum.MEAN:
            strategy = 'mean'
        elif params['strategy'] == StrategyEnum.MEDIAN:
            strategy = 'median'
        elif params['strategy'] == StrategyEnum.MOST_FREQUENT:
            strategy = 'most_frequent'
        elif params['strategy'] == StrategyEnum.CONSTANT:
            strategy = 'constant'

        imp = SimpleImputer(missing_values=params['missing_value'],
                            strategy='mean',
                            fill_value=params['fill_value'])
        result = imp.fit_transform(data)

        for i, column in enumerate(columns):
            df.iloc[:, column] = result[:, i]
    except Exception as e:
        return error(f'Ошибка при выполнении замены пропущенных значений: {e}')

    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'missingvalues',
                                          'input_replaced.csv')
            df.to_csv(file_path, index=False)
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'missingvalues',
                                          'input_replaced.xlsx')
            df.to_excel(file_path, index=False)
            results.append(str(file_path))
        else:
            raise AttributeError
    except Exception as e:
        return error(f'Ошибка при сохранении файла с результатом : {e}')

    return ready(results)
Example #3
0
def run_polynomialregression(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']
    data = df.iloc[:, columns]
    target = df.iloc[:, params['target_column']]

    try:
        polyreg = make_pipeline(PolynomialFeatures(params['degree']),
                                LinearRegression())
        polyreg.fit(data, target)
        index = []
        for power in polyreg[0].powers_:
            strings = []
            for i, column in enumerate(data.columns):
                strings.append(f'{column}^{power[i]}')
            string = ' * '.join(strings)
            index.append(string)
        poly_df = pd.DataFrame(polyreg[1].coef_,
                               index=index,
                               columns=['Значение коэффициента'])
    except:
        return error('Ошибка при вычислении результата')

    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'polynomialregression',
                                          'coefficients.csv')
            poly_df.to_csv(file_path, index_label='Слагаемое')
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'polynomialregression',
                                          'coefficients.xlsx')
            poly_df.to_excel(file_path, index_label='Слагаемое')
            results.append(str(file_path))
        else:
            raise AttributeError
    except:
        return error('Ошибка при сохранении файла с результатом')

    return ready(results)
Example #4
0
def run_linear(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # specific
    try:
        data = df.iloc[:, :]
        x_column_id = params['xcolumn'] - 1
        y_column_id = params['ycolumn'] - 1
        x_column_id = x_column_id if x_column_id < len(data.columns) else 0
        y_column_id = y_column_id if y_column_id < len(data.columns) else 1

        x = data.iloc[:, x_column_id].values.reshape(-1, 1)
        y = data.iloc[:, y_column_id].values.reshape(-1, 1)

        lr = LinearRegression()
        lr.fit(x, y)
        yhat = lr.predict(x)

        res = {
            'Parameter': ['Coefficient', 'Intercept', 'Mean Squared Error'],
            'Value':
            [lr.coef_[0][0], lr.intercept_[0],
             mean_squared_error(y, yhat)]
        }
        file_path = root / 'result.csv'
        pd.DataFrame(res).to_csv(file_path, index=False)
        results.append(str(file_path))
    except Exception as e:
        return {
            'success': False,
            'error': f'Error while saving result! Exception: {e}'
        }

    try:
        fig, ax = plt.subplots()
        sns.regplot(x, y, yhat, ax=ax)
        ax.set(title=f'y = {lr.coef_[0][0]:.4f}x + {lr.intercept_[0]:.4f}')
        ax.set(xlabel=data.columns[x_column_id],
               ylabel=data.columns[y_column_id])
        figname = f'Regression_Column{x_column_id + 1}_Column{y_column_id + 1}.png'
        file_path = root / figname
        fig.savefig(file_path, bbox_inches='tight')
        results.append(str(file_path))
    except:
        return {'success': False, 'error': 'Error while showing graph!'}

    return {'ready': True, 'results': results}
Example #5
0
def run_linearregression(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']
    data = df.iloc[:, columns]
    target = df.iloc[:, params['target_column']]

    try:
        lr = LinearRegression()
        lr.fit(data, target)
        coef = {}
        for i, name in enumerate(data.columns):
            name = f'Переменная {name}'
            coef[name] = [lr.coef_[i]]
            coef["Свободный член"] = [lr.intercept_]
            coef["Коэффициент детерминации"] = [lr.score(data, target)]
        linear_df = pd.DataFrame(coef.values(),
                                 index=coef.keys(),
                                 columns=['Значение'])
    except:
        return error('Ошибка при вычислении результата')

    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'linearregression',
                                          'coefficients.csv')
            linear_df.to_csv(file_path, index_label='Переменная')
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'linearregression',
                                          'coefficients.xlsx')
            linear_df.to_excel(file_path, index_label='Переменная')
            results.append(str(file_path))
        else:
            raise AttributeError
    except:
        return error('Ошибка при сохранении файла с результатом')

    return ready(results)
Example #6
0
def run_kmeansscreeplot(params):

    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']

    try:
        data = df.iloc[:, columns]
        distances = []
        krange = list(range(1, params['max_clusters'] + 1))
        for k in krange:
            model = KMeans(n_clusters=k)
            model.fit(data)
            distances.append(model.inertia_)
    except Exception as e:
        return error(f'Ошибка во время работы алгоритма k-средних : {e}')

    try:
        data = df.iloc[:, columns]
        title = f'Каменистая осыпь:\nподбор оптимального k'
        image_format = params['image_format'].lower()
        name = f'Columns {", ".join([str(c) for c in columns])}'
        filename = f'{name}.{image_format}'
        file_path = generate_filename(root, 'KmeansScreePlot', filename)
        sns.set()
        fig, ax = plt.subplots()
        sns.lineplot(x=krange, y=distances, ax=ax, marker='o')
        ax.set_title(title)
        ax.set_xlabel('Количество кластеров')
        ax.set_ylabel('Сумма квадратов расстояний')
        fig.savefig(file_path,
                    dpi=int(params['image_dpi']),
                    bbox_inches="tight")
        results.append(str(file_path))
    except Exception as e:
        return error(f'Ошибка при сохранении изображений с результатом : {e}')

    return ready(results)
Example #7
0
def run_factorscreeplot(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']
    data = df.iloc[:, columns]
    try:
        covar_matrix = np.cov(data, rowvar=False)
        eigenvalues = np.linalg.eig(covar_matrix)[0]
    except:
        return error('Ошибка при вычислении результата')


    try:
        data = df.iloc[:, columns]
        image_format = params['image_format'].lower()
        filename = f'plot.{image_format}'
        file_path = generate_filename(root, 'factorscreeplot', filename)
        
        sns.set()
        fig, ax = plt.subplots()
        x = list(range(1, len(eigenvalues) + 1))
        sns.lineplot(x, eigenvalues, marker='o', ax=ax)
        ax.axhline(y=1.0, color='r', linestyle='--')
        ax.set_xticks(x[::2]) # <--- set the ticks first
        ax.set_title('Определение количества факторов (каменистая осыпь)')
        ax.set_xlabel('Количество факторов')
        ax.set_ylabel('Собственное значение')

        fig.savefig(file_path, dpi=int(params['image_dpi']), bbox_inches = "tight")
        results.append(str(file_path))
    except Exception as e:
        return error(f'Ошибка при сохранении изображений с результатом : {e}')
      
    return ready(results)
Example #8
0
def run_normalization(params):

    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']
    data = df.iloc[:, columns]

    try:
        scaler = MinMaxScaler(feature_range=(params['lower_bound'],
                                             params['upper_bound']))
        result = scaler.fit_transform(data)
        for i, column in enumerate(columns):
            df.iloc[:, column] = result[:, i]
    except Exception as e:
        return error(f'Ошибка при выполнении нормализации: {e}')

    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'normalization',
                                          'input_replaced.csv')
            df.to_csv(file_path, index=False)
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'normalization',
                                          'input_replaced.xlsx')
            df.to_excel(file_path, index=False)
            results.append(str(file_path))
        else:
            raise AttributeError
    except Exception as e:
        return error(f'Ошибка при сохранении файла с результатом : {e}')

    return ready(results)
Example #9
0
def run_stats(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # specific
    try:
        column = params['column'] - 1
        data = df.iloc[:, column]
    except:
        return error('Wrong column parameter!')
    stats = data.describe()
    stats.index.name = "Stats"

    file_path = generate_filename(root, 'stats', 'output.csv')

    try:
        if params['transpose']:
            stats.T.to_csv(file_path)
        else:
            stats.to_csv(file_path)
        results.append(str(file_path))
    except:
        return error('Error while saving result!')

    if params['showgraph']:
        try:
            sns.set_style("whitegrid")
            sns.set_context("talk")
            fig, ax = plt.subplots(2, 1, figsize=(6, 10))
            sns.distplot(data, ax=ax[0])
            sns.boxplot(data, ax=ax[1])
            figname = f'Column_{column + 1}.png'
            file_path = root / figname
            fig.savefig(file_path, bbox_inches='tight')
            results.append(str(file_path))
            plt.close(fig)
        except:
            return {'success': False, 'error': 'Error while showing graph!'}
    return {'ready': True, 'results': results}
Example #10
0
def run_summary(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']
    data = df.iloc[:, columns]
    stats = data.describe().T
    stats.index.name = "Параметр"
    stats.columns = [
        'Количество', 'Среднее', 'Стандартное отклонение', 'Минимум',
        '25й перцентиль', 'Медиана', '75% перцентиль', 'Максимум'
    ]

    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'summary', 'output.csv')
            stats.to_csv(file_path)
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'summary', 'output.xlsx')
            stats.to_excel(file_path)
            results.append(str(file_path))
        else:
            raise AttributeError
    except Exception as e:
        return error(f'Ошибка при сохранении файла с результатом')

    return ready(results)
Example #11
0
def run_scatterplot(params):
   # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns1 = parse_columns(params['columns1'], len(df.columns))['data']
    columns2 = parse_columns(params['columns2'], len(df.columns))['data']

    try:
        for column1 in columns1:
            for column2 in columns2:
                data1 = df.iloc[:, column1]
                data2 = df.iloc[:, column2]
                name1, name2 = data1.name, data2.name
                
                image_format = params['image_format'].lower()
                filename = f'{name1}_{name2}.{image_format}'
                file_path = generate_filename(root, 'scatterplot', filename)
                sns.set()
                fig, ax = plt.subplots()
                p = sns.regplot(data1, data2, line_kws={'color': 'red'}, ax=ax)
                slope, intercept, r_value, p_value, std_err = linregress(x=p.get_lines()[0].get_xdata(),y=p.get_lines()[0].get_ydata())
                title = f'Диаграмма рассеяния\nСтолбцы {name1} и {name2} \ny = {slope:.5f} * x + {intercept:.5f}'
                ax.set_title(title)
                fig.savefig(file_path, dpi=int(params['image_dpi']), bbox_inches = "tight")
                results.append(str(file_path))
    except Exception as e:
        return error(f'Ошибка при сохранении изображений с результатом : {e}')
        
    return ready(results)
Example #12
0
def run_pcachoose(params):
  # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']

    try:
        data = df.iloc[:, columns]
        image_format = params['image_format'].lower()
        filename = f'pca_choose_number_of_components.{image_format}'
        file_path = generate_filename(root, 'pcachoose', filename)
        
        pca = PCA()
        components = pca.fit_transform(data)
        sns.set()
        fig, ax = plt.subplots()
        x = list(range(1, pca.n_components_ + 1))
        y = np.cumsum(pca.explained_variance_ratio_)
        p = sns.lineplot(x, y, marker='o', ax=ax)
        p.set_xticks(x[::2]) # <--- set the ticks first
        ax.set_title('Анализ главных компонент')
        ax.set_xlabel('Количество компонент')
        ax.set_ylabel('Доля информации (доля от общей дисперсии)')
        fig.savefig(file_path, dpi=int(params['image_dpi']), bbox_inches = "tight")
        results.append(str(file_path))
    except Exception as e:
        return error(f'Ошибка при сохранении изображений с результатом : {e}')
      
    return ready(results)
Example #13
0
def run_histogram(params):

    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']

    try:
        for column in columns:
            data = df.iloc[:, column]
            name = data.name
            title = f'Гистограмма распределения\nСтолбец {name}'
            image_format = params['image_format'].lower()
            filename = f'{name}.{image_format}'
            file_path = generate_filename(root, 'histogram', filename)
            sns.set()
            fig, ax = plt.subplots()
            sns.distplot(data, ax=ax)
            ax.set_title(title)
            fig.savefig(file_path,
                        dpi=int(params['image_dpi']),
                        bbox_inches="tight")
            results.append(str(file_path))
    except Exception as e:
        return error(f'Ошибка при сохранении изображений с результатом : {e}')

    return ready(results)
Example #14
0
def run_corrmatrix(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns1 = parse_columns(params['columns1'], len(df.columns))['data']
    columns2 = parse_columns(params['columns2'], len(df.columns))['data']
    corr = df.corr()
    corr = corr.iloc[columns1, columns2]

    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'corrmatrix', 'output.csv')
            corr.to_csv(file_path)
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'corrmatrix', 'output.xlsx')
            corr.to_excel(file_path)
            results.append(str(file_path))
        else:
            raise AttributeError
    except Exception as e:
        return error(f'Ошибка при сохранении файла с результатом : {e}')
        
    return ready(results)
Example #15
0
def run_factoranalysis(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']
    data = df.iloc[:, columns]

    try:
        fa_model = FactorAnalysis(n_components=params['ncomponents'])
        fa_model.fit(data)
        factor_data = fa_model.transform(data)
        loading = fa_model.components_
        loading_df = pd.DataFrame(loading, index=list(range(1, params['ncomponents'] + 1)), columns=data.columns)
        factor_data_df = pd.DataFrame(factor_data, columns=[f"Фактор {i}" for i in range(1, params['ncomponents'] + 1)])
    except:
        return error('Ошибка при вычислении результата')

    try:
        data = df.iloc[:, columns]
        image_format = params['image_format'].lower()
        component_names = data.columns
        
        for i in range(1, params['ncomponents'] + 1):
            filename = f'Factor_{i}_Loadings.{image_format}'
            file_path = generate_filename(root, 'factoranalysis', filename)
            sns.set()
            fig, ax = plt.subplots()
            sns.barplot(loading[i-1], component_names, orient='h', ax=ax)
            ax.set_title(f'Факторная нагрузка для фактора {i}')
            ax.set_xlabel('Факторная нагрузка')
            ax.set_ylabel('Переменная')
            fig.savefig(file_path, dpi=int(params['image_dpi']), bbox_inches = "tight")
            results.append(str(file_path))

    except Exception as e:
        return error(f'Ошибка при сохранении изображений с результатом : {e}')
          
    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'factoranalysis', 'factorized.csv')
            factor_data_df.to_csv(file_path, index=False)
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'factoranalysis', 'factorized.xlsx')
            factor_data_df.to_excel(file_path, index=False)
            results.append(str(file_path))
        else:
            raise AttributeError
    except:
        return error('Ошибка при сохранении файла с результатом')

    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'factoranalysis', 'loadings.csv')
            loading_df.to_csv(file_path, index_label='Номер фактора')
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'factoranalysis', 'loadings.xlsx')
            loading_df.to_excel(file_path, index_label='Номер фактора')
            results.append(str(file_path))
        else:
            raise AttributeError
    except:
        return error('Ошибка при сохранении файла с результатом')

    return ready(results)
Example #16
0
def run_kmeans(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # validate params
    res = validate_columns_params(params, len(df.columns))
    if not res['success']:
        return res

    # specific
    columns = parse_columns(params['columns'], len(df.columns))['data']

    try:
        data = df.iloc[:, columns]
        model = KMeans(n_clusters=params['nclusters'])
        model.fit(data)
        labels = model.labels_ + 1
        centers = model.cluster_centers_
        clusters_df = pd.DataFrame(data=labels, columns=['Номер кластера'])
        centers_df = pd.DataFrame(data=centers, columns=data.columns)
        output_df = data.copy()
        output_df['Номер кластера'] = labels
    except Exception as e:
        return error(f'Ошибка во время работы алгоритма k-средних : {e}')

    # save output
    try:
        if params['file_format'] == 'CSV':
            file_path = generate_filename(root, 'summary', 'clusters.csv')
            clusters_df.to_csv(file_path, index=False)
            results.append(str(file_path))
            file_path = generate_filename(root, 'summary',
                                          'cluster_centers.csv')
            centers_df.to_csv(file_path, index=False)
            results.append(str(file_path))
            file_path = generate_filename(root, 'summary',
                                          'input_with_clusters.csv')
            output_df.to_csv(file_path, index=False)
            results.append(str(file_path))
        elif params['file_format'] == 'XLSX':
            file_path = generate_filename(root, 'summary', 'clusters.xlsx')
            clusters_df.to_excel(file_path, index=False)
            results.append(str(file_path))
            file_path = generate_filename(root, 'summary',
                                          'cluster_centers.xlsx')
            centers_df.to_excel(file_path, index=False)
            results.append(str(file_path))
            file_path = generate_filename(root, 'summary',
                                          'input_with_clusters.xlsx')
            output_df.to_excel(file_path, index=False)
            results.append(str(file_path))
        else:
            raise AttributeError
    except Exception as e:
        return error(f'Ошибка при сохранении файла с результатом')
    return ready(results)
Example #17
0
def run_pca(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # specific
    try:
        if params['exclude']:
            pattern = r'^\d+(-\d+)?(?:,\d+(?:-\d+)?)*$'
            if not re.search(pattern, params['exclude']):
                return {
                    'success': False,
                    'error': 'Wrong exclude columns pattern!'
                }

            res = re.findall(r'\d+(?:-\d+)*', params['exclude'])
            columns = set()
            for r in res:
                if '-' in r:
                    left, right = r.split('-')
                    if left < right:
                        columns = columns | set(
                            range(int(left) - 1, int(right)))
                else:
                    columns.add(int(r) - 1)
            columns = list({i for i in range(len(df.columns))} - columns)
            data = df.iloc[:, columns]
        else:
            data = df.iloc[:, :]
    except Exception as e:
        return {
            'success': False,
            'error': f'Error while excluding columns! Exception: {e}'
        }

    columns = data.columns
    if params['normalize']:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)

    try:
        file_path = root / 'components.csv'
        pca = PCA()
        pca.fit(data)
        components = pd.DataFrame(pca.components_, columns=columns)
        components.index.name = 'Component'
        file_path = root / 'components.csv'
        components.to_csv(file_path)
        results.append(str(file_path))

        file_path = root / 'variance.csv'
        pd.DataFrame({
            'Explained Variance': pca.explained_variance_,
            'Explained Variance Ratio': pca.explained_variance_ratio_
        }).to_csv(file_path, index=False)
        results.append(str(file_path))
    except Exception as e:
        return {
            'success': False,
            'error': f'Error while saving result! Exception: {e}'
        }

    params['showgraph'] = True
    if params['showgraph']:
        try:
            fig, ax = plt.subplots()
            ax.bar(list(range(1, pca.n_components_ + 1)),
                   pca.explained_variance_ratio_)
            ax.set_title('Principal Component Analysis')
            ax.set_xlabel('Number of components')
            ax.set_ylabel('Explained Variance Ratio')
            file_path = root / 'pca_figure_1.png'
            fig.savefig(file_path, bbox_inches='tight')
            results.append(str(file_path))
            fig, ax = plt.subplots()
            ax.plot(list(range(1, pca.n_components_ + 1)),
                    np.cumsum(pca.explained_variance_ratio_))
            ax.set_title('Principal Component Analysis')
            ax.set_xlabel('Number of components')
            ax.set_ylabel('Cumulative Explained Variance Ratio')
            file_path = root / 'pca_figure_2.png'
            fig.savefig(file_path, bbox_inches='tight')
            results.append(str(file_path))
        except:
            return {'success': False, 'error': 'Error while showing graph!'}
    return {'ready': True, 'results': results}
Example #18
0
def run_kmeans(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # specific
    try:
        if params['exclude']:
            pattern = r'^\d+(-\d+)?(?:,\d+(?:-\d+)?)*$'
            if not re.search(pattern, params['exclude']):
                return {
                    'success': False,
                    'error': 'Wrong exclude columns pattern!'
                }

            res = re.findall(r'\d+(?:-\d+)*', params['exclude'])
            columns = set()
            for r in res:
                if '-' in r:
                    left, right = r.split('-')
                    if left < right:
                        columns = columns | set(
                            range(int(left) - 1, int(right)))
                else:
                    columns.add(int(r) - 1)
            columns = list({i for i in range(len(df.columns))} - columns)
            data = df.iloc[:, columns]
        else:
            data = df.iloc[:, :]
    except:
        return {'success': False, 'error': 'Error while excluding columns!'}

    if params['normalize']:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)
    kmeans = KMeans(n_clusters=params['nclusters'],
                    random_state=params['randomstate'])
    labels = kmeans.fit_predict(data) + 1

    try:
        file_path = root / 'kmeans_result.csv'
        if params['addresultcolumns']:
            df['Cluster'] = labels
            df.to_csv(file_path, index=False)
        else:
            new_df = pd.DataFrame(labels, columns=['Cluster'])
            new_df.to_csv(file_path, index=False)
        results.append(str(file_path))
    except:
        return {'success': False, 'error': 'Error while saving result!'}

    if params['showstats']:
        centers = kmeans.cluster_centers_
        inertia = kmeans.inertia_
    print(params)
    if params['showgraph']:
        try:
            file_path = root / 'kmeans_figure.png'
            pca = PCA(n_components=2)
            components = pca.fit_transform(data)
            fig, ax = plt.subplots()
            scatter = ax.scatter(components[:, 0], components[:, 1], c=labels)
            ax.set_title('Clustering In Principal Components')
            ax.set_xlabel('Principal Component 0')
            ax.set_ylabel('Principal Component 1')
            ax.legend(*scatter.legend_elements(), title="Clusters")
            fig.savefig(file_path, bbox_inches='tight')
            results.append(str(file_path))
        except:
            return {'success': False, 'error': 'Error while showing graph!'}
    return {'ready': True, 'results': results}
Example #19
0
def run_hca(params):
    # common
    results = []
    job_id = params['job_id']
    res = validate_input_and_get_dataframe(params['url'], job_id)
    if not res['success']:
        return res
    df = res['dataframe']
    root = get_or_create_dir(config.DOWNLOAD_DIR, job_id)

    # specific
    try:
        if params['exclude']:
            pattern = r'^\d+(-\d+)?(?:,\d+(?:-\d+)?)*$'
            if not re.search(pattern, params['exclude']):
                return {
                    'success': False,
                    'error': 'Wrong exclude columns pattern!'
                }

            res = re.findall(r'\d+(?:-\d+)*', params['exclude'])
            columns = set()
            for r in res:
                if '-' in r:
                    left, right = r.split('-')
                    if left < right:
                        columns = columns | set(
                            range(int(left) - 1, int(right)))
                else:
                    columns.add(int(r) - 1)
            columns = list({i for i in range(len(df.columns))} - columns)
            data = df.iloc[:, columns]
        else:
            data = df.iloc[:, :]
    except:
        return {'success': False, 'error': 'Error while excluding columns!'}

    if params['normalize']:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)

    model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
    model.fit(data)
    try:
        distances = pd.DataFrame(model.distances_, columns=['distance'])
        distances.index.name = 'Cluster Number'
        file_path = root / 'distances.csv'
        distances.to_csv(file_path)
        results.append(str(file_path))
    except:
        return {'success': False, 'error': 'Error while saving result!'}
    params['levels'] = 4
    try:
        fig, ax = plt.subplots()
        plot_dendrogram(model,
                        truncate_mode='level',
                        p=params['levels'],
                        ax=ax)
        ax.set_title('Hierarchical Clustering Dendrogram')
        ax.set_xlabel(
            'Number of points in node (or index of point if no parenthesis).')
        ax.set_ylabel('Distance')
        file_path = root / 'hca_figure_1.png'
        fig.savefig(file_path, bbox_inches='tight')
        results.append(str(file_path))
    except Exception as e:
        return {
            'success': False,
            'error': f'Error while showing graph! Exception: {e}'
        }
    return {'ready': True, 'results': results}