Exemple #1
0
    def create_fmap(mapping, similarity, angle, ranked=ranked, dims=dims):
        data = pd.DataFrame.from_dict({
            'feature':
            similarity.index.values.tolist(),
            'dim1':
            mapping.iloc[:, 0].tolist(),
            'dim2':
            mapping.iloc[:, 1].tolist(),
            'dim3':
            mapping.iloc[:, 2].tolist()
        })

        if ranked:
            data = data >> arrange(
                X.dim1) >> mutate(dim1=np.arange(data.shape[0]) + 1)
            data = data >> arrange(
                X.dim2) >> mutate(dim2=np.arange(data.shape[0]) + 1)

        data = data >> arrange(X.dim1)
        data = data >> mutate(
            resize_x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_x = data.resize_x.astype(int)
        data = data >> arrange(X.dim2)
        data = data >> mutate(
            resize_y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_y = data.resize_y.astype(int)

        data2 = pd.DataFrame.from_dict({
            'rot_x': data.resize_x,
            'rot_y': data.resize_y
        })
        data2 = rotate_2_col_mat(data2, angle)
        data = data >> bind_cols(data2)
        del data2

        data = data >> arrange(X.rot_x)
        data = data >> mutate(x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.x = data.x.astype(int)

        data = data >> arrange(X.rot_y)
        data = data >> mutate(y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.y = data.y.astype(int)

        data = data >> arrange(X.dim3)

        data2 = {}
        data2['X'] = data >> select(X.x, X.y)
        data2['Y'] = data2['X'].drop_duplicates()
        data2['X'] = np.arange(data2['Y'].shape[0])
        data2['Z'] = data
        for i in data2['X']:
            data2['result'] = data2['Z'] >> mask(X.x == data2['Z'].x[i])
            data2['result'] = data2['result'] >> mask(X.y == data2['Z'].y[i])
            data2['result.z'] = np.arange(data2['result'].shape[0]) + 1
            data2['result.z'] = data2['result.z'].tolist()
            data2['result'] = data2['result'] >> mutate(z=data2['result.z'])
            if i == 0:
                data2['results'] = data2['result']
            else:
                data2['results'] = pd.DataFrame.append(data2['results'],
                                                       data2['result'])
        data = data2['results']
        del data2

        data2a = similarity.index.values
        data2b = data >> mask(data.feature.isin(data2a))
        data2a = pd.DataFrame.from_dict({'feature': data2a})
        data2a = data2a >> mask(
            data2a.feature.isin(data2b['feature'].to_numpy()))
        data = data2a >> left_join(data2b, by='feature')
        del data2a, data2b

        data = data.set_index('feature')
        data = data >> select(X.x, X.y, X.z)
        data = data >> arrange(X.z, X.y, X.x)

        return data
Exemple #2
0
def read(path):
    """
  Read a .ts.tar.gz file to a TidySet
  
  This function read multiple files archived by tar with gzip compression
  to a TidySet.
  
  :param path: A character of .ts.tar.gz file path (include file extension).
  :return: output A TidySet, an ExpressionSet with three tables. Function of
  write_ts_tar_gz can write this file from the TidySet.
  """

    filename = path
    path = re.sub('.ts.tar.gz', '', filename)
    os.mkdir(path)
    tar = tarfile.open(filename)
    tar.extractall(path)
    tar.close()

    f = open(path + '/others.txt', 'r')
    others = f.read()
    f.close()

    other = re.split('\n', others)
    elements = []
    for i in np.arange(len(other)):
        if re.search('^>>', other[i]):
            elements.append(i)

    XX = np.arange(len(elements)).tolist()
    Y = elements
    Z = np.arange(len(other)).tolist()
    K = []
    for i in XX:
        if i < (len(Y) - 1):
            L = Z[(Y[i] + 1):Y[i + 1]]
        else:
            L = Z[(Y[i] + 1):]
        K.append(L)

    XX = np.arange(len(K))
    Y = K
    Z = other
    K = []
    for i in elements:
        K.append(re.sub('>>', '', other[i]))
    M = dict()
    for i in XX:
        L = []
        for j in Y[i]:
            L.append(Z[j])
        L = ' '.join(L)
        M[K[i]] = L
    others = M
    del XX, Y, Z, K, L, M, i, j, f, other, elements

    adata = pd.read_csv(path + '/exprs.csv',
                        names=re.split('\\s', others['sampleNames']))
    adata.index = re.split('\\s', others['featureNames'])

    pdata_names = re.split('\\s', others['varLabels'])
    pdata_dtype = re.split('\\s', others['varClass'])
    pdata = dict()
    for i in np.arange(len(pdata_dtype)):
        if pdata_dtype[i] == 'numeric':
            pdata[pdata_names[i]] = 'float64'
        elif pdata_dtype[i] == 'integer':
            pdata[pdata_names[i]] = 'int64'
        elif pdata_dtype[i] == 'factor':
            pdata[pdata_names[i]] = 'category'
        else:
            pdata[pdata_names[i]] = 'object'
    pdata = pd.read_csv(path + '/pData.csv', names=pdata_names, dtype=pdata)
    pdata.index = re.split('\\s', others['sampleNames'])
    string = re.split('\\s', others['varMetadata'])
    i = 0
    for c in string:
        if string[i] == 'NA': string[i] = np.NaN
        i += 1
    pmetadata = pd.DataFrame(string,
                             index=re.split('\\s', others['varLabels']),
                             columns=['labelDescription'])
    pdata = AnnotatedDataFrame(pdata, pmetadata)

    fdata_names = re.split('\\s', others['fvarLabels'])
    fdata_dtype = re.split('\\s', others['fvarClass'])
    fdata = dict()
    for i in np.arange(len(fdata_dtype)):
        if fdata_dtype[i] == 'numeric':
            fdata[fdata_names[i]] = 'float64'
        elif fdata_dtype[i] == 'integer':
            fdata[fdata_names[i]] = 'int64'
        elif fdata_dtype[i] == 'factor':
            fdata[fdata_names[i]] = 'category'
        else:
            fdata[fdata_names[i]] = 'object'
    fdata = pd.read_csv(path + '/fData.csv', names=fdata_names, dtype=fdata)
    fdata.index = re.split('\\s', others['featureNames'])
    fdata.index.name = 'pos_id'

    sim_names = re.split('\\s', others['simNames'])
    sim_dtype = dict()
    for i in np.arange(len(sim_names)):
        sim_dtype[sim_names[i]] = 'float64'
    similarity = pd.read_csv(path + '/similarity.csv',
                             names=sim_names,
                             dtype=sim_dtype)
    similarity.index = sim_names

    ontomap = adata.transpose()
    for i in np.arange(ontomap.columns.values.shape[0]):
        dim = re.split('x|y|z', ontomap.columns.values[i])
        if i > 0:
            dim[1] = np.max([int(dim[1]), int(dim_[1])])
            dim[2] = np.max([int(dim[2]), int(dim_[2])])
            dim[3] = np.max([int(dim[3]), int(dim_[3])])
        dim_ = dim
    del dim_
    ontomap = ontomap.to_numpy()
    ontomap = ontomap.reshape(ontomap.shape[0] * ontomap.shape[1])
    ontomap = np.array(ontomap)
    ontomap = ontomap.reshape(adata.shape[1], dim[2], dim[1], dim[3])

    ontotype = {}
    for i in np.arange(fdata.shape[1] - 1):
        data0 = fdata >> select(~X.feature)
        data = data0.iloc[:, i]
        data = data.reset_index(inplace=False)
        data = data.rename(columns={data.columns.values[1]: 'ontotype'})
        data = data >> mask(X.ontotype == 1)
        data = data >> left_join(fdata.reset_index(inplace=False),
                                 var='pos_id')
        data = data >> select(X.pos_id, X.feature)

        data2 = data >> separate(X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
        data2 = data2[['feature', 'x', 'y', 'z']].set_index('feature')

        ontotype[data0.columns.values[i]] = data2

    ontotype['root'] = fdata.reset_index(inplace=False) >> select(
        X.pos_id, X.feature)
    ontotype['root'] = ontotype['root'] >> mutate(f_str=X.feature.astype(str))
    ontotype['root'] = ontotype['root'] >> mask(
        X.f_str != 'nan') >> select(~X.f_str)
    ontotype['root'] = ontotype['root'] >> separate(
        X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
    ontotype['root'] = ontotype['root'][['feature', 'x', 'y',
                                         'z']].set_index('feature')

    string = re.split('\\s', others['fvarMetadata'])
    i = 0
    for c in string:
        if string[i] == 'NA': string[i] = np.NaN
        i += 1
    fmetadata = pd.DataFrame(string,
                             index=re.split('\\s', others['fvarLabels']),
                             columns=['labelDescription'])
    fdata.index.name = None
    fdata = AnnotatedDataFrame(fdata, fmetadata)

    ontology_names = re.split('\\s', others['ontoNames'])
    ontology_dtype = re.split('\\s', others['ontoClass'])
    ontology = dict()
    for i in np.arange(len(ontology_dtype)):
        if ontology_dtype[i] == 'numeric':
            ontology[ontology_names[i]] = 'float64'
        elif ontology_dtype[i] == 'integer':
            ontology[ontology_names[i]] = 'int64'
        elif ontology_dtype[i] == 'factor':
            ontology[ontology_names[i]] = 'category'
        else:
            ontology[ontology_names[i]] = 'object'
    ontology = pd.read_csv(path + '/ontology.csv',
                           names=ontology_names,
                           dtype=ontology)

    xData = MIAME(name=others['name'],
                  lab=others['lab'],
                  contact=others['contact'],
                  title=others['title'],
                  abstract=others['abstract'],
                  url=others['url'],
                  pubMedIds=others['pubMedIds'],
                  other={
                      'similarity': similarity,
                      'ontomap': ontomap,
                      'ontotype': ontotype,
                      'ontology': ontology
                  })

    for i in os.listdir(path):
        os.remove(path + '/' + i)
    os.rmdir(path)

    if re.match('^( +)', others['annotation']):
        annot = ''
    else:
        annot = re.sub(' +', ' ', others['annotation'])

    eset = ExpressionSet(assayData=adata.to_numpy(),
                         phenoData=pdata,
                         featureData=fdata,
                         experimentData=xData,
                         annotation=annot)

    return eset
    def exec(self):

        try:
            log.info('[START] {}'.format("exec"))

            # fileInfoPattrn = '{}/{}/{}'.format(globalVar['inpPath'], serviceName, 'data/csv/inp_01.csv')
            # fileInfo = glob.glob(fileInfoPattrn)
            # if (len(fileInfo) < 1): raise Exception("[ERROR] fileInfo : {} : {}".format("자료를 확인해주세요.", fileInfoPattrn))

            # saveFile = '{}/{}_{}'.format(globalVar['figPath'], serviceName, '2021_nagano_S1_01_raw.png')
            # log.info('[CHECK] saveFile : {}'.format(saveFile))

            # fileInfo = fileList[0]
            # for i, fileInfo in enumerate(fileList):
            #     globalVar['inpData{0:02d}'.format(i + 1)] = pd.read_csv(fileInfo, na_filter=False)

            # breakpoint()

            # 파일 읽기
            fileInfoPattrn = '{}/{}/{}'.format(globalVar['inpPath'],
                                               serviceName,
                                               'data/csv/inp_*.csv')
            fileList = glob.glob(fileInfoPattrn)
            if (len(fileList) < 6):
                raise Exception("[ERROR] fileInfo : {} : {}".format(
                    "자료를 확인해주세요.", fileInfoPattrn))

            inpData01 = pd.read_csv(fileList[0], na_filter=False)
            inpData02 = pd.read_csv(fileList[1], na_filter=False)
            inpData03 = pd.read_csv(fileList[2], na_filter=False)
            inpData04 = pd.read_csv(fileList[3], na_filter=False)
            inpData05 = pd.read_csv(fileList[4], na_filter=False)
            inpData06 = pd.read_csv(fileList[5], na_filter=False)

            # 기간 및 자치구에 따른 데이터 병합
            data = ((
                inpData01 >> dfply.left_join(inpData02, by=('기간', '자치구')) >>
                dfply.left_join(inpData03, by=('기간', '자치구')) >>
                dfply.left_join(inpData04, by=('기간', '자치구')) >>
                dfply.left_join(inpData05, by=('기간', '자치구')) >>
                dfply.left_join(inpData06, by=('기간', '자치구')) >>
                dfply.mask(dfply.X.자치구 != '합계') >> dfply.drop([
                    '합계_검거', '살인_발생', '살인_검거', '강도_발생', '강도_검거', '강간강제추행_발생',
                    '강간강제추행_검거', '절도_발생', '절도_검거', '폭력_발생', '폭력_검거', '합계', '소계'
                ])))

            # 컬럼 개수 : 42개
            len(data.columns.values)

            # 컬럼 형태
            # dataStep1.dtypes

            # ======================================================
            #  범죄횟수를 기준으로 각 상관계수 행렬 시각화
            # ======================================================
            # data = pd.DataFrame(data.dropna(axis=0))

            tmpColY = data.iloc[:, 2]
            tmpColXStep1 = data.iloc[:, 3:21:1]
            tmpColXStep2 = data.iloc[:, 22:41:1]

            dataStep1 = pd.concat([tmpColY, tmpColXStep1], axis=1)
            dataStep1Corr = dataStep1.corr(method='pearson')
            saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                        '상관계수 상단 행렬.png')

            makeCorrPlot(dataStep1, saveImg)

            dataStep2 = pd.concat([tmpColY, tmpColXStep2], axis=1)
            dataStep2Corr = dataStep2.corr(method='pearson')
            saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                        '상관계수 하단 행렬.png')

            makeCorrPlot(dataStep2, saveImg)

            # ===================================================================
            #  전체 데이터셋 (기간, 자치구)을 이용한 독립변수 및 종속 변수 선정
            # ===================================================================
            dataL1 = ((data >> dfply.drop(dfply.X.기간, dfply.X.자치구)))

            # ===================================================================
            #  [상관분석 > 유의미한 변수] 전체 데이터셋 (기간, 자치구)을 이용한 독립변수 및 종속 변수 선정
            # ===================================================================
            selCol = [
                '범죄횟수', '지구대파출소치안센터', '119안전센터', 'CCTV설치현황', '비거주용건물내주택',
                '계_사업체수', '계_종사자수'
            ]
            dataL1 = data[selCol]

            # 결측값에 대한 행 제거 (그에 따른 index 변화로 인해 pd.DataFrame 재변환)
            dataL2 = pd.DataFrame(dataL1.dropna(axis=0))
            dataL2.rename(columns={'범죄횟수': 'total'}, inplace=True)

            # 요약 통계량
            dataL2.describe()

            # 자치구 데이터셋 (기간 평균)을 이용한 독립변수 및 종속 변수 선정
            # selCol = ['기간', '자치구', '범죄횟수', '지구대파출소치안센터', 'CCTV설치현황', '전체세대', '비거주용건물내주택', '계_사업체수']
            # dataL1 = data[selCol]
            #
            # pd.plotting.scatter_matrix(dataL1)
            # plt.show()
            #
            # dataL2 = ((dataL1 >>
            #      group_by(X.자치구) >>
            #      summarize(
            #          total=X.범죄횟수.mean()
            #          , maenX1=X.지구대파출소치안센터.mean()
            #          , maenX2=X.CCTV설치현황.mean()
            #          , maenX3=X.전체세대.mean()
            #          , maenX4=X.비거주용건물내주택.mean()
            #          , maenX5=X.계_사업체수.mean()
            #          ) >>
            #         # arrange(X.number, ascending=False)
            #         drop(X.자치구)
            #      ))

            # ========================================
            #  회귀모형 수행
            # ========================================
            selVarList = list(
                dataL2.columns[~dataL2.columns.str.contains('total')])

            # 다중선형회귀 모형
            result = train_test_linreg(dataL2, selVarList)

            # 릿지 모형
            # result = train_test_ridge(dataL2, selVarList, 1.0)

            # =======================================
            #  시각화
            # ======================================
            # 트레이닝 데이터
            trainValY = result['Y_train'].values
            trainPredValY = result['Y_pred_train']
            saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                        '트레이닝 데이터_상관계수 행렬.png')

            makeScatterPlot(trainValY, trainPredValY, saveImg)

            # 테스트 데이터
            testValY = result['Y_test'].values
            testPredValY = result['Y_pred_test']
            saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                        '테스트 데이터_상관계수 행렬.png')

            makeScatterPlot(testValY, testPredValY, saveImg)

            # =======================================
            #  교차검증 수행
            # ======================================
            X = dataL2[selVarList]
            Y = dataL2.total
            X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                                Y,
                                                                test_size=0.4)

            # Pre-allocate models and corresponding parameter candidates
            models = []
            params = []

            model = ('Linear', LinearRegression())
            param = {}

            models.append(model)
            params.append(param)

            log.info("[CHECK] models : {%s}", models)
            log.info("[CHECK] params : {%s}", params)

            kfold = KFold(n_splits=10, shuffle=True)

            results = []

            # [교차검증] 트레이닝 데이터
            for i in range(1):
                model = models[i]
                param = params[i]
                result = gridsearch_cv_for_regression(model=model,
                                                      param=param,
                                                      kfold=kfold,
                                                      train_input=X_train,
                                                      train_target=Y_train)
                result.best_score_
                results.append(result)

            # [교차검증] 테스트 데이터
            for i in range(len(results)):
                testValY = Y_test.values
                testPredValY = results[i].predict(X_test)

                saveImg = '{}/{}_{}'.format(globalVar['figPath'], serviceName,
                                            '테스트 데이터_산점도.png')
                makeScatterPlot(testValY, testPredValY, saveImg)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))
Exemple #4
0
def compile(value,
            outcome,
            similarity,
            mapping,
            ontology,
            ranked=True,
            dims=7,
            decreasing=False,
            seed_num=33):
    """
  Make a TidySet for visible neural network (VNN) modeling
  
  This function create a TidySet, an ExpressionSet class to orchestrate five
  data into a single set of three tables.
  
  :param value: Instance-feature value, a pandas data frame with rows for
  instances and columns for features. All rows in value should have names. All
  values should be floating numbers.
  :param outcome: Outcome, a single-column pandas data frame of binary integers
  with the same rows as the instances. The row numbers and the order of outcome
  should be the same with those of value. Value  of 0 and 1 should refer to
  non-event and event outcome, respectively.
  :param similarity: Feature similarity, a square pandas data frame of floating
  numbers containing feature-feature similarity measures.
  :param mapping: Feature three-dimensional mapping, a pandas data frame of
  floating numbers with rows for features and three columns for three dimensions
  where the features are mapped onto.
  :param ontology: Ontology, a pandas data frame with rows for ontologies and
  four columns for source, target, similarity, and relation. Feature (source)-
  ontology (target) relation should be annotated as 'feature', while ontology-
  ontology relation should be annotated as 'is_a'. To differentiate between
  feature and ontology names, a prefix of 'ONT:' precedes an ontology name. All
  columns except similarity in ontology should be strings. Similarity (a
  floating number) is a minimum threshold by which either features or ontologies
  (source) belong to an ontology (target).
  :return: output TidySet, an ExpressionSet with three tables. Instance-feature
  value and outcome pandas data frame are compiled as a phenotype pandas data
  frame with rows for instances and columns for features and outcome. Instance-
  feature value and feature three-dimensional mapping pandas data frame are
  compiled as an expression two-dimensional array with rows for positions of
  features and columns for instances. The mapping, similarity, and ontology
  pandas data frame are compiled as a feature pandas data frame with rows for
  positions of features and columns for feature names and ontological relations.
  For easier access, the similarity pandas data frame, ontomap four-dimensional
  numpy array, ontotype dictionary of pandas data frame, and ontology pandas
  data frame are included in experiment notes that can be called using function
  of notes.
  """

    pb = ProgressBar(8)
    tick = 0
    pb.start()

    # Leibniz formula for pi
    # https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
    # pi=1
    # for i in range(1,int(10e+6)):
    #   pi+=((-1)**i)*(1/(2*i+1))
    # pi=pi*4

    tick += 1
    pb.update(tick)  #1

    def rotate_2_col_mat(X, angle):
        angle = (math.pi / 180 * angle) * -1
        M = np.array([
            math.cos(angle),
            math.sin(angle), -math.sin(angle),
            math.cos(angle)
        ])
        M = M.reshape(2, 2)
        M = np.dot(X.to_numpy(), M)
        M = pd.DataFrame(M,
                         index=X.index.values.tolist(),
                         columns=X.columns.values.tolist())
        return M

    def create_fmap(mapping, similarity, angle, ranked=ranked, dims=dims):
        data = pd.DataFrame.from_dict({
            'feature':
            similarity.index.values.tolist(),
            'dim1':
            mapping.iloc[:, 0].tolist(),
            'dim2':
            mapping.iloc[:, 1].tolist(),
            'dim3':
            mapping.iloc[:, 2].tolist()
        })

        if ranked:
            data = data >> arrange(
                X.dim1) >> mutate(dim1=np.arange(data.shape[0]) + 1)
            data = data >> arrange(
                X.dim2) >> mutate(dim2=np.arange(data.shape[0]) + 1)

        data = data >> arrange(X.dim1)
        data = data >> mutate(
            resize_x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_x = data.resize_x.astype(int)
        data = data >> arrange(X.dim2)
        data = data >> mutate(
            resize_y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_y = data.resize_y.astype(int)

        data2 = pd.DataFrame.from_dict({
            'rot_x': data.resize_x,
            'rot_y': data.resize_y
        })
        data2 = rotate_2_col_mat(data2, angle)
        data = data >> bind_cols(data2)
        del data2

        data = data >> arrange(X.rot_x)
        data = data >> mutate(x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.x = data.x.astype(int)

        data = data >> arrange(X.rot_y)
        data = data >> mutate(y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.y = data.y.astype(int)

        data = data >> arrange(X.dim3)

        data2 = {}
        data2['X'] = data >> select(X.x, X.y)
        data2['Y'] = data2['X'].drop_duplicates()
        data2['X'] = np.arange(data2['Y'].shape[0])
        data2['Z'] = data
        for i in data2['X']:
            data2['result'] = data2['Z'] >> mask(X.x == data2['Z'].x[i])
            data2['result'] = data2['result'] >> mask(X.y == data2['Z'].y[i])
            data2['result.z'] = np.arange(data2['result'].shape[0]) + 1
            data2['result.z'] = data2['result.z'].tolist()
            data2['result'] = data2['result'] >> mutate(z=data2['result.z'])
            if i == 0:
                data2['results'] = data2['result']
            else:
                data2['results'] = pd.DataFrame.append(data2['results'],
                                                       data2['result'])
        data = data2['results']
        del data2

        data2a = similarity.index.values
        data2b = data >> mask(data.feature.isin(data2a))
        data2a = pd.DataFrame.from_dict({'feature': data2a})
        data2a = data2a >> mask(
            data2a.feature.isin(data2b['feature'].to_numpy()))
        data = data2a >> left_join(data2b, by='feature')
        del data2a, data2b

        data = data.set_index('feature')
        data = data >> select(X.x, X.y, X.z)
        data = data >> arrange(X.z, X.y, X.x)

        return data

    def order_angle_by_channel(mapping,
                               similarity,
                               ranked=ranked,
                               dims=dims,
                               decreasing=False):
        angles = np.arange(360) + 1
        for i in angles:
            if i == 1:
                data_ = create_fmap(mapping, similarity, i, ranked, dims)
                data = [np.max(data_['z'])]
            else:
                data_ = create_fmap(mapping, similarity, i, ranked, dims)
                data.append(np.max(data_['z']))

        data = pd.DataFrame.from_dict({
            'angle': angles,
            'channel': np.array(data)
        })
        data = data >> arrange(X.channel, ascending=decreasing == False)
        return data

    tick += 1
    pb.update(tick)  #2
    np.random.seed(seed_num)
    angle = order_angle_by_channel(mapping, similarity, ranked, dims,
                                   decreasing)
    angle = angle >> mask(X.channel == np.min(angle['channel']))
    angle = angle['angle'].values
    angle = np.random.choice(np.arange(angle.shape[0]).tolist(), 1, False)

    tick += 1
    pb.update(tick)  #3
    fmap = create_fmap(mapping, similarity, angle, ranked, dims)

    fval = value[fmap.index.values].to_numpy()
    fval = pd.DataFrame(fval,
                        index=value.index.values,
                        columns=value.columns.values)

    fboth = fmap >> summarize_each([np.max], X.x, X.y, X.z)
    fboth = fboth.to_numpy()
    data = []
    for i in np.arange(fboth.shape[1]):
        data_ = np.arange(fboth[:, i]) + 1
        data.append(data_.tolist())
        del data_

    fboth = np.meshgrid(data[0], data[1], data[2])
    del data
    fboth = np.array(fboth).T.reshape(-1, 3)
    fboth = pd.DataFrame(fboth, columns=fmap.columns.values)
    fboth = fboth >> arrange(X.z, X.y, X.x)

    fboth = fboth >> left_join(fmap.reset_index(inplace=False),
                               by=['x', 'y', 'z'])

    idx = []
    for i in fboth['feature'].values.tolist():
        idx.append(str(i) != 'nan')

    fval = fval[fboth['feature'][idx]].to_numpy()
    fval = np.matrix.transpose(fval)
    fval = pd.DataFrame(fval,
                        index=fboth['feature'][idx],
                        columns=value.index.values)

    fboth = fboth >> left_join(fval.reset_index(inplace=False), by='feature')

    fboth = fboth >> mutate(x_='x') >> unite(
        'x', ['x_', 'x'], remove=False, sep='')
    fboth = fboth >> select(~X.x_)
    fboth = fboth >> unite('pos_id', ['x', 'y'], remove=True, sep='y')
    fboth = fboth >> unite('pos_id', ['pos_id', 'z'], remove=False, sep='z')
    fboth = fboth >> select(~X.z)

    ori_ontology = ontology

    def str_detect(string, pattern):
        match = []
        for i in string:
            match.append('ONT:' in i)
        return match

    while np.sum(str_detect(ontology['source'], 'ONT:')) > 0:

        data = ontology >> mask(X.relation == 'feature')
        for i in np.arange(ontology.shape[0]):
            if 'ONT:' in ontology['source'][i]:
                data2 = data >> mask(X.target == ontology['source'][i])
                if data2.shape[0] > 0:
                    data_ = pd.DataFrame.from_dict({
                        'source':
                        data2['source'],
                        'target':
                        ontology['target'][i],
                        'similarity':
                        ontology['similarity'][i],
                        'relation':
                        'feature'
                    })
                else:
                    data_ = ontology.iloc[i, :]
            else:
                data_ = ontology.iloc[i, :]

            if i == 0:
                data2 = data_
            else:
                data2 = data.append(data_)
        ontology = data2
    del data_, data, data2

    tick += 1
    pb.update(tick)  #4
    adata = fboth >> select(~X.feature)
    adata = adata.set_index('pos_id')
    adata = adata.fillna(0)

    pdata = value >> mutate(outcome=outcome.astype(int))
    pdata = pdata >> select(X.outcome, fmap.index.values.tolist())

    fdata = fboth >> select(X.pos_id, X.feature)
    fdata2 = ontology >> select(X.source, X.target)
    fdata2 = fdata2.drop_duplicates()
    fdata2 = fdata2 >> separate(X.target, ['t1', 't2'])
    fdata2 = fdata2 >> mutate(t1='ONT') >> unite('target', ['t1', 't2'],
                                                 sep='')
    fdata2 = fdata2 >> mutate(included=1) >> spread(X.target, X.included)
    fdata2 = fdata2 >> rename(feature=X.source)
    fdata = fdata >> left_join(fdata2, by='feature')
    del fdata2
    fdata = fdata.set_index('pos_id')

    tick += 1
    pb.update(tick)  #5
    ontomap = adata.transpose()
    for i in np.arange(ontomap.columns.values.shape[0]):
        dim = re.split('x|y|z', ontomap.columns.values[i])
        if i > 0:
            dim[1] = np.max([int(dim[1]), int(dim_[1])])
            dim[2] = np.max([int(dim[2]), int(dim_[2])])
            dim[3] = np.max([int(dim[3]), int(dim_[3])])
        dim_ = dim
    del dim_
    ontomap = ontomap.to_numpy()
    ontomap = ontomap.reshape(ontomap.shape[0] * ontomap.shape[1])
    ontomap = np.array(ontomap)
    ontomap = ontomap.reshape(adata.shape[1], dim[2], dim[1], dim[3])

    tick += 1
    pb.update(tick)  #6
    ontotype = {}
    for i in np.arange(fdata.shape[1] - 1):
        data0 = fdata >> select(~X.feature)
        data = data0.iloc[:, i]
        data = data.reset_index(inplace=False)
        data = data.rename(columns={data.columns.values[1]: 'ontotype'})
        data = data >> mask(X.ontotype == 1)
        data = data >> left_join(fdata.reset_index(inplace=False),
                                 var='pos_id')
        data = data >> select(X.pos_id, X.feature)

        data2 = data >> separate(X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
        data2 = data2[['feature', 'x', 'y', 'z']].set_index('feature')

        ontotype[data0.columns.values[i]] = data2

    ontotype['root'] = fdata.reset_index(inplace=False) >> select(
        X.pos_id, X.feature)
    ontotype['root'] = ontotype['root'] >> mutate(f_str=X.feature.astype(str))
    ontotype['root'] = ontotype['root'] >> mask(
        X.f_str != 'nan') >> select(~X.f_str)
    ontotype['root'] = ontotype['root'] >> separate(
        X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
    ontotype['root'] = ontotype['root'][['feature', 'x', 'y',
                                         'z']].set_index('feature')

    data2a = fmap.reset_index(inplace=False)
    data2b = similarity[data2a['feature'].to_numpy().tolist()]
    data2b = data2b.reset_index(inplace=False)
    data2a = data2a >> rename(index=X.feature)
    similarity = data2a >> left_join(data2b, by='index')
    del data2a, data2b
    similarity = similarity >> select(~X.x, ~X.y, ~X.z)
    similarity = similarity.set_index('index')
    similarity.index.name = None

    tick += 1
    pb.update(tick)  #7
    adata.index.name = None
    fdata.index.name = None
    ori_ontology.index = pd.Index(np.arange(ori_ontology.shape[0]))
    output = ExpressionSet(assayData=adata.to_numpy(),
                           phenoData=AnnotatedDataFrame(pdata),
                           featureData=AnnotatedDataFrame(fdata),
                           experimentData=MIAME(
                               other={
                                   'similarity': similarity,
                                   'ontomap': ontomap,
                                   'ontotype': ontotype,
                                   'ontology': ori_ontology
                               }))

    tick += 1
    pb.update(tick)  #8
    return output
Exemple #5
0
    def ontology_df(hierarchy, value):
        def linkage_matrix(hierarchy):
            counts = np.zeros(hierarchy.children_.shape[0])
            n_samples = len(hierarchy.labels_)
            for i, merge in enumerate(hierarchy.children_):
                current_count = 0
                for child_idx in merge:
                    if child_idx < n_samples:
                        current_count += 1
                    else:
                        current_count += counts[child_idx - n_samples]
                counts[i] = current_count
            l = [hierarchy.children_, hierarchy.distances_, counts]
            return np.column_stack(l).astype(float)

        labels = value.columns.values[hierarchy.labels_]
        linkage = linkage_matrix(hierarchy)
        tree = dendrogram(linkage)

        A = pd.DataFrame(labels, columns=['A'])
        A = A >> bind_cols(pd.DataFrame(tree['leaves'], columns=['i']))

        B = pd.DataFrame(labels, columns=['B'])
        B = B >> bind_cols(pd.DataFrame(tree['leaves'], columns=['i2']))

        linkages = pd.DataFrame(linkage,
                                columns=['i', 'i2', 'similarity', 'count'])

        ontology = linkages >> left_join(A, by='i')
        ontology = ontology >> left_join(B, by='i2')
        ontology = ontology >> mutate(similarity=1 - X.similarity)
        ontology = ontology >> mutate(
            target=['ONT:' + str(i + 1) for i in range(ontology.shape[0])])

        ontology = ontology >> mutate(i=X.i - ontology.shape[0])
        ontology = ontology >> mutate(i2=X.i2 - ontology.shape[0])

        A = ontology['i'].values.astype(int)
        A1 = ontology['A'].values
        A2 = ['ONT:' + str(i) for i in A]
        ontology = ontology >> mutate(A=np.where(A <= 0, A1, A2))

        B = ontology['i2'].values.astype(int)
        B1 = ontology['B'].values
        B2 = ['ONT:' + str(i) for i in B]
        ontology = ontology >> mutate(B=np.where(B <= 0, B1, B2))

        ontology = pd.melt(ontology,
                           id_vars=['similarity', 'target', 'i', 'i2'],
                           value_vars=['A', 'B'],
                           var_name='key',
                           value_name='source')

        C = np.where(ontology['key'] == 'A', ontology['i'], ontology['i2'])
        C = np.where(C <= 0, 'feature', 'is_a')

        ontology = ontology >> mutate(relation=C)
        ontology = ontology >> select(X.source, X.target, X.similarity,
                                      X.relation)

        ontology = ontology >> arrange(1 - X.similarity, X.relation)

        return ontology