def test_select(): df = diamonds[['carat','cut','price']] assert df.equals(diamonds >> select('carat','cut','price')) assert df.equals(diamonds >> select(0, 1, 6)) assert df.equals(diamonds >> select(0, 1, 'price')) assert df.equals(diamonds >> select([0, X.cut], X.price)) assert df.equals(diamonds >> select(X.carat, X['cut'], X.price)) assert df.equals(diamonds >> select(X[['carat','cut','price']])) assert df.equals(diamonds >> select(X[['carat','cut']], X.price)) assert df.equals(diamonds >> select(X.iloc[:,[0,1,6]])) assert df.equals(diamonds >> select([X.loc[:, ['carat','cut','price']]]))
def test_cummin(): df = diamonds.copy() >> head(5) >> select(X.cut, X.x) df_cm = df >> mutate(cm=cummin(X.x)) df_truth = df df_truth['cm'] = pd.Series([3.95, 3.89, 3.89, 3.89, 3.89]) assert df_cm.equals(df_truth) df_cm = df >> groupby(X.cut) >> mutate(cm=cummin(X.x)) df_truth['cm'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05]) assert df_cm.equals(df_truth)
def test_cumsum(): df = diamonds.copy() >> head(5) >> select(X.cut, X.x) df_cs = df >> mutate(cs=cumsum(X.x)) df_truth = df df_truth['cs'] = pd.Series([3.95, 7.84, 11.89, 16.09, 20.43]) assert df_cs.equals(df_truth) df_cs = df >> groupby(X.cut) >> mutate(cs=cumsum(X.x)) df_truth['cs'] = pd.Series([3.95, 3.89, 4.05, 8.09, 8.39]) assert df_cs.equals(df_truth)
def test_cummean(): df = diamonds.copy() >> head(5) >> select(X.cut, X.x) df_cm = df >> mutate(cm=cummean(X.x)) df_truth = df df_truth['cm'] = pd.Series([3.950000, 3.920000, 3.963333, 4.022500, 4.086000]) assert df_cm.equals(df_truth) df_cm = df >> groupby(X.cut) >> mutate(cm=cummean(X.x)) df_truth['cm'] = pd.Series([3.950, 3.890, 4.050, 4.045, 4.195]) assert df_cm.equals(df_truth)
def test_cumprod(): df = diamonds.copy() >> head(5) >> select(X.cut, X.x) df_cp = df >> mutate(cp=cumprod(X.x)) df_truth = df df_truth['cp'] = pd.Series([3.950000, 15.365500, 62.230275, 261.367155, 1134.333453]) assert df_cp.equals(df_truth) df_cp = df >> groupby(X.cut) >> mutate(cp=cumprod(X.x)) df_truth['cp'] = pd.Series([3.950, 3.890, 4.050, 16.338, 17.577]) # some tricky floating point stuff going on here diffs = df_cp.cp - df_truth.cp assert all(diffs < .0000001)
def test_min_rank(): df = diamonds.copy() >> head(5) >> select(X.cut, X.x) df_mr = df >> mutate(mr=min_rank(X.x)) df_truth = df df_truth['mr'] = pd.Series([2.0, 1.0, 3.0, 4.0, 5.0]) assert df_mr.equals(df_truth) df_mr = df >> mutate(mr=min_rank(X.cut)) df_truth['mr'] = pd.Series([3.0, 4.0, 1.0, 4.0, 1.0]) assert df_mr.equals(df_truth) df_mr = df >> groupby(X.cut) >> mutate(mr=min_rank(X.x)) df_truth['mr'] = pd.Series([1.0, 1.0, 1.0, 2.0, 2.0]) assert df_mr.equals(df_truth) df_mr = df >> mutate(mr=min_rank(X.x, ascending=False)) df_truth['mr'] = pd.Series([4.0, 5.0, 3.0, 2.0, 1.0]) assert df_mr.equals(df_truth)
def exec(self): try: log.info('[START] {}'.format("exec")) # breakpoint() fileInfoPattrn = '{}/{}/{}'.format(globalVar['inpPath'], serviceName, '1.csv') fileInfo = glob.glob(fileInfoPattrn) if (len(fileInfo) < 1): raise Exception("[ERROR] fileInfo : {} : {}".format("자료를 확인해주세요.", fileInfoPattrn)) data = pd.read_csv(fileInfo[0], skiprows = 16) # (Pdb) >? dataL1.columns # Index(['DateTime', 'Latitude', 'L Sensing Latitude', 'R Sensing Latitude', # 'Longitude', 'L Sensing Longitude', 'R Sensing Longitude', # 'Sensor R S1', 'Sensor L S1', 'Cropspec Root S1'], # dtype='object') dataL1 = ( ( data >> dfply.select( dfply.X['DateTime'] , dfply.X['Latitude'] , dfply.X['Longitude'] , dfply.X['Cropspec Root S1'] , dfply.X['L Sensing Latitude'] , dfply.X['L Sensing Longitude'] , dfply.X['Sensor L S1'] , dfply.X['R Sensing Latitude'] , dfply.X['R Sensing Longitude'] , dfply.X['Sensor R S1'] ) ) ) dataL2 = dataL1.replace(0, np.nan)\ .dropna(axis = 0) dataL3 = pd.concat([ pd.DataFrame(dataL2[['DateTime', 'Latitude', 'Longitude', 'Cropspec Root S1']]).set_axis(['DateTime', 'y', 'x', 'S1'], axis = 1, inplace=False) , pd.DataFrame(dataL2[['DateTime', 'L Sensing Latitude', 'L Sensing Longitude', 'Sensor L S1']]).set_axis(['DateTime', 'y', 'x', 'S1'], axis = 1, inplace=False) , pd.DataFrame(dataL2[['DateTime', 'R Sensing Latitude', 'R Sensing Longitude', 'Sensor R S1']]).set_axis(['DateTime', 'y', 'x', 'S1'], axis = 1, inplace=False) ] , axis = 0 ) dataL4 = dataL3.sort_values(by=['DateTime'], axis=0) saveFile = '{}/{}_{}'.format(globalVar['outPath'], serviceName, '2021_nagano_S1_01_raw.csv') log.info('[CHECK] saveFile : {}'.format(saveFile)) dataL4.to_csv(saveFile, index=False) except Exception as e: log.error("Exception : {}".format(e)) raise e finally: log.info('[END] {}'.format("exec"))
def exec(self): try: log.info('[START] {}'.format("exec")) fileInfo1 = glob.glob('{}/{}'.format(globalVar['inpPath'], '/LSH0183/result/reply.csv')) if (len(fileInfo1) < 1): raise Exception("[ERROR] fileInfo1 : {}".format("자료를 확인해주세요.")) replyData = ( (pd.read_csv(fileInfo1[0]) >> dfply.mutate( title='', view='', content=dfply.X.reply, flag='reply') >> dfply.select(dfply.X.idx_no, dfply.X.title, dfply.X.content, dfply.X.nick, dfply.X.date, dfply.X.view, dfply.X.flag, dfply.X.thread))) contentInfo = glob.glob('{}/{}'.format( globalVar['inpPath'], '/LSH0183/INPUT/CONTENT_RESULT.xlsx')) if (len(contentInfo) < 1): raise Exception( "[ERROR] contentInfo : {}".format("자료를 확인해주세요.")) sheetList = ['황반변성', '비오뷰', '루센티스', '아일리아', '아바스틴'] # breakpoint() # sheetInfo = sheetList[0] for sheetInfo in sheetList: log.info('[CHECK] sheetInfo : {}'.format(sheetInfo)) keyData = ((pd.read_excel(contentInfo[0], sheet_name=sheetInfo) >> dfply.filter_by(dfply.X.flag == 'content') >> dfply.mutate(thread=''))) data = pd.DataFrame() for i in range(len(keyData)): keyDataL1 = ((keyData >> dfply.filter_by( dfply.X.idx_no == keyData['idx_no'][i], dfply.X.view != None ) >> dfply.mutate(url=( "https://cafe.naver.com/maculardegeneration?iframe_url_utf8=%2FArticleRead.nhn%253Fclubid%3D21788988%2526page%3D1%2526boardtype%3DL%2526articleid%3D{}%2526referrerAllArticles%3Dtrue" ).format(keyData['idx_no'][i])))) replyDataL1 = ((replyData >> dfply.filter_by( dfply.X.idx_no == keyData['idx_no'][i], dfply.X.thread != '') >> dfply.mutate(url=''))) # 행 단위로 추가 data = pd.concat([data, keyDataL1, replyDataL1], axis=0) saveFile = '{}/{}_키워드_{}.xlsx'.format(globalVar['outPath'], serviceName, sheetInfo) log.info('[CHECK] saveFile : {}'.format(saveFile)) data.to_excel(saveFile, index=False) except Exception as e: log.error("Exception : {}".format(e)) raise e finally: log.info('[END] {}'.format("exec"))
def create_fmap(mapping, similarity, angle, ranked=ranked, dims=dims): data = pd.DataFrame.from_dict({ 'feature': similarity.index.values.tolist(), 'dim1': mapping.iloc[:, 0].tolist(), 'dim2': mapping.iloc[:, 1].tolist(), 'dim3': mapping.iloc[:, 2].tolist() }) if ranked: data = data >> arrange( X.dim1) >> mutate(dim1=np.arange(data.shape[0]) + 1) data = data >> arrange( X.dim2) >> mutate(dim2=np.arange(data.shape[0]) + 1) data = data >> arrange(X.dim1) data = data >> mutate( resize_x=np.round_(np.linspace(1, dims, data.shape[0]))) data.resize_x = data.resize_x.astype(int) data = data >> arrange(X.dim2) data = data >> mutate( resize_y=np.round_(np.linspace(1, dims, data.shape[0]))) data.resize_y = data.resize_y.astype(int) data2 = pd.DataFrame.from_dict({ 'rot_x': data.resize_x, 'rot_y': data.resize_y }) data2 = rotate_2_col_mat(data2, angle) data = data >> bind_cols(data2) del data2 data = data >> arrange(X.rot_x) data = data >> mutate(x=np.round_(np.linspace(1, dims, data.shape[0]))) data.x = data.x.astype(int) data = data >> arrange(X.rot_y) data = data >> mutate(y=np.round_(np.linspace(1, dims, data.shape[0]))) data.y = data.y.astype(int) data = data >> arrange(X.dim3) data2 = {} data2['X'] = data >> select(X.x, X.y) data2['Y'] = data2['X'].drop_duplicates() data2['X'] = np.arange(data2['Y'].shape[0]) data2['Z'] = data for i in data2['X']: data2['result'] = data2['Z'] >> mask(X.x == data2['Z'].x[i]) data2['result'] = data2['result'] >> mask(X.y == data2['Z'].y[i]) data2['result.z'] = np.arange(data2['result'].shape[0]) + 1 data2['result.z'] = data2['result.z'].tolist() data2['result'] = data2['result'] >> mutate(z=data2['result.z']) if i == 0: data2['results'] = data2['result'] else: data2['results'] = pd.DataFrame.append(data2['results'], data2['result']) data = data2['results'] del data2 data2a = similarity.index.values data2b = data >> mask(data.feature.isin(data2a)) data2a = pd.DataFrame.from_dict({'feature': data2a}) data2a = data2a >> mask( data2a.feature.isin(data2b['feature'].to_numpy())) data = data2a >> left_join(data2b, by='feature') del data2a, data2b data = data.set_index('feature') data = data >> select(X.x, X.y, X.z) data = data >> arrange(X.z, X.y, X.x) return data
def read(path): """ Read a .ts.tar.gz file to a TidySet This function read multiple files archived by tar with gzip compression to a TidySet. :param path: A character of .ts.tar.gz file path (include file extension). :return: output A TidySet, an ExpressionSet with three tables. Function of write_ts_tar_gz can write this file from the TidySet. """ filename = path path = re.sub('.ts.tar.gz', '', filename) os.mkdir(path) tar = tarfile.open(filename) tar.extractall(path) tar.close() f = open(path + '/others.txt', 'r') others = f.read() f.close() other = re.split('\n', others) elements = [] for i in np.arange(len(other)): if re.search('^>>', other[i]): elements.append(i) XX = np.arange(len(elements)).tolist() Y = elements Z = np.arange(len(other)).tolist() K = [] for i in XX: if i < (len(Y) - 1): L = Z[(Y[i] + 1):Y[i + 1]] else: L = Z[(Y[i] + 1):] K.append(L) XX = np.arange(len(K)) Y = K Z = other K = [] for i in elements: K.append(re.sub('>>', '', other[i])) M = dict() for i in XX: L = [] for j in Y[i]: L.append(Z[j]) L = ' '.join(L) M[K[i]] = L others = M del XX, Y, Z, K, L, M, i, j, f, other, elements adata = pd.read_csv(path + '/exprs.csv', names=re.split('\\s', others['sampleNames'])) adata.index = re.split('\\s', others['featureNames']) pdata_names = re.split('\\s', others['varLabels']) pdata_dtype = re.split('\\s', others['varClass']) pdata = dict() for i in np.arange(len(pdata_dtype)): if pdata_dtype[i] == 'numeric': pdata[pdata_names[i]] = 'float64' elif pdata_dtype[i] == 'integer': pdata[pdata_names[i]] = 'int64' elif pdata_dtype[i] == 'factor': pdata[pdata_names[i]] = 'category' else: pdata[pdata_names[i]] = 'object' pdata = pd.read_csv(path + '/pData.csv', names=pdata_names, dtype=pdata) pdata.index = re.split('\\s', others['sampleNames']) string = re.split('\\s', others['varMetadata']) i = 0 for c in string: if string[i] == 'NA': string[i] = np.NaN i += 1 pmetadata = pd.DataFrame(string, index=re.split('\\s', others['varLabels']), columns=['labelDescription']) pdata = AnnotatedDataFrame(pdata, pmetadata) fdata_names = re.split('\\s', others['fvarLabels']) fdata_dtype = re.split('\\s', others['fvarClass']) fdata = dict() for i in np.arange(len(fdata_dtype)): if fdata_dtype[i] == 'numeric': fdata[fdata_names[i]] = 'float64' elif fdata_dtype[i] == 'integer': fdata[fdata_names[i]] = 'int64' elif fdata_dtype[i] == 'factor': fdata[fdata_names[i]] = 'category' else: fdata[fdata_names[i]] = 'object' fdata = pd.read_csv(path + '/fData.csv', names=fdata_names, dtype=fdata) fdata.index = re.split('\\s', others['featureNames']) fdata.index.name = 'pos_id' sim_names = re.split('\\s', others['simNames']) sim_dtype = dict() for i in np.arange(len(sim_names)): sim_dtype[sim_names[i]] = 'float64' similarity = pd.read_csv(path + '/similarity.csv', names=sim_names, dtype=sim_dtype) similarity.index = sim_names ontomap = adata.transpose() for i in np.arange(ontomap.columns.values.shape[0]): dim = re.split('x|y|z', ontomap.columns.values[i]) if i > 0: dim[1] = np.max([int(dim[1]), int(dim_[1])]) dim[2] = np.max([int(dim[2]), int(dim_[2])]) dim[3] = np.max([int(dim[3]), int(dim_[3])]) dim_ = dim del dim_ ontomap = ontomap.to_numpy() ontomap = ontomap.reshape(ontomap.shape[0] * ontomap.shape[1]) ontomap = np.array(ontomap) ontomap = ontomap.reshape(adata.shape[1], dim[2], dim[1], dim[3]) ontotype = {} for i in np.arange(fdata.shape[1] - 1): data0 = fdata >> select(~X.feature) data = data0.iloc[:, i] data = data.reset_index(inplace=False) data = data.rename(columns={data.columns.values[1]: 'ontotype'}) data = data >> mask(X.ontotype == 1) data = data >> left_join(fdata.reset_index(inplace=False), var='pos_id') data = data >> select(X.pos_id, X.feature) data2 = data >> separate(X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z') data2 = data2[['feature', 'x', 'y', 'z']].set_index('feature') ontotype[data0.columns.values[i]] = data2 ontotype['root'] = fdata.reset_index(inplace=False) >> select( X.pos_id, X.feature) ontotype['root'] = ontotype['root'] >> mutate(f_str=X.feature.astype(str)) ontotype['root'] = ontotype['root'] >> mask( X.f_str != 'nan') >> select(~X.f_str) ontotype['root'] = ontotype['root'] >> separate( X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z') ontotype['root'] = ontotype['root'][['feature', 'x', 'y', 'z']].set_index('feature') string = re.split('\\s', others['fvarMetadata']) i = 0 for c in string: if string[i] == 'NA': string[i] = np.NaN i += 1 fmetadata = pd.DataFrame(string, index=re.split('\\s', others['fvarLabels']), columns=['labelDescription']) fdata.index.name = None fdata = AnnotatedDataFrame(fdata, fmetadata) ontology_names = re.split('\\s', others['ontoNames']) ontology_dtype = re.split('\\s', others['ontoClass']) ontology = dict() for i in np.arange(len(ontology_dtype)): if ontology_dtype[i] == 'numeric': ontology[ontology_names[i]] = 'float64' elif ontology_dtype[i] == 'integer': ontology[ontology_names[i]] = 'int64' elif ontology_dtype[i] == 'factor': ontology[ontology_names[i]] = 'category' else: ontology[ontology_names[i]] = 'object' ontology = pd.read_csv(path + '/ontology.csv', names=ontology_names, dtype=ontology) xData = MIAME(name=others['name'], lab=others['lab'], contact=others['contact'], title=others['title'], abstract=others['abstract'], url=others['url'], pubMedIds=others['pubMedIds'], other={ 'similarity': similarity, 'ontomap': ontomap, 'ontotype': ontotype, 'ontology': ontology }) for i in os.listdir(path): os.remove(path + '/' + i) os.rmdir(path) if re.match('^( +)', others['annotation']): annot = '' else: annot = re.sub(' +', ' ', others['annotation']) eset = ExpressionSet(assayData=adata.to_numpy(), phenoData=pdata, featureData=fdata, experimentData=xData, annotation=annot) return eset
def compile(value, outcome, similarity, mapping, ontology, ranked=True, dims=7, decreasing=False, seed_num=33): """ Make a TidySet for visible neural network (VNN) modeling This function create a TidySet, an ExpressionSet class to orchestrate five data into a single set of three tables. :param value: Instance-feature value, a pandas data frame with rows for instances and columns for features. All rows in value should have names. All values should be floating numbers. :param outcome: Outcome, a single-column pandas data frame of binary integers with the same rows as the instances. The row numbers and the order of outcome should be the same with those of value. Value of 0 and 1 should refer to non-event and event outcome, respectively. :param similarity: Feature similarity, a square pandas data frame of floating numbers containing feature-feature similarity measures. :param mapping: Feature three-dimensional mapping, a pandas data frame of floating numbers with rows for features and three columns for three dimensions where the features are mapped onto. :param ontology: Ontology, a pandas data frame with rows for ontologies and four columns for source, target, similarity, and relation. Feature (source)- ontology (target) relation should be annotated as 'feature', while ontology- ontology relation should be annotated as 'is_a'. To differentiate between feature and ontology names, a prefix of 'ONT:' precedes an ontology name. All columns except similarity in ontology should be strings. Similarity (a floating number) is a minimum threshold by which either features or ontologies (source) belong to an ontology (target). :return: output TidySet, an ExpressionSet with three tables. Instance-feature value and outcome pandas data frame are compiled as a phenotype pandas data frame with rows for instances and columns for features and outcome. Instance- feature value and feature three-dimensional mapping pandas data frame are compiled as an expression two-dimensional array with rows for positions of features and columns for instances. The mapping, similarity, and ontology pandas data frame are compiled as a feature pandas data frame with rows for positions of features and columns for feature names and ontological relations. For easier access, the similarity pandas data frame, ontomap four-dimensional numpy array, ontotype dictionary of pandas data frame, and ontology pandas data frame are included in experiment notes that can be called using function of notes. """ pb = ProgressBar(8) tick = 0 pb.start() # Leibniz formula for pi # https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80 # pi=1 # for i in range(1,int(10e+6)): # pi+=((-1)**i)*(1/(2*i+1)) # pi=pi*4 tick += 1 pb.update(tick) #1 def rotate_2_col_mat(X, angle): angle = (math.pi / 180 * angle) * -1 M = np.array([ math.cos(angle), math.sin(angle), -math.sin(angle), math.cos(angle) ]) M = M.reshape(2, 2) M = np.dot(X.to_numpy(), M) M = pd.DataFrame(M, index=X.index.values.tolist(), columns=X.columns.values.tolist()) return M def create_fmap(mapping, similarity, angle, ranked=ranked, dims=dims): data = pd.DataFrame.from_dict({ 'feature': similarity.index.values.tolist(), 'dim1': mapping.iloc[:, 0].tolist(), 'dim2': mapping.iloc[:, 1].tolist(), 'dim3': mapping.iloc[:, 2].tolist() }) if ranked: data = data >> arrange( X.dim1) >> mutate(dim1=np.arange(data.shape[0]) + 1) data = data >> arrange( X.dim2) >> mutate(dim2=np.arange(data.shape[0]) + 1) data = data >> arrange(X.dim1) data = data >> mutate( resize_x=np.round_(np.linspace(1, dims, data.shape[0]))) data.resize_x = data.resize_x.astype(int) data = data >> arrange(X.dim2) data = data >> mutate( resize_y=np.round_(np.linspace(1, dims, data.shape[0]))) data.resize_y = data.resize_y.astype(int) data2 = pd.DataFrame.from_dict({ 'rot_x': data.resize_x, 'rot_y': data.resize_y }) data2 = rotate_2_col_mat(data2, angle) data = data >> bind_cols(data2) del data2 data = data >> arrange(X.rot_x) data = data >> mutate(x=np.round_(np.linspace(1, dims, data.shape[0]))) data.x = data.x.astype(int) data = data >> arrange(X.rot_y) data = data >> mutate(y=np.round_(np.linspace(1, dims, data.shape[0]))) data.y = data.y.astype(int) data = data >> arrange(X.dim3) data2 = {} data2['X'] = data >> select(X.x, X.y) data2['Y'] = data2['X'].drop_duplicates() data2['X'] = np.arange(data2['Y'].shape[0]) data2['Z'] = data for i in data2['X']: data2['result'] = data2['Z'] >> mask(X.x == data2['Z'].x[i]) data2['result'] = data2['result'] >> mask(X.y == data2['Z'].y[i]) data2['result.z'] = np.arange(data2['result'].shape[0]) + 1 data2['result.z'] = data2['result.z'].tolist() data2['result'] = data2['result'] >> mutate(z=data2['result.z']) if i == 0: data2['results'] = data2['result'] else: data2['results'] = pd.DataFrame.append(data2['results'], data2['result']) data = data2['results'] del data2 data2a = similarity.index.values data2b = data >> mask(data.feature.isin(data2a)) data2a = pd.DataFrame.from_dict({'feature': data2a}) data2a = data2a >> mask( data2a.feature.isin(data2b['feature'].to_numpy())) data = data2a >> left_join(data2b, by='feature') del data2a, data2b data = data.set_index('feature') data = data >> select(X.x, X.y, X.z) data = data >> arrange(X.z, X.y, X.x) return data def order_angle_by_channel(mapping, similarity, ranked=ranked, dims=dims, decreasing=False): angles = np.arange(360) + 1 for i in angles: if i == 1: data_ = create_fmap(mapping, similarity, i, ranked, dims) data = [np.max(data_['z'])] else: data_ = create_fmap(mapping, similarity, i, ranked, dims) data.append(np.max(data_['z'])) data = pd.DataFrame.from_dict({ 'angle': angles, 'channel': np.array(data) }) data = data >> arrange(X.channel, ascending=decreasing == False) return data tick += 1 pb.update(tick) #2 np.random.seed(seed_num) angle = order_angle_by_channel(mapping, similarity, ranked, dims, decreasing) angle = angle >> mask(X.channel == np.min(angle['channel'])) angle = angle['angle'].values angle = np.random.choice(np.arange(angle.shape[0]).tolist(), 1, False) tick += 1 pb.update(tick) #3 fmap = create_fmap(mapping, similarity, angle, ranked, dims) fval = value[fmap.index.values].to_numpy() fval = pd.DataFrame(fval, index=value.index.values, columns=value.columns.values) fboth = fmap >> summarize_each([np.max], X.x, X.y, X.z) fboth = fboth.to_numpy() data = [] for i in np.arange(fboth.shape[1]): data_ = np.arange(fboth[:, i]) + 1 data.append(data_.tolist()) del data_ fboth = np.meshgrid(data[0], data[1], data[2]) del data fboth = np.array(fboth).T.reshape(-1, 3) fboth = pd.DataFrame(fboth, columns=fmap.columns.values) fboth = fboth >> arrange(X.z, X.y, X.x) fboth = fboth >> left_join(fmap.reset_index(inplace=False), by=['x', 'y', 'z']) idx = [] for i in fboth['feature'].values.tolist(): idx.append(str(i) != 'nan') fval = fval[fboth['feature'][idx]].to_numpy() fval = np.matrix.transpose(fval) fval = pd.DataFrame(fval, index=fboth['feature'][idx], columns=value.index.values) fboth = fboth >> left_join(fval.reset_index(inplace=False), by='feature') fboth = fboth >> mutate(x_='x') >> unite( 'x', ['x_', 'x'], remove=False, sep='') fboth = fboth >> select(~X.x_) fboth = fboth >> unite('pos_id', ['x', 'y'], remove=True, sep='y') fboth = fboth >> unite('pos_id', ['pos_id', 'z'], remove=False, sep='z') fboth = fboth >> select(~X.z) ori_ontology = ontology def str_detect(string, pattern): match = [] for i in string: match.append('ONT:' in i) return match while np.sum(str_detect(ontology['source'], 'ONT:')) > 0: data = ontology >> mask(X.relation == 'feature') for i in np.arange(ontology.shape[0]): if 'ONT:' in ontology['source'][i]: data2 = data >> mask(X.target == ontology['source'][i]) if data2.shape[0] > 0: data_ = pd.DataFrame.from_dict({ 'source': data2['source'], 'target': ontology['target'][i], 'similarity': ontology['similarity'][i], 'relation': 'feature' }) else: data_ = ontology.iloc[i, :] else: data_ = ontology.iloc[i, :] if i == 0: data2 = data_ else: data2 = data.append(data_) ontology = data2 del data_, data, data2 tick += 1 pb.update(tick) #4 adata = fboth >> select(~X.feature) adata = adata.set_index('pos_id') adata = adata.fillna(0) pdata = value >> mutate(outcome=outcome.astype(int)) pdata = pdata >> select(X.outcome, fmap.index.values.tolist()) fdata = fboth >> select(X.pos_id, X.feature) fdata2 = ontology >> select(X.source, X.target) fdata2 = fdata2.drop_duplicates() fdata2 = fdata2 >> separate(X.target, ['t1', 't2']) fdata2 = fdata2 >> mutate(t1='ONT') >> unite('target', ['t1', 't2'], sep='') fdata2 = fdata2 >> mutate(included=1) >> spread(X.target, X.included) fdata2 = fdata2 >> rename(feature=X.source) fdata = fdata >> left_join(fdata2, by='feature') del fdata2 fdata = fdata.set_index('pos_id') tick += 1 pb.update(tick) #5 ontomap = adata.transpose() for i in np.arange(ontomap.columns.values.shape[0]): dim = re.split('x|y|z', ontomap.columns.values[i]) if i > 0: dim[1] = np.max([int(dim[1]), int(dim_[1])]) dim[2] = np.max([int(dim[2]), int(dim_[2])]) dim[3] = np.max([int(dim[3]), int(dim_[3])]) dim_ = dim del dim_ ontomap = ontomap.to_numpy() ontomap = ontomap.reshape(ontomap.shape[0] * ontomap.shape[1]) ontomap = np.array(ontomap) ontomap = ontomap.reshape(adata.shape[1], dim[2], dim[1], dim[3]) tick += 1 pb.update(tick) #6 ontotype = {} for i in np.arange(fdata.shape[1] - 1): data0 = fdata >> select(~X.feature) data = data0.iloc[:, i] data = data.reset_index(inplace=False) data = data.rename(columns={data.columns.values[1]: 'ontotype'}) data = data >> mask(X.ontotype == 1) data = data >> left_join(fdata.reset_index(inplace=False), var='pos_id') data = data >> select(X.pos_id, X.feature) data2 = data >> separate(X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z') data2 = data2[['feature', 'x', 'y', 'z']].set_index('feature') ontotype[data0.columns.values[i]] = data2 ontotype['root'] = fdata.reset_index(inplace=False) >> select( X.pos_id, X.feature) ontotype['root'] = ontotype['root'] >> mutate(f_str=X.feature.astype(str)) ontotype['root'] = ontotype['root'] >> mask( X.f_str != 'nan') >> select(~X.f_str) ontotype['root'] = ontotype['root'] >> separate( X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z') ontotype['root'] = ontotype['root'][['feature', 'x', 'y', 'z']].set_index('feature') data2a = fmap.reset_index(inplace=False) data2b = similarity[data2a['feature'].to_numpy().tolist()] data2b = data2b.reset_index(inplace=False) data2a = data2a >> rename(index=X.feature) similarity = data2a >> left_join(data2b, by='index') del data2a, data2b similarity = similarity >> select(~X.x, ~X.y, ~X.z) similarity = similarity.set_index('index') similarity.index.name = None tick += 1 pb.update(tick) #7 adata.index.name = None fdata.index.name = None ori_ontology.index = pd.Index(np.arange(ori_ontology.shape[0])) output = ExpressionSet(assayData=adata.to_numpy(), phenoData=AnnotatedDataFrame(pdata), featureData=AnnotatedDataFrame(fdata), experimentData=MIAME( other={ 'similarity': similarity, 'ontomap': ontomap, 'ontotype': ontotype, 'ontology': ori_ontology })) tick += 1 pb.update(tick) #8 return output
sns.boxplot(x="d2", y="val", data=dataL2) plt.show() plt.savefig(saveImg, dpi=600, bbox_inches='tight') # 연소득당 거래금액 산점도 saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName, '연소득당 거래금액 산점도') makeScatterPlot(dataL2['meanCost'], dataL2['거래금액'], saveImg, 3500, 100000) # ******************************************************* # 데이터 분석 (데이터 분석 기법 활용) # ******************************************************* # 주택 가격 결정 요인을 위한 회귀분석 dataL4 = ((dataL2 >> dfply.select(dfply.X.건축년도, dfply.X.전용면적, dfply.X.층, dfply.X.val2, dfply.X.d2, dfply.X.val) >> dfply.rename(면적당거래금액=dfply.X.val2, 연소득당거래금액=dfply.X.val))) # 주택 가격 결정 요인을 위한 관계성 saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName, '주택 가격 결정 요인을 위한 관계성') sns.pairplot(dataL4) plt.show() plt.savefig(saveImg, dpi=600, bbox_inches='tight') # +++++++++++++++++++++++++++++++++++++++++++++++ # 전체 아파트 dataL5 = dataL4 # +++++++++++++++++++++++++++++++++++++++++++++++ # 모든 변수에 대한 다중선형회귀모형
else: # can happen at the year beginning iso = "%04d-%02d-%02d" % (yy-1, int(mm), int(dd)) return iso sh = list() for d in os.listdir("data"): print(d) if os.path.isdir("data/%s" % d): f01 = "data/%s/chmi_manualmeasure.txt" % d if os.path.isfile(f01): d01 = pd.read_csv(f01, sep="|") y01 = d01 >> \ mutate(source = 'chmi_man', country = 'cz', date_valid = d) >> \ rename(snow = X.snowdepth_total) >> \ select(X.date_valid, X.source, X.country, X.station, X.snow) sh.append(y01) f02 = "data/%s/chmi_oah.txt" % d if os.path.isfile(f02): d02 = pd.read_csv(f02, sep="|") y02 = d02 >> \ mutate(source = 'chmi_oah', country = 'cz') y02['date_valid'] = [date_cz2iso(row['date']) for i, row in d02.iterrows()] y02 = y02 >> rename(snow = X.snowdepth_total) >> \ select(X.date_valid, X.source, X.country, X.station, X.snow) sh.append(y02) f03 = "data/%s/chmi_resorts.txt" % d if os.path.isfile(f03): d03 = pd.read_csv(f03, sep="|") y03 = d03 >> \ mutate(source = 'chmi_resorts', country = 'cz')
def ontology_df(hierarchy, value): def linkage_matrix(hierarchy): counts = np.zeros(hierarchy.children_.shape[0]) n_samples = len(hierarchy.labels_) for i, merge in enumerate(hierarchy.children_): current_count = 0 for child_idx in merge: if child_idx < n_samples: current_count += 1 else: current_count += counts[child_idx - n_samples] counts[i] = current_count l = [hierarchy.children_, hierarchy.distances_, counts] return np.column_stack(l).astype(float) labels = value.columns.values[hierarchy.labels_] linkage = linkage_matrix(hierarchy) tree = dendrogram(linkage) A = pd.DataFrame(labels, columns=['A']) A = A >> bind_cols(pd.DataFrame(tree['leaves'], columns=['i'])) B = pd.DataFrame(labels, columns=['B']) B = B >> bind_cols(pd.DataFrame(tree['leaves'], columns=['i2'])) linkages = pd.DataFrame(linkage, columns=['i', 'i2', 'similarity', 'count']) ontology = linkages >> left_join(A, by='i') ontology = ontology >> left_join(B, by='i2') ontology = ontology >> mutate(similarity=1 - X.similarity) ontology = ontology >> mutate( target=['ONT:' + str(i + 1) for i in range(ontology.shape[0])]) ontology = ontology >> mutate(i=X.i - ontology.shape[0]) ontology = ontology >> mutate(i2=X.i2 - ontology.shape[0]) A = ontology['i'].values.astype(int) A1 = ontology['A'].values A2 = ['ONT:' + str(i) for i in A] ontology = ontology >> mutate(A=np.where(A <= 0, A1, A2)) B = ontology['i2'].values.astype(int) B1 = ontology['B'].values B2 = ['ONT:' + str(i) for i in B] ontology = ontology >> mutate(B=np.where(B <= 0, B1, B2)) ontology = pd.melt(ontology, id_vars=['similarity', 'target', 'i', 'i2'], value_vars=['A', 'B'], var_name='key', value_name='source') C = np.where(ontology['key'] == 'A', ontology['i'], ontology['i2']) C = np.where(C <= 0, 'feature', 'is_a') ontology = ontology >> mutate(relation=C) ontology = ontology >> select(X.source, X.target, X.similarity, X.relation) ontology = ontology >> arrange(1 - X.similarity, X.relation) return ontology
df_org = pd.read_csv(file_names[i], header=0) for j in np.arange(len(tickers)): df = df_org[df_org.SECCODE == tickers[j]] df0 = df.query('PRICE!=0') df0 = df >> pl.mutate( s1=np.where((df.ACTION == 1) & (df.BUYSELL == "S"), df.VOLUME, 0), s2=np.where((df.ACTION == 2) & (df.BUYSELL == "S"), df.VOLUME, 0), s0=np.where((df.ACTION == 0) & (df.BUYSELL == "S"), df.VOLUME, 0), b1=np.where((df.ACTION == 1) & (df.BUYSELL == "B"), df.VOLUME, 0), b2=np.where((df.ACTION == 2) & (df.BUYSELL == "B"), df.VOLUME, 0), b0=np.where((df.ACTION == 0) & (df.BUYSELL == "B"), df.VOLUME, 0), timeb=np.where( (df.ACTION == 2) & (df.BUYSELL == "B"), df.NO - 1, 0), times=np.where((df.ACTION == 2) & (df.BUYSELL == "S"), df.NO - 1, 0)) >> pl.select([ 'PRICE', 'ORDERNO', 's1', 's2', 's0', 'b1', 'b2', 'b0', 'timeb', 'times' ]) df0 = df0.groupby(['PRICE', 'ORDERNO']).aggregate({ 's1': np.sum, 's2': np.sum, 's0': np.sum, 'b1': np.sum, 'b2': np.sum, 'b0': np.sum, 'timeb': np.max, 'times': np.max }).reset_index(level=["PRICE", "ORDERNO"]) pricecum = df.query( '(ACTION==2)&(BUYSELL=="S")&(PRICE!=0)').sort_index( ascending=False).PRICE.cummax().sort_index() ind = pricecum.index.values
import pandas as pd from pprint import pprint import dfply as dpy import nltk import gensim from sklearn.model_selection import train_test_split import timeit import matplotlib.pyplot as plt # Import data reviews = pd.read_csv( 'C:/Users/straw/Desktop/stageM2/scripts-francesca/reviews.csv' ) >> dpy.select("Restaurant_ID", "Review_ID", "Review_TEXT") # Only kept the restaurant 'FR0210153861525' reviews = reviews[reviews['Restaurant_ID'] == 'FR0210153861525'].reset_index( level=0, drop=True) >> dpy.drop("Restaurant_ID") reviews.index = reviews['Review_ID'] reviews = reviews >> dpy.drop("Review_ID") # Lower case reviews['Review_TEXT'] = reviews['Review_TEXT'].apply( lambda x: " ".join(x.lower() for x in x.split())) reviews['Review_TEXT'].head() # Tokenization function