def add_snv_boolean(df_input: DataFrame, column_sequence: str = "Sequence", column_aminoacid: str = "Aminoacid") -> DataFrame: """ Add a column to dataframe indication if the variant is a SNV or not Parameters ----------- df_input : pandas dataframe containing DSM data. column_sequence: the column that contains the original aa. column_aminoacid: the column that contains the substitution. Returns -------- Modified dataframe. Returns copy """ # Generate dictionary with aa and codon translation codon_table: Dict[str, List[str]] = _dict_codon_to_aa() # Add column with True/False input df_input["SNV?"] = df_input.apply(lambda x: _aminoacids_snv( x[column_sequence], x[column_aminoacid], codon_table), axis=1) return df_input
def apply(self, func, axis=0): """ Applies func to columns (Series) of this DataMatrix and returns either a DataMatrix (if the function produces another series) or a Series indexed on the column names of the DataFrame if the function produces a value. Parameters ---------- func : function Function to apply to each column Examples -------- >>> df.apply(numpy.sqrt) --> DataMatrix >>> df.apply(numpy.sum) --> Series N.B.: Do NOT use functions that might toy with the index. """ if not len(self.cols()): return self if isinstance(func, np.ufunc): results = func(self.values) return DataMatrix(data=results, index=self.index, columns=self.columns, objects=self.objects) else: return DataFrame.apply(self, func, axis=axis)
def findOutliers(self, start=None, end=None, n_sigma=3, window=21, plot=True): if not start: start = self.stock_data.index[0] if not end: end = self.stock_data.index[-1] outlier = DataFrame(self.returns(start=start, end=end, add=False)) temp = outlier[['Return']].rolling(window=window).agg(['mean', 'std']) temp.columns = temp.columns.droplevel() outlier = outlier.join(temp) outlier['outlier'] = outlier.apply(self.indentify_outliers, axis=1) outliers = outlier.loc[outlier['outlier'] == 1, ['Return']] if plot: fig, ax = plt.subplots() ax.plot(outlier.index, outlier.Return, color='blue', label='Normal') ax.scatter(outliers.index, outliers.Return, color='red', label='Anomaly') ax.set_title("{}'s stock returns".format(self.symbol)) ax.legend(loc='lower right') return outliers
def bingxing(filename): start = end = time.clock() host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=ppPj2gyINjoYiqkhsjAnyYDC&client_secret=2Q6tsZrbGsE60pXuoxg5o5AOUDCSMaLP' header = {'Content-Type': 'application/json; charset=UTF-8'} r = requests.post(host, headers=header) r = json.loads(r.text) Access_token = r['access_token'] f = open(filename, 'rb') img = base64.b64encode(f.read()) data = {"image": img, "templateSign": "7dc32854acac2c3bac8d3bb599ceaeca"} ocr_host = 'https://aip.baidubce.com/rest/2.0/solution/v1/iocr/recognise?access_token=' + Access_token ocr_header = { 'Content-Type': 'application/x-www-form-urlencoded', "apikey": "ppPj2gyINjoYiqkhsjAnyYDC" } img = requests.post(ocr_host, headers=ocr_header, data=data) img = json.loads(img.text) ocr_res = img["data"]["ret"] sim_res = [i['word'] for i in ocr_res] testdata = DataFrame(sim_res[1::2]).T testdata.columns = sim_res[0::2] testdata.columns testdata = testdata.rename( columns={ '中性细胞比率': '中性粒细胞百分比', '淋巴细胞(%)': '淋巴细胞百分比', '嗜酸性粒细胞比': '嗜酸性粒细胞百分比', '嗜酸性粒细胞比': '嗜酸性粒细胞百分比', '中性细胞数': '中性粒细胞计数', '淋巴细胞值': '淋巴细胞数计数', '单核细胞百分比': '单核细胞', '嗜酸性粒细胞': '嗜酸性粒细胞计数', '嗜碱性粒细胞': '嗜碱性粒细胞计数', '红细胞平均体积': '平均红细胞体积', '平均血红蛋白量': '平均血红蛋白', '红细胞分布宽度': '红细胞分布宽度变异系数', '平均血小板体积': '血小板平均体积', '血小板分布宽度': '血小板平均分布宽度' }) testdata = testdata.apply(pd.to_numeric, errors='ignore') xtest = testdata[np.array(rowname)[clf.feature_importances_ >= 0.03]] #print(xtest) prob = model.predict_proba(xtest).tolist()[0] if model.predict(xtest): print('该人得有肾病,概率为%f' % prob[1]) else: print('该人未得肾病,概率为%f' % prob[0]) end = time.clock() print('运行时间为' + str(end - start) + '秒')
def create_dummy_variables(df: DataFrame, cat: str) -> None: """expand a categorical attribute into dummy variables named after each unique category """ # some may have " ?" as a category, clean these out first # so there aren't collisions later on df[cat] = df[cat].apply(lambda x: x if x.strip() != '?' else cat + '-unk') uniques = [x for x in df[cat].unique()] for u in uniques: df[u] = df[cat].apply(lambda x: int(x == u)) # Add a cache for fast vector lookup df[cat + '_vec'] = df.apply(lambda r: r[uniques].tolist(), axis=1)
def _aa_to_codons_df(df_input: DataFrame, namecolumn: str) -> DataFrame: """ Inputs a dataframe with a column of amino acids, returns all syn for each amino acidcodons. Used dict_codon_to_aa() and _aa_to_codons. Parameters ----------- df_input : pandas dataframe namecolumn : str Name of the column containing the amino acids. Returns -------- Dataframe with a column containing all the codons that code for that amino acid. Returns copy """ # Copy df_input df_input = df_input.copy() # Calculate each possible codon for every amino acid df_input["Codons_" + namecolumn] = df_input.apply( lambda x: _aa_to_codons(x[namecolumn]), axis=1) return df_input
def getfeature(mypath): path = [] path2 = [] for root, dirs, files in walk(mypath): if len(files) > 0: path.append([root, files]) for i in range(len(path)): for j in range(len(path[i][1])): path2.append(path[i][0] + '\\' + path[i][1][j]) rightdflist = list() BLKdf = list() RETdf = list() PKGdf = list() LASdf = list() FRQdf = list() FNLdf = list() BNDdf = list() LOGdf = list() QNTdf = list() LOTdf = list() CSVdf = list() XLSdf = list() falsedf = list() falsefile = list() Filelist = list() errorlist = list() tStart = time.time() for i in range(len(path2)): #len(path2) try: if "BLK" in path2[i]: Filelist.append(path2[i]) f = open(path2[i], 'r') raw = f.read() f.close() data = re.split('"<|\n"<|>"\n', raw) BLKdf.append(pd.read_csv(StringIO(data[4]))) elif "RET" in path2[i]: Filelist.append(path2[i]) f = open(path2[i], 'r') raw = f.read() f.close() data = re.split('"<|\n"<|>"\n', raw) RETdf.append(pd.read_csv(StringIO(data[4]))) elif "PKG" in path2[i]: Filelist.append(path2[i]) f = open(path2[i], 'r') raw = f.read() f.close() data = re.split('"<|\n"<|>"\n', raw) PKGdf.append(pd.read_csv(StringIO(data[4]))) elif "LAS" in path2[i]: Filelist.append(path2[i]) f = open(path2[i], 'r') raw = f.read() f.close() data = re.split('"<|\n"<|>"\n', raw) LASdf.append(pd.read_csv(StringIO(data[4]))) elif "FNL" in path2[i]: Filelist.append(path2[i]) f = open(path2[i], 'r') raw = f.read() f.close() data = re.split('"<|\n"<|>"\n', raw) FNLdf.append(pd.read_csv(StringIO(data[4]))) elif "BND" in path2[i]: Filelist.append(path2[i]) f = open(path2[i], 'r') raw = f.read() f.close() data = re.split('"<|\n"<|>"\n', raw) tempdf = pd.read_csv(StringIO(data[4])) tempdf = tempdf.apply(lambda x: x.str.strip() if isinstance(x, str) else x).replace( 'Null', np.nan) tempdf.iloc[:, 1:10] = (tempdf.iloc[:, 1:10]).convert_objects( convert_numeric=True) BNDdf.append(tempdf) elif "FRQ" in path2[i]: Filelist.append(path2[i]) f = open(path2[i], 'r') raw = f.read() f.close() data = re.split('\n', raw) FRQdf.append( pd.DataFrame(line.strip().split(',') for line in data)) elif "LOG" in path2[i]: Filelist.append(path2[i]) LOGdf.append(pd.read_csv(path2[i])) elif "QNT" in path2[i]: Filelist.append(path2[i]) QNTdf.append(pd.read_csv(path2[i])) elif "LOT" in path2[i]: Filelist.append(path2[i]) LOTdf.append(pd.read_csv(path2[i])) elif "XLS" in path2[i]: Filelist.append(path2[i]) book = xlrd.open_workbook(path2[i]) sh = book.sheet_by_index(0) nrows = sh.nrows data = [] title = sh.row_values(11) for i in range(18, nrows): row_data = sh.row_values(i) data.append(row_data) FINAL = DataFrame(data) FINAL.columns = title FINAL = FINAL.apply(lambda x: x.str.strip() if isinstance(x, str) else x).replace( '', np.nan) XLSdf.append(FINAL) elif "CSV" in path2[i]: Filelist.append(path2[i]) f = open(path2[i], 'r') raw = f.read() f.close() data = re.split('<|\n<|>\n', raw) CSVdf.append(pd.read_csv(StringIO(data[8]))) else: falsefile.append(path2[i]) falsedf.append(path2[i]) except: errorlist.append(path2[i]) print(i) usefullist = list() rightdflist = [ BLKdf, PKGdf, LASdf, FNLdf, BNDdf, QNTdf, LOGdf, CSVdf, XLSdf ] #exclude: FRQdf, LOTdf, RETdf for i in range(len(rightdflist)): if len(rightdflist[i]) > 1: usefullist.append(pd.concat(rightdflist[i])) # usefullist.append(pd.concat((rightdflist[i]), axis=0, ignore_index=True)) elif len(rightdflist[i]) == 1: usefullist.append(rightdflist[i][0]) elif len(rightdflist[i]) == 0: rightdflist[i] = [] deslist = list() deserrlist = list() for df in usefullist: try: deslist.append(df.describe()) except: deserrlist.append(df) # 將每個mean&std取出 #stats = list() stats = {} stats['mean'] = pd.DataFrame() stats['std'] = pd.DataFrame() for i in range(len(deslist)): temp = deslist[i].loc['mean'] tempstd = deslist[i].loc['std'] stats['mean'] = pd.concat([stats['mean'], temp]) stats['std'] = pd.concat([stats['std'], tempstd]) stats['mean'] = stats['mean'].transpose() stats['std'] = stats['std'].transpose() tEnd = time.time() print(tEnd - tStart) return stats
'嗜酸性粒细胞比': '嗜酸性粒细胞百分比', '中性细胞数': '中性粒细胞计数', '淋巴细胞值': '淋巴细胞数计数', '单核细胞百分比': '单核细胞', '嗜酸性粒细胞': '嗜酸性粒细胞计数', '嗜碱性粒细胞': '嗜碱性粒细胞计数', '红细胞平均体积': '平均红细胞体积', '平均血红蛋白量': '平均血红蛋白', '红细胞分布宽度': '红细胞分布宽度变异系数', '平均血小板体积': '血小板平均体积', '血小板分布宽度': '血小板平均分布宽度' }) # In[ ]: testdata = testdata.apply(pd.to_numeric, errors='ignore') # In[ ]: xtest = testdata[np.array(rowname)[clf.feature_importances_ >= 0.03]] # In[ ]: model.predict(xtest) # In[ ]: #从百度api调用ocr自定义模板识别 start = end = time.clock() host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=ppPj2gyINjoYiqkhsjAnyYDC&client_secret=2Q6tsZrbGsE60pXuoxg5o5AOUDCSMaLP' header = {'Content-Type': 'application/json; charset=UTF-8'}
def __call__(self, *args, **kwargs): kwargs = {**decorator_kwargs, **kwargs} return DataFrame.apply(self._obj, method, args=args, **kwargs)
def parse_csv(self): # Read the csv file, and skip the first row as it's a long string label name survey_data = pd.read_csv(self.in_filename)[1:] bb_survey_flags = [ '2', '<strong>B. I want to record my experiences during the day today (please complete before going to bed).</strong>' ] # Before sleep survey data bb_survey = survey_data.loc[ survey_data['QID20'] != '<strong>A. I want to record my sleep last night (please complete upon awakening).</strong>'] # Upon awakening survey data ab_survey = survey_data.loc[ survey_data['QID20'] == '<strong>A. I want to record my sleep last night (please complete upon awakening).</strong>'] # Define a before sleep DataFrame bb_df = DataFrame() bb_df['User'] = bb_survey['V3'] bb_df['Date'] = bb_survey['V8'].apply(to_ymdstr) bb_df['Day'] = bb_survey['V8'].apply(find_weekday_ymdhms) # Create empty submission times first, fill it later bb_df['MULT'] = '' bb_df['NAPN'] = bb_survey['QID27'].fillna(BLANK_E) bb_df['NAPT'] = bb_survey['QID11#2_1'].fillna(0).apply( hour_to_mins) + bb_survey['QID11#1_1'].fillna(0).apply(str_to_int) bb_df['ALN'] = bb_survey['QID15#3_1_1_TEXT'].fillna(BLANK_E) # ALT alt_series = bb_survey['QID15#2_1'].fillna( BLANK_NA) + ":" + bb_survey['QID15#1_1'].fillna(MM_ZERO) bb_df['ALT'] = alt_series.apply(fill_for_hhmm) bb_df['CAFN'] = bb_survey['QID23#3_1_1_TEXT'].fillna(BLANK_E) # CAFT caft_series = bb_survey['QID23#2_1'].fillna( BLANK_NA) + ":" + bb_survey['QID23#1_1'].fillna(MM_ZERO) bb_df['CAFT'] = caft_series.apply(fill_for_hhmm) # Parse SMED smed_df = DataFrame() smed_df['SMED'] = bb_survey['QID18'] smed_df['SMED1'] = bb_survey['QID17#3_1_1_TEXT'].fillna(BLANK_E) smed_df['SMED1T_HH'] = bb_survey['QID17#2_1'].fillna(BLANK_NA) smed_df['SMED1T_MM'] = bb_survey['QID17#1_1'].fillna(MM_ZERO) smed_df['SMED2'] = bb_survey['QID17#3_2_1_TEXT'].fillna(BLANK_E) smed_df['SMED2T_HH'] = bb_survey['QID17#2_2'].fillna(BLANK_NA) smed_df['SMED2T_MM'] = bb_survey['QID17#1_2'].fillna(MM_ZERO) smed_df['SMED3'] = bb_survey['QID17#3_3_1_TEXT'].fillna(BLANK_E) smed_df['SMED3T_HH'] = bb_survey['QID17#2_3'].fillna(BLANK_NA) smed_df['SMED3T_MM'] = bb_survey['QID17#1_3'].fillna(MM_ZERO) smed_df = smed_df.apply(process_smed, axis=1) bb_df['SMED'] = smed_df['SMED'] bb_df['SMED1'] = smed_df['SMED1'] bb_df['SMED1T'] = smed_df['SMED1T'] bb_df['SMED2'] = smed_df['SMED2'] bb_df['SMED2T'] = smed_df['SMED2T'] bb_df['SMED3'] = smed_df['SMED3'] bb_df['SMED3T'] = smed_df['SMED3T'] bb_df['NOTEBB'] = bb_survey['QID19'].fillna(BLANK_E) bb_df['ATTEMPT'] = '' bb_df['BT'] = '' bb_df['LO'] = '' bb_df['WT'] = '' bb_df['RT'] = '' bb_df['SOL'] = '' bb_df['SNZ'] = '' bb_df['TST'] = '' bb_df['WASON'] = '' bb_df['WASOT'] = '' bb_df['EA'] = '' bb_df['EAT'] = '' bb_df['SQ'] = '' bb_df['REST'] = '' bb_df['NOTEWU'] = '' bb_df['TIB'] = '' bb_df['SE1'] = '' bb_df['SE2'] = '' # process MULT bb_df['MULT'] = self.process_mult(bb_df) # test code # bb_df.to_csv('before_bed_survey.csv', index=False) # End of before sleep # Start for Upon awakening ab_df = DataFrame() ab_df['User'] = ab_survey['V3'] ab_df['Date'] = ab_survey['V8'].apply(reduce_one_day_ymdstr) ab_df['Day'] = ab_df['Date'].apply(find_weekday_ymd) # submission times ab_df['MULT'] = '' ab_df['NAPN'] = '' ab_df['NAPT'] = '' ab_df['ALN'] = '' ab_df['ALT'] = '' ab_df['CAFN'] = '' ab_df['CAFT'] = '' ab_df['SMED'] = '' ab_df['SMED1'] = '' ab_df['SMED1T'] = '' ab_df['SMED2'] = '' ab_df['SMED2T'] = '' ab_df['SMED3'] = '' ab_df['SMED3T'] = '' ab_df['NOTEBB'] = '' tmp_ab_df = DataFrame() tmp_ab_df['Date'] = ab_df['Date'] tmp_ab_df['ATTEMPT'] = ab_survey['QID24'].fillna('Yes').apply( check_for_attempt) tmp_ab_df['BT'] = ab_survey['QID2#2_1'].fillna( BLANK_NA) + ":" + ab_survey['QID2#1_1'].fillna(MM_ZERO) tmp_ab_df['LO'] = ab_survey['QID2#2_2'].fillna( BLANK_NA) + ":" + ab_survey['QID2#1_2'].fillna(MM_ZERO) tmp_ab_df['WT'] = ab_survey['QID2#2_3'].fillna( BLANK_NA) + ":" + ab_survey['QID2#1_3'].fillna(MM_ZERO) tmp_ab_df['RT'] = ab_survey['QID2#2_4'].fillna( BLANK_NA) + ":" + ab_survey['QID2#1_4'].fillna(MM_ZERO) tmp_ab_df['SOL'] = ab_survey['QID3#2_1'].fillna(0).apply( hour_to_mins) + ab_survey['QID3#1_1'].fillna(0).apply(str_to_int) tmp_ab_df['SNZ'] = ab_survey['QID3#2_2'].fillna(0).apply( hour_to_mins) + ab_survey['QID3#1_2'].fillna(0).apply(str_to_int) tmp_ab_df['TST'] = ab_survey['QID3#2_3'].fillna(0).apply( hour_to_mins) + ab_survey['QID3#1_3'].fillna(0).apply(str_to_int) tmp_ab_df['WASON'] = ab_survey['QID6#3_1_1_TEXT'].fillna(BLANK_E) tmp_ab_df['WASOT'] = ab_survey['QID6#2_1'].fillna(0).apply( hour_to_mins) + ab_survey['QID6#1_1'].fillna(0).apply(str_to_int) tmp_ab_df['EA'] = ab_survey['QID26'].fillna(BLANK_E) tmp_ab_df['EAT'] = ab_survey['QID7#2_1'].fillna(0).apply( hour_to_mins) + ab_survey['QID7#1_1'].fillna(0).apply(str_to_int) tmp_ab_df['SQ'] = ab_survey['QID5'].apply(fill_for_rank) tmp_ab_df['REST'] = ab_survey['QID8'].apply(fill_for_rank) tmp_ab_df = tmp_ab_df.apply(process_awaken, axis=1) ab_df['ATTEMPT'] = tmp_ab_df['ATTEMPT'] ab_df['BT'] = tmp_ab_df['BT'] ab_df['LO'] = tmp_ab_df['LO'] ab_df['WT'] = tmp_ab_df['WT'] ab_df['RT'] = tmp_ab_df['RT'] ab_df['SOL'] = tmp_ab_df['SOL'] ab_df['SNZ'] = tmp_ab_df['SNZ'] ab_df['TST'] = tmp_ab_df['TST'] ab_df['WASON'] = tmp_ab_df['WASON'] ab_df['WASOT'] = tmp_ab_df['WASOT'] ab_df['EA'] = tmp_ab_df['EA'] ab_df['EAT'] = tmp_ab_df['EAT'] ab_df['SQ'] = tmp_ab_df['SQ'] ab_df['REST'] = tmp_ab_df['REST'] ab_df['NOTEWU'] = ab_survey['QID28'].fillna(BLANK_E) ab_df['TIB'] = tmp_ab_df['TIB'] ab_df['SE1'] = tmp_ab_df['SE1'] ab_df['SE2'] = tmp_ab_df['SE2'] # test code # ab_df.to_csv('after_bed_survey.csv', index=False) # Process MULT ab_df['MULT'] = self.process_mult(ab_df) # Merge two types of surveys together self.survey_new_csv = bb_df.append(ab_df, ignore_index=True) # sorting it first self.survey_new_csv = self.survey_new_csv.sort(['User', 'Date'], ascending=[1, 1]) # the combined_dulicated_dfs will hold the combined duplicated records combined_duplicated_dfs = [] # get all unique patient ids self.patient_ids = self.survey_new_csv.User.unique().tolist() for index, row in self.survey_new_csv.iterrows(): user_id = row['User'] date = row['Date'] mult = row['MULT'] key = '{}'.format(user_id) + '{}'.format(date) + '{}'.format(mult) found_index = self.get_survey_data_from_dict(key) # TODO: remove this temporary solution # if user_id == '1504': # print('-- Removed the USER ID 1504 Record temporarily due to generate pdf error in R') # self.survey_new_csv.drop([index], inplace=True) if found_index is None: self.set_survey_data_in_dict(key, index) else: duplicated_df = DataFrame(self.survey_new_csv, index=[found_index, index]) # print('------ duplicated df: {}'.format(duplicated_df)) # we drop these duplicated df recordes self.survey_new_csv.drop([found_index, index], inplace=True) # combines these two duplicated dfs into one combined_df = self.combine_rows(duplicated_df) # print('------ combined df: {}'.format(combined_df)) # append this into combined_duplicated_dfs list combined_duplicated_dfs.append(combined_df) # concat these combined duplicated df list all_duplicated = pd.concat(combined_duplicated_dfs) # append it into survey new csv file self.survey_new_csv = self.survey_new_csv.append(all_duplicated, ignore_index=True)