def read_feature_label(filename): ''' 给定文本名称,返回特征和标签 :param filename: 文本名称 :return: 特征list和标签list ''' filename = filename s = pd.read_csv("./data/"+filename, header=None) lens =len(s.iloc[0,:]) labels = s.iloc[:,lens -1] feas = s.iloc[0, :lens - 1] ser = Series(feas) arr = ser.as_matrix() for i in range(1, len(s)): feas = s.iloc[i, :lens - 1] ser = Series(feas) temp = ser.as_matrix() arr = np.vstack((arr, temp)) Xs = arr buglist = ['false\r','buggy\n','buggy','False','yes','Y','yes\n','Y\n','false\n','FALSE\n','FALSE','False',False,'0'] notbuglist = ['true\r','clean\n','clean','True','no\n','N','N\n','true\n','TRUE\n','TRUE','True',True,] bugnum = 0 notbugnum = 0 for bug in labels: if bug in buglist: bugnum += 1 elif bug in notbuglist: notbugnum += 1 else: notbugnum += 1 arrs = [] if bugnum > notbugnum: for bug in labels: if bug in buglist: arrs.append(0) elif bug in notbuglist: arrs.append(1) else: arrs.append(1) else: for bug in labels: if bug in buglist: arrs.append(1) elif bug in notbuglist: arrs.append(0) else: arrs.append(0) Ys = np.array(arrs) return Xs,Ys
def calculate_lr_with_market_alpha(stock_data, market_data): print('clwmb') stock_data = validate_stock_data(stock_data) market_data = validate_market_data(market_data) alpha_factor = {} stock_category = stock_data.drop_duplicates(['stock_symbol']) for i in stock_category['stock_symbol']: current_df = stock_data[stock_data['stock_symbol'] == i] X = Series.as_matrix(market_data['log_return']) y = Series.as_matrix(current_df['log_return']) lr = LinearRegression() lr.fit(X.reshape(-1, 1), y) alpha_factor[i] = lr.intercept_ return alpha_factor
def aggregate_value(data: pd.Series, aggregation_function: str) -> float: """ Aggregate a list of values in form of Pandas Series using the specified aggregation function. :param data: Series object to calculate the aggregated value on. :param aggregation_function: Aggregation function to use. Supported values are 'mean', 'max', 'min', 'median', 'std', 'slope'. :return: Aggregated value. :raises ValueError In case of unknown aggregation function. """ # Compute the value depending on aggregation function and return the result if aggregation_function == 'mean': return data.mean(skipna=True) elif aggregation_function == 'max': return data.max(skipna=True) elif aggregation_function == 'min': return data.min(skipna=True) elif aggregation_function == 'median': return data.median(skipna=True) elif aggregation_function == 'std': return data.std(skipna=True) elif aggregation_function == 'slope': # Create time points, assuming discrete time steps with fixed delta t times = np.array(range(0, len(data.index))) data = data.as_matrix().astype(np.float32) # Check for NaN's mask = ~np.isnan(data) # If data contains no data but NaN's return NaN if len(data[mask]) == 0: return np.nan # Otherwise return the slope else: slope, intercept, r_value, p_value, std_err = stats.linregress( times[mask], data[mask]) return slope else: raise ValueError( f'Unknown aggregation function {aggregation_function}.')
def get_samples(index, y): delta_a = get_delta(microseconds=400000) delta_b = get_delta(microseconds=10000) a = index - delta_a b = index + delta_b intervals = zip(a, b) output = [] for a, b in intervals[1:]: sample = data.loc[a:b, ['mlii_mV', 'v5_mV']] mlii_heartbeat = get_heartbeat(sample, 'mlii_mV') v5_heartbeat = get_heartbeat(sample, 'v5_mV') sample = Series(sample.as_matrix().ravel()) sample['mlii_heartbeat_max'] = mlii_heartbeat.max() sample['mlii_heartbeat_var'] = mlii_heartbeat.var() sample['v5_heartbeat_max'] = v5_heartbeat.max() sample['v5_heartbeat_var'] = v5_heartbeat.var() sample['y'] = y output.append(sample) output = DataFrame(output) return output
# # features_variable = Variable(torch.from_numpy(matrix1)) # print(features_variable) import pandas as pd import numpy as np from pandas import Series, DataFrame import os '/Users/WeiJoseph/mystuff/ihappy_lifeifei/handle/label2D.csv' label2D_originFIle = pd.read_csv(label_file) print(label2D_originFIle.info()) print(label2D_originFIle.head()) depression_level = label2D_originFIle['depression_level'] print(depression_level) depression_level_matrix = Series.as_matrix(depression_level) print(depression_level_matrix.shape) # 查看抑郁症的种类 depression_set = set(depression_level_matrix) print(len(depression_set)) # 划分训练集和测试集 files = Series.as_matrix(label2D_originFIle['filename']) dir = '/Users/WeiJoseph/mystuff/ihappy_lifeifei/handle/' i = 0 length = len(files) while (i < length): files[i] = os.path.join(dir, files[i]) i += 1 file_train = files[:8000]
from pandas import Series import os import skimage.transform from PIL import Image from time import time from sklearn import metrics from sklearn.metrics import classification_report data_dir = '../input' img_dir = os.path.join(data_dir, 'bee_imgs') data_csv = os.path.join(data_dir, 'bee_data.csv') data = pd.read_csv(data_csv) #set subspecies target = data['subspecies'] target = Series.as_matrix(target) target_list = set(target) target_list = list(target_list) dic = {} for i in range(7): dic[target_list[i]] = i data = data.replace({"subspecies": dic}) #define dataset class honeybee(Dataset): def __init__(self, data, transform=None): self.data = data self.img_dir = '../input/bee_imgs'
def ReadLabelFile(labelPath): csv = pd.read_csv(labelPath) level = Series.as_matrix(csv['level']) files = Series.as_matrix(csv['image']) return dict(zip(files, level))
# 定义一个列表,这种形式相当于列表嵌套列表的形式 data = [[1, 'zhaokun'], [2, 'ycl'], [3, 'aimi']] #相当于数据库中的一条一条数据 # 将列表形式转换为Series类型 ser = Series(data, index=['A', 'B', 'C']) print(ser) print(type(ser)) # DataFrame类型相当于数据库中的表 # index:定义行标号,columns:定义列标号 df = DataFrame(data, index=['A', 'B', 'C'], columns=['id', 'name']) print(df) # 将Series类型转换为list类型 n = ser.as_matrix() print("n's type = ", type(n)) print(n) print(n[0]) # 将DataFrame类型转换为ndarray类型, n = df.as_matrix() # 可以指定列名称 # n = df.as_matrix(columns = ['id','name']) print(n) # 将DataFrame类型转换为ndarray类型 n = df.values print(n)
linea = linea.upper() tokens = linea.split() for word in tokens: if word in tfidf.columns: print word if word in tfquery: tfquery[word] = tfquery[word] + 1 else : tfidf[word] = 0 test = Series({word : 1}) tfquery = tfquery.add(test, fill_value=0) tfquery = tfquery/len(tokens) tfquery = tfquery.multiply(idf,fill_value = 0) print "TERMINO TFIDF" vectorTFIDF = tfquery.as_matrix() distancias = [] for i , f in tfidf.iterrows(): #distancias.append(dist(vectorTFIDF,f.as_matrix())) distancias.append(1 - spatial.distance.cosine(f.as_matrix(), vectorTFIDF)) #b = numpy.argsort(distancias) distancias = sorted(distancias,reverse=True) print distancias[0:100] #tfquery.to_csv('query.csv') # cache = [] # distancias = Series() # for i, row in tfidf.iterrows(): # d = cosine(row, tfquery)