Example #1
0
def read_feature_label(filename):
    '''
    给定文本名称,返回特征和标签
    :param filename: 文本名称
    :return: 特征list和标签list
    '''
    filename = filename
    s = pd.read_csv("./data/"+filename, header=None)
    lens =len(s.iloc[0,:])
    labels = s.iloc[:,lens -1]

    feas = s.iloc[0, :lens - 1]
    ser = Series(feas)
    arr = ser.as_matrix()

    for i in range(1, len(s)):
        feas = s.iloc[i, :lens - 1]
        ser = Series(feas)
        temp = ser.as_matrix()
        arr = np.vstack((arr, temp))
    Xs = arr
    buglist = ['false\r','buggy\n','buggy','False','yes','Y','yes\n','Y\n','false\n','FALSE\n','FALSE','False',False,'0']
    notbuglist = ['true\r','clean\n','clean','True','no\n','N','N\n','true\n','TRUE\n','TRUE','True',True,]
    bugnum = 0
    notbugnum = 0
    for bug in labels:
        if bug in buglist:
            bugnum += 1
        elif bug in notbuglist:
            notbugnum += 1
        else:
            notbugnum += 1
    arrs = []
    if bugnum > notbugnum:
        for bug in labels:
            if bug in buglist:
                arrs.append(0)
            elif bug in notbuglist:
                arrs.append(1)
            else:
                arrs.append(1)
    else:
        for bug in labels:
            if bug in buglist:
                arrs.append(1)
            elif bug in notbuglist:
                arrs.append(0)
            else:
                arrs.append(0)
    Ys = np.array(arrs)


    return Xs,Ys
Example #2
0
def calculate_lr_with_market_alpha(stock_data, market_data):
    print('clwmb')
    stock_data = validate_stock_data(stock_data)
    market_data = validate_market_data(market_data)
    alpha_factor = {}
    stock_category = stock_data.drop_duplicates(['stock_symbol'])
    for i in stock_category['stock_symbol']:
        current_df = stock_data[stock_data['stock_symbol'] == i]
        X = Series.as_matrix(market_data['log_return'])
        y = Series.as_matrix(current_df['log_return'])
        lr = LinearRegression()
        lr.fit(X.reshape(-1, 1), y)
        alpha_factor[i] = lr.intercept_
    return alpha_factor
Example #3
0
    def aggregate_value(data: pd.Series, aggregation_function: str) -> float:
        """
        Aggregate a list of values in form of Pandas Series using the specified aggregation function.

        :param data: Series object to calculate the aggregated value on.
        :param aggregation_function: Aggregation function to use. Supported values are 'mean', 'max', 'min', 'median',
            'std', 'slope'.
        :return: Aggregated value.
        :raises ValueError In case of unknown aggregation function.
        """

        # Compute the value depending on aggregation function and return the result
        if aggregation_function == 'mean':
            return data.mean(skipna=True)
        elif aggregation_function == 'max':
            return data.max(skipna=True)
        elif aggregation_function == 'min':
            return data.min(skipna=True)
        elif aggregation_function == 'median':
            return data.median(skipna=True)
        elif aggregation_function == 'std':
            return data.std(skipna=True)
        elif aggregation_function == 'slope':
            # Create time points, assuming discrete time steps with fixed delta t
            times = np.array(range(0, len(data.index)))
            data = data.as_matrix().astype(np.float32)
            # Check for NaN's
            mask = ~np.isnan(data)
            # If data contains no data but NaN's return NaN
            if len(data[mask]) == 0:
                return np.nan
            # Otherwise return the slope
            else:
                slope, intercept, r_value, p_value, std_err = stats.linregress(
                    times[mask], data[mask])
                return slope
        else:
            raise ValueError(
                f'Unknown aggregation function {aggregation_function}.')
Example #4
0
    def get_samples(index, y):
        delta_a = get_delta(microseconds=400000)
        delta_b = get_delta(microseconds=10000)
        a = index - delta_a
        b = index + delta_b

        intervals = zip(a, b)
        output = []

        for a, b in intervals[1:]:
            sample = data.loc[a:b, ['mlii_mV', 'v5_mV']]
            mlii_heartbeat = get_heartbeat(sample, 'mlii_mV')
            v5_heartbeat = get_heartbeat(sample, 'v5_mV')
            sample = Series(sample.as_matrix().ravel())
            sample['mlii_heartbeat_max'] = mlii_heartbeat.max()
            sample['mlii_heartbeat_var'] = mlii_heartbeat.var()
            sample['v5_heartbeat_max'] = v5_heartbeat.max()
            sample['v5_heartbeat_var'] = v5_heartbeat.var()
            sample['y'] = y

            output.append(sample)

        output = DataFrame(output)
        return output
Example #5
0
#
# features_variable = Variable(torch.from_numpy(matrix1))
# print(features_variable)
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import os

'/Users/WeiJoseph/mystuff/ihappy_lifeifei/handle/label2D.csv'
label2D_originFIle = pd.read_csv(label_file)
print(label2D_originFIle.info())
print(label2D_originFIle.head())

depression_level = label2D_originFIle['depression_level']
print(depression_level)
depression_level_matrix = Series.as_matrix(depression_level)
print(depression_level_matrix.shape)

# 查看抑郁症的种类
depression_set = set(depression_level_matrix)
print(len(depression_set))

# 划分训练集和测试集
files = Series.as_matrix(label2D_originFIle['filename'])
dir = '/Users/WeiJoseph/mystuff/ihappy_lifeifei/handle/'
i = 0
length = len(files)
while (i < length):
    files[i] = os.path.join(dir, files[i])
    i += 1
file_train = files[:8000]
from pandas import Series
import os
import skimage.transform
from PIL import Image
from time import time
from sklearn import metrics
from sklearn.metrics import classification_report

data_dir = '../input'
img_dir = os.path.join(data_dir, 'bee_imgs')
data_csv = os.path.join(data_dir, 'bee_data.csv')
data = pd.read_csv(data_csv)

#set subspecies
target = data['subspecies']
target = Series.as_matrix(target)
target_list = set(target)
target_list = list(target_list)

dic = {}
for i in range(7):
    dic[target_list[i]] = i
data = data.replace({"subspecies": dic})

#define dataset


class honeybee(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.img_dir = '../input/bee_imgs'
def ReadLabelFile(labelPath):
    csv = pd.read_csv(labelPath)
    level = Series.as_matrix(csv['level'])
    files = Series.as_matrix(csv['image'])
    return dict(zip(files, level))
Example #8
0
# 定义一个列表,这种形式相当于列表嵌套列表的形式
data = [[1, 'zhaokun'], [2, 'ycl'], [3, 'aimi']]

#相当于数据库中的一条一条数据
# 将列表形式转换为Series类型
ser = Series(data, index=['A', 'B', 'C'])

print(ser)
print(type(ser))

# DataFrame类型相当于数据库中的表
# index:定义行标号,columns:定义列标号
df = DataFrame(data, index=['A', 'B', 'C'], columns=['id', 'name'])

print(df)

# 将Series类型转换为list类型
n = ser.as_matrix()
print("n's type = ", type(n))
print(n)
print(n[0])

# 将DataFrame类型转换为ndarray类型,
n = df.as_matrix()
# 可以指定列名称
# n = df.as_matrix(columns = ['id','name'])
print(n)

# 将DataFrame类型转换为ndarray类型
n = df.values
print(n)
Example #9
0
linea = linea.upper()
tokens = linea.split()	
for word in tokens:
	if word in tfidf.columns:
		print word
		if word in tfquery:
			tfquery[word] = tfquery[word] + 1
		else :
			tfidf[word] = 0
			test = Series({word : 1})
			tfquery = tfquery.add(test, fill_value=0)
tfquery = tfquery/len(tokens)
tfquery = tfquery.multiply(idf,fill_value = 0)
print "TERMINO TFIDF"

vectorTFIDF = tfquery.as_matrix()
distancias = []
for i , f in tfidf.iterrows():
	#distancias.append(dist(vectorTFIDF,f.as_matrix()))
	distancias.append(1 - spatial.distance.cosine(f.as_matrix(), vectorTFIDF))
#b = numpy.argsort(distancias)
distancias = sorted(distancias,reverse=True)
print distancias[0:100]



#tfquery.to_csv('query.csv')
# cache = []
# distancias = Series()
# for i, row in tfidf.iterrows():
# 	d = cosine(row, tfquery)