コード例 #1
0
#importing
import numpy as np
#import panda as pd
#from Ipython.display import display

#dataset
data = np.read_csv("data/winequality-red.csv")

#display
display(data.head(n=5))
コード例 #2
0
import torch.nn as nn
import torch .nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autogrid import Variable


#importing the dataset
movies=pd.read_csv('ml-lm/movies.dat',sep='::',header=None,engine='python',encoding='latin-1')
users=pd.read_csv('ml-lm/users.dat',sep='::',header='None',engine='python',encoding='latin-1')
rating=pd.read_csv('ml-lm/ratings.dat',sep='::',header=None,engine='python',encoding='latin-1')

#Preparing the training set and the test set
training_set=pd.read_csv('ml-100k/ul.base',delimiter='\t')
training_set=np.array(training_set,dtype='int')
test_set=np.read_csv('ml-100k/ul.test',delimiter='\t')
test_set=np.array(test_set,dtype='int')


#Getting the number of user and movies
nb_users=int(max(max(training_set[:,0]),max(test_set[:,0])))
nb_movies=int(max(max(training_set[:,0]),max(test_set[:,1])))


#Converting the data into and array in lines and movies in columns
def convert(data):
	new_data=[]
	for id_users in range(1,nb_users+1)
		id_movies = data[:,1][data[:,0]==id_users]
		id_rating= data[:,2][data[:,0]==id_users]
		ratings=np.zeros(nb_movies)
コード例 #3
0
ファイル: csv stuff.py プロジェクト: seomh81/tkek
rdr = csv.reader(f)
for line in rdf:
    print(line)
f.close()

# CSV파일 쓰기
import csv

f = open('output.csv', 'w', encoding='utf-8')
wr = csv.writer(f)
wr.writerow([1, "Alice", True])
wr.writerow([2, "Bob", False])
f.close()

# Intermediate 있어보이게 표현
'''
with open('./train.csv') as csvfile:
    rdr = csv.DictReader(csvfile))
    for i in rdr:
      print(i)
'''

# Advanced pandas로 읽기
import numpy as pd

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

# train 데이터 살펴보기
train.describe(include="all")
コード例 #4
0
import numpy as pd
import pandas as pd
titanic_df = pd.read_csv('titanictrain.csv')
titanic_df['Survived'] = titanic_df['Survived'].map({0: 'Died', 1: 'Survived'})
コード例 #5
0
ファイル: graphs.py プロジェクト: ksteve28/Fit-or-Faux
        pc.set_edgecolor('black')
        plt.tight_layout()
    for pc in violin_plot2['bodies']:
        pc.set_facecolor('teal')
        pc.set_edgecolor('black')
        plt.tight_layout()
    for pc in violin_plot3['bodies']:
        pc.set_facecolor('teal')
        pc.set_edgecolor('black')
        plt.tight_layout()
    plt.show()


if __name__ == '__main__':
    
    obesity = pd.read_csv('/Users/Kelly/Desktop/Fit-or-Faux/Datasets/Obesity_in_Adults_-_CDPHE_Community_Level_Estimates_(Census_Tracts) .csv')
    overweight = pd.read_csv('/Users/Kelly/Desktop/Fit-or-Faux/Datasets/Overweight_and_Obese_Adults_-_CDPHE_Community_Level_Estimates_(Census_Tracts).csv')
    diabetes = pd.read_csv('/Users/Kelly/Desktop/Fit-or-Faux/Datasets/Diabetes_in_Adults_-_CDPHE_Community_Level_Estimates__Census_Tracts_.csv')

    fn.sort_census_estimate_highest(obesity)

    plot_sort_census_estimate_lowest(diabetes, 'Diabetes')
    plot_sort_census_estimate_lowest(overweight, 'Overweight')
    print(plot_sort_census_estimate_lowest(obesity, 'Obesity')

    plot_sort_census_estimate_highest(obesity, 'Obesity')
    plot_sort_census_estimate_highest(overweight, 'Overweight')
    plot_sort_census_estimate_highest(diabetes, 'Diabetes')
    plot_census_estimate(obesity, 'Obesity')
    plot_census_estimate(overweight, 'Overweight')
    plot_census_estimate(obesity, 'Diabetes')
コード例 #6
0
def read_matrixfile(name):
    m = np.read_csv(name)
    return m
コード例 #7
0
ファイル: sketchy.py プロジェクト: ducxion/tsketch
            return s
        else:
            msg('%s is not a subset of "rgbk", please provide a string permutation of these four characters')
            raise argparse.ArgumentTypeError(msg)

    parser.add_argument(
        '-c', '--colors',
        dest='color'
        type=color_subset,
        default="rgbk",
        help='Specify the colors to process as a string of characters'
    )
    parser.add_argument(
        '-b, --batch',
        dest='batch',
        type=string,
        default=None,
        help='Pass option to read an external csv which will batch process the entries'
    )
    sys.argv = ["sketchy.py  "Code/sketchseries/Validation/test1.jpg"]
    args = parser.parse_args()
    if args.batch not None:
        # loop through configurations stored in a csv file
        # For guidance on the format of the csv, see
        # csv example
        batches = pd.read_csv(args.batch, sep=',')
        for batch in batches:
            sketchy(params..)
    else:
        sketchy(args)
コード例 #8
0
#Polynomial Regression

import numpy as np
import matplotlib.pyplot as pyplot
import pandas as pd

dataset = np.read_csv('')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

#Fitting Linear Regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, y)

#Fitting Polynomial Regression
from sklear.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures()
X_poly = poly_reg.fit.transform(X)
lin_reg2 = LinearRegression()
lin_reg2.fit(X_poly, y)

#Visualising the Linear Regression Model
plt.scatter(X, y, color='red')
plt.plot(X, lin_reg.predict(X), color='blue')
plt.title('Thruth or Bluff')
plt.xlabel('Position')
plt.ylabel('Salary')
plt.show()

#Visualising the Polynomial Regression Model
which has already been explicitly provided for us so we don't need to do any extraction.
"""

import numpy as pd
import pandas as pd

import tensorflow as tf
from keras import Sequential
from keras.layers import Input, Dense
from keras.models import Model

from google.colab import drive
drive.mount('/content/drive')

filename = "drive/Team Drives/Deep Learning Project/events.csv"
df = pd.read_csv(filename)
columns = list(df.columns.values)

df.head()

attempted_shot = []
corner_kick = []
isFoul = []
isYellowCard1 = []
isYellowCard2 = []
straight_red_card = []
substitution = []
free_kick_awarded = []
off_sides = []
is_hand_ball = []
penalty_awarded = []
コード例 #10
0
import cv2
import numpy as np

obj_list = np.read_csv('code/csv_labels.csv')

print(obj_list[:3])
コード例 #11
0
import numpy as pd
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Read the training data
train = pd.read_csv('train.csv')

#pull data into target (y) and predictors (X)
train_y = train.SalePrice

# Features we are using to predict the Sale Price of a house
predictors_cols = ['LotArea', 'OverallQual', 'YearBuilt', 'TotRmsAbvGrd']

# Create training predictors data
train_X = train[predictors_cols]

my_model = RandomForestRegressor()

# Fit the model: Capture patterns from provided data. This is the heart of modeling.
my_model.fit(train_X, train_y)

# Read test data
test = pd.read_csv('test.csv')

# Pull the same features/columns as training data from the test data
test_X = test[predictors_cols]

# Use the model to make predictions
predicted_prices = my_model.predict(test_X)

print(predicted_prices)
コード例 #12
0
# Note you need to have these packages in your computer. To do this easily just download anaconda.
import pandas as pd
import numpy as np
from scipy stats
import matplotlib.pyplot as plt

# Table is generated using numpy but is essentially the same as the Table package that was designed. Note that the file was converted from xls to csv using an online converter. If you want you can write a python converter but this will be slow. Note that the path will be different for windows. Also the path depends on where your csv file is in. I chose to put it in the downloads section. Also Note that the filename has changed to 07423-0001-Data1.csv due to converting.
Table = np.read_csv('~/Downloads/07423-0001-Data1.csv')

# This line gets the Table of slave numbers and their prices.
prices = Table['V14']

# These two lines gets the Table of slave numbers and their prices, only including those that are we deem to be relevant (i.e. where the column V40 is all 1 gives us the slaves that are guaranteed and 2 gives us those that are guaranteed.)
guarantee = Table['V14'][Table.V40 == 1]
notG = Table['V14'][Table.V40 == 2]


# Assuming that the student has correctly impleneted the KS distance function in lab 8, which we simulate using the python package scipy. Let us set our significance level for the hypothesis testing to be 0.05. Meaning we reject the null hypothesis (that there is no difference between the prices)
stats.ks_2samp(guarantee, notG)

# From running the line above, we get a p-value of about 1.55*10^-17, which leads us to the conclusion to reject the null hypothesis.

# The other variables can be taken into account by creating more queries to the table, or more complex functions that iterate over loops like a for loop having a boolean statement for the different statistics (i.e. the color of the person's skin).

# The results for males and females price difference is significant. From hypothesis test using KS distance, we get a p-value of 2.9943622527702614e-19.
male = Table['V14'][Table.V15 == 1]
female = Table['V14'][Table.V15 == 2]
stats.ks_2samp(male, female)

# Further analysis can be done by looking at the histograms of the two data, using the matplotlib.pyplot
plt.figure()
コード例 #13
0
ファイル: plt.py プロジェクト: RuiWang3/firstproject
import numpy as np
import matplotlib.pyplot as plt
plt = np.read_csv("Data/py.csv")
lotArea = plt[['lotArea']]
Print(plt['lotArea'].describe())
コード例 #14
0
# coding: utf-8

# In[90]:


import numpy as pd
import matplotlib.pyplot as plt
import pandas as pd


# In[91]:


#Reading data
dataset = pd.read_csv('titanic_data.csv')
dataset.head(5)


# In[92]:


#Removing unnecessary columns
dataset.drop(['PassengerId','Name','Ticket','fair','Embarked','Cabin'],axis = 'columns',inplace = True)
dataset.head(5)


# In[93]:


#Fill na values with mean
コード例 #15
0
#  Generate trades for analysis

number_trades = 1000  #  number of days
mean_profit = 0.001  # 0.1% per trade
standard_dev = 0.003  # 0.3% per trade

#trades = make_trade_list(1000)
#print (trades)
##########

########## Read a text file containing the list of trades
#
#  Read trades for analysis

filename = 'trades.csv'
trades = np.read_csv(filename)
#print (trades)
##########

#  Set the parameters describing the personal risk tolerance
#  of the trader.

drawdown_tolerance = 0.10

desired_accuracy = 0.003

initial_capital = 100000.0

for rep in range(5):

    #  Fraction is initially set to use all available funds
コード例 #16
0
Original file is located at
    https://colab.research.google.com/drive/1kH3mMxTT6-t5GYHiiYPLbgKsGScc-_Ts
"""

!pip install tqdm
import sys, os
import datetime
import numpy as pd
import pandas as pd
from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive')

# Load text from first half of each game
df = pd.read_csv('drive/Team Drives/Deep Learning Project/ken_cnn/text_dataset_60min.csv', sep='\t')
df.head(3)

df.shape

# Combine text comments into one long text for each game
texts = {}
duplicate_ids = []
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
  game_id = row['id_odsp']
  text = str(row['text'])
  
  if not game_id in texts:
    texts[game_id] = []
  else:
    duplicate_ids.append(idx)
コード例 #17
0
Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1PkR06z-NeA0mRWvv0aiqRiOIOhuLs6G7
"""

import numpy as pd
import pandas as pd

from google.colab import drive

drive.mount('/content/drive')

# Read dataset
events_fp = "drive/Team Drives/Deep Learning Project/events.csv"
events_df = pd.read_csv(events_fp, index_col="id_odsp")
events_df = events_df.iloc[:, 1:]

# Select only text from the first half of games
events_df = events_df[events_df['time'] <= 60]
events_df = events_df[['text']]
events_df.head(5)

# Read dataset
ginf_fp = "drive/Team Drives/Deep Learning Project/ginf.csv"
ginf_df = pd.read_csv(ginf_fp, index_col="id_odsp")
ginf_df = ginf_df.iloc[:, 1:]

# Select only games that have detail
ginf_df = ginf_df[ginf_df['adv_stats'] == True]
ginf_df = ginf_df[['ht', 'at', 'fthg',
コード例 #18
0
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 999)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.expand_frame_repr', False)

# titanic, hitters,diabetes verisetleri üzerinde feature engineering çalışmalarını yürütünüz

# Feature Engineering - Data Pre-Processing
# kem küm


# ------------------------------------------------Titanic-----------------------------------------------------------#


df_titanic_ = pd.read_csv(r"\titanic.csv")
df_titanic = df_titanic_.copy()

# Değişken Mühendisliği

# Kabini NA olanlar için CABIN BOOL
df_titanic["NEW_CABIN_BOOL"] = df_titanic["Cabin"].isnull().astype("int")
# Name Letter Count
df_titanic["NEW_NAME_COUNT"] = df_titanic["Name"].str.len()
# Name word Count
df_titanic["NEW_NAME_WORD_COUNT"] = df_titanic["Name"].apply(lambda x: len(str(x).split(" ")))
# isDoctor ?
df_titanic["NEW_NAME_DR"] = df_titanic["Name"].apply(lambda x: len([x for x in x.split() if x.startswith("Dr")]))
# Name Titles
df_titanic["NEW_TITLE"] = df_titanic.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# Familiy Size
コード例 #19
0
        y_current = (teta1*x_train)+teta0 
        cost = sum(np.array([data**2 for data in (y_train-y_current)])*np.array(weights))/N
        teta1_gradient = -(2.0/N)*sum(x_train*(y_train - y_current)*weights)
        teta0_gradient = -(2.0/N)*sum((y_train - y_current)*weights)
        teta1 = teta1 -(learningRate *teta1_gradient)
        teta0 = teta0 -(learningRate *teta0_gradient)
        
    predict_y = []
    for entry in x_test:
            predict = teta1*entry+teta0
            predict_y.extend(predict)    
     
    return np.array(predict_y)[:,np.newaxis]

    
micMatrix = np.matrix(pd.read_csv("/Users/Tina/Desktop/Thesis Code/shuffledMicData1_22.csv"))
RNAMatrix = np.matrix(pd.read_csv("/Users/Tina/Desktop/Thesis Code/shuffledRNAData1_22.csv"))
geneNumber= micMatrix.shape[1]

trainMic = micMatrix[0:95,]
trainRNA = RNAMatrix[0:95,]

testMic = micMatrix[95:104,]
testRNA = RNAMatrix[95:104,]


Nx = testRNA.shape[0]
Ny = testRNA.shape[1]
mylist= np.zeros((Nx,Ny)).tolist()
predictedMatrix = np.matrix(mylist)
predictedMatrixrid = np.matrix(mylist)
コード例 #20
0
ファイル: ballmodel.py プロジェクト: des5ve/pyTorch
from pandas import DataFrame
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

teams = [
    "Virtus Bologna", "Banco di Sardegna Sassari", "Gemani Basket Brescia",
    "EA7 Emporio Armani Milano", "Enel Brindisi", "Vanoli Cremona",
    "Umana Venezia", "Dolomiti Energia Trento", "Fortituto Kontatto Bologna",
    "OpenJobMetis Varese", "Red October Cantu", "Grissin Bon Reggio Emilia",
    "Universo De'Longhi Treviso", "Virtus Roma", "Flexx Pistoia",
    "Pallacanestro Trieste", "Consultinvest VL Pesaro"
]

#Show Game Results
frame = pd.read_csv("serieAUpdated.csv")
columns = [
    'Team', 'PFH', 'PAH', 'PFA', 'PAA', 'eFGFH', 'eFGAH', 'eFGFA', 'eFGAA',
    'TOFH', 'TOAH', 'TOFA', 'TOAA', 'ORFH', 'ORAH', 'ORFA', 'ORAA', 'FTRFH',
    'FTRAH', 'FTRFA', 'FTRAA'
]
data = []
print(frame)

#Loop Through Game Results to Create Team Averages
for i in teams:
    row = []
    homeFrame = frame[frame.homeTeam.str.contains(i)]
    awayFrame = frame[frame.awayTeam.str.contains(i)]
    PFH = homeFrame["homeScore"].mean()
    PAH = homeFrame["awayScore"].mean()
コード例 #21
0
my_date.day

first_two = [datetime(2016, 1, 1), datetime(2016, 1, 2)]

# Datetime Index
dt_ind = pd.DatetimeIndex(first_two)

data = np.random.randn(2, 2)
cols = ['a', 'b']

df = pd.DataFrame(data, dt_ind, cols)
df.index.argmax()  # latest index (use min for first)
df.index.max()  # latest date

# Time Resampling
df = pd.read_csv(
    'data/walmart_stock.csv')  # parse_dates=True, index_col='Date'
df.info()

df['Date'] = pd.to_datetime(df['Date'], format='')
df['Date'] = df['Date'].apply(pd.to_datetime)

df.set_index('Date', inplace=True)
df.head()

df.resample(rule='A').mean()  # year end frequency
# Q - quaterly
# BQ - business quaterly


def first_day(entry):
    return entry[0]
コード例 #22
0
ファイル: keepparsing.py プロジェクト: Hsieh-Cheng-Han/Work
cols = []
for term in [
        s for s in data.columns.values if ("理賠金額" in s) and not ('年理賠金額' in s)
]:
    cols.append((term, term.replace('理賠金額', '單位理賠金額')))

#'理賠金額轉換為單位理賠金額'
for col in cols:
    data[col[0]] = data[col[0]] / data['B5總保額']

#進model
######################################################################################
import numpy as pd
import pandas as pd

data2016 = pd.read_csv('data2016(notfill).csv')
data2017 = pd.read_csv('data2017(notfill).csv')
data2018 = pd.read_csv('data2018(notfill).csv')
data2018 = data2018.drop([
    'Unnamed: 0', 'RANK', 'MIN_of_RANK', 'MIN_of_MIN_of_RANK', '婚姻居住狀況',
    '婚姻狀況_5', '役別', '被保人聾啞', '被保人肢體殘缺畸形_部位及程度', '被保人肥胖瘦弱'
],
                         axis=1,
                         errors='ignore')
#data2017.to_csv('data2017(notfill).csv',index=False)
ID_2018 = data2018['被保人ID']
data2016 = data2016.drop([
    '理賠金額_住院2016', '理賠金額_住院2017', '理賠金額_住院2018', '理賠金額_手術2017', '理賠金額_手術2018',
    '被保人ID', '近三年理賠金額_住院_13_15', '近五年理賠金額_住院_11_15', '近14年理賠金額_住院_02_15',
    '近三年理賠金額_手術_13_15', '近五年理賠金額_手術_11_15', '近14年理賠金額_手術_02_15'
],
コード例 #23
0
"""
Created on Sun Dec 13 15:26:12 2020

@author: deger
"""

## Kütüphaneler
import numpy as pd
import pandas as pd
import matplotlib.pyplot as plt

## Verilerin import edilmesi

#csv dosylarını okumak için pandas'ın read_csv metodunu çağırıyoruz.
#read_csv() metodunun ilk parametresi file_path olduğu için verileri çekeceğimiz dosyanın yolunu pandas'a gösteriyoruz.
veriler = pd.read_csv('./veriler.csv')

#------------------Sütun isimleri-------------------------------------------------
"""
verileri bu şekilde okuttuğumuzda eğer başka bir header değeri atamadıysak, 
pandas her zaman en üstteki satırı sütun isimleri olarak görür
"""
#-------------------pandas.read_csv() metodunun parametreleri:-------------------------------------
"""
pandas.read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer', 
names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, 
dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, 
skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, 
skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, 
dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal='.', 
lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None,
コード例 #24
0
    }
    predict_y = []
    for entry in test_x:
        bandwidth = adaptiveBandwidth(train_x, entry, 10)
        nks = [np.sum((j - entry)**2) / bandwidth for j in train_x]
        ks = [kernels[kernelType](i) for i in nks]
        dividend = sum([ks[i] * train_y[i] for i in range(len(ks))])
        devisor = sum(ks)
        predict = dividend / divisor
        predict_y.extend(predict)
    return np.array(predict_y)[:, np.newaxis]


#inputs are from files shuffledMicData and shuffledRNAData which are the outputs of ExtractingTrainTestCaseAndControlSamples.R
micMatrix = np.matrix(
    pd.read_csv("/Users/Tina/Desktop/Thesis Code/shuffledMicData1_22.csv"))
RNAMatrix = np.matrix(
    pd.read_csv("/Users/Tina/Desktop/Thesis Code/shuffledRNAData1_22.csv"))
geneNumber = micMatrix.shape[1]

trainMic = micMatrix[0:95, ]
trainRNA = RNAMatrix[0:95, ]

testMic = micMatrix[95:104, ]
testRNA = RNAMatrix[95:104, ]

Nx = testRNA.shape[0]
Ny = testRNA.shape[1]
mylist = np.zeros((Nx, Ny)).tolist()
predictedMatrix = np.matrix(mylist)
predictedMatrixrid = np.matrix(mylist)
コード例 #25
0
ファイル: data_clean.py プロジェクト: dxcv/LSTM_predict_trade
"""
Created on Thu May 11 10:35:05 2017

@author: wanjun
数据清洗
"""
#%%
import numpy as pd
import pandas as pd
from sklearn import preprocessing
import talib
from matplotlib import pyplot as plt
#%%
file = '/Users/wanjun/Desktop/LSTM模型/data/data_train_latest.csv'
file_1 = '/Users/wanjun/Desktop/LSTM模型/data/MINUTE_zhuli_IF_20170509.csv'
data = pd.read_csv(file)
data_1 = pd.read_csv(file_1, index_col=1, parse_dates=True)
data.index = data_1.index
data['datetime'] = data_1.index
#%%
data = data.sort_index()
data = data['2016-12-19':]
index = data.drop_duplicates('datetime').resample('D').mean().dropna().index
data_clean = pd.DataFrame(columns=['open', 'high', 'low', 'volume', 'close'])
lst_len = []
lst = []
for i in index:
    i = str(i)[:10]
    temp = data[i]
    start = i + ' 09:30:00'
    end = i + ' 15:01:00'