def scrape():
    browser = init_browser
    url = "https://mars.nasa.gov.news"
    browser.visit(url)

    bs_version = BeautifulSoup(browser.html, 'html.parser')
    content_titles = bs_version.find_all(class_="content_title")
    paragraphs = bs_version.find_all(class_="article_teaser_body")
    featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg'
    mars_image = browser.visit(featured_image_url)

    news_url = "https://twitter.com/marswxreport?lang=en"
    mars_news = browser.visit(news_url)
    bs_news = BeautifulSoup(browser.html, 'html.parser')

    facts_url = "https://space-facts.com/mars/"
    facts_tables = pd.read(facts_url)

    #falling asleep, will continue tommorow on Fri 10/18/2019

    hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    hemisphere_info = browser.visit(hemisphere_url)
    bs_hemisphere = BeautifulSoup(browser.html, 'html.parser')

    #watched Astros kick the Yankees ass in their own stadium last night

    hemisphere_items = hemisphere_url.find_all("div", class_="item")
    item_list = "[]"
    for item in item_list:
        anchor_tag = item_find("a")
        href_attribute = anchor_tag["href"]
        item_list.append({"", "anchor_tag:" + hemisphere_url + href_attribute})

    return item_list
Ejemplo n.º 2
0
def csv_reader():
    """
      Read a csv file
      """
    f = open(u'Данные для задания 3 и 4.csv')
    data = read(f, sep=',', encoding='utf-8')
    f.close()
    clean_data(data)
Ejemplo n.º 3
0
def load_file():
	"""
	Looks for the csv files in the local directory, reads in each 
	of the CSV files, and returns panda dataframes for each one.
	"""
	dirname, _ = os.path.split(os.path.abspath(sys.argv[0]))
	csv_files = [file for file in os.listdir(dirname) if re.match("[0-9]{4}.csv$", file)]
	csv_dfs = [pd.read(csv_file, names=column_names) for csv_file in csv_files]
	return csv_dfs
    def __init__(self):
        self.file = pd.read('blog/static/breast-cancer-wisconsin-data.csv')
        label = self.file['breast_cancer']
        features = self.file[self.file.columns[:-1]]

        split_data = train_test_split(label, features, test_size=0.1)
        train_label, self.test_label, train_features, self.test_features = split_data

        self.dtree_algo = tree.DecisionTreeClassifier()
        self.knn_algo = KNeighborsClassifier(n_neighbors=5)
        self.trained_knn = self.knn_algo.fit(train_features, train_label)
        self.trained_dtree = self.dtree_algo.fit(train_features, train_label)
def load_adj_csv(input_dir):
    #Loads all csv files in directory input dir
    result = {}
    for f in os.listdir(input_dir):

        #Extract file extension
        ext = f.split(".")[1]
        if ext == "csv":

            #Extract subject id; e.g 12345_adj.csv
            name = f.split("_")[0]
            path = os.path.join(input_dir, f)
            result[name] = pd.read(path, header=None, index_col=None)
    return result
Ejemplo n.º 6
0
def read_preprocess_rasters(wtd_old_rst_fn, can_rst_fn, dem_rst_fn,
                            peat_type_rst_fn, peat_depth_rst_fn):
    """
    Deals with issues specific to each  set of input data rasters.
    Output
    """
    with rasterio.open(wtd_old_rst_fn) as wtd_old:
        wtd_old = wtd_old.read(1)
    with rasterio.open(dem_rst_fn) as dem:
        dem = dem.read(1)
    with rasterio.open(can_rst_fn) as can:
        can_arr = can.read(1)
    with rasterio.open(peat_type_rst_fn) as pt:
        peat_type_arr = pt.read(1)
    with rasterio.open(peat_depth_rst_fn) as pd:
        peat_depth_arr = pd.read(1)

    #Some small changes to get mask of canals: 1 where canals exist, 0 otherwise
    can_arr[can_arr < 0.5] = 0
    can_arr[abs(can_arr) > 0.5] = 1
    can_arr = np.array(can_arr, dtype=int)

    # Convert from numpy no data to -9999.0
    dem[dem < -10] = -9999.0
    dem[np.where(np.isnan(dem))] = -9999.0
    dem[dem > 1e20] = -9999.0  # just in case

    # control nodata values
    peat_type_arr[peat_type_arr < 0] = -1
    # fill some nodata values to get same size as dem
    peat_type_arr[(np.where(dem > 0.1) and np.where(peat_type_arr < 0.1))] = 1.

    # control nodata values
    peat_depth_arr[peat_depth_arr < 0] = -1

    peat_depth_arr = peat_depth_map(
        peat_depth_arr)  # translate number keys to depths

    # fill some nodata values to get same size as dem
    peat_depth_arr[(np.where(dem > 0.1)
                    and np.where(peat_depth_arr < 0.1))] = 1.

    # Eliminate rows and columns full of noData values.
    # dem = dem[7:-7, 5:-15] #old
    # wtd_old = wtd_old[7:-7, 5:-15]
    # can_arr = can_arr[7:-7, 5:-15]
    # peat_type_arr = peat_type_arr[7:-7, 5:-15]
    # peat_depth_arr = peat_depth_arr[7:-7, 5:-15]

    return can_arr, wtd_old, dem, peat_type_arr, peat_depth_arr
Ejemplo n.º 7
0
def mageck_mle(design_matrix):
    """

    :param design_matrix:
    :return:
    """
    # check directory
    path_check('mageck_mle')

    # args parse
    if isinstance(design_matrix, pd.DataFrame):
        pass
    elif isinstance(design_matrix, str):
        os.path.exists(design_matrix)
        design_matrix = pd.read('design_matrix.xlsx', index_col=0)
    else:
        logging.error('Design matrix must be pd.DataFrame or EXCEL file')
    commands = []
    for control in design_matrix.index:
        path_check('mageck_mle/' + control)
        samples = design_matrix.loc[control][
            design_matrix.loc[control] != 0].index
        mageck_design = pd.DataFrame(np.eye(len(samples) + 1))
        columns = ['baseline']
        columns.extend(samples)
        index = [control]
        index.extend(samples)
        mageck_design.columns = columns
        mageck_design.index = index
        mageck_design.index.name = 'Samples'
        mageck_design['baseline'] = 1
        mageck_design = mageck_design.astype(dtype=int)
        mageck_design.to_csv('mageck_mle/%s/mageck_mle_design.txt' % control,
                             sep='\t')
        command = 'mageck mle -k mageck.count_normalized.txt -d mageck_mle/%s/mageck_mle_design.txt -n mageck_mle/%s/%s --norm-method none --threads %s' % (
            control, control, control, cpu)
        commands.append(command)
    return commands
Ejemplo n.º 8
0
def get_ratings(new_ratings_path):
    all_ratings_path = "data/demo/recipes.csv"
    all_ratings_df = pd.read(all_ratings_path)
    new_ratings_df = pd.read_csv(new_ratings_path)

    new_usernames = set(new_ratings_df.username.unique())
    old_usernames = set(all_ratings_df.username.unique())
    username = list(new_usernames - old_usernames)[0]

    all_ratings_df = all_ratings_df.append(
        new_ratings_df)  # append new ratings to old
    all_ratings_df = all_ratings_df.drop_duplicates()
    all_ratings_df.to_csv(all_ratings_path)  # update the csv with all ratings

    k = 3
    best_k, worst_k = demo_reco(3)

    print("Top {} Recommended Recipes for {}: ".format(k, username))
    for i, recipe in enumerate(best_k):
        print("{}.) {}".format(i, recipe))

    print("Bottom {} Recommended Recipes for {}: ".format(k, username))
    for i, recipe in enumerate(best_k):
        print("{}.) {}".format(i, recipe))
  session_count / room_count as avg_sessions_per_room,
  sum(attended_duration/60) as session_minutes_sum
from cdm_clb.session
where attended_duration > 0
"""

outfile = "timeSpentInCollab.csv"

ctx = snowflake.connector.connect(user=cfg.sfconcfg['user'],
                                  password=cfg.sfconcfg['password'],
                                  account=cfg.sfconcfg['account'],
                                  warehouse=cfg.sfconcfg['warehouse'],
                                  database=cfg.sfconcfg['database'],
                                  insecure_mode=cfg.sfconcfg['insecure_mode'])
cs = ctx.cursor()
try:
    cs.execute(query)

    # Fetch the result set from the cursor and deliver it as the Pandas DataFrame.
    # df = cs.fetch_pandas_all()
    # TODO: fix this later, switched to pd.read_sql as fetch_pandas_all() causes segfault on linux
    df = pd.read(query, ctx)

    print(df.head())

    df.to_csv(outfile, index=cfg.sfconcfg['timeSpentInCollab']['index'])

finally:
    cs.close()
ctx.close()
Ejemplo n.º 10
0
import os.path as path
import sys
from scipy import interpolate
from pandas import read_excel as read
rc('text', usetex=True)
rc('text.latex',
   preamble=[
       r'\usepackage[russian]{babel}', r'\usepackage{amsmath}',
       r'\usepackage{amssymb}'
   ])

rc('font', family='serif')

rec = path.abspath('..' + '\\rec\\rec.xlsx')
sheet_name = 'Лист1'
df = read(rec, sheet_name=sheet_name)

figure(sheet_name)
x = array(df['x, мм'])
y = log(array(df['А, мВ']))

g = polyfit(x[8:-1], y[8:-1], 1)
print(g[0])
f = poly1d(g)
t = linspace(x[0], x[-1], 1000)
plot(t, f(t), label='аппроксимация', color='darkblue')

plot(x[8:-1], y[8:-1], 'r.', label='эксперимент')
t = linspace(x[0], x[-1], len(x))
plot(t[0:8], f(t[0:8]), 'r.', label='_nolabel_')
ylabel(r'$\ln{U},\text{ у.е.}$', fontsize=16)
Ejemplo n.º 11
0
import sys
from scipy import interpolate
from pandas import read_excel as read
rc('text', usetex=True)
rc('text.latex',
   preamble=[
       r'\usepackage[russian]{babel}', r'\usepackage{amsmath}',
       r'\usepackage{amssymb}', r'\usepackage{mathrsfs}',
       r'\usepackage{gensymb}'
   ])

rc('font', family='serif')

rec = path.abspath('grad.xlsx')

df = read(rec, sheet_name='1')
figure('Градуировочный график')
x = array(df['x'])
y = array(df['y'])
g = polyfit(x, y, 4)
print(g)
t = linspace(x[0], x[-1], 1000)
f = poly1d(g)
plot(t, f(t), color='darkblue')

xlabel(r'$T_{\text{спая}}, ^{\circ}C$', fontsize=16)
ylabel(r'$\mathscr{E}_{\text{термо}}$, мВ', fontsize=16)
grid(which='major', linestyle='-')
grid(which='minor', linestyle=':')
minorticks_on()
savefig(path.abspath('..' + '\\img\\grad.pdf'))
Ejemplo n.º 12
0
import autograd.numpy as np
from autograd import grad, jacobian
from scipy.optimize import minimize
from pandas import read_csv as read
import matplotlib.pyplot as plt

### import data
train = read('Depth5/mse_train.csv')
valid = read('Depth5/mse_valid.csv')
test  = read('Depth5/mse_test.csv')

train.columns = ['index', 'e_s', 'e_f']
valid.columns = ['index', 'e_s', 'e_f']
test.columns  = ['index', 'e_s', 'e_f']

### compute the negative log-likelihood
def NLL(theta):
    
    a = theta[0]
    b = theta[1]
    c = theta[2]
    
    mu    = a * e_s + b
    sigma = c * e_s
    
    ll = -N/2 * np.log(2*np.pi) - np.sum(np.log(sigma)) - 0.5 * np.sum(((e_f-mu)/ sigma)**2)
    return -ll/N

### compute the negative log-likelihood on the validation set    
def NLL_valid(theta):
    
Ejemplo n.º 13
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn import metrics
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np



colnames=['label','id','date','query','user','text']

df_train = pd.read('/Users/zenger/FileDoc/ISTD 2018 T6/ComputationalData/lab3/trainingandtestdata/training.1600000.processed.noemoticon.csv',header =None, names = colnames, encoding = 'windows 1252')
df_test = pd.read('/Users/zenger/FileDoc/ISTD 2018 T6/ComputationalData/lab3/trainingandtestdata/testdata.manual.2009.06.14.csv',header =None, names = colnames, encoding = 'windows 1252')

df_train.shape
df_train.head()
df_train['label'].value_counts()
Ejemplo n.º 14
0
from collections import defaultdict
import json

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd

from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl

reviews = pd.read("/home/dhruv/Desktop/cmpe239/data/bigdf.csv")
df = pd.concat(reviews, ignore_index=True)
print df.shape[0]
# print reviews['user_id'].head()
# print "number of reviews: ",reviews.shape[0]
# print "number of users: ",reviews.user_id.unique().shape[0]," number of businesses: ",reviews.business_id.unique().shape[0]

# def recompute_frame(ldf):
#     """
#     takes a dataframe ldf, makes a copy of it, and returns the copy
#     with all averages and review counts recomputed
#     this is used when a frame is subsetted.
#     """
#     ldfu=ldf.groupby('user_id')
#     ldfb=ldf.groupby('business_id')
#     user_avg=ldfu.stars.mean()
#     user_review_count=ldfu.review_id.count()
#     business_avg=ldfb.stars.mean()
#     business_review_count=ldfb.review_id.count()
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

def remove_html(text):
    soup = BeautifulSoup(text,'lxml')
    html_free = soup.get_text()
    return html_free
def remove_punctuation(text):
    no_punc = "".join([c for c in text if c not in string.punctuation])
    return no_punc
review_df = pd.read('amazon_product_reviews.csv')
print(df.shape)

reviews = review_df['customer_reviews'].str.split("//",n=4,expand=True)

print(reviews.head())

review_df['review_title'] = reviews[0]
review_df['rating'] = reviews[1]
review_df['review_date'] = reviews[2]
review_df['customer_name'] = reviews[3]
review_df['review'] = reviews[4]

review_df.drop(columns='customer_reviews',inplace = True)

review_df['review'] = review_df['review'].apply(lambda x:remove_punctuation(x))
def get_coord(path):
    with open(path + "tester.csv", 'r+') as f:
        data = pd.read(f)
        data["coordinates"] = np.vectorize(convert_to_lat_lng)(
            data['Zip'].values, data['zip'].values)
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3d
import pandas as pd
import numpy as np

#preparing data
data = pd.read("E:/Thesis/Pandas_csv/All_data.csv", sep=",")

mldata = data[['TSS', 'TP', 'Chl-Conc']]

X = mldata.dropna()

#Initiating Kmeans instance
clustering = KMeans(n_clusters=3)
clustering.fit(X)

centroids = clustering.cluster_centers_
labels = clustering.predict(X)

#plotting clustering results
fig = plt.figure(figsize=(10, 10))
ax = Axes3D(fig)
ax.tick_params(labelsize=12)
ax.scatter(mldata['Chl-Conc'],
           mldata['TP'],
           mldata['TSS'],
           c=color_theme[clustering.labels_])
ax.set_zlabel('TSS', fontsize=20)
ax.set_xlabel('Chl-a', fontsize=20)
ax.set_ylabel('TP', fontsize=20)
Ejemplo n.º 18
0
#%% Initials imports
import pandas as pd

#%% Load the dataset and first exploratory analisys
iris = pd.read('iris.csv',
               names=[
                   'sepal_length', 'sepal_width', 'petal_length',
                   'petal_width', 'class'
               ])
print(iris.head())
Ejemplo n.º 19
0
#path = current+'/database/sidis/expdata/' # for Linux

os.chdir(path)
# Check 
newDir = os.getcwd() # get current dir
print "Working directory for data %s" % newDir # check current dir

## Grabing list of files
data=os.listdir('./') # list all files in dir
data=[files for files in data if files.endswith('.xlsx') and files.startswith('1')] # list of COMPASS data
print "Data files retrieved %s" % data


## Reading and resructuring data

dat = df(read(data[0]))

# Calculate 
dat["delta"] = np.sqrt(dat["stat_u"]**2.0) # measurment error
dat["qT"] = dat["pT"]/dat["z"]
dat["qT2"] = dat["qT"]**2 #

##Binning data
xBin=[0.023,0.04,0.055,0.075,0.1,0.14,0.2,0.3,0.4,0.6]
Q2Bin=[1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 3.0, 5.0, 15.0]
zBin= [0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.1]

xClas=range(len(xBin)-1)
Q2Clas=range(len(Q2Bin)-1)
zClas=range(len(zBin)-1)
import matplotlib.pyplot as pt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

data = pd.read(csv("datasets/train.csv").as_matrix())
clf = DecisionTreeClassifier()

#training data
xtrain = data[0:21000, 0]
train_label = data[:21000, 0]

clf.fit(xtrain, train_label)

#testing data
xtest = data[21000:, 1:]
actual_label = data[21000:.0]

#prediction and graphed output
d = xtest[8]
d.shape = (28, 28)
pt.imshow(255 - d, cmap='gray')
print(clf.predict([xtest[0]]))
pt.show()
Ejemplo n.º 21
0
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
from pandas import read_csv as read

path = "wh.csv"
rows = 1000

data = read(path, nrows=rows, delimiter=",")

# sns.lmplot(x="Humidity", y="Temperature (C)", data=data)
# plt.savefig("Humidtity.png")

# sns.lmplot(x="Apparent Temperature (C)", y="Temperature (C)", data=data)
# plt.savefig("Apparent Temperature (C).png")

# sns.lmplot(x="Wind Speed (km/h)", y="Temperature (C)", data=data)
# plt.savefig("Wind Speed (kmh).png")

# sns.lmplot(x="Wind Bearing (degrees)", y="Temperature (C)", data=data)
# plt.savefig("Wind Bearing (degrees).png")

# sns.lmplot(x="Visibility (km)", y="Temperature (C)", data=data)
# plt.savefig("Visibility (km).png")

# sns.lmplot(x="Loud Cover", y="Temperature (C)", data=data)
# plt.savefig("Loud Cover.png")

# sns.lmplot(x="Pressure (millibars)", y="Temperature (C)", data=data)
# plt.savefig("Pressure (millibars).png")

# sns.distplot(data['Temperature (C)'])
Ejemplo n.º 22
0
        # Fancy progressbar indicator
        pb = ProgressBar(maxval=line_count(data_path))
        pb.start()

        # Normalize and write all rows
        for i, record in enumerate(csv_dict_reader(data_path)):
            record.update(stores_map[record['Store']])
            normalize_record(record)
            writer.writerow(record)
            pb.update(i)

    pb.finish()

    print "One-hotting the remaining integer features..."
    df = pd.read(output_path, compression='gzip')

    log_features = [
        "CompetitionDistance",
        "CompetitionOpenDays",
        "Promo2RunDays",
    ]

    integer_features = [
        "NowDayOfWeek",
        "NowDayOfMonth",
        "NowWeek",
        "NowMonth",
    ]
    df[integer_features] = df[integer_features].astype(int)
    for c in integer_features:
Ejemplo n.º 23
0
import numpy as np
from pandas import read_csv as read

path = "new.csv"
rows = 2000000
cols = ['step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest','isFraud','hour','newSender','newReceiver','merchant','fraudsEarly']
data = read(path, delimiter=",", nrows=rows, usecols=cols)

print(data.head())

X = data.loc[:, data.columns != 'isFraud'].values

y = data.loc[:, 'isFraud'].values


print(X[0])

y=y.astype('int') # не совсем понял, почему https://stackoverflow.com/questions/45346550/valueerror-unknown-label-type-unknown

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X[:, 1] = le.fit_transform(X[:, 1])

# from sklearn import preprocessing
# # normalize the data attributes
# normalized_X = preprocessing.normalize(X)
# # standardize the data attributes
# standardized_X = preprocessing.scale(X)

Ejemplo n.º 24
0
import pandas as pd
import numpy as np
import os

# 将JData_201604_User中的数据用pandas读入
train_action_04 = pd.read('JData_Action_201604/JData_Action_201604.csv')
# unique方法,将所有User_id逐一列出,且不会重复,下面的for循环将4月份的有发生交互的用户逐一保存为一个csv文件,文件名为用户ID号
for user_id in train_action_04['user_id'].unique():
    train_action_04[train_action_04['user_id'] == user_id].to_csv(
        'JData_201604_User/' + str(user_id) + '.csv')

# 清洗规则1
# 先将JData_201604_User文件夹下的文件名列出
pathDir = os.listdir('JData_201604_User')
# 开始清洗,清洗没有对8品类商品进行购买,加购物车和关注的用户
for dir in pathDir:
    user = pd.read_csv('JData_201604_User/' + dir)
    # 2,5,4分别为加购物车,关注和购买
    user_type_2 = user[user['type'] == 2]
    user_type_5 = user[user['type'] == 5]
    user_type_4 = user[user['type'] == 4]
    if (8 not in user_type_2['cate'].unique()) and (
            8 not in user_type_4['cate'].unique()) and (
                8 not in user_type_5['cate'].unique()):
        os.remove('JData_201604_User/' +
                  dir)  # 原先的处理方式是直接删除文件,下面的修改方式是对文件名进行更改,加入.1标签表示该用户被清洗
# os.rename('JData_201604_User/'+dir,'JData_201604_User/'+dir.split('.')[0]+'.1.csv')
    else:
        continue

# 清洗规则2
Ejemplo n.º 25
0
import csv
import pandas as pd
import plotly.express as px

fr = pd.read("data.csv")
mean = fr.groupby(["student_id", "level"], as_index=False)["attempt"].mean()
fig = px.scatter(mean,
                 x="student_id",
                 y="level",
                 size="attempt",
                 color="attempt")
fig.show()
Ejemplo n.º 26
0
y_pred = classifier.predict(x_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

import pydotplus
from sklearn.tree import export_graphviz

dot = export_graphviz(classifier, out_file= None,filled = True,rounded = True)
graph = pydotplus.graph_from_dot_data(dot)
graph.write_png('sample.org')


ASSOCIATION RULES:
import pandas as pd
data = pd.read('MLRSMBAEX2-DataSet.csV')


--------------------DIMENSIONALITY REDUCTION :----------------------------
from sklearn import datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dataset = datasets.load_iris()
x = dataset.data
x = pd.DataFrame(x)

y = dataset.target
y = pd.DataFrame(y)
Ejemplo n.º 27
0
print(ss.fit('latitude','longitude'))


# In[66]:


from sklearn.preprocessing import StandardScaler as ss
data=['latitude','longitude','mag','depth']
scaler = ss
print(ss.fit('latitude','longitude'))


# In[67]:


dataset=pd.read("Earthquake.csv")


# In[68]:


import pandas as pd
import numpy as np


# In[69]:


ds

Ejemplo n.º 28
0
import numpy as np
from pandas import read_csv as read

path = "wh.csv"
rows = 10

dataX = read(path, delimiter=",", usecols=[1,2,4,5,6,7,8,9,10,11])
dataY = read(path, delimiter=",", usecols=[3])

X = dataX.values
y = dataY.values

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X[:, 0] = le.fit_transform(X[:, 0])
X[:, 1] = le.fit_transform(X[:, 1])
X[:, 9] = le.fit_transform(X[:, 9])

from sklearn.model_selection import train_test_split as train

X_train, X_test, y_train, y_test = train(X, y, test_size=0.3, shuffle=False)

# print(X_train[0])
# print(y_train[0])
# print(X_test[0])
# print(y_test[0])
# print(len(y_test))

from sklearn.linear_model import LinearRegression
Ejemplo n.º 29
0
    data = pd.read_csv('cup1.txt', skiprows = 6, sep = " ", names = ['date', 'time', 'V'])

    data = data.V.dropna()

    probe_reading_cup1 = np.mean(data)

    f.write(data_mean_cup1)

data2 = pd.read_csv('probe_calibration.txt', sep = " ", names = ['density', 'probe_reading'])

plt(density, probe_reading)

slope, intercept = np.polyfit(np.array(density), np.array(probe_reading),1)

inv_slope = 1/slope

inv_intercept = 1/intercept

  
data3 = pd.read('exp160621220022.txt', skiprows = 6, sep = " ", names = ['date', 'time', 'steps', 'z', 'V'])
data3.rho = inv_slope * data3.V + inv_intercept
plt.plot(data5.rho, data5.z)

slope, intercept = np.polyfit(np.array(data3.rho.dropna()), np.array(data3.z.dropna()),1)
strat_freq = math.sqrt((1/slope)*980)
inv_slope = 1/slope
inv_intercept = 1/intercept

print(strat_freq)

Ejemplo n.º 30
0
Archivo: wine.py Proyecto: skvoz/wine
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
from pandas import read_csv as read

path = "wine.csv"
data = read(path, delimiter=",")

X = data.values[::, 1:14]
y = data.values[::, 0:1]

# print(data)
# print(X, y)

from sklearn.cross_validation import train_test_split as train
X_train, X_test, y_train, y_test = train(X, y, test_size=0.6)

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import scale

print(X_train)
print(X_train[::, 0:2])

X_train_draw = scale(X_train[::, 0:2])
X_test_draw = scale(X_test[::, 0:2])

print(X_train_draw)

clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
Ejemplo n.º 31
0
import os
import pandas as pd
import numpy as np
import datetime
import glob
import statsmodels as sm

# WEB IMPRESSIONS #
imp_17 = '/Users/mcnamarp/Downloads/Initiative_Files/MSG INI Weekly Reporting - 2017.12.17.xlsx'
newest = max(glob.iglob('/Users/mcnamarp/Downloads/Initiative_Files/*.xlsx'), key=os.path.getctime)
impressions = pd.read_excel(newest, sheetname = 'Paid Pivot', converters= {'Week Starting': pd.to_datetime})
impressions_17 = pd.read(imp_17, sheetname = 'Paid Pivot', converters= {'Week Starting': pd.to_datetime})
impressions = impressions.append(impressions_17)

# ARGUS ANALYSIS #
argus_imp = impressions[impressions['Campaign Name'] == 'Knicks Indy']
argus_imp = argus_imp[argus_imp['Week Number'].isin(range(43,53)]
argus_imp = argus_imp[argus_imp['Placement_14'].isin(['DATA DRIVEN PROSPECTING','INDY CRM TARGETING'])]

drops = ['Year','Month Number','Week Number','%Viewable Impressions','Placement_1','Placement_2','Placement_3',
		'Placement_4','Placement_5','Placement_6','Placement_7','Placement_8','Placement_9','Placement_10','Placement_11','Placement_12',
		'Placement_13','Placement_14','Placement_15','Placement_16','Placement_17','Placement_18','Concert Creative_1','ConcertCreative_2',
		'Creative Concept','Unit Size','Placement','Placement ID','Site ID','Campaign ID',
		'Area','Media Category','Campaign Name','Publisher','Actualized Imps','Week Starting','Week Ending','Actualized Clicks','Actualized Cost',
		'Viewable Impression Distribution','Viewable Impressions','Media','Campaign','Site (Social)','DCM Imps','DCM Clicks','DCM Spend','Revenue']
		
argus_imp.drop(drops, axis = 1, inplace = True)


impressions.drop(drops, axis = 1, inplace = True)
impressions.drop(['Devices','Tactic','Segment'], axis = 1, inplace = True)
Ejemplo n.º 32
0
from pylab import plot,show
from numpy import vstack,array
from numpy.random import rand
import numpy as np
from scipy.cluster.vq import kmeans,vq
import pandas as pd
import pandas_datareader as dr
from math import sqrt
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt

#Load our data set and convert it into a list
data_table = pd.read(our_Data_set)
names = data_table[0][1:][0].tolist()

#Iterate through the data and create a list with all of the stock "Names"
#i.e. 'GOOG', 'AAPL', etc...
prices_list = []

for name in names:
    try:
        prices = dr.DataReader(ticker,,'01/01/2017')['Adj Close']
        prices = pd.DataFrame(prices)
        prices.columns = [name]
        prices_list.append(prices)
    except:
        pass
    prices_df = pd.concat(prices_list,axis=1)
Ejemplo n.º 33
0
 def test_select_urls(sample_folder):
     df = pd.read(sample_folder + 'mock_rental_data.csv')
     urls = get_urls_from_table(cursor, table_name, url_column_name)
     assert len(urls) == df.shape[0]
Ejemplo n.º 34
0
import os.path as path
import sys
from scipy import interpolate
from pandas import read_excel as read
rc('text', usetex=True)
rc('text.latex',
   preamble=[
       r'\usepackage[russian]{babel}', r'\usepackage{amsmath}',
       r'\usepackage{amssymb}'
   ])

rc('font', family='serif')

rec = path.abspath('..' + '\\rec\\rec.xlsx')

df = read(rec, sheet_name='Uотр=3В')
xlabel(r'$U_{\text{рез}}$, \text{В}', fontsize=16)
ylabel(r'$I$, деления', fontsize=16)
x, y = 3 * df['Uрез, В/3'], df['I, mA']
g = interpolate.interp1d(x, y, 'quadratic')
x = linspace(111, 126, 10)
y = g(x)
plot(x, y, 'r')
plot(3 * df['Uрез, В/3'], df['I, mA'], 'ro', label=r'$U_\text{отр}=3$ В')
grid(which='major', linestyle='-')
grid(which='minor', linestyle=':')
minorticks_on()

df = read(rec, sheet_name='Uотр=19.5В')
x, y = 3 * df['Uрез, В/3'], df['I, mA']
g = interpolate.interp1d(x, y, 'quadratic')
Ejemplo n.º 35
0
import numpy as np
import glob

all_data = pd.DataFrame()
for f in glob.glob("../in/*.xlsx"):
    df = pd.read_excel(f)
    all_data = all_data.append(df,ignore_index=True)

all_data.describe()
### the best practice is to convert the data column to a date time object.
all_data['date'] = pd.to_datetime(all_data['date'])

## Combining the Data
## like the join function in ArcGIS Table

status = pd.read("../custom-status.xlxs")
status

all_data_st = pd.merge(all_data,status,how='left') #like Excel's vlookup
all_data_st.head()

all_data_st['status'].fillna('bronze',implace=True)
all_data_st.head()


## Using Categories
all_data_st["status"] = all_data_st["status"].astype("category")
all_data_st.head()
all_data_st.dtypes

all_data_st.sort(columns=["status"]).head()
Ejemplo n.º 36
0
import numpy as np
from pandas import read_csv as read

path = "data.csv"
data = read(path, delimiter=",", nrows=2000000)

senders = set()
receivers = set()
frauds = set()
merchants = set()

is_new_sender = list()
is_new_receiver = list()
merhants_list = list()
frauds_early = list()

X = data.values

for trans in X:
    sender = trans[3]
    receiver = trans[6]
    is_fraud = trans[9]

    is_new_sender_app = 0
    is_new_receiver_app = 0
    is_merchant_app = 0
    is_fraud_early_app = 0

    ## поиск уникальных
    
    if sender not in senders:
Ejemplo n.º 37
0
rc('text', usetex=True)
rc('text.latex',
   preamble=[
       r'\usepackage[russian]{babel}', r'\usepackage{amsmath}',
       r'\usepackage{amssymb}'
   ])

rc('font', family='serif')
grad = path.abspath('.\\data\\grad.xlsx')


def func(x, a, b, c, d):
    return a * x**4 + b * x**3 + c * x**2 + d * x


df = read(grad, sheet_name='1')
x = array(df['x'])
y = array(df['y'])

popt, pcov = curve_fit(func, y, x)
a, b, c, d = popt
EDS2T = lambda x: a * x**4 + b * x**3 + c * x**2 + d * x

rec = path.abspath('.' + '\\data\\rec.xlsx')
df = read(rec, sheet_name='1')

et = linspace(0, 10, 1000)
Et = array(df['ЭДС'])  #mВ
T = EDS2T(Et)
# plot(T, Et)
# show()