def scrape(): browser = init_browser url = "https://mars.nasa.gov.news" browser.visit(url) bs_version = BeautifulSoup(browser.html, 'html.parser') content_titles = bs_version.find_all(class_="content_title") paragraphs = bs_version.find_all(class_="article_teaser_body") featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg' mars_image = browser.visit(featured_image_url) news_url = "https://twitter.com/marswxreport?lang=en" mars_news = browser.visit(news_url) bs_news = BeautifulSoup(browser.html, 'html.parser') facts_url = "https://space-facts.com/mars/" facts_tables = pd.read(facts_url) #falling asleep, will continue tommorow on Fri 10/18/2019 hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" hemisphere_info = browser.visit(hemisphere_url) bs_hemisphere = BeautifulSoup(browser.html, 'html.parser') #watched Astros kick the Yankees ass in their own stadium last night hemisphere_items = hemisphere_url.find_all("div", class_="item") item_list = "[]" for item in item_list: anchor_tag = item_find("a") href_attribute = anchor_tag["href"] item_list.append({"", "anchor_tag:" + hemisphere_url + href_attribute}) return item_list
def csv_reader(): """ Read a csv file """ f = open(u'Данные для задания 3 и 4.csv') data = read(f, sep=',', encoding='utf-8') f.close() clean_data(data)
def load_file(): """ Looks for the csv files in the local directory, reads in each of the CSV files, and returns panda dataframes for each one. """ dirname, _ = os.path.split(os.path.abspath(sys.argv[0])) csv_files = [file for file in os.listdir(dirname) if re.match("[0-9]{4}.csv$", file)] csv_dfs = [pd.read(csv_file, names=column_names) for csv_file in csv_files] return csv_dfs
def __init__(self): self.file = pd.read('blog/static/breast-cancer-wisconsin-data.csv') label = self.file['breast_cancer'] features = self.file[self.file.columns[:-1]] split_data = train_test_split(label, features, test_size=0.1) train_label, self.test_label, train_features, self.test_features = split_data self.dtree_algo = tree.DecisionTreeClassifier() self.knn_algo = KNeighborsClassifier(n_neighbors=5) self.trained_knn = self.knn_algo.fit(train_features, train_label) self.trained_dtree = self.dtree_algo.fit(train_features, train_label)
def load_adj_csv(input_dir): #Loads all csv files in directory input dir result = {} for f in os.listdir(input_dir): #Extract file extension ext = f.split(".")[1] if ext == "csv": #Extract subject id; e.g 12345_adj.csv name = f.split("_")[0] path = os.path.join(input_dir, f) result[name] = pd.read(path, header=None, index_col=None) return result
def read_preprocess_rasters(wtd_old_rst_fn, can_rst_fn, dem_rst_fn, peat_type_rst_fn, peat_depth_rst_fn): """ Deals with issues specific to each set of input data rasters. Output """ with rasterio.open(wtd_old_rst_fn) as wtd_old: wtd_old = wtd_old.read(1) with rasterio.open(dem_rst_fn) as dem: dem = dem.read(1) with rasterio.open(can_rst_fn) as can: can_arr = can.read(1) with rasterio.open(peat_type_rst_fn) as pt: peat_type_arr = pt.read(1) with rasterio.open(peat_depth_rst_fn) as pd: peat_depth_arr = pd.read(1) #Some small changes to get mask of canals: 1 where canals exist, 0 otherwise can_arr[can_arr < 0.5] = 0 can_arr[abs(can_arr) > 0.5] = 1 can_arr = np.array(can_arr, dtype=int) # Convert from numpy no data to -9999.0 dem[dem < -10] = -9999.0 dem[np.where(np.isnan(dem))] = -9999.0 dem[dem > 1e20] = -9999.0 # just in case # control nodata values peat_type_arr[peat_type_arr < 0] = -1 # fill some nodata values to get same size as dem peat_type_arr[(np.where(dem > 0.1) and np.where(peat_type_arr < 0.1))] = 1. # control nodata values peat_depth_arr[peat_depth_arr < 0] = -1 peat_depth_arr = peat_depth_map( peat_depth_arr) # translate number keys to depths # fill some nodata values to get same size as dem peat_depth_arr[(np.where(dem > 0.1) and np.where(peat_depth_arr < 0.1))] = 1. # Eliminate rows and columns full of noData values. # dem = dem[7:-7, 5:-15] #old # wtd_old = wtd_old[7:-7, 5:-15] # can_arr = can_arr[7:-7, 5:-15] # peat_type_arr = peat_type_arr[7:-7, 5:-15] # peat_depth_arr = peat_depth_arr[7:-7, 5:-15] return can_arr, wtd_old, dem, peat_type_arr, peat_depth_arr
def mageck_mle(design_matrix): """ :param design_matrix: :return: """ # check directory path_check('mageck_mle') # args parse if isinstance(design_matrix, pd.DataFrame): pass elif isinstance(design_matrix, str): os.path.exists(design_matrix) design_matrix = pd.read('design_matrix.xlsx', index_col=0) else: logging.error('Design matrix must be pd.DataFrame or EXCEL file') commands = [] for control in design_matrix.index: path_check('mageck_mle/' + control) samples = design_matrix.loc[control][ design_matrix.loc[control] != 0].index mageck_design = pd.DataFrame(np.eye(len(samples) + 1)) columns = ['baseline'] columns.extend(samples) index = [control] index.extend(samples) mageck_design.columns = columns mageck_design.index = index mageck_design.index.name = 'Samples' mageck_design['baseline'] = 1 mageck_design = mageck_design.astype(dtype=int) mageck_design.to_csv('mageck_mle/%s/mageck_mle_design.txt' % control, sep='\t') command = 'mageck mle -k mageck.count_normalized.txt -d mageck_mle/%s/mageck_mle_design.txt -n mageck_mle/%s/%s --norm-method none --threads %s' % ( control, control, control, cpu) commands.append(command) return commands
def get_ratings(new_ratings_path): all_ratings_path = "data/demo/recipes.csv" all_ratings_df = pd.read(all_ratings_path) new_ratings_df = pd.read_csv(new_ratings_path) new_usernames = set(new_ratings_df.username.unique()) old_usernames = set(all_ratings_df.username.unique()) username = list(new_usernames - old_usernames)[0] all_ratings_df = all_ratings_df.append( new_ratings_df) # append new ratings to old all_ratings_df = all_ratings_df.drop_duplicates() all_ratings_df.to_csv(all_ratings_path) # update the csv with all ratings k = 3 best_k, worst_k = demo_reco(3) print("Top {} Recommended Recipes for {}: ".format(k, username)) for i, recipe in enumerate(best_k): print("{}.) {}".format(i, recipe)) print("Bottom {} Recommended Recipes for {}: ".format(k, username)) for i, recipe in enumerate(best_k): print("{}.) {}".format(i, recipe))
session_count / room_count as avg_sessions_per_room, sum(attended_duration/60) as session_minutes_sum from cdm_clb.session where attended_duration > 0 """ outfile = "timeSpentInCollab.csv" ctx = snowflake.connector.connect(user=cfg.sfconcfg['user'], password=cfg.sfconcfg['password'], account=cfg.sfconcfg['account'], warehouse=cfg.sfconcfg['warehouse'], database=cfg.sfconcfg['database'], insecure_mode=cfg.sfconcfg['insecure_mode']) cs = ctx.cursor() try: cs.execute(query) # Fetch the result set from the cursor and deliver it as the Pandas DataFrame. # df = cs.fetch_pandas_all() # TODO: fix this later, switched to pd.read_sql as fetch_pandas_all() causes segfault on linux df = pd.read(query, ctx) print(df.head()) df.to_csv(outfile, index=cfg.sfconcfg['timeSpentInCollab']['index']) finally: cs.close() ctx.close()
import os.path as path import sys from scipy import interpolate from pandas import read_excel as read rc('text', usetex=True) rc('text.latex', preamble=[ r'\usepackage[russian]{babel}', r'\usepackage{amsmath}', r'\usepackage{amssymb}' ]) rc('font', family='serif') rec = path.abspath('..' + '\\rec\\rec.xlsx') sheet_name = 'Лист1' df = read(rec, sheet_name=sheet_name) figure(sheet_name) x = array(df['x, мм']) y = log(array(df['А, мВ'])) g = polyfit(x[8:-1], y[8:-1], 1) print(g[0]) f = poly1d(g) t = linspace(x[0], x[-1], 1000) plot(t, f(t), label='аппроксимация', color='darkblue') plot(x[8:-1], y[8:-1], 'r.', label='эксперимент') t = linspace(x[0], x[-1], len(x)) plot(t[0:8], f(t[0:8]), 'r.', label='_nolabel_') ylabel(r'$\ln{U},\text{ у.е.}$', fontsize=16)
import sys from scipy import interpolate from pandas import read_excel as read rc('text', usetex=True) rc('text.latex', preamble=[ r'\usepackage[russian]{babel}', r'\usepackage{amsmath}', r'\usepackage{amssymb}', r'\usepackage{mathrsfs}', r'\usepackage{gensymb}' ]) rc('font', family='serif') rec = path.abspath('grad.xlsx') df = read(rec, sheet_name='1') figure('Градуировочный график') x = array(df['x']) y = array(df['y']) g = polyfit(x, y, 4) print(g) t = linspace(x[0], x[-1], 1000) f = poly1d(g) plot(t, f(t), color='darkblue') xlabel(r'$T_{\text{спая}}, ^{\circ}C$', fontsize=16) ylabel(r'$\mathscr{E}_{\text{термо}}$, мВ', fontsize=16) grid(which='major', linestyle='-') grid(which='minor', linestyle=':') minorticks_on() savefig(path.abspath('..' + '\\img\\grad.pdf'))
import autograd.numpy as np from autograd import grad, jacobian from scipy.optimize import minimize from pandas import read_csv as read import matplotlib.pyplot as plt ### import data train = read('Depth5/mse_train.csv') valid = read('Depth5/mse_valid.csv') test = read('Depth5/mse_test.csv') train.columns = ['index', 'e_s', 'e_f'] valid.columns = ['index', 'e_s', 'e_f'] test.columns = ['index', 'e_s', 'e_f'] ### compute the negative log-likelihood def NLL(theta): a = theta[0] b = theta[1] c = theta[2] mu = a * e_s + b sigma = c * e_s ll = -N/2 * np.log(2*np.pi) - np.sum(np.log(sigma)) - 0.5 * np.sum(((e_f-mu)/ sigma)**2) return -ll/N ### compute the negative log-likelihood on the validation set def NLL_valid(theta):
from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.preprocessing import FunctionTransformer from sklearn import metrics from sklearn.metrics import accuracy_score import pandas as pd import numpy as np colnames=['label','id','date','query','user','text'] df_train = pd.read('/Users/zenger/FileDoc/ISTD 2018 T6/ComputationalData/lab3/trainingandtestdata/training.1600000.processed.noemoticon.csv',header =None, names = colnames, encoding = 'windows 1252') df_test = pd.read('/Users/zenger/FileDoc/ISTD 2018 T6/ComputationalData/lab3/trainingandtestdata/testdata.manual.2009.06.14.csv',header =None, names = colnames, encoding = 'windows 1252') df_train.shape df_train.head() df_train['label'].value_counts()
from collections import defaultdict import json import numpy as np import scipy as sp import matplotlib.pyplot as plt import pandas as pd from matplotlib import rcParams import matplotlib.cm as cm import matplotlib as mpl reviews = pd.read("/home/dhruv/Desktop/cmpe239/data/bigdf.csv") df = pd.concat(reviews, ignore_index=True) print df.shape[0] # print reviews['user_id'].head() # print "number of reviews: ",reviews.shape[0] # print "number of users: ",reviews.user_id.unique().shape[0]," number of businesses: ",reviews.business_id.unique().shape[0] # def recompute_frame(ldf): # """ # takes a dataframe ldf, makes a copy of it, and returns the copy # with all averages and review counts recomputed # this is used when a frame is subsetted. # """ # ldfu=ldf.groupby('user_id') # ldfb=ldf.groupby('business_id') # user_avg=ldfu.stars.mean() # user_review_count=ldfu.review_id.count() # business_avg=ldfb.stars.mean() # business_review_count=ldfb.review_id.count()
from nltk.tokenize import RegexpTokenizer from nltk.stem import WordNetLemmatizer from nltk.stem.porter import PorterStemmer def remove_stopwords(text): words = [w for w in text if w not in stopwords.words('english')] return words def remove_html(text): soup = BeautifulSoup(text,'lxml') html_free = soup.get_text() return html_free def remove_punctuation(text): no_punc = "".join([c for c in text if c not in string.punctuation]) return no_punc review_df = pd.read('amazon_product_reviews.csv') print(df.shape) reviews = review_df['customer_reviews'].str.split("//",n=4,expand=True) print(reviews.head()) review_df['review_title'] = reviews[0] review_df['rating'] = reviews[1] review_df['review_date'] = reviews[2] review_df['customer_name'] = reviews[3] review_df['review'] = reviews[4] review_df.drop(columns='customer_reviews',inplace = True) review_df['review'] = review_df['review'].apply(lambda x:remove_punctuation(x))
def get_coord(path): with open(path + "tester.csv", 'r+') as f: data = pd.read(f) data["coordinates"] = np.vectorize(convert_to_lat_lng)( data['Zip'].values, data['zip'].values)
import matplotlib.pyplot as plt from sklearn.cluster import KMeans from mpl_toolkits.mplot3d import Axes3d import pandas as pd import numpy as np #preparing data data = pd.read("E:/Thesis/Pandas_csv/All_data.csv", sep=",") mldata = data[['TSS', 'TP', 'Chl-Conc']] X = mldata.dropna() #Initiating Kmeans instance clustering = KMeans(n_clusters=3) clustering.fit(X) centroids = clustering.cluster_centers_ labels = clustering.predict(X) #plotting clustering results fig = plt.figure(figsize=(10, 10)) ax = Axes3D(fig) ax.tick_params(labelsize=12) ax.scatter(mldata['Chl-Conc'], mldata['TP'], mldata['TSS'], c=color_theme[clustering.labels_]) ax.set_zlabel('TSS', fontsize=20) ax.set_xlabel('Chl-a', fontsize=20) ax.set_ylabel('TP', fontsize=20)
#%% Initials imports import pandas as pd #%% Load the dataset and first exploratory analisys iris = pd.read('iris.csv', names=[ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class' ]) print(iris.head())
#path = current+'/database/sidis/expdata/' # for Linux os.chdir(path) # Check newDir = os.getcwd() # get current dir print "Working directory for data %s" % newDir # check current dir ## Grabing list of files data=os.listdir('./') # list all files in dir data=[files for files in data if files.endswith('.xlsx') and files.startswith('1')] # list of COMPASS data print "Data files retrieved %s" % data ## Reading and resructuring data dat = df(read(data[0])) # Calculate dat["delta"] = np.sqrt(dat["stat_u"]**2.0) # measurment error dat["qT"] = dat["pT"]/dat["z"] dat["qT2"] = dat["qT"]**2 # ##Binning data xBin=[0.023,0.04,0.055,0.075,0.1,0.14,0.2,0.3,0.4,0.6] Q2Bin=[1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 3.0, 5.0, 15.0] zBin= [0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.1] xClas=range(len(xBin)-1) Q2Clas=range(len(Q2Bin)-1) zClas=range(len(zBin)-1)
import matplotlib.pyplot as pt import pandas as pd from sklearn.tree import DecisionTreeClassifier data = pd.read(csv("datasets/train.csv").as_matrix()) clf = DecisionTreeClassifier() #training data xtrain = data[0:21000, 0] train_label = data[:21000, 0] clf.fit(xtrain, train_label) #testing data xtest = data[21000:, 1:] actual_label = data[21000:.0] #prediction and graphed output d = xtest[8] d.shape = (28, 28) pt.imshow(255 - d, cmap='gray') print(clf.predict([xtest[0]])) pt.show()
import matplotlib.pyplot as plt import seaborn as sns; sns.set(color_codes=True) from pandas import read_csv as read path = "wh.csv" rows = 1000 data = read(path, nrows=rows, delimiter=",") # sns.lmplot(x="Humidity", y="Temperature (C)", data=data) # plt.savefig("Humidtity.png") # sns.lmplot(x="Apparent Temperature (C)", y="Temperature (C)", data=data) # plt.savefig("Apparent Temperature (C).png") # sns.lmplot(x="Wind Speed (km/h)", y="Temperature (C)", data=data) # plt.savefig("Wind Speed (kmh).png") # sns.lmplot(x="Wind Bearing (degrees)", y="Temperature (C)", data=data) # plt.savefig("Wind Bearing (degrees).png") # sns.lmplot(x="Visibility (km)", y="Temperature (C)", data=data) # plt.savefig("Visibility (km).png") # sns.lmplot(x="Loud Cover", y="Temperature (C)", data=data) # plt.savefig("Loud Cover.png") # sns.lmplot(x="Pressure (millibars)", y="Temperature (C)", data=data) # plt.savefig("Pressure (millibars).png") # sns.distplot(data['Temperature (C)'])
# Fancy progressbar indicator pb = ProgressBar(maxval=line_count(data_path)) pb.start() # Normalize and write all rows for i, record in enumerate(csv_dict_reader(data_path)): record.update(stores_map[record['Store']]) normalize_record(record) writer.writerow(record) pb.update(i) pb.finish() print "One-hotting the remaining integer features..." df = pd.read(output_path, compression='gzip') log_features = [ "CompetitionDistance", "CompetitionOpenDays", "Promo2RunDays", ] integer_features = [ "NowDayOfWeek", "NowDayOfMonth", "NowWeek", "NowMonth", ] df[integer_features] = df[integer_features].astype(int) for c in integer_features:
import numpy as np from pandas import read_csv as read path = "new.csv" rows = 2000000 cols = ['step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest','isFraud','hour','newSender','newReceiver','merchant','fraudsEarly'] data = read(path, delimiter=",", nrows=rows, usecols=cols) print(data.head()) X = data.loc[:, data.columns != 'isFraud'].values y = data.loc[:, 'isFraud'].values print(X[0]) y=y.astype('int') # не совсем понял, почему https://stackoverflow.com/questions/45346550/valueerror-unknown-label-type-unknown from sklearn.preprocessing import LabelEncoder le = LabelEncoder() X[:, 1] = le.fit_transform(X[:, 1]) # from sklearn import preprocessing # # normalize the data attributes # normalized_X = preprocessing.normalize(X) # # standardize the data attributes # standardized_X = preprocessing.scale(X)
import pandas as pd import numpy as np import os # 将JData_201604_User中的数据用pandas读入 train_action_04 = pd.read('JData_Action_201604/JData_Action_201604.csv') # unique方法,将所有User_id逐一列出,且不会重复,下面的for循环将4月份的有发生交互的用户逐一保存为一个csv文件,文件名为用户ID号 for user_id in train_action_04['user_id'].unique(): train_action_04[train_action_04['user_id'] == user_id].to_csv( 'JData_201604_User/' + str(user_id) + '.csv') # 清洗规则1 # 先将JData_201604_User文件夹下的文件名列出 pathDir = os.listdir('JData_201604_User') # 开始清洗,清洗没有对8品类商品进行购买,加购物车和关注的用户 for dir in pathDir: user = pd.read_csv('JData_201604_User/' + dir) # 2,5,4分别为加购物车,关注和购买 user_type_2 = user[user['type'] == 2] user_type_5 = user[user['type'] == 5] user_type_4 = user[user['type'] == 4] if (8 not in user_type_2['cate'].unique()) and ( 8 not in user_type_4['cate'].unique()) and ( 8 not in user_type_5['cate'].unique()): os.remove('JData_201604_User/' + dir) # 原先的处理方式是直接删除文件,下面的修改方式是对文件名进行更改,加入.1标签表示该用户被清洗 # os.rename('JData_201604_User/'+dir,'JData_201604_User/'+dir.split('.')[0]+'.1.csv') else: continue # 清洗规则2
import csv import pandas as pd import plotly.express as px fr = pd.read("data.csv") mean = fr.groupby(["student_id", "level"], as_index=False)["attempt"].mean() fig = px.scatter(mean, x="student_id", y="level", size="attempt", color="attempt") fig.show()
y_pred = classifier.predict(x_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test,y_pred) import pydotplus from sklearn.tree import export_graphviz dot = export_graphviz(classifier, out_file= None,filled = True,rounded = True) graph = pydotplus.graph_from_dot_data(dot) graph.write_png('sample.org') ASSOCIATION RULES: import pandas as pd data = pd.read('MLRSMBAEX2-DataSet.csV') --------------------DIMENSIONALITY REDUCTION :---------------------------- from sklearn import datasets import numpy as np import pandas as pd import matplotlib.pyplot as plt dataset = datasets.load_iris() x = dataset.data x = pd.DataFrame(x) y = dataset.target y = pd.DataFrame(y)
print(ss.fit('latitude','longitude')) # In[66]: from sklearn.preprocessing import StandardScaler as ss data=['latitude','longitude','mag','depth'] scaler = ss print(ss.fit('latitude','longitude')) # In[67]: dataset=pd.read("Earthquake.csv") # In[68]: import pandas as pd import numpy as np # In[69]: ds
import numpy as np from pandas import read_csv as read path = "wh.csv" rows = 10 dataX = read(path, delimiter=",", usecols=[1,2,4,5,6,7,8,9,10,11]) dataY = read(path, delimiter=",", usecols=[3]) X = dataX.values y = dataY.values from sklearn.preprocessing import LabelEncoder le = LabelEncoder() X[:, 0] = le.fit_transform(X[:, 0]) X[:, 1] = le.fit_transform(X[:, 1]) X[:, 9] = le.fit_transform(X[:, 9]) from sklearn.model_selection import train_test_split as train X_train, X_test, y_train, y_test = train(X, y, test_size=0.3, shuffle=False) # print(X_train[0]) # print(y_train[0]) # print(X_test[0]) # print(y_test[0]) # print(len(y_test)) from sklearn.linear_model import LinearRegression
data = pd.read_csv('cup1.txt', skiprows = 6, sep = " ", names = ['date', 'time', 'V']) data = data.V.dropna() probe_reading_cup1 = np.mean(data) f.write(data_mean_cup1) data2 = pd.read_csv('probe_calibration.txt', sep = " ", names = ['density', 'probe_reading']) plt(density, probe_reading) slope, intercept = np.polyfit(np.array(density), np.array(probe_reading),1) inv_slope = 1/slope inv_intercept = 1/intercept data3 = pd.read('exp160621220022.txt', skiprows = 6, sep = " ", names = ['date', 'time', 'steps', 'z', 'V']) data3.rho = inv_slope * data3.V + inv_intercept plt.plot(data5.rho, data5.z) slope, intercept = np.polyfit(np.array(data3.rho.dropna()), np.array(data3.z.dropna()),1) strat_freq = math.sqrt((1/slope)*980) inv_slope = 1/slope inv_intercept = 1/intercept print(strat_freq)
from sklearn.preprocessing import scale import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap import numpy as np from pandas import read_csv as read path = "wine.csv" data = read(path, delimiter=",") X = data.values[::, 1:14] y = data.values[::, 0:1] # print(data) # print(X, y) from sklearn.cross_validation import train_test_split as train X_train, X_test, y_train, y_test = train(X, y, test_size=0.6) from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import scale print(X_train) print(X_train[::, 0:2]) X_train_draw = scale(X_train[::, 0:2]) X_test_draw = scale(X_test[::, 0:2]) print(X_train_draw) clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
import os import pandas as pd import numpy as np import datetime import glob import statsmodels as sm # WEB IMPRESSIONS # imp_17 = '/Users/mcnamarp/Downloads/Initiative_Files/MSG INI Weekly Reporting - 2017.12.17.xlsx' newest = max(glob.iglob('/Users/mcnamarp/Downloads/Initiative_Files/*.xlsx'), key=os.path.getctime) impressions = pd.read_excel(newest, sheetname = 'Paid Pivot', converters= {'Week Starting': pd.to_datetime}) impressions_17 = pd.read(imp_17, sheetname = 'Paid Pivot', converters= {'Week Starting': pd.to_datetime}) impressions = impressions.append(impressions_17) # ARGUS ANALYSIS # argus_imp = impressions[impressions['Campaign Name'] == 'Knicks Indy'] argus_imp = argus_imp[argus_imp['Week Number'].isin(range(43,53)] argus_imp = argus_imp[argus_imp['Placement_14'].isin(['DATA DRIVEN PROSPECTING','INDY CRM TARGETING'])] drops = ['Year','Month Number','Week Number','%Viewable Impressions','Placement_1','Placement_2','Placement_3', 'Placement_4','Placement_5','Placement_6','Placement_7','Placement_8','Placement_9','Placement_10','Placement_11','Placement_12', 'Placement_13','Placement_14','Placement_15','Placement_16','Placement_17','Placement_18','Concert Creative_1','ConcertCreative_2', 'Creative Concept','Unit Size','Placement','Placement ID','Site ID','Campaign ID', 'Area','Media Category','Campaign Name','Publisher','Actualized Imps','Week Starting','Week Ending','Actualized Clicks','Actualized Cost', 'Viewable Impression Distribution','Viewable Impressions','Media','Campaign','Site (Social)','DCM Imps','DCM Clicks','DCM Spend','Revenue'] argus_imp.drop(drops, axis = 1, inplace = True) impressions.drop(drops, axis = 1, inplace = True) impressions.drop(['Devices','Tactic','Segment'], axis = 1, inplace = True)
from pylab import plot,show from numpy import vstack,array from numpy.random import rand import numpy as np from scipy.cluster.vq import kmeans,vq import pandas as pd import pandas_datareader as dr from math import sqrt from sklearn.cluster import KMeans from matplotlib import pyplot as plt #Load our data set and convert it into a list data_table = pd.read(our_Data_set) names = data_table[0][1:][0].tolist() #Iterate through the data and create a list with all of the stock "Names" #i.e. 'GOOG', 'AAPL', etc... prices_list = [] for name in names: try: prices = dr.DataReader(ticker,,'01/01/2017')['Adj Close'] prices = pd.DataFrame(prices) prices.columns = [name] prices_list.append(prices) except: pass prices_df = pd.concat(prices_list,axis=1)
def test_select_urls(sample_folder): df = pd.read(sample_folder + 'mock_rental_data.csv') urls = get_urls_from_table(cursor, table_name, url_column_name) assert len(urls) == df.shape[0]
import os.path as path import sys from scipy import interpolate from pandas import read_excel as read rc('text', usetex=True) rc('text.latex', preamble=[ r'\usepackage[russian]{babel}', r'\usepackage{amsmath}', r'\usepackage{amssymb}' ]) rc('font', family='serif') rec = path.abspath('..' + '\\rec\\rec.xlsx') df = read(rec, sheet_name='Uотр=3В') xlabel(r'$U_{\text{рез}}$, \text{В}', fontsize=16) ylabel(r'$I$, деления', fontsize=16) x, y = 3 * df['Uрез, В/3'], df['I, mA'] g = interpolate.interp1d(x, y, 'quadratic') x = linspace(111, 126, 10) y = g(x) plot(x, y, 'r') plot(3 * df['Uрез, В/3'], df['I, mA'], 'ro', label=r'$U_\text{отр}=3$ В') grid(which='major', linestyle='-') grid(which='minor', linestyle=':') minorticks_on() df = read(rec, sheet_name='Uотр=19.5В') x, y = 3 * df['Uрез, В/3'], df['I, mA'] g = interpolate.interp1d(x, y, 'quadratic')
import numpy as np import glob all_data = pd.DataFrame() for f in glob.glob("../in/*.xlsx"): df = pd.read_excel(f) all_data = all_data.append(df,ignore_index=True) all_data.describe() ### the best practice is to convert the data column to a date time object. all_data['date'] = pd.to_datetime(all_data['date']) ## Combining the Data ## like the join function in ArcGIS Table status = pd.read("../custom-status.xlxs") status all_data_st = pd.merge(all_data,status,how='left') #like Excel's vlookup all_data_st.head() all_data_st['status'].fillna('bronze',implace=True) all_data_st.head() ## Using Categories all_data_st["status"] = all_data_st["status"].astype("category") all_data_st.head() all_data_st.dtypes all_data_st.sort(columns=["status"]).head()
import numpy as np from pandas import read_csv as read path = "data.csv" data = read(path, delimiter=",", nrows=2000000) senders = set() receivers = set() frauds = set() merchants = set() is_new_sender = list() is_new_receiver = list() merhants_list = list() frauds_early = list() X = data.values for trans in X: sender = trans[3] receiver = trans[6] is_fraud = trans[9] is_new_sender_app = 0 is_new_receiver_app = 0 is_merchant_app = 0 is_fraud_early_app = 0 ## поиск уникальных if sender not in senders:
rc('text', usetex=True) rc('text.latex', preamble=[ r'\usepackage[russian]{babel}', r'\usepackage{amsmath}', r'\usepackage{amssymb}' ]) rc('font', family='serif') grad = path.abspath('.\\data\\grad.xlsx') def func(x, a, b, c, d): return a * x**4 + b * x**3 + c * x**2 + d * x df = read(grad, sheet_name='1') x = array(df['x']) y = array(df['y']) popt, pcov = curve_fit(func, y, x) a, b, c, d = popt EDS2T = lambda x: a * x**4 + b * x**3 + c * x**2 + d * x rec = path.abspath('.' + '\\data\\rec.xlsx') df = read(rec, sheet_name='1') et = linspace(0, 10, 1000) Et = array(df['ЭДС']) #mВ T = EDS2T(Et) # plot(T, Et) # show()