def get_id(sec): '''Scrape the internal HTML ID for the film''' global sec_to_id if sec_to_id.empty: try: sec_to_id = Series.from_csv('hsx_security_to_id', header=0) except: print('Security -> ID table not found, making a new one') with open('hsx_security_to_id', 'w') as f: f.write('security,id') sec_to_id = Series.from_csv('hsx_security_to_id', header=0) if sec not in sec_to_id: r = requests.get('http://www.hsx.com/security/view/{}'.format(sec)) #extract from webpage soup = BeautifulSoup(r.text) try: script = soup.findAll('script')[4].text.split('\n') sec_id = script[3].split('=')[2] sec_id = sec_id.split('"')[0] except: print("Cannot find id for {}".format(sec)) return -1 sec_to_id[sec] = int(sec_id) Series.to_csv(sec_to_id,'hsx_security_to_id',header='security,id') return sec_to_id[sec]
def downloadDailies(movie_id, movie_name='', save=True): index = ['Day', 'Date', 'Rank', 'Gross', '% Change Prev Day', '% Change Prev Week', 'Theaters', 'Avg/Theater', 'Gross-to-Date','Day #'] r = requests.get('{0}/movies/?page=daily&view=chart&id={1}.htm'.format(base_url, movie_id)) soup = BeautifulSoup(r.text,'lxml') if not movie_name: try: movie_name = soup.find('font', attrs={'face':'Verdana', 'size':'6'}).text except: movie_name = soup.find('font', attrs={'face':'Verdana', 'size':'5'}).text table = soup.find('table', attrs={'class':'chart-wide'}) if table is None: print('{} does not have daily box office numbers'.format(movie_id)) return DataFrame() results = {} for tr in table.findAll('tr')[1:]: result = [td.text for td in tr.findAll('td')] if len(result) > 1: dt = datetime.strptime(result[1].replace('\t','').replace('.',''), '%b %d, %Y') results[dt] = result df = DataFrame(results, index=index).T #clean data df['Gross'] = df['Gross'].map(lambda x: int(x[1:].replace(',',''))) df['Gross-to-Date'] = df['Gross-to-Date'].map(lambda x: int(x[1:].replace(',',''))) df['Avg/Theater'] = df['Avg/Theater'].map(lambda x: int(x[1:].replace(',',''))) df['% Change Prev Day'] = df['% Change Prev Day'].map(lambda x: float('nan') if x == '-' else float(x[:-1].replace(',',''))) df['% Change Prev Week'] = df['% Change Prev Week'].map(lambda x: float('nan') if x == '-' else float(x[:-1].replace(',',''))) if save: df.to_csv('boxoffice/{}.csv'.format(movie_id)) series = Series.from_csv('boxoffice/'+film_index_name) series[movie_name] = movie_id series.to_csv('boxoffice/'+film_index_name) return df
def test_to_csv_float_format(self): with ensure_clean() as filename: ser = Series([0.123456, 0.234567, 0.567567]) ser.to_csv(filename, float_format='%.2f') rs = Series.from_csv(filename) xp = Series([0.12, 0.23, 0.57]) assert_series_equal(rs, xp)
def test_from_csv_deprecation(self): # see gh-17812 with ensure_clean() as path: self.ts.to_csv(path) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): ts = self.read_csv(path) depr_ts = Series.from_csv(path) assert_series_equal(depr_ts, ts)
def cleanFilmIndex(): series = Series.from_csv('boxoffice/'+film_index_name) l=[] for name,fid in series.items(): df=loadDailies(fid) if df.empty: l.append(name) for name in l: del series[name] series.to_csv('boxoffice/'+film_index_name)
def test_to_csv_unicode_index(self): buf = StringIO() s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")]) s.to_csv(buf, encoding='UTF-8') buf.seek(0) s2 = Series.from_csv(buf, index_col=0, encoding='UTF-8') assert_series_equal(s, s2)
def similarDay(price, day, count=0, above=0): '''Get a set of films with the most similar gross revenues on the given day since release.''' series = Series() films = Series.from_csv('boxoffice/'+film_index_name) for film in films: s = asSeries(loadDailies(film)) if s is not None and day in s and s[day] > above: series[film] = s[day] series = (abs(series - price)).sort_values(ascending=True) series /= 1000000 if count > 0: return series[:count] return series
def parse_old_logs(log_file, order_file=None, name=None): """Parse logs that have a single column of timepoints for each event""" df = DataFrame.from_csv(log_file, index_col=None, header=None) df = df.rename(columns={0:'break'}) if name: df['name'] = name elif not 'name' in df: raise Exception('log needs a name column') log = parse_splits(df) if order_file: order = Series.from_csv(order_file, header=None, index_col=None) log['order'] = order else: log['order'] = range(len(log)) return log
def test_from_csv(self): with ensure_clean() as path: self.ts.to_csv(path) ts = self.read_csv(path) assert_series_equal(self.ts, ts, check_names=False) assert ts.name is None assert ts.index.name is None with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): depr_ts = Series.from_csv(path) assert_series_equal(depr_ts, ts) # see gh-10483 self.ts.to_csv(path, header=True) ts_h = self.read_csv(path, header=0) assert ts_h.name == "ts" self.series.to_csv(path) series = self.read_csv(path) assert_series_equal(self.series, series, check_names=False) assert series.name is None assert series.index.name is None self.series.to_csv(path, header=True) series_h = self.read_csv(path, header=0) assert series_h.name == "series" outfile = open(path, "w") outfile.write("1998-01-01|1.0\n1999-01-01|2.0") outfile.close() series = self.read_csv(path, sep="|") check_series = Series({datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0}) assert_series_equal(check_series, series) series = self.read_csv(path, sep="|", parse_dates=False) check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0}) assert_series_equal(check_series, series)
def test_from_csv(self): with ensure_clean() as path: self.ts.to_csv(path) ts = Series.from_csv(path) assert_series_equal(self.ts, ts, check_names=False) self.assertTrue(ts.name is None) self.assertTrue(ts.index.name is None) # GH10483 self.ts.to_csv(path, header=True) ts_h = Series.from_csv(path, header=0) self.assertTrue(ts_h.name == 'ts') self.series.to_csv(path) series = Series.from_csv(path) self.assertIsNone(series.name) self.assertIsNone(series.index.name) assert_series_equal(self.series, series, check_names=False) self.assertTrue(series.name is None) self.assertTrue(series.index.name is None) self.series.to_csv(path, header=True) series_h = Series.from_csv(path, header=0) self.assertTrue(series_h.name == 'series') outfile = open(path, 'w') outfile.write('1998-01-01|1.0\n1999-01-01|2.0') outfile.close() series = Series.from_csv(path, sep='|') checkseries = Series({datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0}) assert_series_equal(checkseries, series) series = Series.from_csv(path, sep='|', parse_dates=False) checkseries = Series({'1998-01-01': 1.0, '1999-01-01': 2.0}) assert_series_equal(checkseries, series)
from pandas import DataFrame from pandas import concat from statsmodels.tsa.ar_model import AR from sklearn.metrics import mean_squared_error from pandas.tools.plotting import autocorrelation_plot from statsmodels.graphics.tsaplots import plot_acf def mean_absolute_percentage_error(y_true, y_pred): # y_true, y_pred = check_arrays(y_true, y_pred) ## Note: does not handle mix 1d representation #if _is_1d(y_true): # y_true, y_pred = _check_1d_array(y_true, y_pred) return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 # series = Series.from_csv('daily-minimum-temperatures-in-me.csv', header=0) series = Series.from_csv('wc98_workload_hour.csv', header=0) # split dataset X = series.values print 'X: ', len(X) train_size = int(len(X) * 0.8) train, test = X[1:train_size], X[train_size:] # train autoregression print len(train) model = AR(train) model_fit = model.fit() # model_fit = model.fit(10,ic='bic') print('Lag: %s' % model_fit.k_ar) # lag= round(12*(len(train)/100.)**(1/4.)) # print 'Gia tri lag theo cach tinh', lag print('Coefficients: %s' % model_fit.params) print len(model_fit.params), len(test)
def main(): out_dir = os.path.dirname(__file__) ex1_path = study.DATA_DIR + '/ch06/ex1.csv' cat(ex1_path) df = pd.read_csv(ex1_path) p(df) p(pd.read_table(ex1_path, sep=',')) p('header less---------------------') ex2_path = study.DATA_DIR + '/ch06/ex2.csv' cat(ex2_path) names = ['a','b', 'c', 'd', 'message'] p(pd.read_csv(ex2_path, header=None)) p(pd.read_csv(ex2_path, names=names)) p(pd.read_csv(ex2_path, names=names, index_col='message')) p('hierarchy index---------------------') mindex_path = study.DATA_DIR + '/ch06/csv_mindex.csv' cat(mindex_path) p(pd.read_csv(mindex_path, index_col=['key1', 'key2'])) p('separate by regex-------------') ex3_path = study.DATA_DIR + '/ch06/ex3.csv' cat(ex3_path) p(pd.read_csv(ex3_path, sep='\s+')) p('skip rows-----------') ex4_path = study.DATA_DIR + '/ch06/ex4.csv' cat(ex4_path) p(pd.read_csv(ex4_path, skiprows=[0,2,3])) p('N/A------------------') ex5_path = study.DATA_DIR + '/ch06/ex5.csv' cat(ex5_path) result = pd.read_csv(ex5_path) p(result) p(pd.isnull(result)) result = pd.read_csv(ex5_path, na_values=['NULL', '12']) # 12 is NA p(result) p('N/A dict------------------') sentinels = {'message': ['foo', 'NA'], 'something': ['two']} p(sentinels) p(pd.read_csv(ex5_path, na_values=sentinels)) p('6.1.1 read data chunk size---------------------') ex6_path = study.DATA_DIR + '/ch06/ex6.csv' p(pd.read_csv(ex6_path).count()) p(pd.read_csv(ex6_path, nrows=5)) chunker = pd.read_csv(ex6_path, chunksize=1000) p(chunker) tot = Series([]) for piece in chunker: tot = tot.add(piece['key'].value_counts(), fill_value=0) tot.order(ascending=False) p(tot[:10]) p('6.1.2 write---------------------') data = pd.read_csv(ex5_path) p(data) ex5_out_path = out_dir + '/ex5_out.csv' data.to_csv(ex5_out_path) cat(ex5_path) data.to_csv(sys.stdout, index=False, header=False) print '' data.to_csv(sys.stdout, index=False, cols=list('abc')) print '' p('Series--------------') tseries_out_path = out_dir + '/tseries_out.csv' dates = pd.date_range('1/1/2000', periods=7) ts = Series(np.arange(7), index=dates) ts.to_csv(tseries_out_path) cat(tseries_out_path) p(Series.from_csv(tseries_out_path, parse_dates=True)) p('6.1.3 csv-------------------------') ex7_path = study.DATA_DIR + '/ch06/ex7.csv' cat(ex7_path) f = open(ex7_path) reader = csv.reader(f) for line in reader: print line lines = list(csv.reader(open(ex7_path))) header, values = lines[0], lines[1:] data_dict = {h: v for h,v in zip(header, zip(*values))} p(data_dict) my_data_out_path = out_dir + '/mydata.csv' with open(my_data_out_path, 'w') as fp: writer = csv.writer(fp, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9')) cat(my_data_out_path) p('6.1.4 JSON-------------------------') obj = """ {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}] } """ result = json.loads(obj) p(result) asjson = json.dumps(result) p(asjson) siblings = DataFrame(result['siblings'], columns=['name', 'age']) p(siblings) p('6.1.4 XML/HTML Web Scraping-------------------------') url = '' #'http://finance.yahoo.com/q/op?s=AAPL+Options' if not url is '': parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options')) doc = parsed.getroot() p([lnk.get('href') for lnk in doc.findall('.//a')][-10:]) tables = doc.findall('.//table') p(parse_options_data(tables[9])[:5]) p(parse_options_data(tables[13])[:5]) p('6.1.5 Read XML-------------------------') xml_path = out_dir + '/Performance_MNR.xml' xml_content =""" <INDICATOR> <INDICATOR_SEQ>373889</INDICATOR_SEQ> <PARENT_SEQ></PARENT_SEQ> <AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME> <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME> <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION> <PERIOD_YEAR>2011</PERIOD_YEAR> <PERIOD_MONTH>12</PERIOD_MONTH> <CATEGORY>Service Indicators</CATEGORY> <FREQUENCY>M</FREQUENCY> <DESIRED_CHANGE>U</DESIRED_CHANGE> <INDICATOR_UNIT>%</INDICATOR_UNIT> <DECIMAL_PLACES>1</DECIMAL_PLACES> <YTD_TARGET>97.00</YTD_TARGET> <YTD_ACTUAL></YTD_ACTUAL> <MONTHLY_TARGET>97.00</MONTHLY_TARGET> <MONTHLY_ACTUAL></MONTHLY_ACTUAL> </INDICATOR> """ if not os.path.exists(xml_path): with open(xml_path, 'w') as f: f.write(xml_content) parsed = objectify.parse(open(xml_path)) root = parsed.getroot() data = [] skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_SEQ', 'DECIMAL_PLACES'] p(dir(root)) for elt in root: # .INDICATOR: el_data = {} for child in elt.getchildren(): if child.tag in skip_fields: continue el_data[child.tag] = child.pyval data.append(el_data) perf = DataFrame(data) p(perf) tag = '<a href="http://google.com">Google</a>' root = objectify.parse(StringIO.StringIO(tag)).getroot() p(root) p(root.get('href')) p(root.text)
# plot the forecasts in the context of the original dataset def plot_forecasts(series, forecasts, n_test): # plot the entire dataset in blue pyplot.plot(series.values) # plot the forecasts in red for i in range(len(forecasts)): off_s = len(series) - n_test + i - 1 off_e = off_s + len(forecasts[i]) + 1 xaxis = [x for x in range(off_s, off_e)] yaxis = [series.values[off_s]] + forecasts[i] pyplot.plot(xaxis, yaxis, color='red') # show the plot pyplot.show() series = Series.from_csv('ec2_cpu_utilization_2.csv', header=0) n_lag = 1 n_test = 10 n_seq = 3 n_batch = 1 scaler, train, test = prepare_data(series, n_test, n_lag, n_seq) model = load_model('cpu_model-multi.h5') forecasts = make_forecasts(model, n_batch, train, test, n_lag, n_seq) # inverse transform forecasts and test forecasts = inverse_transform(series, forecasts, scaler, n_test + 2) actual = [row[n_lag:] for row in test] actual = inverse_transform(series, actual, scaler, n_test + 2) # evaluate forecasts evaluate_forecasts(actual, forecasts, n_lag, n_seq) # plot forecasts
return ((self.endog), (self.k_lags, self.k_diff, self.k_ma)) ARIMA.__getnewargs__ = __getnewargs__ # create a differenced series def difference(dataset, interval=1): diff = list() for i in range(interval, len(dataset)): value = dataset[i] - dataset[i - interval] diff.append(value) return diff # load data series = Series.from_csv('dataset.csv') # prepare data X = series.values X = X.astype('float32') # difference data months_in_year = 12 diff = difference(X, months_in_year) # fit model model = ARIMA(diff, order=(0, 0, 1)) model_fit = model.fit(trend='nc', disp=0) # bias constant, could be calculated from in-sample mean residual bias = 165.904728 # save model model_fit.save('model.pkl') numpy.save('model_bias.npy', [bias])
print(df) # 省略行列标签 df = data.to_csv(sys.stdout, index=False, header=False) print(df) # 指定列 df = data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c']) print(df) # 循环写操作 dates = pd.date_range('1/1/2000', periods=7) ts = Series(np.arange(7), index=dates) ts.to_csv('tseries.csv') df = Series.from_csv('tseries.csv', parse_dates=True) print(df) print('-----------------------------------') print('-----------------------------------') # 手工处理分隔符格式 import csv f = open('d:data/ex7.csv') reader = csv.reader(f) for line in reader: print(line) # 字典处理,数据对齐
# -*- coding # statistical test for the stationarity of the time series """ Created on Sun Oct 8 00:44:43 2017 @author: user """ #evaluate manually configured ARIMA model from pandas import Series from sklearn.metrics import mean_squared_error from statsmodels.tsa.arima_model import ARIMA from math import sqrt # load data series = Series.from_csv('DailyData.csv') # prepare data X = series.values X = X.astype('float32') train_size = int(len(X) * 0.66) train, test = X[0:train_size], X[train_size:] # walk-forward validation history = [x for x in train] predictions = list() for i in range(len(test)): # predict model = ARIMA(history, order=(1,1,5)) model_fit = model.fit(disp=0) yhat = model_fit.forecast()[0] predictions.append(yhat)
from pandas import Series from matplotlib import pyplot from pandas.tools.plotting import lag_plot from pandas import DataFrame from pandas import concat from matplotlib import pyplot from pandas.tools.plotting import autocorrelation_plot from statsmodels.graphics.tsaplots import plot_acf from statsmodels.tsa.ar_model import AR from sklearn.metrics import mean_squared_error import timeit import pandas import numpy series = Series.from_csv('/home/alex/Desktop/doulke_mikri/ML/important_doc.txt', header=0) # split dataset X = series.values train, test = X[1:len(X)-7], X[len(X)-7:] # train autoregression t1=timeit.default_timer() model = AR(train) model_fit = model.fit() window = model_fit.k_ar coef = model_fit.params # walk forward over time steps in test history = train[len(train)-window:] history = [history[i] for i in range(len(history))] predictions = list() for t in range(len(test)):
import pandas as pd from pandas import DataFrame from pandas import Series import re import matplotlib import matplotlib.pyplot as plt from nltk.sentiment.vader import SentimentIntensityAnalyzer from nltk import tokenize s1 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-25.csv') s2 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-26.csv') s3 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-27.csv') s4 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-28.csv') all_tweets = pd.concat([s1,s2,s3,s4]) twitter_handle_re = re.compile(r'@([A-Za-z0-9_]+)') mention_counts = Series() for item in all_tweets: mentions = twitter_handle_re.findall(item) for mention in mentions: if mention in mention_counts.keys(): mention_counts[mention] += 1 else: mention_counts[mention] = 1 mention_counts.sort(ascending = False) #print mention_counts
from pandas import Series from matplotlib import pyplot from pandas import DataFrame from pandas import TimeGrouper from pandas import concat from pandas.plotting import lag_plot from pandas.plotting import autocorrelation_plot series = Series.from_csv( "/Users/richardcollins/Desktop/Time_Series/daily-min-temperatures.csv", header=0) print(series.head()) print(len(series)) # Group data by years and by months (in 1990) groups = series.groupby(TimeGrouper('A')) years = DataFrame() for name, group in groups: years[name.year] = group.values series_1990 = series['1990'] groups_1990 = series_1990.groupby(TimeGrouper('M')) months = concat([DataFrame(x[1].values) for x in groups_1990], axis=1) months = DataFrame(months) months.columns = range(1, 13) # Line plot series.plot(linewidth=0.2) pyplot.show() # Line plot per year years.plot(subplots=True, legend=False)
''' * Zaman Serisi Tahmini Uygulamaları * Görüldüğü gibi karekök transform daha çigisel ve eşit bir dağılım sunmaktadır. * Corona Virüs Günlük Onaylı Vaka Karekök ve Log Transform * Tarih: 03 Nisan 2020 * Hazırlayan: Bilishim Siber Güvenlik ve Yapay Zeka * Bu çalışmalar yalnızca ARGE ve bilgiyi geliştirmek maksadıyla hazırlanmış olup, herhangi bir resmi temsil ya da bağlayıcılığı yoktur. ''' from pandas import Series from pandas import DataFrame from numpy import sqrt from matplotlib import pyplot from numpy import log series = Series.from_csv('corona-virus-istatistikleri-resampled.csv', header=0) dataframe = DataFrame(series.values) dataframe.columns = ['Gunluk Onayli Vaka'] pyplot.figure("Günlük Onaylı Vaka") # line plot pyplot.subplot(211) pyplot.plot(dataframe['Gunluk Onayli Vaka']) # histogram pyplot.subplot(212) pyplot.hist(dataframe['Gunluk Onayli Vaka']) pyplot.show() #Karekök Transform series = Series.from_csv('corona-virus-istatistikleri-resampled.csv', header=0) dataframe = DataFrame(series.values) dataframe.columns = ['Gunluk Onayli Vaka']
from pandas import Series from matplotlib import pyplot from statsmodels.tsa.ar_model import AR from sklearn.metrics import mean_squared_error from flask import render_template import numpy as np app = Flask(__name__) start_day = 1 end_day = 100 your_prediction = 1 train = 1 test = 1 series = Series.from_csv('daily_curren_new.csv', header=0) # split dataset X = series.values @app.route('/', methods=['GET', 'POST']) def index(): if request.method == 'POST': start_day = int(request.form['start']) end_day = int(request.form['end']) tr, te = X[start_day:end_day], X[end_day:end_day + 1] return redirect(url_for('predict', train=tr, test=te)) return ''' <form method="post"> <p>Please enter start day for prediction: <p><input type=number name=start>
from pandas import Series from pandas import DataFrame from pandas import TimeGrouper from matplotlib import pyplot series = Series.from_csv('dataset_training.csv') groups = series['1964':'1970'].groupby(TimeGrouper('A')) years = DataFrame() for name, group in groups: years[name.year] = group.values years.boxplot() pyplot.show()
#!/usr/bin/python # -*- coding: utf-8 -*- # import numpy # from pandas import Series # from pandas import DataFrame # from pandas import TimeGrouper # from matplotlib import pyplot # from pandas.tools.plotting import lag_plot # series = Series.from_csv('../data/oni/csv/nino3_4.csv', header=0) # rolling = series.rolling(window=3) # rolling_mean = rolling.mean() # print(rolling_mean.head(10)) # # plot original and transformed dataset # series.plot() # rolling_mean.plot(color='red') # pyplot.show() # # zoomed plot original and transformed dataset # series[:100].plot() # rolling_mean[:100].plot(color='red') # pyplot.show() from pandas import Series from matplotlib import pyplot from statsmodels.tsa.seasonal import seasonal_decompose series = Series.from_csv('../data/oni/csv/nino3_4.csv', header=0) result = seasonal_decompose(series, model="multiplicative") result.plot() pyplot.show()
handle = open(file, "rU") for record in tqdm(SeqIO.parse(handle, "fasta")) : seq = str(record.seq) l = len(seq) if 'coverage' not in locals(): coverage = [0]*l for (i,c) in enumerate(seq): if c not in ['.','-']: coverage[i] = coverage[i] +1 coverage=Series(coverage) coverage.to_csv("coverages.csv",index=False) handle.close() else : print "import coverages" coverage = Series.from_csv("coverages.csv",header=-1, index_col=False) print "compute median-ish things" medians = [] means = [] maxs = [] mins = [] lens = [] left = [] right = [] unsure = [] handle = open(file, "rU") positions=list(coverage[coverage > 500000].index) l = len(positions) for record in tqdm(SeqIO.parse(handle, "fasta")) : seq = str(record.seq)
# model seasonality with a polynomial model from pandas import Series from matplotlib import pyplot from numpy import polyfit series = Series.from_csv('daily-minimum-temperatures.csv', header=0) # fit polynomial: x^2*b1 + x*b2 + ... + bn X = [i % 365 for i in range(0, len(series))] y = series.values degree = 4 coef = polyfit(X, y, degree) print('Coefficients: %s' % coef) # create curve curve = list() for i in range(len(X)): value = coef[-1] for d in range(degree): value += X[i]**(degree - d) * coef[d] curve.append(value) # plot curve over original data pyplot.plot(series.values) pyplot.plot(curve, color='red', linewidth=3) pyplot.show()
# -*- coding: utf-8 -*- """ Created on Sat Jun 17 04:14:53 2017 @author: user """ # create a heat map of monthly data from pandas import Series from pandas import DataFrame from pandas import TimeGrouper from matplotlib import pyplot from pandas import concat series = Series.from_csv('TSData2.csv', header=0) one_year = series['2017'] groups = one_year.groupby(TimeGrouper('M')) months = concat([DataFrame(x[1].values) for x in groups], axis=1) months = DataFrame(months) months.columns = range(1,6) pyplot.matshow(months, interpolation=None, aspect='auto') pyplot.show()
# split into a training and validation dataset from pandas import Series series = Series.from_csv('robberies.csv', header=0) split_point = len(series) - 12 dataset, validation = series[0:split_point], series[split_point:] print('Dataset %d, Validation %d' % (len(dataset), len(validation))) dataset.to_csv('dataset.csv') validation.to_csv('validation.csv')
# 计算样本的误差值 error = mean_squared_error(test, predictions) return error # 评估ARIMA模型的p,d和q值的组合 def evaluate_models(dataset, p_values, d_values, q_values): dataset = dataset.astype('float32') best_score, best_cfg = float("inf"), None for p in p_values: for d in d_values: for q in q_values: order = (p, d, q) try: mse = evaluate_arima_model(dataset, order) if mse < best_score: best_score, best_cfg = mse, order print('ARIMA%s MSE=%.3f' % (order, mse)) except: continue print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score)) # load dataset series = Series.from_csv('daily-total-female-births.csv', header=0) # 评估参数 p_values = [0, 1, 2, 4, 6, 8, 10] d_values = range(0, 3) q_values = range(0, 3) warnings.filterwarnings("ignore") evaluate_models(series.values, p_values, d_values, q_values)
import statsmodels.tsa.stattools from pandas import Series from pandas import read_csv from pandas import datetime from pandas import DataFrame from matplotlib import pyplot from pandas.plotting import autocorrelation_plot from sklearn.metrics import mean_squared_error from statsmodels.tsa.stattools import adfuller from statsmodels.tsa.seasonal import seasonal_decompose from statsmodels.tsa.arima_model import ARIMA from statsmodels.tsa.arima_model import ARIMAResults from statsmodels.graphics.tsaplots import plot_pacf timeSeriesName = Series.from_csv('file_path.csv',header=0) arimaOrder = #(p,d,q) parameters defined by the user, might be defined using an "ARIMA(p,d,q)_parameters" function to choose the parameters with the lowest MSE n = 0.66 #Use a length "n" corresponding to the length of your base, this value must be the amount of time you want to see in the past, being less than 1 and greater than 0. #The amount of points foreseen in future periods will be 1- n. #p = AR, which means 'autorregressive' #i = I, which means 'integrated' #q = MA, which means 'moving average' timeSeriesName.describe() timeSeriesName.plot()
# evaluate combinations of p, d and q values for an ARIMA model def evaluate_models(dataset, p_values, d_values, q_values): dataset = dataset.astype('float32') best_score, best_cfg = float("inf"), None for p in p_values: for d in d_values: for q in q_values: order = (p, d, q) try: mse = evaluate_arima_model(dataset, order) if mse < best_score: best_score, best_cfg = mse, order print('ARIMA%s RMSE=%.3f' % (order, mse)) except: continue print('Best ARIMA%s RMSE=%.3f' % (best_cfg, best_score)) # load dataset series = Series.from_csv('final_dataset.csv') # evaluate parameters p_values = range(0, 7) d_values = range(0, 3) q_values = range(0, 7) warnings.filterwarnings("ignore") evaluate_models(series.values, p_values, d_values, q_values) #After Executing the Program, we see that (1,0,0) is best suited with RMSE= 27.400
import numpy # create a differenced series def difference(dataset, interval=1): diff = list() for i in range(interval, len(dataset)): value = dataset[i] - dataset[i - interval] diff.append(value) return diff # invert differenced value def inverse_difference(history, yhat, interval=1): return yhat + history[-interval] # load and prepare datasets dataset = Series.from_csv('dataset.csv') X = dataset.values.astype('float32') history = [x for x in X] months_in_year = 12 validation = Series.from_csv('validation.csv') y = validation.values.astype('float32') # load model model_fit = ARIMAResults.load('model.pkl') bias = numpy.load('model_bias.npy') # make first prediction predictions = list() yhat = float(model_fit.forecast()[0]) yhat = bias + inverse_difference(history, yhat, months_in_year) predictions.append(yhat) history.append(y[0]) print('>Predicted=%.3f, Expected=%3.f' % (yhat, y[0]))
# calculate statistics of partitioned log transformed time series data from pandas import Series from matplotlib import pyplot from numpy import log series = Series.from_csv('airline-passengers.csv', header=0) X = series.values X = log(X) split = int(len(X) / 2) X1, X2 = X[0:split], X[split:] mean1, mean2 = X1.mean(), X2.mean() var1, var2 = X1.var(), X2.var() print('mean1=%f, mean2=%f' % (mean1, mean2)) print('variance1=%f, variance2=%f' % (var1, var2))
f = open('E:/建模/第5周/data/ex3.txt') df = pd.read_table(f, sep='\s+') print(df) #根据需要选择需要读的行 f = open('E:/建模/第5周/data/ex4.csv') df = pd.read_table(f, sep=',', skiprows=[0, 2, 3]) #跳过不想读的行 print(df) #处理缺失值 f = open('E:/建模/第5周/data/ex5.csv') df = pd.read_table(f, sep=',', na_values='world') #如果数据中有’world’,也会视为缺失值 print(df) #逐行读取文件 f = open('E:/建模/第5周/data/ex6.csv') df = pd.read_table(f, sep=',', nrows=5) #只读取前面5行 print(df) #将dataframe数据写入csv文件 f = open('E:/建模/第5周/data/ex5.csv') data = pd.read_csv(f) data.to_csv('E:/建模/第5周/data/out.csv') #将dataframe输出到csv文件中 data.to_csv('E:/建模/第5周/data/out.csv', na_rep='ok') #将缺失值补上‘ok’ data.to_csv('E:/建模/第5周/data/out.csv', header=None) #不设置表头 data.to_csv('E:/建模/第5周/data/out.csv', columns=['a', 'b']) #写出指定的列 #将csv文件读取位Series f = open('E:/建模/第5周/data/tseries.csv') series = Series.from_csv(f, parse_dates=True) print(series)
# In[41]: dates = pd.date_range('21/8/2017', periods=10) # In[45]: ts = Series(np.arange(10), index=dates) # In[90]: ts.to_csv('D:\PythonDataAnalysis\ch6\tseries.csv') # In[91]: Series.from_csv('D:\PythonDataAnalysis\ch6\tseries.csv', parse_dates=True) # ## 3,手动处理分隔符格式 # In[65]: import csv # In[66]: f = open('D:\PythonDataAnalysis\ch6\ex7.csv') # In[88]: reader = csv.reader(f)
for i in range(len(sequence)): # find the end of this pattern end_ix = i + n_steps # check if we are beyond the sequence if end_ix > len(sequence) - 1: break # gather input and output parts of the pattern seq_x, seq_y = sequence[i:end_ix], sequence[end_ix] X.append(seq_x) y.append(seq_y) return array(X), array(y) # define input sequence #raw_seq = [10, 20, 30, 40, 50, 60, 70, 80, 90] raw_seq = Series.from_csv('WL_Edmonton_1day.csv', header=0) # choose a number of time steps n_steps = 3 # split into samples X, y = split_sequence(raw_seq, n_steps) # reshape from [samples, timesteps] into [samples, timesteps, features] n_features = 1 X = X.reshape((X.shape[0], X.shape[1], n_features)) # define model model = Sequential() model.add( Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(n_steps, n_features)))
import os import numpy as np from pandas import Series from matplotlib import pyplot from pandas import DataFrame from pandas import concat folder_path = '/home/nguyen/spark-lab/spark-2.1.1-bin-hadoop2.7/abc' # for file_name in os.listdir(folder_path): # print file_name +"\n" series = Series.from_csv( '/home/nguyen/spark-lab/spark-2.1.1-bin-hadoop2.7/abc/part-00000-09243fc6-2d5e-4b7e-a4d7-146ad3399131.csv', header=0) X = series.values pyplot.plot(X) pyplot.show()
# create a difference transform of the dataset def difference(dataset): diff = list() for i in range(1, len(dataset)): value = dataset[i] - dataset[i - 1] diff.append(value) return numpy.array(diff) # Make a prediction give regression coefficients and lag obs def predict(coef, history): yhat = coef[0] for i in range(1, len(coef)): yhat += coef[i] * history[-i] return yhat series = Series.from_csv('../data/nifty.csv', header=0) # split dataset X = difference(series.values) size = int(len(X) * 0.66) train, test = X[0:size], X[size:] # train autoregression model = AR(train) model_fit = model.fit(maxlag=6, disp=False) window = model_fit.k_ar coef = model_fit.params # walk forward over time steps in test history = [train[i] for i in range(len(train))] predictions = list() for t in range(len(test)): yhat = predict(coef, history) obs = test[t]
America/Argentina/Cordoba 26 America/Argentina/Mendoza 55 dtype: int64 """ count_subset = agg_counts.take(indexer)[-10:] print count_subset print type(count_subset) """ Not Windows Windows tz America/Sao_Paulo 13.0 20.0 Europe/Madrid 16.0 19.0 Pacific/Honolulu 0.0 36.0 Asia/Tokyo 2.0 35.0 Europe/London 43.0 31.0 America/Denver 132.0 59.0 America/Los_Angeles 130.0 252.0 America/Chicago 115.0 285.0 245.0 276.0 America/New_York 339.0 912.0 <class 'pandas.core.frame.DataFrame'> """ ##write back count_subset.to_csv('./333.csv') print '----' print Series.from_csv('./333.csv',parse_dates=True) # print pd.read_csv(csv_path2,header=None)
# separate out a validation dataset from pandas import Series series = Series.from_csv('champagne.csv', header=0) split_point = len(series) - 12 dataset, validation = series[0:split_point], series[split_point:] print('Dataset %d, Validation %d' % (len(dataset), len(validation))) dataset.to_csv('dataset.csv') validation.to_csv('validation.csv')
filename = options.filename sim_no = options.sim_no if options.product is None: parser.error("Need to pass in a product") product = create_product(options) logger.info('filename: %s, no of sim: %s price %s', filename, sim_no, product) forwards = pandas.read_csv(filename) forwards = forwards.set_index('Dates') forwards = forwards.dropna() if options.cache_pca and \ os.path.exists('eigenmat.csv') and \ os.path.exists('eigenvecs.csv'): logger.info("Using previously calculated PCAs") eigenmat = DataFrame.from_csv('eigenmat.csv') eigenvecs = Series.from_csv('eigenvecs.csv') else: logger.info('Calculating PCA') eigenvecs, eigenmat = calculate_pca(forwards) eigenmat.to_csv("eigenmat.csv") eigenvecs.to_csv("eigenvec.csv") tenors = eigenvecs.index.astype("float64").tolist() tenors.sort() sqrt_lambdas = eigenvecs[:3].apply(sqrt) eigenmat = eigenmat.transpose() vols = eigenmat.apply(lambda x: numpy.asarray(x) * numpy.asarray(sqrt_lambdas)) factors = create_factors(vols, 3) vol1fit, poly1 = curve_fit(vols.iloc[0], 1) vol2fit, poly2 = curve_fit(vols.iloc[1]) vol3fit, poly3 = curve_fit(vols.iloc[2])
from sklearn.metrics import mean_squared_error from math import sqrt # create a differenced series def difference(dataset, interval=1): diff = list() for i in range(interval, len(dataset)): value = dataset[i] - dataset[i - interval] diff.append(value) return diff def inverse_difference(history, yhat, interval=1): return yhat + history[-interval] series = Series.from_csv('LSE-ABDP-dataset.csv') X = series.values X = X.astype('float32') train_size = int(len(X) * 0.50) train, test = X[0:train_size], X[train_size:] # walk-forward validation history = [x for x in train] predictions = list() #bias = 165.904728 months_in_year = 12 diff = difference(history, months_in_year) # predict model = ARIMA(diff, order=(2,0,1)) for i in range(len(test)): model_fit = model.fit(trend='nc', disp=0)
print(data.to_csv(sys.stdout, index=False, header=False)) print('\n') print(data.to_csv(sys.stdout, index=False, col=['a','b','c'])) print('\n') dates = pd.date_range('1/1/2000', periods=7) ts = Series(np.arange(7), index=dates) ts.to_csv('data/tseries.csv') print('\n') print(Series.from_csv('data/tseries.csv', parse_dates=True)) print('\n')
dataset = dataset.astype('float32') best_score, best_cfg = float("inf"), None for p in p_values: for d in d_values: for q in q_values: order = (p,d,q) try: mse = evaluate_arima_model(dataset, order) if mse < best_score: best_score, best_cfg = mse, order print('ARIMA%s MSE=%.3f' % (order,mse)) except: continue print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score)) # load dataset def parser(x): return datetime.strptime('190'+x, '%Y-%m') #series = read_csv('input/shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) series = Series.from_csv('../input/shampoo-sales.csv', header=0) # evaluate parameters p_values = [0, 1, 2, 4, 6, 8, 10] d_values = range(0, 3) q_values = range(0, 3) #evaluate_models(series.values, p_values, d_values, q_values) #Best: (6, 1, 0) dataset = series.values.astype('float32') evaluate_arima_model(dataset, (6, 1, 0))
# 指定读取的行数 df10 = pd.read_csv('resources/ex5.csv',nrows=10) # print df10 # 写入csv,缺省值写成NaN,行标签不写入文件,列标签为header df7.to_csv('resources/write1.csv',sep=',',na_rep='Nan',index=False) # 按指定顺序写入指定列 df7.to_csv('resources/write1.csv',sep=',',na_rep='Nan',index=False,columns=['b','c','a']) # Serises的读写 dates1 = pd.date_range('1/1/2000','1/1/2016') s = Series(dates1,index=np.arange(dates1.size)) # print s s.to_csv('resources/write2.csv',sep=',') s1 = Series.from_csv('resources/write2.csv') # print s1 # csv f = open('resources/write1.csv') reader = csv.reader(f) lines = list(reader) header,values = lines[0],lines[1:] data_dic = { k:v for k,v in zip(header,zip(*values)) } # print data_dic
t_params = ['n', 'c', 't', 'ct'] m_params = seasonal # create config instances for trend_elements in product(p_params, d_params, q_params): for t in t_params: for seasonal_elements in product(P_params, D_params, Q_params, m_params): cfg2 = [trend_elements, seasonal_elements, t] models.append(cfg2) return (models) if __name__ == '__main__': # define dataset series = Series.from_csv( "/Users/richardcollins/Desktop/Time_Series/monthly-car-sales.csv", header=0, index_col=0) data = series.values print(data.shape) # data split n_test = 12 # model configs cfg_list = sarima_configs(seasonal=[12]) # grid search scores = grid_search(data, cfg_list, n_test, parallel=False) print("done") # list top 3 configs for cfg, error in scores[:3]: print(cfg, error) train, test = train_test_split(data, n_test)
# -*- coding: utf-8 -*- """ Created on Sat Jun 17 01:59:11 2017 @author: user """ # calculate descriptive statistics from pandas import Series series = Series.from_csv('mdata2.csv') print(series.describe())
from pandas import Series from statsmodels.tsa.arima_model import ARIMA import numpy # create a differenced series def difference(dataset, interval=1): diff = list() for i in range(interval, len(dataset)): value = dataset[i] - dataset[i - interval] diff.append(value) return numpy.array(diff) # load dataset series = Series.from_csv('dataset.csv', header=None) # seasonal difference X = series.values days_in_year = 365 differenced = difference(X, days_in_year) # fit model model = ARIMA(differenced, order=(7,0,1)) model_fit = model.fit(disp=0) # print summary of fit model print(model_fit.summary())
from pandas import Series from pandas import DataFrame from pandas import concat series = Series.from_csv('daily-temp.csv', header=0) temps = DataFrame(series.values) print(temps.head()) #Sliding window of size 1 print('Sliding window of size 1') dataframe = concat([temps.shift(1), temps], axis=1) dataframe.columns = ['t-1', 't+1'] print(dataframe.head(5)) #Sliding window of size 3 print('Sliding window of size 3') dataframe = concat( [temps.shift(3), temps.shift(2), temps.shift(1), temps], axis=1) dataframe.columns = ['t-3', 't-2', 't-1', 't'] print(dataframe.head(5)) #Creating a rolling window with means #print('Creating a rolling window with mean') shifted = temps.shift(1) window = shifted.rolling(window=2) print('++++++++++++++++') print(window) means = window.mean() dataframe = concat([means, temps], axis=1)
import pandas as pd from scipy import spatial import gc import math from pandas import Series,DataFrame from scipy.spatial.distance import cosine import numpy def dist(v1,v2): return numpy.linalg.norm(v1-v2) idf = Series.from_csv("idf.csv") tfidf = pd.read_csv("tfidf.csv",index_col = 0) #print tfidf.head(5) #print idf.size tfquery = Series() linea = "Armed Robbery Suspect Arrested w/ Handgun" # QUERY A EVALUAR linea = linea.upper() tokens = linea.split() for word in tokens: if word in tfidf.columns: print word if word in tfquery: tfquery[word] = tfquery[word] + 1 else : tfidf[word] = 0 test = Series({word : 1}) tfquery = tfquery.add(test, fill_value=0) tfquery = tfquery/len(tokens) tfquery = tfquery.multiply(idf,fill_value = 0) print "TERMINO TFIDF"