Example #1
0
def get_id(sec):
    '''Scrape the internal HTML ID for the film'''
    global sec_to_id
    if sec_to_id.empty:
        try:
            sec_to_id = Series.from_csv('hsx_security_to_id', header=0)
        except:
            print('Security -> ID table not found, making a new one')
            with open('hsx_security_to_id', 'w') as f:
                f.write('security,id')
            sec_to_id = Series.from_csv('hsx_security_to_id', header=0)
    if sec not in sec_to_id:
        r = requests.get('http://www.hsx.com/security/view/{}'.format(sec))
        #extract from webpage
        soup = BeautifulSoup(r.text)
        try:
            script = soup.findAll('script')[4].text.split('\n')
            sec_id = script[3].split('=')[2]
            sec_id = sec_id.split('"')[0]
        except:
            print("Cannot find id for {}".format(sec))
            return -1
        sec_to_id[sec] = int(sec_id)
        Series.to_csv(sec_to_id,'hsx_security_to_id',header='security,id')
    return sec_to_id[sec]
Example #2
0
def downloadDailies(movie_id, movie_name='', save=True):
    index = ['Day', 'Date', 'Rank', 'Gross', '% Change Prev Day', '% Change Prev Week', 'Theaters', 'Avg/Theater', 'Gross-to-Date','Day #']
    r = requests.get('{0}/movies/?page=daily&view=chart&id={1}.htm'.format(base_url, movie_id))
    soup = BeautifulSoup(r.text,'lxml')

    if not movie_name:
        try:
            movie_name = soup.find('font', attrs={'face':'Verdana', 'size':'6'}).text
        except:
            movie_name = soup.find('font', attrs={'face':'Verdana', 'size':'5'}).text

    table = soup.find('table', attrs={'class':'chart-wide'})
    if table is None:
        print('{} does not have daily box office numbers'.format(movie_id))
        return DataFrame()
    results = {}
    for tr in table.findAll('tr')[1:]:
        result = [td.text for td in tr.findAll('td')]
        if len(result) > 1:
            dt = datetime.strptime(result[1].replace('\t','').replace('.',''), '%b %d, %Y')
            results[dt] = result
    df = DataFrame(results, index=index).T
    #clean data
    df['Gross'] = df['Gross'].map(lambda x: int(x[1:].replace(',','')))
    df['Gross-to-Date'] = df['Gross-to-Date'].map(lambda x: int(x[1:].replace(',','')))
    df['Avg/Theater'] = df['Avg/Theater'].map(lambda x: int(x[1:].replace(',','')))
    df['% Change Prev Day'] = df['% Change Prev Day'].map(lambda x: float('nan') if x == '-' else float(x[:-1].replace(',','')))
    df['% Change Prev Week'] = df['% Change Prev Week'].map(lambda x: float('nan') if x == '-' else float(x[:-1].replace(',','')))
    if save:
        df.to_csv('boxoffice/{}.csv'.format(movie_id))
        series = Series.from_csv('boxoffice/'+film_index_name)
        series[movie_name] = movie_id
        series.to_csv('boxoffice/'+film_index_name)
    return df
Example #3
0
    def test_to_csv_float_format(self):

        with ensure_clean() as filename:
            ser = Series([0.123456, 0.234567, 0.567567])
            ser.to_csv(filename, float_format='%.2f')

            rs = Series.from_csv(filename)
            xp = Series([0.12, 0.23, 0.57])
            assert_series_equal(rs, xp)
Example #4
0
    def test_from_csv_deprecation(self):
        # see gh-17812
        with ensure_clean() as path:
            self.ts.to_csv(path)

            with tm.assert_produces_warning(FutureWarning,
                                            check_stacklevel=False):
                ts = self.read_csv(path)
                depr_ts = Series.from_csv(path)
                assert_series_equal(depr_ts, ts)
Example #5
0
def cleanFilmIndex():
    series = Series.from_csv('boxoffice/'+film_index_name)
    l=[]
    for name,fid in series.items():
        df=loadDailies(fid)
        if df.empty:
            l.append(name)
    for name in l:
        del series[name]
    series.to_csv('boxoffice/'+film_index_name)
Example #6
0
    def test_to_csv_unicode_index(self):
        buf = StringIO()
        s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")])

        s.to_csv(buf, encoding='UTF-8')
        buf.seek(0)

        s2 = Series.from_csv(buf, index_col=0, encoding='UTF-8')

        assert_series_equal(s, s2)
Example #7
0
def similarDay(price, day, count=0, above=0):
    '''Get a set of films with the most similar gross revenues on the given day since release.'''
    series = Series()
    films = Series.from_csv('boxoffice/'+film_index_name)
    for film in films:
        s = asSeries(loadDailies(film))
        if s is not None and day in s and s[day] > above:
            series[film] = s[day]
    series = (abs(series - price)).sort_values(ascending=True)
    series /= 1000000
    if count > 0:
        return series[:count]
    return series
Example #8
0
def parse_old_logs(log_file, order_file=None, name=None):
    """Parse logs that have a single column of timepoints for each event"""
    df = DataFrame.from_csv(log_file, index_col=None, header=None)
    df = df.rename(columns={0:'break'})
    if name: df['name'] = name
    elif not 'name' in df: raise Exception('log needs a name column')

    log = parse_splits(df)
    if order_file:
        order = Series.from_csv(order_file, header=None, index_col=None)
        log['order'] = order
    else:
        log['order'] = range(len(log))

    return log
Example #9
0
    def test_from_csv(self):

        with ensure_clean() as path:
            self.ts.to_csv(path)
            ts = self.read_csv(path)
            assert_series_equal(self.ts, ts, check_names=False)

            assert ts.name is None
            assert ts.index.name is None

            with tm.assert_produces_warning(FutureWarning,
                                            check_stacklevel=False):
                depr_ts = Series.from_csv(path)
                assert_series_equal(depr_ts, ts)

            # see gh-10483
            self.ts.to_csv(path, header=True)
            ts_h = self.read_csv(path, header=0)
            assert ts_h.name == "ts"

            self.series.to_csv(path)
            series = self.read_csv(path)
            assert_series_equal(self.series, series, check_names=False)

            assert series.name is None
            assert series.index.name is None

            self.series.to_csv(path, header=True)
            series_h = self.read_csv(path, header=0)
            assert series_h.name == "series"

            outfile = open(path, "w")
            outfile.write("1998-01-01|1.0\n1999-01-01|2.0")
            outfile.close()

            series = self.read_csv(path, sep="|")
            check_series = Series({datetime(1998, 1, 1): 1.0,
                                   datetime(1999, 1, 1): 2.0})
            assert_series_equal(check_series, series)

            series = self.read_csv(path, sep="|", parse_dates=False)
            check_series = Series({"1998-01-01": 1.0, "1999-01-01": 2.0})
            assert_series_equal(check_series, series)
Example #10
0
    def test_from_csv(self):

        with ensure_clean() as path:
            self.ts.to_csv(path)
            ts = Series.from_csv(path)
            assert_series_equal(self.ts, ts, check_names=False)
            self.assertTrue(ts.name is None)
            self.assertTrue(ts.index.name is None)

            # GH10483
            self.ts.to_csv(path, header=True)
            ts_h = Series.from_csv(path, header=0)
            self.assertTrue(ts_h.name == 'ts')

            self.series.to_csv(path)
            series = Series.from_csv(path)
            self.assertIsNone(series.name)
            self.assertIsNone(series.index.name)
            assert_series_equal(self.series, series, check_names=False)
            self.assertTrue(series.name is None)
            self.assertTrue(series.index.name is None)

            self.series.to_csv(path, header=True)
            series_h = Series.from_csv(path, header=0)
            self.assertTrue(series_h.name == 'series')

            outfile = open(path, 'w')
            outfile.write('1998-01-01|1.0\n1999-01-01|2.0')
            outfile.close()
            series = Series.from_csv(path, sep='|')
            checkseries = Series({datetime(1998, 1, 1): 1.0,
                                  datetime(1999, 1, 1): 2.0})
            assert_series_equal(checkseries, series)

            series = Series.from_csv(path, sep='|', parse_dates=False)
            checkseries = Series({'1998-01-01': 1.0, '1999-01-01': 2.0})
            assert_series_equal(checkseries, series)
Example #11
0
from pandas import DataFrame
from pandas import concat
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import mean_squared_error
from pandas.tools.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
def mean_absolute_percentage_error(y_true, y_pred): 
    # y_true, y_pred = check_arrays(y_true, y_pred)

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# series = Series.from_csv('daily-minimum-temperatures-in-me.csv', header=0)
series = Series.from_csv('wc98_workload_hour.csv', header=0) 
# split dataset
X = series.values
print 'X: ', len(X)
train_size = int(len(X) * 0.8)
train, test = X[1:train_size], X[train_size:]
# train autoregression
print len(train)
model = AR(train)
model_fit = model.fit()
# model_fit = model.fit(10,ic='bic')
print('Lag: %s' % model_fit.k_ar)
# lag= round(12*(len(train)/100.)**(1/4.))
# print 'Gia tri lag theo cach tinh', lag
print('Coefficients: %s' % model_fit.params)
print len(model_fit.params), len(test)
def main():
    out_dir = os.path.dirname(__file__)

    ex1_path = study.DATA_DIR + '/ch06/ex1.csv'
    cat(ex1_path)

    df = pd.read_csv(ex1_path)
    p(df)
    p(pd.read_table(ex1_path, sep=','))

    p('header less---------------------')
    ex2_path = study.DATA_DIR + '/ch06/ex2.csv'
    cat(ex2_path)
    names = ['a','b', 'c', 'd', 'message']
    p(pd.read_csv(ex2_path, header=None))
    p(pd.read_csv(ex2_path, names=names))
    p(pd.read_csv(ex2_path, names=names, index_col='message'))

    p('hierarchy index---------------------')
    mindex_path = study.DATA_DIR + '/ch06/csv_mindex.csv'
    cat(mindex_path)
    p(pd.read_csv(mindex_path, index_col=['key1', 'key2']))

    p('separate by regex-------------')
    ex3_path = study.DATA_DIR + '/ch06/ex3.csv'
    cat(ex3_path)
    p(pd.read_csv(ex3_path, sep='\s+'))

    p('skip rows-----------')
    ex4_path = study.DATA_DIR + '/ch06/ex4.csv'
    cat(ex4_path)
    p(pd.read_csv(ex4_path, skiprows=[0,2,3]))

    p('N/A------------------')
    ex5_path = study.DATA_DIR + '/ch06/ex5.csv'
    cat(ex5_path)
    result = pd.read_csv(ex5_path)
    p(result)
    p(pd.isnull(result))
    result = pd.read_csv(ex5_path, na_values=['NULL', '12']) # 12 is NA
    p(result)

    p('N/A dict------------------')
    sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
    p(sentinels)
    p(pd.read_csv(ex5_path, na_values=sentinels))

    p('6.1.1 read data chunk size---------------------')
    ex6_path = study.DATA_DIR + '/ch06/ex6.csv'
    p(pd.read_csv(ex6_path).count())
    p(pd.read_csv(ex6_path, nrows=5))
    chunker = pd.read_csv(ex6_path, chunksize=1000)
    p(chunker)
    tot = Series([])
    for piece in chunker:
        tot = tot.add(piece['key'].value_counts(), fill_value=0)
    tot.order(ascending=False)
    p(tot[:10])

    p('6.1.2 write---------------------')
    data = pd.read_csv(ex5_path)
    p(data)

    ex5_out_path = out_dir + '/ex5_out.csv'
    data.to_csv(ex5_out_path)
    cat(ex5_path)

    data.to_csv(sys.stdout, index=False, header=False)
    print ''
    data.to_csv(sys.stdout, index=False, cols=list('abc'))
    print ''

    p('Series--------------')
    tseries_out_path = out_dir + '/tseries_out.csv'
    dates = pd.date_range('1/1/2000', periods=7)
    ts = Series(np.arange(7), index=dates)
    ts.to_csv(tseries_out_path)
    cat(tseries_out_path)
    p(Series.from_csv(tseries_out_path, parse_dates=True))

    p('6.1.3 csv-------------------------')
    ex7_path = study.DATA_DIR + '/ch06/ex7.csv'
    cat(ex7_path)
    f = open(ex7_path)
    reader = csv.reader(f)
    for line in reader:
        print line
    lines = list(csv.reader(open(ex7_path)))
    header, values = lines[0], lines[1:]
    data_dict = {h: v for h,v in zip(header, zip(*values))}
    p(data_dict)

    my_data_out_path = out_dir + '/mydata.csv'
    with open(my_data_out_path, 'w') as fp:
        writer = csv.writer(fp, dialect=my_dialect)
        writer.writerow(('one', 'two', 'three'))
        writer.writerow(('1', '2', '3'))
        writer.writerow(('4', '5', '6'))
        writer.writerow(('7', '8', '9'))
    cat(my_data_out_path)

    p('6.1.4 JSON-------------------------')
    obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
             {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
    result = json.loads(obj)
    p(result)
    asjson = json.dumps(result)
    p(asjson)
    siblings = DataFrame(result['siblings'], columns=['name', 'age'])
    p(siblings)

    p('6.1.4 XML/HTML Web Scraping-------------------------')
    url = '' #'http://finance.yahoo.com/q/op?s=AAPL+Options'
    if not url is '':
        parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
        doc = parsed.getroot()
        p([lnk.get('href') for lnk in doc.findall('.//a')][-10:])

        tables = doc.findall('.//table')
        p(parse_options_data(tables[9])[:5])
        p(parse_options_data(tables[13])[:5])

    p('6.1.5 Read XML-------------------------')
    xml_path = out_dir + '/Performance_MNR.xml'
    xml_content ="""
<INDICATOR>
    <INDICATOR_SEQ>373889</INDICATOR_SEQ>
    <PARENT_SEQ></PARENT_SEQ>
    <AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME>
    <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
    <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION>
    <PERIOD_YEAR>2011</PERIOD_YEAR>
    <PERIOD_MONTH>12</PERIOD_MONTH>
    <CATEGORY>Service Indicators</CATEGORY>
    <FREQUENCY>M</FREQUENCY>
    <DESIRED_CHANGE>U</DESIRED_CHANGE>
    <INDICATOR_UNIT>%</INDICATOR_UNIT>
    <DECIMAL_PLACES>1</DECIMAL_PLACES>
    <YTD_TARGET>97.00</YTD_TARGET>
    <YTD_ACTUAL></YTD_ACTUAL>
    <MONTHLY_TARGET>97.00</MONTHLY_TARGET>
    <MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
"""
    if not os.path.exists(xml_path):
        with open(xml_path, 'w') as f:
            f.write(xml_content)
    parsed = objectify.parse(open(xml_path))
    root = parsed.getroot()
    data = []
    skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
                   'DESIRED_SEQ', 'DECIMAL_PLACES']
    p(dir(root))
    for elt in root: # .INDICATOR:
        el_data = {}
        for child in elt.getchildren():
            if child.tag in skip_fields:
                continue
            el_data[child.tag] = child.pyval
        data.append(el_data)
    perf = DataFrame(data)
    p(perf)

    tag = '<a href="http://google.com">Google</a>'
    root = objectify.parse(StringIO.StringIO(tag)).getroot()
    p(root)
    p(root.get('href'))
    p(root.text)
Example #13
0
# plot the forecasts in the context of the original dataset
def plot_forecasts(series, forecasts, n_test):
    # plot the entire dataset in blue
    pyplot.plot(series.values)
    # plot the forecasts in red
    for i in range(len(forecasts)):
        off_s = len(series) - n_test + i - 1
        off_e = off_s + len(forecasts[i]) + 1
        xaxis = [x for x in range(off_s, off_e)]
        yaxis = [series.values[off_s]] + forecasts[i]
        pyplot.plot(xaxis, yaxis, color='red')
    # show the plot
    pyplot.show()


series = Series.from_csv('ec2_cpu_utilization_2.csv', header=0)
n_lag = 1
n_test = 10
n_seq = 3
n_batch = 1
scaler, train, test = prepare_data(series, n_test, n_lag, n_seq)

model = load_model('cpu_model-multi.h5')
forecasts = make_forecasts(model, n_batch, train, test, n_lag, n_seq)
# inverse transform forecasts and test
forecasts = inverse_transform(series, forecasts, scaler, n_test + 2)
actual = [row[n_lag:] for row in test]
actual = inverse_transform(series, actual, scaler, n_test + 2)
# evaluate forecasts
evaluate_forecasts(actual, forecasts, n_lag, n_seq)
# plot forecasts
    return ((self.endog), (self.k_lags, self.k_diff, self.k_ma))


ARIMA.__getnewargs__ = __getnewargs__


# create a differenced series
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return diff


# load data
series = Series.from_csv('dataset.csv')
# prepare data
X = series.values
X = X.astype('float32')
# difference data
months_in_year = 12
diff = difference(X, months_in_year)
# fit model
model = ARIMA(diff, order=(0, 0, 1))
model_fit = model.fit(trend='nc', disp=0)
# bias constant, could be calculated from in-sample mean residual
bias = 165.904728
# save model
model_fit.save('model.pkl')
numpy.save('model_bias.npy', [bias])
Example #15
0
print(df)

# 省略行列标签
df = data.to_csv(sys.stdout, index=False, header=False)
print(df)

# 指定列
df = data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])
print(df)

# 循环写操作
dates = pd.date_range('1/1/2000', periods=7)
ts = Series(np.arange(7), index=dates)
ts.to_csv('tseries.csv')

df = Series.from_csv('tseries.csv', parse_dates=True)
print(df)

print('-----------------------------------')
print('-----------------------------------')
# 手工处理分隔符格式
import csv

f = open('d:data/ex7.csv')

reader = csv.reader(f)

for line in reader:
    print(line)

# 字典处理,数据对齐
Example #16
0
# -*- coding
# statistical test for the stationarity of the time series

"""
Created on Sun Oct  8 00:44:43 2017

@author: user

"""
#evaluate manually configured ARIMA model
from pandas import Series
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
from math import sqrt
# load data
series = Series.from_csv('DailyData.csv')
# prepare data
X = series.values
X = X.astype('float32')
train_size = int(len(X) * 0.66)
train, test = X[0:train_size], X[train_size:]
# walk-forward validation
history = [x for x in train]
predictions = list()
for i in range(len(test)):

# predict
 model = ARIMA(history, order=(1,1,5))
 model_fit = model.fit(disp=0)
 yhat = model_fit.forecast()[0]
 predictions.append(yhat)
Example #17
0
from pandas import Series
from matplotlib import pyplot
from pandas.tools.plotting import lag_plot
from pandas import DataFrame
from pandas import concat
from matplotlib import pyplot
from pandas.tools.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import mean_squared_error
import timeit
import pandas
import numpy

series = Series.from_csv('/home/alex/Desktop/doulke_mikri/ML/important_doc.txt', header=0)

# split dataset
X = series.values
train, test = X[1:len(X)-7], X[len(X)-7:]
# train autoregression
t1=timeit.default_timer()
model = AR(train)
model_fit = model.fit()
window = model_fit.k_ar
coef = model_fit.params

# walk forward over time steps in test
history = train[len(train)-window:]
history = [history[i] for i in range(len(history))]
predictions = list()
for t in range(len(test)):
import pandas as pd
from pandas import DataFrame
from pandas import Series
import re

import matplotlib
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

s1 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-25.csv')
s2 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-26.csv')
s3 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-27.csv')
s4 = Series.from_csv('/Users/cprinz/Developer/MIS375_TwitterProject/fakenews_2-28.csv')

all_tweets = pd.concat([s1,s2,s3,s4])

twitter_handle_re = re.compile(r'@([A-Za-z0-9_]+)')

mention_counts = Series()
for item in all_tweets:
    mentions = twitter_handle_re.findall(item)
    for mention in mentions:
        if mention in mention_counts.keys():
            mention_counts[mention] += 1
        else:
            mention_counts[mention] = 1

mention_counts.sort(ascending = False)
#print mention_counts
from pandas import Series
from matplotlib import pyplot
from pandas import DataFrame
from pandas import TimeGrouper
from pandas import concat
from pandas.plotting import lag_plot
from pandas.plotting import autocorrelation_plot

series = Series.from_csv(
    "/Users/richardcollins/Desktop/Time_Series/daily-min-temperatures.csv",
    header=0)
print(series.head())
print(len(series))

# Group data by years and by months (in 1990)
groups = series.groupby(TimeGrouper('A'))
years = DataFrame()
for name, group in groups:
    years[name.year] = group.values
series_1990 = series['1990']
groups_1990 = series_1990.groupby(TimeGrouper('M'))
months = concat([DataFrame(x[1].values) for x in groups_1990], axis=1)
months = DataFrame(months)
months.columns = range(1, 13)

# Line plot
series.plot(linewidth=0.2)
pyplot.show()

# Line plot per year
years.plot(subplots=True, legend=False)
Example #20
0
'''
* Zaman Serisi Tahmini Uygulamaları
* Görüldüğü gibi karekök transform daha çigisel ve eşit bir dağılım sunmaktadır.
* Corona Virüs Günlük Onaylı Vaka Karekök ve Log Transform
* Tarih: 03 Nisan 2020
* Hazırlayan: Bilishim Siber Güvenlik ve Yapay Zeka
* Bu çalışmalar yalnızca ARGE ve bilgiyi geliştirmek maksadıyla hazırlanmış olup, herhangi bir resmi temsil ya da bağlayıcılığı yoktur.
'''

from pandas import Series
from pandas import DataFrame
from numpy import sqrt
from matplotlib import pyplot
from numpy import log

series = Series.from_csv('corona-virus-istatistikleri-resampled.csv', header=0)
dataframe = DataFrame(series.values)
dataframe.columns = ['Gunluk Onayli Vaka']
pyplot.figure("Günlük Onaylı Vaka")
# line plot
pyplot.subplot(211)
pyplot.plot(dataframe['Gunluk Onayli Vaka'])
# histogram
pyplot.subplot(212)
pyplot.hist(dataframe['Gunluk Onayli Vaka'])
pyplot.show()

#Karekök Transform
series = Series.from_csv('corona-virus-istatistikleri-resampled.csv', header=0)
dataframe = DataFrame(series.values)
dataframe.columns = ['Gunluk Onayli Vaka']
Example #21
0
from pandas import Series
from matplotlib import pyplot
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import mean_squared_error
from flask import render_template
import numpy as np

app = Flask(__name__)

start_day = 1
end_day = 100
your_prediction = 1
train = 1
test = 1

series = Series.from_csv('daily_curren_new.csv', header=0)
# split dataset
X = series.values


@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        start_day = int(request.form['start'])
        end_day = int(request.form['end'])
        tr, te = X[start_day:end_day], X[end_day:end_day + 1]
        return redirect(url_for('predict', train=tr, test=te))
    return '''
        <form method="post">
            <p>Please enter start day for prediction: 
            <p><input type=number name=start>
Example #22
0
from pandas import Series
from pandas import DataFrame
from pandas import TimeGrouper
from matplotlib import pyplot

series = Series.from_csv('dataset_training.csv')
groups = series['1964':'1970'].groupby(TimeGrouper('A'))
years = DataFrame()
for name, group in groups:
    years[name.year] = group.values
years.boxplot()
pyplot.show()
Example #23
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

# import numpy
# from pandas import Series
# from pandas import DataFrame
# from pandas import TimeGrouper
# from matplotlib import pyplot
# from pandas.tools.plotting import lag_plot

# series = Series.from_csv('../data/oni/csv/nino3_4.csv', header=0)

# rolling = series.rolling(window=3)
# rolling_mean = rolling.mean()
# print(rolling_mean.head(10))
# # plot original and transformed dataset
# series.plot()
# rolling_mean.plot(color='red')
# pyplot.show()
# # zoomed plot original and transformed dataset
# series[:100].plot()
# rolling_mean[:100].plot(color='red')
# pyplot.show()

from pandas import Series
from matplotlib import pyplot
from statsmodels.tsa.seasonal import seasonal_decompose
series = Series.from_csv('../data/oni/csv/nino3_4.csv', header=0)
result = seasonal_decompose(series, model="multiplicative")
result.plot()
pyplot.show()
Example #24
0
    handle = open(file, "rU")
    for record in tqdm(SeqIO.parse(handle, "fasta")) :
        seq = str(record.seq)
        l = len(seq)
        if 'coverage' not in locals():
            coverage = [0]*l

        for (i,c) in enumerate(seq):
            if c not in ['.','-']:
                coverage[i] = coverage[i] +1
    coverage=Series(coverage)
    coverage.to_csv("coverages.csv",index=False)
    handle.close()
else :
    print "import coverages"
    coverage = Series.from_csv("coverages.csv",header=-1, index_col=False)

print "compute median-ish things"
medians = []
means = []
maxs = []
mins = []
lens = []
left = []
right = []
unsure = []
handle = open(file, "rU")
positions=list(coverage[coverage > 500000].index)
l = len(positions)
for record in tqdm(SeqIO.parse(handle, "fasta")) :
    seq = str(record.seq)
Example #25
0
# model seasonality with a polynomial model
from pandas import Series
from matplotlib import pyplot
from numpy import polyfit

series = Series.from_csv('daily-minimum-temperatures.csv', header=0)
# fit polynomial: x^2*b1 + x*b2 + ... + bn
X = [i % 365 for i in range(0, len(series))]
y = series.values
degree = 4
coef = polyfit(X, y, degree)
print('Coefficients: %s' % coef)
# create curve
curve = list()
for i in range(len(X)):
    value = coef[-1]
    for d in range(degree):
        value += X[i]**(degree - d) * coef[d]
    curve.append(value)
# plot curve over original data
pyplot.plot(series.values)
pyplot.plot(curve, color='red', linewidth=3)
pyplot.show()
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 17 04:14:53 2017

@author: user
"""

# create a heat map of monthly data
from pandas import Series
from pandas import DataFrame
from pandas import TimeGrouper
from matplotlib import pyplot
from pandas import concat
series = Series.from_csv('TSData2.csv', header=0)
one_year = series['2017']
groups = one_year.groupby(TimeGrouper('M'))
months = concat([DataFrame(x[1].values) for x in groups], axis=1)
months = DataFrame(months)
months.columns = range(1,6)
pyplot.matshow(months, interpolation=None, aspect='auto')
pyplot.show()
# split into a training and validation dataset
from pandas import Series
series = Series.from_csv('robberies.csv', header=0)
split_point = len(series) - 12
dataset, validation = series[0:split_point], series[split_point:]
print('Dataset %d, Validation %d' % (len(dataset), len(validation)))
dataset.to_csv('dataset.csv')
validation.to_csv('validation.csv')
    # 计算样本的误差值
    error = mean_squared_error(test, predictions)
    return error


# 评估ARIMA模型的p,d和q值的组合
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p, d, q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s MSE=%.3f' % (order, mse))
                except:
                    continue
    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))


# load dataset
series = Series.from_csv('daily-total-female-births.csv', header=0)
# 评估参数
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)
warnings.filterwarnings("ignore")
evaluate_models(series.values, p_values, d_values, q_values)
Example #29
0
import statsmodels.tsa.stattools

from pandas import Series
from pandas import read_csv
from pandas import datetime
from pandas import DataFrame
from matplotlib import pyplot
from pandas.plotting import autocorrelation_plot
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults
from statsmodels.graphics.tsaplots import plot_pacf

timeSeriesName  = Series.from_csv('file_path.csv',header=0)
arimaOrder = #(p,d,q) parameters defined by the user, might be defined using an "ARIMA(p,d,q)_parameters" function to choose the parameters with the lowest MSE

n = 0.66 

#Use a length "n" corresponding to the length of your base, this value must be the amount of time you want to see in the past, being less than 1 and greater than 0. 
#The amount of points foreseen in future periods will be 1- n. 


#p = AR, which means 'autorregressive'
#i = I, which means 'integrated'
#q = MA, which means 'moving average'

timeSeriesName.describe()

timeSeriesName.plot()

# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p, d, q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s RMSE=%.3f' % (order, mse))
                except:
                    continue
    print('Best ARIMA%s RMSE=%.3f' % (best_cfg, best_score))


# load dataset
series = Series.from_csv('final_dataset.csv')
# evaluate parameters
p_values = range(0, 7)
d_values = range(0, 3)
q_values = range(0, 7)
warnings.filterwarnings("ignore")
evaluate_models(series.values, p_values, d_values, q_values)

#After Executing the Program, we see that (1,0,0)  is best suited with RMSE= 27.400
Example #31
0
import numpy

# create a differenced series
def difference(dataset, interval=1):
	diff = list()
	for i in range(interval, len(dataset)):
		value = dataset[i] - dataset[i - interval]
		diff.append(value)
	return diff

# invert differenced value
def inverse_difference(history, yhat, interval=1):
	return yhat + history[-interval]

# load and prepare datasets
dataset = Series.from_csv('dataset.csv')
X = dataset.values.astype('float32')
history = [x for x in X]
months_in_year = 12
validation = Series.from_csv('validation.csv')
y = validation.values.astype('float32')
# load model
model_fit = ARIMAResults.load('model.pkl')
bias = numpy.load('model_bias.npy')
# make first prediction
predictions = list()
yhat = float(model_fit.forecast()[0])
yhat = bias + inverse_difference(history, yhat, months_in_year)
predictions.append(yhat)
history.append(y[0])
print('>Predicted=%.3f, Expected=%3.f' % (yhat, y[0]))
Example #32
0
# calculate statistics of partitioned log transformed time series data
from pandas import Series
from matplotlib import pyplot
from numpy import log
series = Series.from_csv('airline-passengers.csv', header=0)
X = series.values
X = log(X)
split = int(len(X) / 2)
X1, X2 = X[0:split], X[split:]
mean1, mean2 = X1.mean(), X2.mean()
var1, var2 = X1.var(), X2.var()
print('mean1=%f, mean2=%f' % (mean1, mean2))
print('variance1=%f, variance2=%f' % (var1, var2))
Example #33
0
f = open('E:/建模/第5周/data/ex3.txt')
df = pd.read_table(f, sep='\s+')
print(df)

#根据需要选择需要读的行
f = open('E:/建模/第5周/data/ex4.csv')
df = pd.read_table(f, sep=',', skiprows=[0, 2, 3])  #跳过不想读的行
print(df)

#处理缺失值
f = open('E:/建模/第5周/data/ex5.csv')
df = pd.read_table(f, sep=',', na_values='world')  #如果数据中有’world’,也会视为缺失值
print(df)

#逐行读取文件
f = open('E:/建模/第5周/data/ex6.csv')
df = pd.read_table(f, sep=',', nrows=5)  #只读取前面5行
print(df)

#将dataframe数据写入csv文件
f = open('E:/建模/第5周/data/ex5.csv')
data = pd.read_csv(f)
data.to_csv('E:/建模/第5周/data/out.csv')  #将dataframe输出到csv文件中
data.to_csv('E:/建模/第5周/data/out.csv', na_rep='ok')  #将缺失值补上‘ok’
data.to_csv('E:/建模/第5周/data/out.csv', header=None)  #不设置表头
data.to_csv('E:/建模/第5周/data/out.csv', columns=['a', 'b'])  #写出指定的列

#将csv文件读取位Series
f = open('E:/建模/第5周/data/tseries.csv')
series = Series.from_csv(f, parse_dates=True)
print(series)
Example #34
0
# In[41]:

dates = pd.date_range('21/8/2017', periods=10)

# In[45]:

ts = Series(np.arange(10), index=dates)

# In[90]:

ts.to_csv('D:\PythonDataAnalysis\ch6\tseries.csv')

# In[91]:

Series.from_csv('D:\PythonDataAnalysis\ch6\tseries.csv', parse_dates=True)

# ## 3,手动处理分隔符格式

# In[65]:

import csv

# In[66]:

f = open('D:\PythonDataAnalysis\ch6\ex7.csv')

# In[88]:

reader = csv.reader(f)
Example #35
0
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence) - 1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)


# define input sequence
#raw_seq = [10, 20, 30, 40, 50, 60, 70, 80, 90]
raw_seq = Series.from_csv('WL_Edmonton_1day.csv', header=0)

# choose a number of time steps
n_steps = 3
# split into samples
X, y = split_sequence(raw_seq, n_steps)
# reshape from [samples, timesteps] into [samples, timesteps, features]
n_features = 1
X = X.reshape((X.shape[0], X.shape[1], n_features))
# define model
model = Sequential()
model.add(
    Conv1D(filters=64,
           kernel_size=2,
           activation='relu',
           input_shape=(n_steps, n_features)))
import os
import numpy as np
from pandas import Series
from matplotlib import pyplot
from pandas import DataFrame
from pandas import concat

folder_path = '/home/nguyen/spark-lab/spark-2.1.1-bin-hadoop2.7/abc'
# for file_name in os.listdir(folder_path):
# 	print file_name +"\n"
series = Series.from_csv(
    '/home/nguyen/spark-lab/spark-2.1.1-bin-hadoop2.7/abc/part-00000-09243fc6-2d5e-4b7e-a4d7-146ad3399131.csv',
    header=0)
X = series.values
pyplot.plot(X)
pyplot.show()
Example #37
0
# create a difference transform of the dataset
def difference(dataset):
	diff = list()
	for i in range(1, len(dataset)):
		value = dataset[i] - dataset[i - 1]
		diff.append(value)
	return numpy.array(diff)

# Make a prediction give regression coefficients and lag obs
def predict(coef, history):
	yhat = coef[0]
	for i in range(1, len(coef)):
		yhat += coef[i] * history[-i]
	return yhat

series = Series.from_csv('../data/nifty.csv', header=0)
# split dataset
X = difference(series.values)
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:]
# train autoregression
model = AR(train)
model_fit = model.fit(maxlag=6, disp=False)
window = model_fit.k_ar
coef = model_fit.params
# walk forward over time steps in test
history = [train[i] for i in range(len(train))]
predictions = list()
for t in range(len(test)):
	yhat = predict(coef, history)
	obs = test[t]
Example #38
0
America/Argentina/Cordoba         26
America/Argentina/Mendoza         55
dtype: int64
"""


count_subset = agg_counts.take(indexer)[-10:]
print count_subset
print type(count_subset)
"""
                     Not Windows  Windows
tz                                       
America/Sao_Paulo           13.0     20.0
Europe/Madrid               16.0     19.0
Pacific/Honolulu             0.0     36.0
Asia/Tokyo                   2.0     35.0
Europe/London               43.0     31.0
America/Denver             132.0     59.0
America/Los_Angeles        130.0    252.0
America/Chicago            115.0    285.0
                           245.0    276.0
America/New_York           339.0    912.0
<class 'pandas.core.frame.DataFrame'>
"""

##write back 
count_subset.to_csv('./333.csv')
print '----'
print Series.from_csv('./333.csv',parse_dates=True)
# print pd.read_csv(csv_path2,header=None)
Example #39
0
# separate out a validation dataset
from pandas import Series
series = Series.from_csv('champagne.csv', header=0)
split_point = len(series) - 12
dataset, validation = series[0:split_point], series[split_point:]
print('Dataset %d, Validation %d' % (len(dataset), len(validation)))
dataset.to_csv('dataset.csv')
validation.to_csv('validation.csv')
Example #40
0
File: hjm.py Project: alpmdog/CQF
 filename = options.filename
 sim_no = options.sim_no
 if options.product is None:
     parser.error("Need to pass in a product")
 product = create_product(options)
 logger.info('filename: %s, no of sim: %s price %s', filename, sim_no,
              product)
 forwards = pandas.read_csv(filename)
 forwards = forwards.set_index('Dates')
 forwards = forwards.dropna()
 if options.cache_pca and \
    os.path.exists('eigenmat.csv') and \
    os.path.exists('eigenvecs.csv'):
     logger.info("Using previously calculated PCAs")
     eigenmat = DataFrame.from_csv('eigenmat.csv')
     eigenvecs = Series.from_csv('eigenvecs.csv')
 else:
     logger.info('Calculating PCA')
     eigenvecs, eigenmat = calculate_pca(forwards)
     eigenmat.to_csv("eigenmat.csv")
     eigenvecs.to_csv("eigenvec.csv")
 tenors = eigenvecs.index.astype("float64").tolist()
 tenors.sort()
 sqrt_lambdas = eigenvecs[:3].apply(sqrt)
 eigenmat = eigenmat.transpose()
 vols = eigenmat.apply(lambda x: numpy.asarray(x)
                       * numpy.asarray(sqrt_lambdas))
 factors = create_factors(vols, 3)
 vol1fit, poly1 = curve_fit(vols.iloc[0], 1)
 vol2fit, poly2 = curve_fit(vols.iloc[1])
 vol3fit, poly3 = curve_fit(vols.iloc[2])
Example #41
0
from sklearn.metrics import mean_squared_error
from math import sqrt

# create a differenced series
def difference(dataset, interval=1):
	diff = list()
	for i in range(interval, len(dataset)):
		value = dataset[i] - dataset[i - interval]
		diff.append(value)
	return diff

def inverse_difference(history, yhat, interval=1):
	return yhat + history[-interval]


series = Series.from_csv('LSE-ABDP-dataset.csv')
X = series.values
X = X.astype('float32')
train_size = int(len(X) * 0.50)
train, test = X[0:train_size], X[train_size:]
# walk-forward validation
history = [x for x in train]
predictions = list()
#bias = 165.904728
months_in_year = 12
diff = difference(history, months_in_year)
# predict
model = ARIMA(diff, order=(2,0,1))
	
for i in range(len(test)):
	model_fit = model.fit(trend='nc', disp=0)
Example #42
0
print(data.to_csv(sys.stdout, index=False, header=False))

print('\n')

print(data.to_csv(sys.stdout, index=False, col=['a','b','c']))

print('\n')

dates = pd.date_range('1/1/2000', periods=7)
ts = Series(np.arange(7), index=dates)

ts.to_csv('data/tseries.csv')

print('\n')

print(Series.from_csv('data/tseries.csv', parse_dates=True))

print('\n')











Example #43
0
	dataset = dataset.astype('float32')
	best_score, best_cfg = float("inf"), None
	for p in p_values:
		for d in d_values:
			for q in q_values:
				order = (p,d,q)
				try:
					mse = evaluate_arima_model(dataset, order)
					if mse < best_score:
						best_score, best_cfg = mse, order
					print('ARIMA%s MSE=%.3f' % (order,mse))
				except:
					continue
	print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))
 
# load dataset
def parser(x):
	return datetime.strptime('190'+x, '%Y-%m')
#series = read_csv('input/shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
series = Series.from_csv('../input/shampoo-sales.csv', header=0)

# evaluate parameters
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)

#evaluate_models(series.values, p_values, d_values, q_values)

#Best: (6, 1, 0)
dataset = series.values.astype('float32')
evaluate_arima_model(dataset, (6, 1, 0))
Example #44
0
# 指定读取的行数
df10 = pd.read_csv('resources/ex5.csv',nrows=10)
# print df10

# 写入csv,缺省值写成NaN,行标签不写入文件,列标签为header
df7.to_csv('resources/write1.csv',sep=',',na_rep='Nan',index=False)

# 按指定顺序写入指定列
df7.to_csv('resources/write1.csv',sep=',',na_rep='Nan',index=False,columns=['b','c','a'])


# Serises的读写
dates1 = pd.date_range('1/1/2000','1/1/2016')
s = Series(dates1,index=np.arange(dates1.size))
# print s
s.to_csv('resources/write2.csv',sep=',')
s1 = Series.from_csv('resources/write2.csv')
# print s1

# csv
f = open('resources/write1.csv')
reader = csv.reader(f)
lines = list(reader)
header,values = lines[0],lines[1:]
data_dic = {
    k:v for k,v in zip(header,zip(*values))
}
# print data_dic

    t_params = ['n', 'c', 't', 'ct']
    m_params = seasonal
    # create config instances
    for trend_elements in product(p_params, d_params, q_params):
        for t in t_params:
            for seasonal_elements in product(P_params, D_params, Q_params,
                                             m_params):
                cfg2 = [trend_elements, seasonal_elements, t]
                models.append(cfg2)
    return (models)


if __name__ == '__main__':
    # define dataset
    series = Series.from_csv(
        "/Users/richardcollins/Desktop/Time_Series/monthly-car-sales.csv",
        header=0,
        index_col=0)
    data = series.values
    print(data.shape)
    # data split
    n_test = 12
    # model configs
    cfg_list = sarima_configs(seasonal=[12])
    # grid search
    scores = grid_search(data, cfg_list, n_test, parallel=False)
    print("done")
    # list top 3 configs
    for cfg, error in scores[:3]:
        print(cfg, error)

    train, test = train_test_split(data, n_test)
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 17 01:59:11 2017

@author: user
"""

# calculate descriptive statistics
from pandas import Series
series = Series.from_csv('mdata2.csv')
print(series.describe())
Example #47
0
from pandas import Series
from statsmodels.tsa.arima_model import ARIMA
import numpy

# create a differenced series
def difference(dataset, interval=1):
	diff = list()
	for i in range(interval, len(dataset)):
		value = dataset[i] - dataset[i - interval]
		diff.append(value)
	return numpy.array(diff)

# load dataset
series = Series.from_csv('dataset.csv', header=None)
# seasonal difference
X = series.values
days_in_year = 365
differenced = difference(X, days_in_year)
# fit model
model = ARIMA(differenced, order=(7,0,1))
model_fit = model.fit(disp=0)
# print summary of fit model
print(model_fit.summary())
from pandas import Series
from pandas import DataFrame
from pandas import concat

series = Series.from_csv('daily-temp.csv', header=0)
temps = DataFrame(series.values)

print(temps.head())

#Sliding window of size 1
print('Sliding window of size 1')
dataframe = concat([temps.shift(1), temps], axis=1)
dataframe.columns = ['t-1', 't+1']
print(dataframe.head(5))

#Sliding window of size 3
print('Sliding window of size 3')
dataframe = concat(
    [temps.shift(3), temps.shift(2),
     temps.shift(1), temps], axis=1)
dataframe.columns = ['t-3', 't-2', 't-1', 't']
print(dataframe.head(5))

#Creating a rolling window with means
#print('Creating a rolling window with mean')
shifted = temps.shift(1)
window = shifted.rolling(window=2)
print('++++++++++++++++')
print(window)
means = window.mean()
dataframe = concat([means, temps], axis=1)
Example #49
0
import pandas as pd
from scipy import spatial
import gc
import math
from pandas import Series,DataFrame
from scipy.spatial.distance import cosine
import numpy
def dist(v1,v2): 
	return numpy.linalg.norm(v1-v2)
idf = Series.from_csv("idf.csv")
tfidf = pd.read_csv("tfidf.csv",index_col = 0)
#print tfidf.head(5)
#print idf.size
tfquery = Series()

linea = "Armed Robbery Suspect Arrested w/ Handgun" # QUERY A EVALUAR
linea = linea.upper()
tokens = linea.split()	
for word in tokens:
	if word in tfidf.columns:
		print word
		if word in tfquery:
			tfquery[word] = tfquery[word] + 1
		else :
			tfidf[word] = 0
			test = Series({word : 1})
			tfquery = tfquery.add(test, fill_value=0)
tfquery = tfquery/len(tokens)
tfquery = tfquery.multiply(idf,fill_value = 0)
print "TERMINO TFIDF"