Beispiel #1
0
def add_bulk_data_values(session, series, dvs_size):
    """
    Load up exampleData.csv into a series' datavalues field
    """
    assert 10000 >= dvs_size > 0
    path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(path, 'example_files', 'exampleData.csv')
    df = pd.read_csv(filepath)
    df['LocalDateTime'] = pd.to_datetime(df['LocalDateTime']).astype(datetime.datetime)
    df['DateTimeUTC'] = pd.to_datetime(df['DateTimeUTC']).astype(datetime.datetime)
    dvs = []
    for record in df.to_dict('records')[:dvs_size]:
        dv = DataValue()
        dv.data_value = record['DataValue']
        dv.local_date_time = record['LocalDateTime']
        dv.utc_offset = record['UTCOffset']
        dv.date_time_utc = record['DateTimeUTC']
        dv.site_id = series.site_id
        dv.variable_id = series.variable_id
        dv.censor_code = record['CensorCode']
        dv.method_id = series.method_id
        dv.source_id = series.source_id
        dv.quality_control_level_id = series.quality_control_level_id
        dvs.append(dv)
    series.data_values = dvs
    session.add_all(dvs)
    session.commit()
    return df
Beispiel #2
0
    def parse(self, files):

        block_groups = self.group_files(files)

        for file_grp in block_groups.values():
            files, mdicts = zip(*file_grp)
            blk = Block()
            blk.name = self.get_name(mdicts[0]["name"])
            date = pd.to_datetime(mdicts[0]["date"], format="%y%m%d").date()
            time = pd.to_datetime(mdicts[0]["time"], format="%H%M%S").time()
            file_types = [m["file"] for m in mdicts]
            if "parameters" in file_types:
                fname = files[file_types.index("parameters")]
                blk.start, blk.first_peck, blk.end = self.parse_time_file(fname, date, time)
            else:
                blk.start = pd.Timestamp(pd.datetime.combine(date, time))

            if not blk.is_complete:
                continue

            if "timestamp" in file_types:
                fname = files[file_types.index("timestamp")]
                blk.data = self.get_block_data(fname, start=blk.start)
                if (blk.data is None) or (len(blk.data) <= 1):
                    continue
                blk.compute_statistics()

            blk.files = files
            self.blocks.append(blk)

        return self.blocks
    def process_df(self, df):
        print("df_before_processing = ")
        print(str(df))
        # df['Datetime'] = [datetime.strptime(date_string, '%d %b %Y %H:%M') for date_string in df['Date']]# datetime.strptime(df['Date'], "%d %m %Y %H:%M")
        df['FromDate'] = pd.to_datetime(df['FromDate'])
        df['Date'] = pd.to_datetime(df['Date'])
        # df.Date.dt.hour
        df['DayDiff'] = (df['Date'] - df['FromDate']).astype('timedelta64[h]')/24
        df['HoursOfDay'] = df['Date'].dt.hour + df['Date'].dt.minute/60
        df['DayOfWeek'] = df['Date'].dt.weekday
        # df['MinutesOfDay'] = df['Date'].dt.minute
        # df['Minutes'] = [date_time.hours*60 + date_time.minutes
        #                  for date_time in df['Datetime']]
        # datestring = '01 Jan 2016 10:00'
        # my_datetime = datetime.strptime(datestring, '%d %b %Y %H:%M')
        # print("my_datetme = " + str(my_datetime))
        print(str(df))
        # date_df = df["Date"]
        # print('date_df = ')
        # print(str(date_df))
        return df

# TO add in the from date:
# Firstly do group by project and issue number, then sort by date,
        # and within each section add a counter
    # Or can we use the index which is already a counter?
    def _get_dollar_values(self, group=False):
        """Calculate the value of portfolio holdings using closing prices.
        Optionally aggregate the values into groups provided in config.
        """
        dates = sorted(self._config['dates'])

        # Copy dataframe and zero data before earliest portfolio date.
        dollar_values = self._daily['close'].copy()
        dollar_values.ix[
            dollar_values.index < pd.to_datetime(str(dates[0])), :] = 0.0

        # Loop thru dates and calculate each date range using bitmask index.
        for i, item in enumerate(dates):
            index = dollar_values.index >= pd.to_datetime(str(item))
            if i < (len(dates) - 1):
                index = index & (
                    dollar_values.index < pd.to_datetime(str(dates[i + 1])))
            for key in list(dollar_values.columns.values):
                value = self._config['dates'][item]['symbols'].get(key)
                if value is None:
                    dollar_values.ix[index, key] = 0.0
                else:
                    dollar_values.ix[index, key] *= value * self._config[
                        'value_ratio']

        if group is True:
            dollar_values = self._sum_symbol_groups(dollar_values)
        return dollar_values
Beispiel #5
0
    def __init__(self, sids, fields, start=None, end=None, period=None, ignore_security_error=0,
                 ignore_field_error=0, period_adjustment=None, currency=None, override_option=None,
                 pricing_option=None, non_trading_day_fill_option=None, non_trading_day_fill_method=None,
                 max_data_points=None, adjustment_normal=None, adjustment_abnormal=None, adjustment_split=None,
                 adjustment_follow_DPDF=None, calendar_code_override=None, **overrides):

        Request.__init__(self, '//blp/refdata', ignore_security_error=ignore_security_error,
                         ignore_field_error=ignore_field_error)
        period = period or 'DAILY'
        assert period in ('DAILY', 'WEEKLY', 'MONTHLY', 'QUARTERLY', 'SEMI-ANNUAL', 'YEARLY')
        self.is_single_sid = is_single_sid = isinstance(sids, basestring)
        self.is_single_field = is_single_field = isinstance(fields, basestring)
        self.sids = is_single_sid and [sids] or list(sids)
        self.fields = is_single_field and [fields] or list(fields)
        self.end = end = pd.to_datetime(end) if end else pd.Timestamp.now()
        self.start = pd.to_datetime(start) if start else end + pd.datetools.relativedelta(years=-1)
        self.period = period
        self.period_adjustment = period_adjustment
        self.currency = currency
        self.override_option = override_option
        self.pricing_option = pricing_option
        self.non_trading_day_fill_option = non_trading_day_fill_option
        self.non_trading_day_fill_method = non_trading_day_fill_method
        self.max_data_points = max_data_points
        self.adjustment_normal = adjustment_normal
        self.adjustment_abnormal = adjustment_abnormal
        self.adjustment_split = adjustment_split
        self.adjustment_follow_DPDF = adjustment_follow_DPDF
        self.calendar_code_override = calendar_code_override
        self.overrides = overrides
Beispiel #6
0
def get_histdata(symbol,startDate="1990-01-01",endDate=None):
    """
    symbol: string
        AXS code for a stock
    startDate,endDate: date string or datetime object
        this can be neglected
    """
    startDate = pd.to_datetime(startDate)
    if endDate is not None:
        endDate = pd.to_datetime(endDate)
    else:
        endDate = pd.datetime.now()
    url = urlYahooFinance.format(
                symbol,startDate.month-1,startDate.day,startDate.year,
                endDate.month-1,endDate.day,endDate.year
        )
    # print url
    
    try:
        df = pd.read_csv(url)
        df.set_index("Date",inplace=True)
        df.index = pd.to_datetime(df.index)
        df.sort_index(inplace=True)
        return df
    except:
        print "Couldn't find this stock at Yahoo Finance or banned by Yahoo somehow!"
        return None
def build_sonar_json(**kwargs):
    """Provide JSON file"""
    prod_file = conf['prod_data_dir'] + '/sonar_payload.json'
    keen_collection = 'sonar_pings_{}'.format(conf['env']).lower()

    sonar_data = get_keen_sonar(keen_collection, 'this_30_days')

    df = pd.DataFrame(sonar_data)
    df['keen_timestamp'] = df['keen'].map(lambda x: x['created_at'])
    df['exec_date'] = pd.to_datetime(df['exec_date'], format=conf['date_format_keen'])
    df['keen_timestamp'] = pd.to_datetime(df['keen_timestamp'], format=conf['date_format_keen'])
    df.sort_values(by=['exec_date', 'keen_timestamp'], ascending=False, inplace=True)
    df.drop_duplicates(subset='value_key', keep='first', inplace=True)
    df.reset_index(inplace=True)
    df.drop(['index', 'keen'], 1, inplace=True)
    df['exec_date'] = df['exec_date'].dt.strftime(conf['date_format_keen'])
    df['keen_timestamp'] = df['keen_timestamp'].dt.strftime(conf[
        'date_format_keen'])
    sonar_payload = json.loads(df.to_json(orient='records'))



    notify_keen(
        {
            'sonar_payload': sonar_payload
        },
        'sonar_payloads_{}'.format(conf['env']).lower(),
        raise_for_status=True)

    with open(prod_file, 'w') as outfile:
        json.dump(sonar_payload, outfile, indent=4, sort_keys=True)


    return sonar_payload
Beispiel #8
0
def main():
    df = pd.read_csv(args.file)

    # 列类型转换
    df['time_x'], df['time_y'], df['duration'] = \
        pd.to_datetime(df['time_x']), \
        pd.to_datetime(df['time_y']), \
        pd.to_timedelta(df['duration'])

    df['duration'] = df['duration'] / np.timedelta64(1, 's')
    df['time_group'] = ((df['time_x'].dt.hour * 60 +
                         df['time_x'].dt.minute) / 5).apply(math.ceil)

    print("原始长度: {}".format(len(df)))
    grouped = df.groupby('time_group')
    statBefore = pd.DataFrame({'q1': grouped['duration'].quantile(.25),
                               'q3': grouped['duration'].quantile(.75)})
    df['outlier'] = df.apply(is_outlier, axis=1, args=(statBefore,))
    df = df[~(df.outlier)]
    del df['outlier']
    print("filtered长度: {}".format(len(df)))

    df_weekday = df[df['time_x'].dt.weekday < 6]
    df_weekend = df[df['time_x'].dt.weekday >= 6]
    draw_weekdays_plot(df_weekday)
    draw_weekdays_plot(df_weekend)

    df_weekday.boxplot(column='duration', by='time_group')
    plt.show()
    df_weekend.boxplot(column='duration', by='time_group')
    plt.show()
Beispiel #9
0
    def test_frame_add_datetime64_col_other_units(self):
        n = 100

        units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y']

        ns_dtype = np.dtype('M8[ns]')

        for unit in units:
            dtype = np.dtype('M8[%s]' % unit)
            vals = np.arange(n, dtype=np.int64).view(dtype)

            df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
            df[unit] = vals

            ex_vals = to_datetime(vals.astype('O')).values

            self.assertEqual(df[unit].dtype, ns_dtype)
            self.assertTrue((df[unit].values == ex_vals).all())

        # Test insertion into existing datetime64 column
        df = DataFrame({'ints': np.arange(n)}, index=np.arange(n))
        df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype)

        for unit in units:
            dtype = np.dtype('M8[%s]' % unit)
            vals = np.arange(n, dtype=np.int64).view(dtype)

            tmp = df.copy()

            tmp['dates'] = vals
            ex_vals = to_datetime(vals.astype('O')).values

            self.assertTrue((tmp['dates'].values == ex_vals).all())
Beispiel #10
0
    def test_allow_exact_matches_and_tolerance2(self):
        # GH 13695
        df1 = pd.DataFrame({
            'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
            'username': ['bob']})
        df2 = pd.DataFrame({
            'time': pd.to_datetime(['2016-07-15 13:30:00.000',
                                    '2016-07-15 13:30:00.030']),
            'version': [1, 2]})

        result = pd.merge_asof(df1, df2, on='time')
        expected = pd.DataFrame({
            'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
            'username': ['bob'],
            'version': [2]})
        assert_frame_equal(result, expected)

        result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False)
        expected = pd.DataFrame({
            'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
            'username': ['bob'],
            'version': [1]})
        assert_frame_equal(result, expected)

        result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False,
                               tolerance=pd.Timedelta('10ms'))
        expected = pd.DataFrame({
            'time': pd.to_datetime(['2016-07-15 13:30:00.030']),
            'username': ['bob'],
            'version': [np.nan]})
        assert_frame_equal(result, expected)
Beispiel #11
0
 def merge(self, weather, price):
     weather['date'] = pd.to_datetime(weather['date'])
     price['日期'] = pd.to_datetime(price['日期'])
     weather = weather.set_index('date')
     m = pd.merge(price, weather, left_on='日期',
                       right_index=True, how='left')
     return m
 def get_absolute_margin(exec_lot, req_price, soft_pnl, instrument, time):
     if exec_lot == 0.0:
         return np.nan
     ccy1 = instrument.split('/')[0]
     ccy2 = instrument.split('/')[1]
     ccy2_pnl = soft_pnl/exec_lot
     if (ccy2 == 'USD') or (ccy2 == 'USD_TOM') or (ccy2 == 'USD_TOD'):
         return ccy2_pnl
     if (ccy1 == 'USD'):
         return ccy2_pnl*req_price
     ccy_pair = ('/' + ccy2 + 'USD')
     swaped_ccy_pair = ('/' + 'USD' + ccy2)
     timestamp_end = pd.to_datetime(time)
     str_timestamp_end = str(timestamp_end)
     timestamp_start = pd.to_datetime(time) - timedelta(days=1)
     str_timestamp_start = str(timestamp_start)
     query = "index>Timestamp('" + str_timestamp_start + "') & index<Timestamp('" + str_timestamp_end +"')"
     if ccy_pair in instruments:
         for_price = prices_store.select(ccy_pair, where=query).tail(1)
         try:
             convert_price = (for_price['Bid'] + for_price['Ask']).values[0]/2
         except IndexError:
             return ccy2_pnl/req_price
         return ccy2_pnl/convert_price
     elif swaped_ccy_pair in instruments:
         for_price = prices_store.select(swaped_ccy_pair, where=query).tail(1)
         try:
             convert_price = (for_price['Bid'] + for_price['Ask']).values[0]/2
         except IndexError:
             return ccy2_pnl*req_price
         return ccy2_pnl*convert_price
     else:
         print(time, instrument)
         return ccy2_pnl
def clean_data():
    """Get the permits file from temp directory, clean it, and save it in Prod directory"""

    df = pd.read_csv(temp_permits)
    df.columns = [x.lower() for x in df.columns]
    df['approval_issue_dt'] = pd.to_datetime(
    df['approval_issue_dt'], errors='coerce')
    df['approval_close_dt'] = pd.to_datetime(
    df['approval_close_dt'], errors='coerce')

    df['proj_appl_date'] = pd.to_datetime(
    df['proj_appl_date'], errors='coerce')

    df['proj_deemed_cmpl_date'] = pd.to_datetime(
    df['proj_deemed_cmpl_date'], errors='coerce')

    df = df.sort_values(by='approval_issue_dt')

    logging.info('Writing all permits')

    general.pos_write_csv(
    df,
    prod_permits,
    date_format=conf['date_format_ymd_hms'])

    return 'Successfully cleaned data.'
Beispiel #14
0
def test_asfreq_actual():
    a = pd.TimeSeries({pd.to_datetime('2010-02-27'): 100,
                       pd.to_datetime('2010-03-25'): 200})
    actual = a.asfreq_actual(freq='M', method='ffill')

    assert len(actual) == 1
    assert '2010-02-27' in actual
Beispiel #15
0
def show_worst_drawdown_periods(returns, top=5):
    """
    Prints information about the worst drawdown periods.

    Prints peak dates, valley dates, recovery dates, and net drawdowns.

    Parameters
    ----------
    returns : pd.Series
        Daily returns of the strategy, non-cumulative.
    top : int, optional
        Amount of top drawdowns periods to plot (default 5).
    """

    print('\nWorst Drawdown Periods')
    drawdown_df = timeseries.gen_drawdown_table(returns, top=top)
    drawdown_df['peak date'] = pd.to_datetime(
        drawdown_df['peak date'],
        unit='D')
    drawdown_df['valley date'] = pd.to_datetime(
        drawdown_df['valley date'],
        unit='D')
    drawdown_df['recovery date'] = pd.to_datetime(
        drawdown_df['recovery date'],
        unit='D')
    drawdown_df['net drawdown in %'] = list(
        map(utils.round_two_dec_places, drawdown_df['net drawdown in %']))
    print(drawdown_df.sort('net drawdown in %', ascending=False))
Beispiel #16
0
def get_time_delta(start_date, end_date, start_format, end_format):
    """
    Given strings representing time, returns a timedelta object
    representing the time difference between two dates
    """
    time_delta = pd.to_datetime(end_date, end_format) - pd.to_datetime(start_date, start_format)
    return time_delta
def get_quote_yahoojp(code, start=None, end=None, interval='d'):
    base = 'http://info.finance.yahoo.co.jp/history/?code={0}.T&{1}&{2}&tm={3}&p={4}'
    start, end = web._sanitize_dates(start, end)
    start = 'sy={0}&sm={1}&sd={2}'.format(start.year, start.month, start.day)
    end = 'ey={0}&em={1}&ed={2}'.format(end.year, end.month, end.day)
    p = 1
    results = []

    if interval not in ['d', 'w', 'm', 'v']:
        raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'")

    while True:
        url = base.format(code, start, end, interval, p)
        tables = pd.read_html(url, header=0)
        if len(tables) < 2 or len(tables[1]) == 0:
            break
        results.append(tables[1])
        p += 1
    result = pd.concat(results, ignore_index=True)
    result.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
    if interval == 'm':
        result['Date'] = pd.to_datetime(result['Date'], format='%Y年%m月')
    else:
        result['Date'] = pd.to_datetime(result['Date'], format='%Y年%m月%d日')
    result = result.set_index('Date')
    result = result.sort_index()
    return result
Beispiel #18
0
def make_url(
	symbol,
	start_date='2000-1-1',
	stop_date=dt.date.today(),
	freq='d'):
	# Create url to download raw CSV data from Yahoo! Finance.
	# start_date and stop_date can be any format recognized by pd.to_datetime().
	# freq must be one of ['d','w','m'] meaning daily, weekly, monthly.
	
	symbol 		= symbol.upper()
	start_date 	= pd.to_datetime(start_date)
	stop_date 	= pd.to_datetime(stop_date)
	
	params = dict()
	params['s'] = symbol
	params['a'] = start_date.month - 1
	params['b'] = start_date.day
	params['c'] = start_date.year
	params['d'] = stop_date.month - 1
	params['e'] = stop_date.day
	params['f'] = stop_date.year
	params['g'] = freq				
	params['y'] = str(0)
	params['z'] = str(30000)
	
	yurl = 'http://real-chart.finance.yahoo.com/x?'
	for key in sorted(params.keys()):
	    yurl += '&' + key + '=' + str(params[key])
	
	return yurl
Beispiel #19
0
def draw(tick):
    #import numpy as np

    mydata = Quandl.get("WIKI/" + tick)


    #markers_on = np.array(['2013-02-26','2015-01-26','2016-02-26', '2016-04-01'], dtype='datetime64')
    #df3 = pd.DataFrame(markers_on)
    #df4 = df3.set_index(0)
    #df5 = df4.join(mydata,how='left')
    #df6 = df5['Adj. Close']
    #mynewdata = mydata.join(df6,how="left",lsuffix='_OG',rsuffix='_Mark')


    #get trading start

    def tradedetails(tradetype,tradevalue,minprice,maxprice,isofficer,ceo,cfo,isdir,is10,isother,stock):
        hf = pd.read_html("http://openinsider.com/screener?fd=0&fdr=&td=0&tdr=&s="+ stock + "&o=&t="+ tradetype + "&minprice=" + str(minprice) + "&maxprice=" + str(maxprice) + "&v="+ str(tradevalue) +"&isofficer=" + str(isofficer) + "&isceo=" + str(ceo) + "&iscfo=" + str(cfo) + "&isdirector=" + str(isdir) + "&istenpercent=" + str(is10) + "&isother=" + str(isother) + "&sicMin=&sicMax=&sortcol=1&maxresults=1000")
        return hf[5]

    def convertdate(x):
        return x[5:7] + "/" + x[8:10] + "/" + x[0:4]

    def converttime(x):
        return x[11:]

    def convertnumber(x):
        return x.replace("+","").replace("$","").replace(",","")

    def cleandataframe(df):
        df['Trade Date'] = df['Trade Date'].apply(convertdate)
        df['Filing Time'] = df['Filing Date'].apply(converttime)
        df['Filing Date'] = df['Filing Date'].apply(convertdate)
        #df['Shares Traded'] = df['Shares Traded'].apply(convertnumber)
        df['Value Traded'] = df['Value Traded'].apply(convertnumber)
        #df['Shares Owned'] = df['Shares Owned'].apply(convertnumber)
        return df

    def cleanerdataframe(df):
        df['Trade Date'] = df['Trade Date'].apply(convertdate)
        df['Filing Time'] = df['Filing Date'].apply(converttime)
        df['Filing Date'] = df['Filing Date'].apply(convertdate)
        df['Shares Traded'] = df['Shares Traded'].apply(convertnumber)
        df['Value Traded'] = df['Value Traded'].apply(convertnumber)
        #df['Shares Owned'] = df['Shares Owned'].apply(convertnumber)
        return df


    detail = tradedetails("p",25000,"","",0,1,1,0,0,0,tick)
    pd.to_datetime(detail['Trade Date'])
    detail = detail.set_index('Trade Date')
    newdetail = detail.join(mydata,how='left')
    df6 = newdetail['Adj. Close']
    mynewdata = mydata.join(df6,how="left",lsuffix='_OG',rsuffix='_Mark')

    #get trading end

    plt.plot(mynewdata['Adj. Close_OG'])
    plt.plot(mynewdata['Adj. Close_Mark'],marker='o',color='r', markersize=11)
    plt.show()
Beispiel #20
0
def do_charts(slicer, pdfpages):
    print "\n***Generating Charts***"
    
    fig, ax = plt.subplots(figsize=(7, 6), dpi=80)    

    start = pd.to_datetime('2010-12-13 13:54:10.5-05:00')
    end = pd.to_datetime('2010-12-13 13:54:11.5-05:00')
    
    window_sizes = [32, 64, 128]
    raw = slicer.series['raw'][start:end]
    raw.plot()
    
    for ws in window_sizes:
        slicer.extract_rolling_median(seriesname = 'raw', window_size = ws)
        rm = slicer.series['raw_rolling_median_' + str(ws)][start:end]
        rm.plot(xticks=[i for i in rm.index])
    
    plt.legend(['512Hz EEG']+[ 'Rolling Median %d window size' % ws \
                                for ws in window_sizes]
                                ,loc='best')
    plt.ylabel(r"Potential ($\mu$V)")
    plt.xlabel(r"Time ($\mu$Sec)")
    #plt.title('10 Hz rolling median, compared to 512Hz signal')
    ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%S.%f'))
    ax.set_ylim(ax.get_ylim()[::-1])
    pdfpages.savefig()
Beispiel #21
0
def load(
	symbol_list,
	start_date='2000-1-1',
	stop_date=dt.date.today(),
	freq='d',
	verbose=True):
	# Get data for multiple symbols. Returns a dictionary of DataFrames.
	# Each DataFrame stores one variable (e.g 'TotalValue') for all symbols.
	# symbol_list should be a list of strings, e.g. ['SPY','AAPL','^GSPC']
	# Set verbose = False to disable printing to screen.
	
	start_date 	= pd.to_datetime(start_date)
	stop_date	= pd.to_datetime(stop_date)
	
	tables = dict()
	if verbose: print "Loading symbols",
	for symbol in symbol_list:
	    if verbose: print symbol,
	    tables[symbol] = get_table(symbol,start_date,stop_date,freq)
	if verbose: print "\nAll symbols loaded.\n"
	
	p = pd.Panel(tables)
	p = p.transpose(2,1,0)
	
	return dict(p)
Beispiel #22
0
def load_yahoo_stock(sids, start=None, end=None, dvds=True):
    if hasattr(sids, '__iter__') and not isinstance(sids, basestring):
        return Instruments([load_yahoo_stock(sid, start=start, end=end, dvds=dvds) for sid in sids])
    else:
        sid = sids
        end = end and pd.to_datetime(end) or pd.datetime.now()
        start = start and pd.to_datetime(start) or end + pd.datetools.relativedelta(years=-1)
        data = get_data_yahoo(sid, start=start, end=end)
        data = data.rename(columns=lambda c: c.lower())
        if dvds:
            d = get_dividends_yahoo(sid, start, end)
            d.columns = ['dvds']
            if not d.empty:
                # sanity check - not expected currently
                # missing = d.index.difference(data.index)
                missing = d.index - data.index
                if len(missing) > 0:
                    raise Exception('dividends occur on non-business day, not expecting this')
                # another sanity check to ensure yahoo rolls dividends up, in case a special occurs on same day
                if not d.index.is_unique:
                    d = d.groupby(lambda x: x).sum()
                data = data.join(d)
            else:
                data['dvds'] = np.nan
        pxs = InstrumentPrices(data)
        return Instrument(sid, pxs, multiplier=1.)
def loadData_getColNames(data_columns):
    print "Here are the data columns of your file: "
    print data_columns

    # Find the column names for each of the 5 data streams
    colnames = ['EDA data','Temperature data','Acceleration X','Acceleration Y','Acceleration Z']
    new_colnames = ['','','','','']

    for i in range(len(new_colnames)):
        new_colnames[i] = raw_input("Column name that contains "+colnames[i]+": ")
        while (new_colnames[i] not in data_columns):
            print "Column not found. Please try again"
            print "Here are the data columns of your file: "
            print data_columns

            new_colnames[i] = raw_input("Column name that contains "+colnames[i]+": ")

    # Get user input on sample rate
    sampleRate = raw_input("Enter sample rate (must be an integer power of 2): ")
    while (sampleRate.isdigit()==False) or (np.log(int(sampleRate))/np.log(2) != np.floor(np.log(int(sampleRate))/np.log(2))):
        print "Not an integer power of two"
        sampleRate = raw_input("Enter sample rate (must be a integer power of 2): ")
    sampleRate = int(sampleRate)

    # Get user input on start time
    startTime = pd.to_datetime(raw_input("Enter a start time (format: YYYY-MM-DD HH:MM:SS): "))
    while type(startTime)==str:
        print "Not a valid date/time"
        startTime = pd.to_datetime(raw_input("Enter a start time (format: YYYY-MM-DD HH:MM:SS): "))


    return sampleRate, startTime, new_colnames
def read_correct_ch_dam_data(csv_file):
    """
    Function to read, calibrate and convert time format (day1 24:00:00
    to day 2 00:00:00) in check dam data
    :param csv_file:
    :return: calibrated and time corrected data
    """
    water_level = pd.read_csv(
        csv_file, skiprows=9, sep=",", header=0, names=["scan no", "date", "time", "raw value", "calibrated value"]
    )
    water_level["calibrated value"] = (water_level["raw value"] * coeff_cal[0]) + coeff_cal[1]  #  in cm
    water_level["calibrated value"] /= 100  # convert to metre
    water_level.columns.values[4] = "stage(m)"
    # create date time index
    format = "%d/%m/%Y  %H:%M:%S"
    c_str = " 24:00:00"
    for index, row in water_level.iterrows():
        x_str = row["time"]
        if x_str == c_str:
            # convert string to datetime object
            r_date = pd.to_datetime(row["date"], format="%d/%m/%Y ")
            # add 1 day
            c_date = r_date + timedelta(days=1)
            # convert datetime to string
            c_date = c_date.strftime("%d/%m/%Y ")
            c_time = " 00:00:00"
            water_level["date"][index] = c_date
            water_level["time"][index] = c_time

    water_level["date_time"] = pd.to_datetime(water_level["date"] + water_level["time"], format=format)
    water_level.set_index(water_level["date_time"], inplace=True)
    # # drop unnecessary columns before datetime aggregation
    water_level.drop(["scan no", "date", "time", "raw value", "date_time"], inplace=True, axis=1)

    return water_level
 def forecast(self , resampled_df ,data_freq = 52 , number_of_predctions = 5):
     
     # start and end date of the series
     start_date = pd.to_datetime(resampled_df.ix[0].name).date()
     end_date = pd.to_datetime(resampled_df.ix[-1].name).date()
     
     r_series = self.convert_to_r_series(resampled_df, start_date, data_freq)
 
     # fit the model
     log_r_series = self.base.log(r_series)
     holt_winter_fit = self.stats.HoltWinters(r_series)
     
     # forecast
     holt_winter_forecast = self.forecast_lib.forecast_HoltWinters(holt_winter_fit , \
                                                               h = number_of_predctions)
     # prepare and convert results to pandas dataframe
     reshaped_melted_results= self.reshape.melt(holt_winter_forecast) 
     if data_freq == 52:
         forecast_duration = self.base.as_Date(end_date.strftime('%Y-%m-%d')).ro +\
                             (self.base.seq(1,number_of_predctions).ro * 7)
         myxts = self.xts.xts(reshaped_melted_results, forecast_duration)
         results_field =  'value.value.Point.Forecast'
     elif  data_freq == 12:
         myxts =  holt_winter_forecast 
         results_field =  'value.Point.Forecast' 
         
     results_pd_df = com.convert_robj(self.r.melt(myxts)) 
     results_pd_ts  = results_pd_df[results_field ]
     
     return (results_pd_ts ,holt_winter_forecast)
Beispiel #26
0
def clean(df):
    df.replace("-unknown-", np.nan, inplace=True)
    df.loc[df.age > 80, "age"] = np.nan
    df.loc[df.age < 18, "age"] = np.nan
    df["timestamp_first_active"] = pd.to_datetime(df.timestamp_first_active.astype(str), format="%Y%m%d%H%M%S")

    df["date_account_created"] = pd.to_datetime(df["date_account_created"])
Beispiel #27
0
def test_resample_across_dst():
    # The test resamples a DatetimeIndex with values before and after a
    # DST change
    # Issue: 14682

    # The DatetimeIndex we will start with
    # (note that DST happens at 03:00+02:00 -> 02:00+01:00)
    # 2016-10-30 02:23:00+02:00, 2016-10-30 02:23:00+01:00
    df1 = DataFrame([1477786980, 1477790580], columns=['ts'])
    dti1 = DatetimeIndex(pd.to_datetime(df1.ts, unit='s')
                         .dt.tz_localize('UTC')
                            .dt.tz_convert('Europe/Madrid'))

    # The expected DatetimeIndex after resampling.
    # 2016-10-30 02:00:00+02:00, 2016-10-30 02:00:00+01:00
    df2 = DataFrame([1477785600, 1477789200], columns=['ts'])
    dti2 = DatetimeIndex(pd.to_datetime(df2.ts, unit='s')
                         .dt.tz_localize('UTC')
                            .dt.tz_convert('Europe/Madrid'))
    df = DataFrame([5, 5], index=dti1)

    result = df.resample(rule='H').sum()
    expected = DataFrame([5, 5], index=dti2)

    assert_frame_equal(result, expected)
Beispiel #28
0
def get_dividends_yahoo(sid, start, end):
    # Taken from get_data_yahoo in Pandas library and adjust a single parameter to get dividends
    from pandas.compat import StringIO, bytes_to_str
    from pandas.io.common import urlopen

    start, end = pd.to_datetime(start), pd.to_datetime(end)
    url = ('http://ichart.finance.yahoo.com/table.csv?' + 's=%s' % sid +
           '&a=%s' % (start.month - 1) +
           '&b=%s' % start.day +
           '&c=%s' % start.year +
           '&d=%s' % (end.month - 1) +
           '&e=%s' % end.day +
           '&f=%s' % end.year +
           '&g=v' +  # THE CHANGE
           '&ignore=.csv')

    with urlopen(url) as resp:
        lines = resp.read()
    rs = pd.read_csv(StringIO(bytes_to_str(lines)), index_col=0,
                     parse_dates=True, na_values='-')[::-1]
    # Yahoo! Finance sometimes does this awesome thing where they
    # return 2 rows for the most recent business day
    if len(rs) > 2 and rs.index[-1] == rs.index[-2]:  # pragma: no cover
        rs = rs[:-1]
    return rs
Beispiel #29
0
    def read(cls, rootdir):
        path = cls.metadata_path(rootdir)
        with open(path) as fp:
            raw_data = json.load(fp)

            try:
                version = raw_data['minutes_per_day']
            except KeyError:
                # Version was first written with version 1, assume 0,
                # if version does not match.
                version = 0

            first_trading_day = pd.Timestamp(
                raw_data['first_trading_day'], tz='UTC')
            market_opens = pd.to_datetime(raw_data['market_opens'],
                                          unit='m',
                                          utc=True)
            market_closes = pd.to_datetime(raw_data['market_closes'],
                                           unit='m',
                                           utc=True)
            ohlc_ratio = raw_data['ohlc_ratio']

            if version == 0:
                # version 0 always assumed US equities.
                minutes_per_day = US_EQUITIES_MINUTES_PER_DAY
            else:
                minutes_per_day = raw_data['minutes_per_day']

            return cls(
                first_trading_day,
                market_opens,
                market_closes,
                ohlc_ratio,
                minutes_per_day,
            )
Beispiel #30
0
def read_data(verbose=False):
    """
    Read the files and return:
    Poids, Temps, Charge, Descr
    data+'Poids.csv'
    data+'Donnees_Ecometering_Temperature-par-site.csv'
    data+'charge.csv'
    data+'Donnees_Ecometering_Description_sites.csv'
    and a dataframe ready for Prediction
    """

    print('Retrieve the weights')
    Poids = read_csv(POIDS_CSV, sep=';')
    Poids.Date = Poids.Date.apply(change_format)
    
    print('Retrieve the temperature')
    Temps = read_csv(TEMPS_CSV, sep=';')
    Temps.Jour = Temps.Jour.apply(change_format)
    Temps.index = to_datetime(Temps['Jour'].values, format='%d/%m/%Y %H:%M')
    Temps = Temps.drop('Jour', 1)
    Temps = Temps.resample('10min', fill_method='ffill')
    if verbose:
        print(echant(Temps, n=10, m=20))
    
    print('Retrieve the charge')
    Charg = read_csv(CHARGE_CSV, sep=';')
    Charg.index = to_datetime(Charg['DATE_LOCAL'].values, format='%d/%m/%Y %H:%M')
    Charg = Charg.drop('DATE_LOCAL', 1)
    if verbose:
        print(echant(Charg, n=10, m = 20))
        
    print('Retrieve the description')
    Descr = read_csv(DESCR_CSV, sep=';')

    return Poids, Temps, Charg, Descr
import pandas as pd

#panggil dataset
df = pd.read_csv('https://dqlab-dataset.s3-ap-southeast-1.amazonaws.com/data_retail.csv', sep=';')

#cetak lima data teratas
print(df.head())

#cetak info dataset
print(df.info())

#mengubah kolom waktu transaksi kedalam format tanggal
df['First_Transaction']=pd.to_datetime(df['First_Transaction']/1000, unit='s', origin='1970-01-01')
df['Last_Transaction']=pd.to_datetime(df['Last_Transaction']/1000, unit='s', origin='1970-01-01')

#Pengecekan transaksi terakhir
print(max(df['Last_Transaction']))

#Klasifikasi customer churn atau tidak dan dimasukkan ke kolom is_churn
df.loc[df['Last_Transaction']<='2018-08-01', 'is_churn'] = True
df.loc[df['Last_Transaction']>'2018-08-01', 'is_churn'] = False

#menghapus kolom yang tidak diperlukan
del df['no']
del df['Row_Num']

import matplotlib.pyplot as plt

#Tahun transaksi
df['Year_First_Transaction'] = df['First_Transaction'].dt.year
df['Year_Last_Transaction'] = df['Last_Transaction'].dt.year
Beispiel #32
0
cols = ['ID', 'Vehicle', 'Group', 'Average Speed', 'Minimum Speed', 'Min Location', 'Maximum Speed', 'Max Location', 'Date', 'Estimated Start Time', 'Estimated End Time', 'Estimated Duration']

speeding_out = pd.DataFrame(columns=cols)

n = 0

tlist = []
llist = []

tlist.append(speeding.iloc[0]['Speed'])
llist.append(speeding.iloc[0]['Location'])

for index, row in islice(speeding.iterrows(), 1, None):

    if row['Rover'] == speeding.iloc[index-1]['Rover'] and int(pd.to_timedelta(pd.to_datetime(row['Stamp']) - pd.to_datetime(speeding.iloc[index - 1]['Stamp'])) / pd.Timedelta('1 minute')) < 1:

        tlist.append(row['Speed'])
        llist.append(row['Location'])

    else:

        speeding_out.loc[n] = pd.Series({'ID': n, 'Vehicle': speeding.iloc[index - 1]['Rover'], 'Group': speeding.iloc[index - 1]['name'], 'Average Speed': np.mean(tlist),
                                        'Minimum Speed': min(tlist), 'Min Location': llist[tlist.index(min(tlist))], 'Maximum Speed': max(tlist), 'Max Location': llist[tlist.index(min(tlist))], 'Date': pd.to_datetime(speeding.iloc[index - 1]['Stamp']).date(), 'Estimated Start Time': pd.to_datetime(pd.to_datetime(speeding.iloc[index - len(tlist)]['Stamp']) - datetime.timedelta(seconds=10)).time(), 'Estimated End Time': pd.to_datetime(speeding.iloc[index - 1]['Stamp']).time(),
                                        'Estimated Duration': int(pd.to_timedelta(pd.to_datetime(speeding.iloc[index - 1]['Stamp']) - (pd.to_datetime(speeding.iloc[index - len(tlist)]['Stamp']) - datetime.timedelta(seconds=10))) / pd.to_timedelta('1 second'))})
        n += 1

        del tlist[:]
        del llist[:]

        tlist.append(row['Speed'])
def parse_records(urls, admissionIDs):
    '''
    Input: a list of strings, and an array of integers
    Output: a pandas DataFrame
    Description: parse_records is a custom parser procedure will query mongoDB and
    extract the data fields for each admissionID that we want to be able to show in the
    visualization application. The final output is a pandas dataframe where each row is 
    an admissionID where the columns are specified in the admissionCols, comorbidCols 
    and patientCols arrays.
    '''
    def load_data(dataLocation, collectionName, isURL=False):
        if isURL:
            response.get(dataLocation)
            docsAsJSON = json.loads(response[collectionName])
            return pd.DataFrame(docsAsJSON)
        return pd.read_csv(dataLocation, delimiter='|')

    admissionsURL, comorbidsURL, patientsURL = urls[0], urls[1], urls[2]

    dfAdmissions = load_data(admissionsURL, None, isURL=False)

    dfComorbids = load_data(comorbidsURL, None,
                            isURL=False).rename(index=str,
                                                columns={
                                                    'DRG_MORTALITY':
                                                    'COMORBID_MORTALITY',
                                                    'DRG_SEVERITY':
                                                    'COMORBID_SEVERITY'
                                                })
    dfPatients = load_data(patientsURL, None, isURL=False)

    subjectIDs = dfAdmissions['SUBJECT_ID'][dfAdmissions.HADM_ID.isin(
        admissionIDs)]

    admissionCols = [
        'HADM_ID', 'SUBJECT_ID', 'ADMISSION_TYPE', 'DIAGNOSIS', 'INSURANCE',
        'ETHNICITY', 'LANGUAGE', 'MARITAL_STATUS', 'ADMITTIME', 'DISCHTIME'
    ]

    admissionInfo = dfAdmissions[admissionCols][dfAdmissions.HADM_ID.isin(
        admissionIDs)]

    patientCols = ['SUBJECT_ID', 'GENDER', 'DOB']

    patientInfo = dfPatients[patientCols][dfPatients.SUBJECT_ID.isin(
        subjectIDs)]

    comorbidCols = ['HADM_ID', 'COMORBID_MORTALITY', 'COMORBID_SEVERITY']

    grouped = dfComorbids[comorbidCols][dfComorbids.HADM_ID.isin(
        admissionIDs)].groupby('HADM_ID')
    intFrame = admissionInfo.join(grouped.mean(), how='left', on='HADM_ID')
    finalFrame = intFrame.merge(patientInfo, how='left', on='SUBJECT_ID')

    finalFrame['AGE'] = np.round((pd.to_datetime(finalFrame.ADMITTIME) - pd.to_datetime(finalFrame.DOB)) \
                        / np.timedelta64(365, 'D'))

    finalFrame['HADM_ID'] = finalFrame['HADM_ID'].astype(int)
    finalFrame['SUBJECT_ID'] = finalFrame['SUBJECT_ID'].astype(int)

    return finalFrame
Beispiel #34
0
def load_vax_data(download = False):
    if download:
        download_data(data, "vaccine_doses_statewise.csv")
    vax = pd.read_csv(data/"vaccine_doses_statewise.csv").set_index("State").T
    vax.columns = vax.columns.str.title()
    return vax.set_index(pd.to_datetime(vax.index, format = "%d/%m/%Y"))
# for j in range(x):
#     df1['e_year'][j]=int(df1['end_date'][j][0:4])
#     df1['e_month'][j]=int(df1['end_date'][j][5:7])
#     df1['e_date'][j]=int(df1['end_date'][j][8:])
#     j=j+1
    

for k in range(x):
    start_date=datetime.date(int(df1['start_date'][k][0:4]),int(df1['start_date'][k][5:7]),int(df1['start_date'][k][8:]))
    end_date=datetime.date(int(df1['end_date'][k][0:4]),int(df1['end_date'][k][5:7]),int(df1['end_date'][k][8:])) 
    diff=end_date-start_date
    diff+= datetime.timedelta(days=(1))
    df1['days'][k]=diff.days
    #df1['imp_days'][k]=int(df1['imp'][k])*int(df1['days'][k])
    k=k+1
df1['start_date']=pd.to_datetime(df1['start_date'])
df1['end_date']=pd.to_datetime(df1['end_date'])   
dff=df1.copy()
mask=dff['user_id']==usrl
global df_4
df_4= dff.loc[mask]
print(df_4)
#df1.info()
l_cat=list(df1.goal_cat_id.unique())
l_cat.sort()
cat={"carrer & education":1,"family":2,"finnaces":3,"friends & social life":4,"fun & recreation":5,"health & fitness":6,"love & relationships":7,"personal development":8}
'''for i in test_dict : 
    print(i, test_dict[i]) '''
list_cat = Listbox(root)
list_cat.pack()
def main_fun(sector_name, hold_time, if_only_long, time_para_dict):
    root_path = '/mnt/mfs/DAT_EQT'
    if_save = True
    if_new_program = True

    begin_date = pd.to_datetime('20100101')
    cut_date = pd.to_datetime('20160401')
    end_date = pd.to_datetime('20180901')
    lag = 2
    return_file = ''

    if_hedge = True
    # if_only_long = False
    if sector_name.startswith('market_top_300plus'):
        if_weight = 1
        ic_weight = 0

    elif sector_name.startswith('market_top_300to800plus'):
        if_weight = 0
        ic_weight = 1

    else:
        if_weight = 0.5
        ic_weight = 0.5

    main = FactorTestSector(root_path, if_save, if_new_program, begin_date, cut_date, end_date, time_para_dict,
                            sector_name, hold_time, lag, return_file, if_hedge, if_only_long,
                            if_weight, ic_weight)

    my_factor_list = ['lsgg_num_df_5',
                      'lsgg_num_df_20',
                      'lsgg_num_df_60',
                      'bulletin_num_df',
                      'news_num_df_5',
                      'news_num_df_20',
                      'news_num_df_60',
                      'staff_changes',
                      'funds',
                      'meeting_decide',
                      'restricted_shares',
                      'son_company',
                      'suspend',
                      'shares',
                      'bar_num_7_df',
                      'bar_num_12_df',
                      'sell_key_title_word',
                      'sell_summary_key_word',
                      'buy_key_title__word',
                      'buy_summary_key_word',
                      ]

    ratio_list = ['R_DebtAssets_QTTM',
                  'R_EBITDA_IntDebt_QTTM',
                  'R_EBITDA_sales_TTM_First',
                  'R_BusinessCycle_First',
                  'R_DaysReceivable_First',
                  'R_DebtEqt_First',
                  'R_FairVal_TotProfit_TTM_First',
                  'R_LTDebt_WorkCap_QTTM',
                  'R_OPCF_TotDebt_QTTM',
                  'R_OPCF_TotDebt_QTTM',
                  'R_OPEX_sales_TTM_First',
                  'R_SalesGrossMGN_QTTM',
                  'R_CurrentAssetsTurnover_QTTM',
                  'R_TangAssets_TotLiab_QTTM',
                  'R_NetROA_TTM_First',
                  'R_ROE_s_First',
                  'R_EBIT_sales_QTTM',
                  ]

    tech_list = [
        'ADX_40_20_10',
        'ADX_100_20_10',
        'ADX_200_20_10',
        'AROON_40_80',
        'AROON_200_80',
        'CMO_40_0',
        'CMO_200_0',
        'MFI_40_70_30',
        'MFI_140_70_30',
        'ADOSC_20_60_0',
        'ADOSC_60_120_0',
        'ATR_40_0.2',
        'ATR_140_0.2',
        'RSI_40_30',
        'RSI_140_30',
        'CCI_p150d_limit_12',
        'MACD_40_160',
        'bias_turn_p60d',
        'vol_p50d',
        'vol_p100d',
        'vol_p200d',
        'evol_p30d',
        'evol_p90d',
        'moment_p30200d',
        'moment_p50300d',
        'turn_p30d_0.24',
        'turn_p150d_0.18',
        'TVOL_p30d_col_extre_0.2',
        'TVOL_p90d_col_extre_0.2',
        'TVOL_row_extre_0.2',
        'aadj_r_p20d_col_extre_0.2',
        'aadj_r_p345d_continue_ud_pct',
        'aadj_r_p345d_continue_ud',
        'volume_moment_p1040d',
        'volume_moment_p20120d',
        'return_p30d_0.2',
        'return_p90d_0.2',
        'BBANDS_20_1.5',
        'BBANDS_40_1.5',
        'BBANDS_100_1.5',
        'BBANDS_200_1.5',
        'MACD_12_26_9',
        'MACD_20_60_18',
        'MA_LINE_10_5',
        'MA_LINE_60_20',
        'MA_LINE_120_60',
        'WILLR_10_30',
        'WILLR_40_30',
        'WILLR_100_20',
    ]

    pool_num = 25

    main.test_index_3_(my_factor_list, ratio_list, tech_list, pool_num, suffix_name='13')
        'MA_LINE_10_5',
        'MA_LINE_60_20',
        'MA_LINE_120_60',
        'WILLR_10_30',
        'WILLR_40_30',
        'WILLR_100_20',
    ]

    pool_num = 25

    main.test_index_3_(my_factor_list, ratio_list, tech_list, pool_num, suffix_name='13')


time_para_dict = OrderedDict()

time_para_dict['time_para_1'] = [pd.to_datetime('20100101'), pd.to_datetime('20150101'),
                                 pd.to_datetime('20150401'), pd.to_datetime('20150701'),
                                 pd.to_datetime('20151001'), pd.to_datetime('20160101')]

time_para_dict['time_para_2'] = [pd.to_datetime('20110101'), pd.to_datetime('20160101'),
                                 pd.to_datetime('20160401'), pd.to_datetime('20160701'),
                                 pd.to_datetime('20161001'), pd.to_datetime('20170101')]

time_para_dict['time_para_3'] = [pd.to_datetime('20120601'), pd.to_datetime('20170601'),
                                 pd.to_datetime('20170901'), pd.to_datetime('20171201'),
                                 pd.to_datetime('20180301'), pd.to_datetime('20180601')]

time_para_dict['time_para_4'] = [pd.to_datetime('20130801'), pd.to_datetime('20180801'),
                                 pd.to_datetime('20181101'), pd.to_datetime('20181101'),
                                 pd.to_datetime('20181101'), pd.to_datetime('20181101')]
def clean_data(df_csv_datas):
    '''
    数据清洗
    '''
    row_length_before = df_csv_datas.shape[0]

    print('\t清理--清除一些用不到的字段')
    df_csv_datas = df_csv_datas.drop(['emmcid','上报时间','异常进程名','进程版本名','进程版本号','异常进程包名',
        '软件系统类型','国家','异常类型','MBN版本信息','异常次数','日志路径'],axis=1)

    df_csv_datas = df_csv_datas.fillna('-1')

    print('\t地区码--只保留中国')
    df_csv_datas = df_csv_datas.loc[df_csv_datas['地区码'] == 'china']
    df_csv_datas = df_csv_datas.drop(['地区码'],axis=1)

    print('\t运营商--移除测试的PLMN')
    fp = open(os.path.join(os.path.abspath('.'),'config','remove_test_plmn.txt'),'r')
    test_plmn_list = [plmn.strip() for plmn in fp.readlines()]
    df_csv_datas = df_csv_datas[-df_csv_datas['运营商'].isin(test_plmn_list)]
    fp.close()

    print('\tIMEI--移除测试的IMEI')
    fp = open(os.path.join(os.path.abspath('.'),'config','remove_test_imei.txt'),'r')
    test_imei_list = [imei.strip() for imei in fp.readlines()]
    df_csv_datas = df_csv_datas[-df_csv_datas['imei'].isin(test_imei_list)]
    fp.close()

    print('\t处理一些处理异常的字段')
    df_csv_datas['省直辖市'] = df_csv_datas['省/直辖市']
    df_csv_datas = df_csv_datas.drop(['省/直辖市'],axis=1)

    df_csv_datas['县区'] = df_csv_datas['县/区']
    df_csv_datas = df_csv_datas.drop(['县/区'],axis=1)

    print('\t字段转义')
    df_csv_datas['机型'] = df_csv_datas['外部机型'].str.cat(df_csv_datas['内部机型'],sep='/')
    df_csv_datas = df_csv_datas.drop(['外部机型','内部机型'],axis=1)

    print('\t发生时间--提取发生的小时')
    df_csv_datas['发生时间1'] = pd.to_datetime(df_csv_datas['发生时间'],infer_datetime_format=True)
    df_csv_datas['发生时间h'] = df_csv_datas['发生时间1'].apply(get_hour)
    df_csv_datas = df_csv_datas.drop(['发生时间','发生时间1'],axis=1)

    df_csv_datas['ImsRat'] = df_csv_datas['保留字段一']
    df_csv_datas = df_csv_datas.drop(['保留字段一'],axis=1)

    df_csv_datas['ExtraCode_ErroMSG'] = df_csv_datas['保留字段二']
    df_csv_datas['ExtraCode'] = df_csv_datas['保留字段二'].apply(get_ExtraCode)
    df_csv_datas['ErroMSG'] = df_csv_datas['保留字段二'].apply(get_ErroMSG)
    df_csv_datas = df_csv_datas.drop(['保留字段二'],axis=1)

    df_csv_datas['RSRP'] = df_csv_datas['log信息'].apply(get_rsrp)
    df_csv_datas['RSRQ'] = df_csv_datas['log信息'].apply(get_rsrq)
    df_csv_datas = df_csv_datas.drop(['log信息'],axis=1)

    df_csv_datas['运营商1'] = df_csv_datas['运营商'].apply(get_plmn1)
    df_csv_datas['运营商2'] = df_csv_datas['运营商'].apply(get_plmn2)
    df_csv_datas = df_csv_datas.drop(['运营商'],axis=1)

    print('\t合并一些共同分析的字段')
    df_csv_datas['开始基站位置'] = df_csv_datas['运营商2'].str.cat(df_csv_datas['起呼位置码'],sep='/').str.cat(df_csv_datas['起呼基站编号'],sep='/')
    df_csv_datas['结束基站位置'] = df_csv_datas['运营商2'].str.cat(df_csv_datas['结束位置码'],sep='/').str.cat(df_csv_datas['结束基站编号'],sep='/')
    df_csv_datas = df_csv_datas.drop(['起呼位置码','起呼基站编号'],axis=1)
    df_csv_datas = df_csv_datas.drop(['结束位置码','结束基站编号'],axis=1)

    df_csv_datas['电话网络']=df_csv_datas['起呼电话网络'].str.cat(df_csv_datas['结束电话网络'],sep='/')
    df_csv_datas['数据网络']=df_csv_datas['开始数据网络'].str.cat(df_csv_datas['结束数据网络'],sep='/')
    df_csv_datas['网络']=df_csv_datas['电话网络'].str.cat(df_csv_datas['数据网络'],sep='/')
    df_csv_datas = df_csv_datas.drop(['起呼电话网络','结束电话网络'],axis=1)
    df_csv_datas = df_csv_datas.drop(['开始数据网络','结束数据网络'],axis=1)

    df_csv_datas['运营商_电话网络'] = df_csv_datas['运营商2'].str.cat(df_csv_datas['电话网络'], sep='/')
    df_csv_datas['运营商_数据网络'] = df_csv_datas['运营商2'].str.cat(df_csv_datas['数据网络'], sep='/')
    df_csv_datas['运营商_网络'] = df_csv_datas['运营商2'].str.cat(df_csv_datas['网络'], sep='/')

    df_csv_datas['运营商_网络_原因'] = df_csv_datas['运营商_网络'].str.cat(df_csv_datas['ExtraCode'], sep='/')

    df_csv_datas['省市'] = df_csv_datas['省直辖市'].str.cat(df_csv_datas['市'],sep='/')
    df_csv_datas['省市县区'] = df_csv_datas['省市'].str.cat(df_csv_datas['县区'],sep='/')
    df_csv_datas = df_csv_datas.drop(['市'],axis=1)
    df_csv_datas = df_csv_datas.drop(['县区'],axis=1)

    row_length_after = df_csv_datas.shape[0]
    print('\t数据清洗前后的数量='+str(row_length_after)+'/'+str(row_length_before)+',数据清洗比率='+str(row_length_after*100/row_length_before)+'%')

    return df_csv_datas
Beispiel #39
0
def datechunk_to_date(date_chunk):
    return pd.to_datetime(date_chunk[0][2:])
Beispiel #40
0
def test_iter_readonly():
    # GH#28055 ints_to_pydatetime with readonly array
    arr = np.array([np.datetime64("2012-02-15T12:00:00.000000000")])
    arr.setflags(write=False)
    dti = pd.to_datetime(arr)
    list(dti)
Beispiel #41
0
    def create_full_tearsheet(self, results, heatmap_2d=True):
        """
        Create a full tear sheet of param scan results.

        Parameters
        ----------
        results : DataFrame
            multi-index (Field, StrategyOrDate) DataFrame of param scan results,
            with param vals as (possibly multi-level) columns

        heatmap_2d : bool
            use heat maps for 2 paramscans; if False, use bar charts

        Returns
        -------
        None
        """
        returns = results.loc["AggReturn"]
        returns.index = pd.to_datetime(returns.index)
        returns.index.name = "Date"

        summary = OrderedDict()
        if results.columns.nlevels == 2:
            param1, param2 = results.columns.names
            summary["Parameter 1"] = param1
            summary["Parameter 2"] = param2
            params_title = " / ".join(results.columns.names)
        else:
            summary["Parameter"] = results.columns.name
            params_title = results.columns.name

        summary["Start Date"] = returns.index.min().date().isoformat()
        summary["End Date"] = returns.index.max().date().isoformat()

        with sns.axes_style("white", {'axes.linewidth': 0}):

            fig = plt.figure("Parameter Scan Summary")

            axis = fig.add_subplot(111)
            axis.get_xaxis().set_visible(False)
            axis.get_yaxis().set_visible(False)

            table = axis.table(cellText=[[v] for v in summary.values()],
                               rowLabels=list(summary.keys()),
                               loc="center")

            table.scale(1, 2)
            table.set_fontsize("large")
            fig.tight_layout()

        # Plot 1d bar charts or 2d heat maps
        if results.columns.nlevels == 2 and heatmap_2d:
            self._create_2d_heatmaps(results)
        else:
            self._create_1d_bar_charts(results)

        # Plot performance plots
        performance = DailyPerformance(returns)

        # cut height in half since only one chart per figure
        width, height = self.figsize
        figsize = width, height / 2

        self._create_returns_plots(performance,
                                   subplot=111,
                                   extra_label=" (Aggregate)",
                                   figsize=figsize,
                                   legend_title=params_title)

        self._save_or_show()
#change the number to move the right bound left and right if needed
NumOfDaysToMoveBackFromToday = time() - api.DAY*0

#period of candlesticks to recieve: 24, 4, 2, 0.5, 0.25, or  0.083
period = api.HOUR * 24

#api call
raw = api.returnChartData(currencyToGet, period=period, start=time() - api.DAY*numOfDaysToGet, end= NumOfDaysToMoveBackFromToday)

#load dataframe with infrom from api call
df = pd.DataFrame(raw)


#create date column and convert epoch time from api call to date
df['date'] = pd.to_datetime(df["date"], unit='s')
df['date']= df['date'].map(dt.datetime.toordinal)

#calculate hui hubel liquidty rates
df['liquidity'] = ((df['high'] - df['low']) / df['low']) / (df['volume'] / (df['weightedAverage'] * df['quoteVolume']))


#Calculates a relative strength index with an exponetial moving average as EMA better shows price movements - Tortise vs Heir example
close = df['close']
delta = close.diff()
delta = delta[1:]
up, down = delta.copy(), delta.copy()
up[up < 0] = 0
down[down > 0] = 0
roll_up1 = pd.stats.moments.ewma(up, windowLength)
roll_down1 = pd.stats.moments.ewma(down.abs(), windowLength)
                        pos = 0
                pos_lst.append(pos)

            position['pos' + str(i)] = pos_lst
            position = position.reset_index(drop=True)[[
                'trade_date', 'pos' + str(i)
            ]]
            pos_df = pos_df.merge(position, on=['trade_date'], how='outer')
        pos_df = pos_df.fillna(value=0)
        pos_df = pos_df.sort_values(['trade_date']).set_index(['trade_date'])
        pos_df['position'] = pos_df.sum(axis=1) / len(para_lst)
        pos_df = pos_df.reset_index(drop=False).merge(index_hq, on=['trade_date']).sort_values(['trade_date']) \
            .assign(close_1=lambda df: df.close.shift(1)).dropna()
        fig, ax = plt.subplots(1, 1, figsize=(9, 6))
        ax1 = ax.twinx()
        pos_df.index = pd.to_datetime(pos_df['trade_date'])

        pos_df[['close']].plot(ax=ax1,
                               figsize=(9, 6),
                               kind='line',
                               style=['k-'])
        pos_df[['position']].plot(kind='area',
                                  grid=True,
                                  ax=ax,
                                  figsize=(9, 7),
                                  rot=60,
                                  style=['y'])
        pos_df[['trade_date', 'position'
                ]].to_csv(fold_pos + 'pos_ymjh_' + index_code[:6] + '.csv',
                          encoding='gbk',
                          header=None)
Beispiel #44
0
def lookup_sources_for_observation(fits_files=None,
                                   filename=None,
                                   force_new=False,
                                   cursor=None,
                                   use_intersection=False,
                                   **kwargs):

    if force_new:
        print(f'Forcing a new source file')
        with suppress(FileNotFoundError):
            os.remove(filename)

    try:
        print(f'Using existing source file: {filename}')
        observation_sources = pd.read_csv(filename, parse_dates=True)
        observation_sources['obstime'] = pd.to_datetime(
            observation_sources.obstime)

    except FileNotFoundError:
        if not cursor:
            cursor = get_cursor(port=5433, db_name='v702', db_user='******')

        print(f'Looking up sources in {len(fits_files)} files')
        observation_sources = None

        # Lookup the point sources for all frames
        for fn in tqdm(fits_files):
            point_sources = lookup_point_sources(fn,
                                                 force_new=force_new,
                                                 cursor=cursor,
                                                 **kwargs)
            header = fits_utils.getheader(fn)
            obstime = Time(pd.to_datetime(os.path.basename(fn).split('.')[0]))
            exptime = header['EXPTIME'] * u.second

            obstime += (exptime / 2)

            point_sources['obstime'] = obstime.datetime
            point_sources['exptime'] = exptime
            point_sources['airmass'] = header['AIRMASS']
            point_sources['file'] = os.path.basename(fn)
            point_sources['picid'] = point_sources.index

            print(f'Combining sources with previous observations')
            if observation_sources is not None:
                if use_intersection:
                    print(f'Getting intersection of sources')

                    idx_intersection = observation_sources.index.intersection(
                        point_sources.index)
                    print(
                        f'Num sources in intersection: {len(idx_intersection)}'
                    )
                    observation_sources = pd.concat([
                        observation_sources.loc[idx_intersection],
                        point_sources.loc[idx_intersection]
                    ],
                                                    join='inner')
                else:
                    observation_sources = pd.concat(
                        [observation_sources, point_sources])
            else:
                observation_sources = point_sources

        print(f'Writing sources out to file')
        observation_sources.to_csv(filename)

    observation_sources.set_index(['obstime'], inplace=True)
    return observation_sources
Beispiel #45
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 27 15:23:07 2019

@author: student
"""

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
from statsmodels.tsa.arima_model import ARIMA


plt.style.use('fivethirtyeight')

df=pd.read_csv("/home/student/Desktop/Python/dataset/movavg.csv",
               index_col='Date')
df.index=pd.to_datetime(df.index)

model=ARIMA(df.Price,order=(1,2,0))
model_fit=model.fit(disp=0)
print(model_fit.summary())
Beispiel #46
0
import argparse
import requests
import pandas as pd
import matplotlib.pyplot as plt


def getdata():
    response = requests.get("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
    with open('data.csv', 'wb') as fp:
        fp.write(response.content)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--logarithmic", action='store_true')
    args = parser.parse_args()

    getdata()
    df = pd.read_csv('data.csv')
    dfg = df.groupby(by='Country/Region').sum()
    dfg.sort_values(by=dfg.columns[-1], ascending=False, inplace=True)
    dfg.drop(labels=['Lat', 'Long'], axis=1, inplace=True)
    dfg.columns = pd.to_datetime(dfg.columns)
    dfplot = dfg.iloc[:10].T.plot(logy=args.logarithmic, title="Covid-19 deaths", grid=True)
    plt.minorticks_on()
    plt.show()

Beispiel #47
0
def test_list_of_datetime_time_roundtrip():
    # ARROW-4135
    times = pd.to_datetime(
        ['09:00', '09:30', '10:00', '10:30', '11:00', '11:30', '12:00'])
    df = pd.DataFrame({'time': [times.time]})
    _roundtrip_pandas_dataframe(df, write_kwargs={})
Beispiel #48
0
server = app.server

style = {
    'background': 'white',
    'color': '#E50914',
    'fontFamily': 'Montserrat',
    'fontSize': '20px',
    'text_color': '#564d4d',
    'second_color': '#94A3BC',
    'third_color': '#C1666B'
}

# assume you have a "long-form" data frame
# see https://plotly.com/python/px-arguments/ for more options
df = pd.read_csv('data/viewedHistory_George.csv', sep=';', error_bad_lines=False, engine='python')
df['dateStr'] = pd.to_datetime(df['dateStr'], utc=True)
df_2020 = df[df['dateStr']>'2019-12-31']
df_2019 = df[df['dateStr']<'2020-01-01']
hours_2020 = df_2020.duration.sum()/3600
days_2020 = str(round(hours_2020/24, 2))

df_2020['weekDay'] = df_2020['dateStr'].dt.day_name()
df_2020['weekDayCount'] = df_2020['dateStr'].apply(lambda x: x.weekday())
df_2020['durationM'] = df_2020['duration'].apply(lambda x: x/60)
df_2020['durationH'] = df_2020['duration'].apply(lambda x: x/3600)
weekday = df_2020[['durationM', 'weekDay', 'weekDayCount']].groupby('weekDay').mean().reset_index().sort_values('weekDayCount')
weekday['isWeekend'] = weekday['weekDay'].apply(lambda x: 'yes' if x =='Saturday' or x == 'Sunday' else 'no')
weekday_plot = px.bar(weekday, x="weekDay", y="durationM", color="isWeekend", 
	title="Average number of minutes spent per day of week", labels={'durationM':'Avg. Minutes'}, height=350, width=450,
	color_discrete_map={
                "yes": style['color'], "no": style['second_color']
Beispiel #49
0
from pandas.tseries.offsets import BMonthEnd
from pandas.tseries.holiday import get_calendar, HolidayCalendarFactory, GoodFriday
from datetime import datetime, time, timedelta
from py_vollib import black_scholes
import zipfile 
from private import settings
import numpy as np 
from scipy.interpolate import InterpolatedUnivariateSpline as interpol

years = ([0.0, 1/360, 1/52, 1/12, 2/12, 3/12, 6/12, 12/12])
functions_dict = {}

df_yields = pandas.read_csv(settings.path_to_libor_csv)
cols = ['date', 'ON', 'w1', 'm1', 'm2', 'm3', 'm6', 'm12']
df_yields.columns = cols
df_yields['date'] = pandas.to_datetime(df_yields['date'])
df_yields.set_index('date',inplace=True)

c = calendar.Calendar(firstweekday=calendar.SUNDAY)
offset = BMonthEnd()
entries = []

ratio = 100
lower_ul = 1
upper_ul = 1000000
dividend = 0
commissions = 1.25

interest = 0.0225
yeartradingdays = 252
Beispiel #50
0
from sklearn.metrics import log_loss

from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB  #朴素贝叶斯(伯努利)
from sklearn.linear_model import LogisticRegression  #逻辑回归
from sklearn.ensemble import RandomForestClassifier  #随机森林

train = pd.read_csv('C:\\Users\\49210\Desktop\\22.csv')
test = pd.read_csv('C:\\Users\\49210\Desktop\\22.csv')

train.head()
le = preprocessing.LabelEncoder()
crime_type_encode = le.fit_transform(train['EVENT_TYPE'])


MONTH = pd.to_datetime(train['EVENT_DATE']).dt.month
MONTH = pd.get_dummies(MONTH)#月份训练
number = pd.get_dummies(train['事件数'])
SHENFEN = pd.get_dummies(train['ADMIN1'])#省份训练
train_set = pd.concat([MONTH,number,SHENFEN],axis=1)
train_set['crime_type'] = crime_type_encode

#训练样本特征因子化
MONTH_t =pd.to_datetime(train['EVENT_DATE']).dt.month
MONTH_t = pd.get_dummies(MONTH_t)
number_t = pd.get_dummies(test['事件数'])
SHENFEN_t = pd.get_dummies(test['ADMIN1'])
test_set = pd.concat([MONTH_t,number,SHENFEN_t],axis=1)

x = train_set.loc[:,train_set.columns!='crime_type']
y = train_set['crime_type']
Beispiel #51
0
def start_db(db_dir: str = 'investments_database.db', start_year: int = 2005, target_funds: list = []):
    """Starts a SQLite database with 3 tables: daily_quotas (funds data), ibov_returns (ibovespa index data) and selic_rates (the base interest rate for the brazilian economy).\n 

    <b>Parameters:</b>\n
    db_dir (str): The path of the dabatabse file to be created. Defaults to 'investments_database.db', creating the file in the current working directory.\n
    start_year (int): Opitional (Defaults to 2005). Starting year for the data collection. . Can be use to reduce the size of the database.\n
    target_funds (list): Opitional (Defaults to []). List of target funds CNPJs. Only funds with CNPJs contained in this list will be included in the database. Can be used to radically reduce the size of the database. If none is specified, all funds will be included.\n

    <b>Returns:</b>\n
    Theres no return from the function.

   """
    ##STEP 1:
    #starts the new database
    print (f'creating SQLite database: {db_dir} \n')
    con = sqlite3.connect(db_dir)


    ##STEP 2:
    #downloads each report in the cvm website and pushes it to the sql database daily_quotas table
    print('downloading daily reports from the CVM website... \n')

    #for each year between 2017 and now
    for year in tqdm(range(start_year, datetime.date.today().year + 1), position = 0, leave=True): 
        for mth in range(1, 13): #for each month
            #loop structure for years equal or after 2017
            if year>=2017: 
                informe = cvm_informes(str(year), mth)

                try:
                    if target_funds: #if the target funds list is not empty, uses it to filter the result set
                        informe = informe[informe.CNPJ_FUNDO.isin(target_funds)]
                    #appends information to the sql database
                    informe.to_sql('daily_quotas', con , if_exists = 'append', index=False)
                except AttributeError:
                    pass
            
            elif year<2017: #loop structure to handle years before 2017 (they have a different file structure)
                #only executes the download function once every year to avoid duplicates (unique file for each year)       
                if mth == 12:
                    informe = cvm_informes(str(year), mth)

                    try:
                        if target_funds: #if the target funds list is not empty, uses it to filter the result set
                            informe = informe[informe.CNPJ_FUNDO.isin(target_funds)]
                        #appends information to the sql database
                        informe.to_sql('daily_quotas', con , if_exists = 'append', index=False)
                    except AttributeError:
                        pass

    #pushes target funds to sql for use when updating the database
    if target_funds:
        target_df = pd.DataFrame({'targets':target_funds})
        target_df.to_sql('target_funds', con , index=False)                    
    ##STEP 3:                    
    #creates index in the daily_quotas table to make future select queries faster. 
    #tradeoff: The updating proceesses of the database will be slower.
    print('creating sql index on "CNPJ_FUNDO", "DT_COMPTC" ... \n')
    index = '''
    CREATE INDEX "cnpj_date" ON "daily_quotas" (
        "CNPJ_FUNDO" ASC,
        "DT_COMPTC" ASC
    )'''

    cursor = con.cursor()
    cursor.execute(index)
    con.commit()

    cursor.close()

    
    ##STEP 4:
    #downloads cadastral information from CVM of the fundos and pushes it to the database
    print('downloading cadastral information from cvm...\n')
    info_cad = pd.read_csv('http://dados.cvm.gov.br/dados/FI/CAD/DADOS/cad_fi.csv', sep = ';', encoding='latin1',
                           dtype = {'RENTAB_FUNDO': object,'FUNDO_EXCLUSIVO': object, 'TRIB_LPRAZO': object, 'ENTID_INVEST': object,
                                    'INF_TAXA_PERFM': object, 'INF_TAXA_ADM': object, 'DIRETOR': object, 'CNPJ_CONTROLADOR': object,
                                    'CONTROLADOR': object}
                            )
    if target_funds:
        info_cad = info_cad[info_cad.CNPJ_FUNDO.isin(target_funds)]
    info_cad.to_sql('info_cadastral_funds', con, index=False)


    ##STEP 5:
    #downloads daily ibovespa prices from investing.com and pushes it to the database
    print('downloading ibovespa index prices from investing.com ...\n')
    today = (datetime.date.today() + datetime.timedelta(1)).strftime('%Y-%m-%d')
    ibov = pd.DataFrame(YahooFinancials('^BVSP').get_historical_price_data('1990-09-15', today, 'daily')['^BVSP']['prices'])
    ibov = ibov.drop(columns=['date', 'close']).rename(columns={'formatted_date':'date', 'adjclose':'close'}).iloc[:,[5,0,1,2,3,4]]
    ibov['date'] = pd.to_datetime(ibov['date'])
    ibov.columns = [i.capitalize() for i in ibov.columns] #capitalizes columns to keep consistency with previous format (investpy)
    ibov.to_sql('ibov_returns', con, index=False) 


    ##STEP 6:
    #downloads daily selic returns (basic interest rate of the brazilian economy) 
    #from the brazillian central bank and pushes it to the database
    print('downloading selic rates from the Brazilian Central Bank website...\n')
    selic = pd.read_json('http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(11))
    selic['data'] = pd.to_datetime(selic['data'], format = '%d/%m/%Y')
    selic['valor'] = selic['valor']/100 #calculates decimal rate from the percentual value

    #calculates asset "price" considering day 0 price as 1
    selic.loc[0,'price'] = 1 * (1 + selic.loc[0,'valor'])
    for i in range(1, len(selic)):
        selic.loc[i, 'price'] = selic.loc[i-1, 'price'] * (1 + selic.loc[i,'valor'])

    selic.rename(columns = {'data':'date', 'valor':'rate'}, inplace = True)
    selic.to_sql('selic_rates', con , index=False)  


    ##STEP 7:
    #creates a table with a log of the execution timestamps of the script
    print('creating the log table...\n')
    update_log = pd.DataFrame({'date':[datetime.datetime.now()], 'log':[1]})
    update_log.to_sql('update_log', con, if_exists = 'append', index=False)


    ##STEP 8
    #closes the connection with the database
    con.close()
    print('connection with the database closed! \n')

    print(f'Success: database created in {db_dir} !\n')
def mypreditions_to_database():
    with open("./utils/make_predicitions/X_train_o3", "rb") as X_scalar_file:
        X_train_scalar_o3 = pk.load(X_scalar_file)

    with open("./utils/make_predicitions/o3_api_model", "rb") as model_file:
        model_o3 = pk.load(model_file)

    with open("./utils/make_predicitions/X_train_pm25", "rb") as X_scalar_file:
        X_train_scalar_pm25 = pk.load(X_scalar_file)

    with open("./utils/make_predicitions/pm25_api_model", "rb") as model_file:
        model_pm25 = pk.load(model_file)

    try:
        predict_start_time = pd.to_datetime(input("-- 请输入预报起始时间: "))
        predict_time_len = int(input("-- 请输入预报时长: "))
        pollutant = input("-- 请输入预报污染物(O3/PM25): ")
        # predict_start_time = pd.to_datetime("20201214 13:00:00")
        # predict_time_len = 24
        # pollutant = "O3"
    except:
        print("-- 请按指定格式重新输入数据.")

    time_delta = timedelta(hours=predict_time_len)
    predict_end_time = predict_start_time + time_delta

    print("\n")
    print("-- 预报污染物: {}".format(pollutant))
    print("-- 预报时长{} to {}".format(predict_start_time, predict_end_time))

    df = pd.read_excel("./utils/make_predicitions/wuxi_meteos.xlsx", index_col="time_point")

    df = df[predict_start_time:predict_end_time]

    #存入数据库
    mloutDB = MloutDB()

    if pollutant == "O3":
        features = ['precipitation', 'temperature', 'ws', 'wd', 'humidity', 'cloudrate', 'pressure', 'visibility',
                    'dswrf']

        X = df[df["position_name"] == "东亭"][features]
        X_std = (X - X_train_scalar_o3.min(axis=0)) / (X_train_scalar_o3.max(axis=0) - X_train_scalar_o3.min(axis=0))
        predictions = pd.Series(model_o3.predict(X_std), index=X_std.index)
        list_value = []
        for time_point, value in predictions.items():
            tempList = [time_point, '东亭', 2, value]
            list_value.append(tempList)
        count = mloutDB.insertMany(list_value)
        print(count)
    elif pollutant == "PM25":
        features = ['precipitation', 'temperature', 'ws', 'wd', 'humidity', 'cloudrate', 'pressure', 'visibility',
                    'dswrf', 'pm25']

        X = df[df["position_name"] == "东亭"][features]
        X_std = (X - X_train_scalar_pm25.min(axis=0)) / (
                    X_train_scalar_pm25.max(axis=0) - X_train_scalar_pm25.min(axis=0))
        list_value = []
        predictions = pd.Series(model_pm25.predict(X_std), index=X_std.index)
        for time_point, value in predictions.items():
            tempList = [time_point, '东亭', 2, value]
            list_value.append(tempList)
        count = mloutDB.insertMany(list_value)
        print(count)
Beispiel #53
0
def min_transform(x):
    #print(x)
    scaler = MinMaxScaler()
    #scaler.fit(x)
    return scaler.fit_transform([x])[0]


#--------------------lectura del dataset-----------------------------------
#

df_mobility = pd.read_csv('Global_Mobility_Report.csv',
                          encoding='iso-8859-1',
                          low_memory=False)

df_mobility['date'] = pd.to_datetime(df_mobility['date'])

df_mobility = df_mobility[df_mobility['date'] == pd.to_datetime('2020-02-29')]

df_mobility_temp = pd.DataFrame()

df_mobility_temp['country_region_code'] = df_mobility['country_region_code']
df_mobility_temp['retail_and_recreation'] = df_mobility[
    'retail_and_recreation_percent_change_from_baseline']
df_mobility_temp['grocery_and_pharmacy'] = df_mobility[
    'grocery_and_pharmacy_percent_change_from_baseline']
df_mobility_temp['parks'] = df_mobility['parks_percent_change_from_baseline']
df_mobility_temp['transit_stations'] = df_mobility[
    'transit_stations_percent_change_from_baseline']
df_mobility_temp['workplaces'] = df_mobility[
    'workplaces_percent_change_from_baseline']
Beispiel #54
0
def load_data():
    data = pd.read_csv(DATA_URL)
    data['tweet_created'] = pd.to_datetime(data['tweet_created'])
    return data
Beispiel #55
0
def deal_tradeinfo(tradeinfo, pricefname, n=10, intraday=False):
    """ 根据交易信号和数据文件,处理数据. 
    return data['high_profits', 'low_profit', 'exit_profit', 'period', 'return', 
                'entry_nbar_bests', 'entry_nbar_worsts', 'exit_nbar_bests',
                'exit_nbar_worsts', 'islong', 'entry_n', 'exit_n'
            ]
    """
    PRICE = 'close'
    data = pd.DataFrame(tradeinfo.ix[:, 0:2])
    price_data = csv2frame(pricefname)
    high_profits = []
    low_profits = []
    exit_profits = []

    periods = []
    entry_nbar_bests = []
    entry_nbar_worsts = []
    exit_nbar_bests = []
    exit_nbar_worsts = []
    islongs = []
    returns = []
    entry_Nlist = []
    exit_Nlist = []
    for i in range(len(data)):
        startt = tradeinfo.index[i]
        startpos = price_data.index.searchsorted(startt)
        endt = tradeinfo.ix[i, ['exit_datetime']][0]
        endpos = price_data.index.searchsorted(endt)
        tradingdf = price_data.truncate(before=tradeinfo.index[i], after=endt)

        onetrade = tradeinfo.ix[i, :]
        # high/low
        if len(tradingdf) > 1:
            hp = tradingdf.ix[:-1, :][PRICE].max()
            lp = tradingdf.ix[:-1, :][PRICE].min()
            t = tradingdf.ix[:-1, :][PRICE].tolist()
            t.append(float(onetrade['exit_price']))
            returns.append(max_return(t, onetrade['islong']))
        else:
            hp = tradingdf.ix[:, :][PRICE].max()
            lp = tradingdf.ix[:, :][PRICE].min()
            if onetrade['islong']:
                returns.append(
                    max(onetrade['entry_price'] - onetrade['exit_price'], 0))
            else:
                returns.append(
                    max(onetrade['exit_price'] - onetrade['entry_price'], 0))
        hp = onetrade['exit_price'] if onetrade['exit_price'] > hp else hp
        hp = onetrade['entry_price'] if onetrade['entry_price'] > hp else hp
        lp = onetrade['exit_price'] if onetrade['exit_price'] < lp else lp
        lp = onetrade['entry_price'] if onetrade['entry_price'] < lp else lp
        hp = hp - onetrade['entry_price']
        lp = lp - onetrade['entry_price']
        high_profits.append(hp if onetrade['islong'] else 0 - hp)
        low_profits.append(lp if onetrade['islong'] else 0 - lp)
        # exit
        ep = onetrade['exit_price'] - onetrade['entry_price']
        exit_profits.append(ep if onetrade['islong'] else 0 - ep)
        # period
        periods.append(endpos - startpos + 1)

        # nbar  todo
        entry_begin = startpos
        exit_begin = endpos + 1
        if intraday:
            day_entry_end = price_data.index.searchsorted(
                (pd.to_datetime(startt) +
                 dt.timedelta(days=1)).strftime("%Y-%m-%d"))
            day_exit_end = price_data.index.searchsorted(
                (pd.to_datetime(endt) +
                 dt.timedelta(days=1)).strftime("%Y-%m-%d"))
            entry_end = min(startpos + n + 1, day_entry_end)
            exit_end = min(endpos + 1 + n, day_exit_end)
        else:
            entry_end = startpos + n + 1
            exit_end = endpos + 1 + n
        entry_Nlist.append(entry_end - entry_begin)
        exit_Nlist.append(exit_end - exit_begin)
        islongs.append(onetrade['islong'])
        if onetrade['islong']:
            entry_nbar_bests.append(price_data.ix[entry_begin:entry_end,
                                                  PRICE].max() -
                                    onetrade['entry_price'])
            entry_nbar_worsts.append(price_data.ix[entry_begin:entry_end,
                                                   PRICE].min() -
                                     onetrade['entry_price'])
            exit_nbar_bests.append(price_data.ix[exit_begin:exit_end,
                                                 PRICE].max() -
                                   onetrade['entry_price'])
            exit_nbar_worsts.append(price_data.ix[exit_begin:exit_end,
                                                  PRICE].min() -
                                    onetrade['entry_price'])
        else:
            entry_nbar_bests.append(onetrade['entry_price'] -
                                    price_data.ix[entry_begin:entry_end,
                                                  PRICE].min())
            entry_nbar_worsts.append(onetrade['entry_price'] -
                                     price_data.ix[entry_begin:entry_end,
                                                   PRICE].max())
            exit_nbar_bests.append(onetrade['entry_price'] -
                                   price_data.ix[exit_begin:exit_end,
                                                 PRICE].min())
            exit_nbar_worsts.append(onetrade['entry_price'] -
                                    price_data.ix[exit_begin:exit_end,
                                                  PRICE].max())

    data['high_profit'] = high_profits
    data['low_profit'] = low_profits
    data['exit_profit'] = exit_profits
    data['period'] = periods
    data['return'] = returns
    data['entry_nbar_best'] = entry_nbar_bests
    data['entry_nbar_worst'] = entry_nbar_worsts
    data['exit_nbar_best'] = exit_nbar_bests
    data['exit_nbar_worst'] = exit_nbar_worsts
    data['islong'] = islongs
    data['entry_n'] = entry_Nlist
    data['exit_n'] = exit_Nlist
    print "Data Preprocessing Done!"
    #data.to_csv("d:\\rst.csv")
    return data
Beispiel #56
0
def update_db(db_dir: str = r'investments_database.db'):
    """Updates the database.\n

    <b>Parameters:</b>\n
    db_dir (str): The path of the dabatabse file to be updated. Defaults to 'investments_database.db'.\n

    <b>Returns:</b>\n
    Theres no return from the function.

   """
    ##STEP 1
    #connects to the database
    print(f'connected with the database {db_dir}\n')
    con = sqlite3.connect(db_dir)


    ##STEP 2
    #calculates relevant date limits to the update process
    Cal=Brazil() #inicializes the brazillian calendar
    today = datetime.date.today()

    #queries the last update from the log table
    last_update = pd.to_datetime(pd.read_sql('select MAX(date) from update_log', con).iloc[0,0])

    last_quota = Cal.sub_working_days(last_update, 2) #date of the last published cvm repport
    num_months = (today.year - last_quota.year) * 12 + (today.month - last_quota.month) + 1


    ##STEP 3
    #delete information that will be updated from the database tables
    print('deleting redundant data from the database... \n')
    tables = {'daily_quotas' : ['DT_COMPTC',last_quota.strftime("%Y-%m-01")],
              'ibov_returns' : ['Date',last_update.strftime("%Y-%m-%d")]}
    
    cursor = con.cursor()
    
    #sql delete statement to the database
    cursor.execute('delete from daily_quotas where DT_COMPTC >= :date', {'date': last_quota.strftime("%Y-%m-01")})
    cursor.execute('delete from ibov_returns where Date >= :date', {'date': last_update.strftime("%Y-%m-%d")})
        
    con.commit()  
    cursor.close()


    ##STEP 4
    #Pulls new data from CVM, investpy and the brazilian central bank
    #and pushes it to the database

    try:#tries to read targets funds if they were specified when starting the database
        target_funds = pd.read_sql('select targets from target_funds', con).targets.to_list()
    except DatabaseError:
        target_funds = []
    
    print('downloading new daily reports from the CVM website...\n')
    # downloads the daily cvm repport for each month between the last update and today
    for m in range(num_months+1): 
        data_alvo = last_quota + relativedelta(months=+m) 
        informe = cvm_informes(data_alvo.year, data_alvo.month)
        if target_funds:
            informe = informe[informe.CNPJ_FUNDO.isin(target_funds)]
        try:
            informe.to_sql('daily_quotas', con , if_exists = 'append', index=False)
        except AttributeError:
            pass 

    #downloads cadastral information from CVM of the fundos and pushes it to the database
    print('downloading updated cadastral information from cvm...\n')
    info_cad = pd.read_csv('http://dados.cvm.gov.br/dados/FI/CAD/DADOS/cad_fi.csv', sep = ';', encoding='latin1',
                           dtype = {'RENTAB_FUNDO': object,'FUNDO_EXCLUSIVO': object, 'TRIB_LPRAZO': object, 'ENTID_INVEST': object,
                                    'INF_TAXA_PERFM': object, 'INF_TAXA_ADM': object, 'DIRETOR': object, 'CNPJ_CONTROLADOR': object,
                                    'CONTROLADOR': object}
                            )
    if target_funds: #filters target funds if they were specified when building the database.
        info_cad = info_cad[info_cad.CNPJ_FUNDO.isin(target_funds)]
    info_cad.to_sql('info_cadastral_funds', con, if_exists='replace', index=False)

    #updates daily interest returns (selic)
    print('updating selic rates...\n')
    selic = pd.read_json('http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(11))
    selic['data'] = pd.to_datetime(selic['data'], format = '%d/%m/%Y')
    selic['valor'] = selic['valor']/100 #calculates decimal rate from the percentual value

    #calculates asset "price" considering day 0 price as 1
    selic.loc[0,'price'] = 1 * (1 + selic.loc[0,'valor'])
    for i in range(1, len(selic)):
        selic.loc[i, 'price'] = selic.loc[i-1, 'price'] * (1 + selic.loc[i,'valor'])

    selic.rename(columns = {'data':'date', 'valor':'rate'}, inplace = True)

    #filters only new data
    selic = selic[selic.date>=(last_update + datetime.timedelta(-1))]
    selic.to_sql('selic_rates', con , if_exists = 'append', index=False) 

    #updates ibovespa data
    print('updating ibovespa returns...\n')
    today = (datetime.date.today() + datetime.timedelta(1)).strftime('%Y-%m-%d')
    ibov = pd.DataFrame(YahooFinancials('^BVSP').get_historical_price_data(last_update.strftime('%Y-%m-%d'), today, 'daily')['^BVSP']['prices'])
    ibov = ibov.drop(columns=['date', 'close']).rename(columns={'formatted_date':'date', 'adjclose':'close'}).iloc[:,[5,0,1,2,3,4]]
    ibov['date'] = pd.to_datetime(ibov['date'])
    ibov.columns = [i.capitalize() for i in ibov.columns] #capitalizes columns to keep consistency with previous format (investpy)
    ibov.to_sql('ibov_returns', con , if_exists = 'append', index=False)

    ##STEP 5
    #updates the log in the database
    print('updating the log...\n')
    update_log = pd.DataFrame({'date':[datetime.datetime.now()], 'log':[1]})
    update_log.to_sql('update_log', con, if_exists = 'append', index=False)


    ##STEP 6
    #closes the connection with the database
    con.close()
    print('connection with the database closed!\n')

    print(f'database {db_dir} updated!\n')
    for x1 in posts_json['data']:
        commentlst.append(x1.get('message').encode('utf-8').strip())
        datelst.append(x1.get('created_time'))
    next_page = ""
    try:
        next_page = posts_json['paging']['next']
        url = next_page
    except:
        break
    if not next_page: break
    print ("Count: %s,  Next Page: %s" % ( len(commentlst), url))

print ("\nGenerating JSON File")

df = pd.DataFrame({'comment': commentlst, 'dates': datelst})
df['dates'] = pd.to_datetime(df['dates'])
df['day_of_week'] = df['dates'].dt.weekday_name
df['year'] = df['dates'].dt.year
df['month'] = df['dates'].dt.month
df['count'] = 1 

df.to_json('comment_data.json')
#Generate Sentimental Results
import requests 
import json
from google.cloud import language, exceptions

client = language.Client()
# export GOOGLE_APPLICATION_CREDENTIALS environment variable 

with open('comment_data.json') as data_file:
Beispiel #58
0
def tick2period(code, period, start, end):
    """ get tick data from tushare and resample to certain period data
    selected by input: period
    """
    import tushare as ts
    import numpy as np
    import pandas as pd
    dfout = None
    #get valid trade date
    valid_dates = ts.get_hist_data(code, start=start, end=end).index
    for date in valid_dates:
        #date=date.strftime('%Y-%m-%d')
        rng = pd.date_range(
            date + ' 9:30:00', date + ' 15:00', closed='right',
            freq=period)  #setup trade time grid by period selected
        sr = pd.Series(np.nan, index=rng)
        df = ts.get_tick_data(code, date=date)
        df.loc[df.time < '09:30:00',
               'time'] = '09:30:01'  #process open call auction
        df.loc[df.time > '15:00:00',
               'time'] = '14:59:59'  #process close call auction
        df['time'] = date + ' ' + df['time']
        df = df.rename(columns={'time': 'datetime'})
        df['datetime'] = pd.to_datetime(df['datetime'])
        df = df.set_index('datetime').sort()
        df2 = df['volume'].resample(period,
                                    how='sum',
                                    closed='right',
                                    label='right')
        df2, dummy = df2.align(sr, axis=0)  #align to standard time
        df3 = df2.truncate(before=date + ' 13:00:01', after=date + ' 15:00')
        df2 = df2.truncate(before=date + ' 9:30:01',
                           after=date + ' 11:30')  #remove non-trade time
        df2 = df2.append(df3).fillna(
            0)  #fill with 0 for period without valid deal
        df1 = df['price'].resample(period,
                                   how='ohlc',
                                   closed='right',
                                   label='right')
        df1, dummy = df1.align(sr, axis=0)  #align to standard time
        df3 = df1.truncate(before=date + ' 13:00:01', after=date + ' 15:00')
        df1 = df1.truncate(before=date + ' 9:30:01',
                           after=date + ' 11:30')  #remove non-trade time
        df1 = df1.append(df3)
        if np.isnan(
                df1.ix[0, 'close']
        ):  #use last day's close as initial price if there is no deal after open
            from datetime import timedelta, datetime
            aDay = timedelta(
                days=-10
            )  #get enough days to ensure at least one trading day is involved
            pre = (pd.to_datetime(date) + aDay).strftime('%Y-%m-%d')
            df1.ix[0, 'close'] = ts.get_hist_data(code, start=pre,
                                                  end=date).ix[-2, 'close']
        df1['close'].fillna(
            method='pad', inplace=True
        )  #use price before if there is no deal during current period
        df1.fillna(
            method='bfill', inplace=True, axis=1
        )  #use close as open,high,low if there  is no deal during current period
        df1['volume'] = df2.values
        dfout = pd.concat([dfout, df1])
    return dfout
Beispiel #59
0
def get_course_assignments(course_id):

    sql = f"""select assign.*,sub.avg_score from
            (select ifnull(assignment_id, 0) as assignment_id ,name,assign_grp_name,grp_id,due_date,points_possible,group_points,weight,drop_lowest,drop_highest from
            (select a.id as assignment_id,a.assignment_group_id, a.local_date as due_date,a.name,a.points_possible from assignment as a  where a.course_id =%(course_id)s) as app right join
            (select id, name as assign_grp_name, id as grp_id, group_points, weight,drop_lowest,drop_highest from assignment_groups where course_id=%(course_id)s) as ag on ag.id=app.assignment_group_id) as assign left join
            (select distinct assignment_id,avg_score from submission where course_id=%(course_id)s) as sub on sub.assignment_id = assign.assignment_id
            """

    assignments_in_course = pd.read_sql(sql,
                                        conn,
                                        params={'course_id': course_id},
                                        parse_dates={'due_date': '%Y-%m-%d'})
    # No assignments found in the course
    if assignments_in_course.empty:
        logger.info('The course %s don\'t seems to have assignment data' %
                    course_id)
        return assignments_in_course

    assignments_in_course['due_date'] = pd.to_datetime(
        assignments_in_course['due_date'], unit='ms')
    assignments_in_course[['points_possible',
                           'group_points']] = assignments_in_course[[
                               'points_possible', 'group_points'
                           ]].fillna(0)
    assignments_in_course[['points_possible', 'group_points',
                           'weight']] = assignments_in_course[[
                               'points_possible', 'group_points', 'weight'
                           ]].astype(float)
    consider_weight = is_weight_considered(course_id)
    df2 = assignments_in_course[['weight', 'group_points',
                                 'grp_id']].drop_duplicates()
    hidden_assignments = are_weighted_assignments_hidden(course_id, df2)
    total_points = assignments_in_course['points_possible'].sum()
    # if assignment group is weighted and no assignments added yet then assignment name will be nothing so situation is specific to that
    if hidden_assignments:
        assignments_in_course['name'] = assignments_in_course['name'].fillna(
            assignments_in_course['assign_grp_name'] +
            ' Group Unavailable Assignments')
    assignments_in_course['towards_final_grade'] = assignments_in_course.apply(
        lambda x: percent_calculation(consider_weight, total_points,
                                      hidden_assignments, x),
        axis=1)
    assignments_in_course['calender_week'] = assignments_in_course[
        'due_date'].dt.week
    assignments_in_course['calender_week'] = assignments_in_course[
        'calender_week'].fillna(0).astype(int)
    min_week = find_min_week(course_id)
    max_week = assignments_in_course['calender_week'].max()
    week_list = [x for x in range(min_week, max_week + 1)]
    assignments_in_course['week'] = assignments_in_course[
        'calender_week'].apply(lambda x: 0
                               if x == 0 else week_list.index(x) + 1)
    assignments_in_course.sort_values(by='due_date', inplace=True)
    assignments_in_course['current_week'] = assignments_in_course[
        'calender_week'].apply(lambda x: find_current_week(x))
    assignments_in_course['due_date_mod'] = assignments_in_course[
        'due_date'].astype(str).apply(lambda x: x.split()[0])
    assignments_in_course['due_dates'] = pd.to_datetime(
        assignments_in_course['due_date_mod']).dt.strftime('%m/%d')
    assignments_in_course['due_dates'].replace('NaT', 'N/A', inplace=True)
    return assignments_in_course
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM
import matplotlib.pyplot as plt
import datetime

# Gather the data
file_path = "data/sales_708_3M.csv"
df = pd.read_csv(file_path)

# Parse date column, set index, drop redundant col
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d")

# Making sure there are no duplicated data
# If there are some duplicates we average the data during those duplicated days
df = df.groupby('Date', as_index=False)['Sales'].mean()

# Sorting the values
df.sort_values('Date', inplace=True)

# Set Params
data = df               # Daten

Y_var = 'Sales'         # Zielvariable

lag = 7                 # Die Anzahl der Verzögerungen, die für die Modellierung verwendet werden

LSTM_layer_depth = 100  # Anzahl der Neuronen in der LSTM-Schicht

batch_size = 72         # Die Größe der Datenstichprobe für den Gradientenabstieg,