Ejemplo n.º 1
0
def get_time_range_from_files(files):
    """Get time range from input files."""
    times = [re.findall('\d\d\d\d\d\d+', fname) for fname in files]
    t_1 = [t1 for t1, t2 in times]
    t_2 = [t2 for t1, t2 in times]
    t_min, t_max = min(t_1), max(t_2)
    #times = np.array(times, 'i4').reshape(len(times), 2)
    #t_min, t_max = str(times.min()), str(times.max())
    y1, m1 = int(t_min[:4]), int(t_min[4:6])
    y2, m2 = int(t_max[:4]), int(t_max[4:6])
    # seasonal
    if len(t_min) > 6 and len(t_max) > 6:
        y1, m1 = get_season(y1, m1, get_month=1)
        y2, m2 = get_season(y2, m2, get_month=1)
        start = '%d/%d/%d' % (y1, m1, 15)
        end = '%d/%d/%d' % (y2, m2, 15)
        return pn.DateRange(start,
                            end,
                            offset=pn.datetools.DateOffset(months=3))
    # monthly
    else:
        print t_min
        start = '%d/%d/%d' % (y1, m1, 15)
        end = '%d/%d/%d' % (y2, m2, 15)
        return pn.DateRange(start,
                            end,
                            offset=pn.datetools.DateOffset(months=1))
Ejemplo n.º 2
0
 def analysis(self, year, month, force=False):
     if year and month:
         self.range = pandas.DateRange(datetime.datetime(year,month,1,12)
                                          , periods = 1
                                          , offset = pandas.DateOffset(months=1)
                                          )
     else:
         self.range = pandas.DateRange(self.start_date
                                          , end = self.end_date
                                          , offset = pandas.DateOffset(months=1)
                                          )
     mr_dict = self.map_reduce(force=force)
     return self.cohort_analysis(mr_dict)
Ejemplo n.º 3
0
def signups(db, end=None, period='hours', start=None):
    if not end: end = now()
    end = datetime.datetime.fromtimestamp(end)
    end = end.replace(minute=0, second=0, microsecond=0)
    if not start:
        start = end - pandas.DateOffset(hours=48)
    else:
        start = datetime.datetime.fromtimestamp(start)
        start = start.replace(minute=0, second=0, microsecond=0)

    if not period is 'hours': start = start.replace(hour=8)

    hourly = pandas.DateRange(start=start, end=end, offset=pandas.DateOffset(**{period: 1}))
    start_epoch = datetime_to_int(hourly[0])
    end_epoch = datetime_to_int(hourly[-1] + hourly.offset)
    spec = {'created': {'$gt': start_epoch, '$lt': end_epoch}}

    def group_data(cursor):
        contact_times = sorted([datetime.datetime.utcfromtimestamp(c['created']) for c in cursor])
        data = pandas.Series(1, contact_times)
        data = pandas.Series(data.groupby(hourly.asof).sum())
        return data.reindex(index=hourly, fill_value=0)

    cursor = db.contact_log.find(spec, {'created': True})
    contacts = group_data(cursor)

    spec.update({'user_created': {'$exists': True}})
    cursor = db.contact_log.find(spec, {'created': True})
    created_contacts = group_data(cursor)

    return {  'index': [time.mktime(x.timetuple()) for x in hourly.tolist()]
            , 'total': contacts.values.tolist()
            , 'active': created_contacts.values.tolist()
            , 'ratio': (created_contacts / contacts).values.tolist()
            }
Ejemplo n.º 4
0
def load_time_series(symbol,
                     start_date=None,
                     end_date=None,
                     downsample_days=1):
    log.info("loading %s for %s to %s" % (symbol, start_date, end_date))
    filename = "%s.csv" % symbol
    if not os.path.exists(filename):
        fetch.fetch_data(symbol)
    data = pandas.read_csv(filename, parse_dates=True, index_col=0)
    data = data.drop(["Open", "High", "Low", "Close", "Volume"], axis=1)
    data = data.rename(columns={"Adj Close": symbol})
    data = data.sort()
    if data.index[0] > start_date:
        log.warning("no data for %s before %s" % (symbol, data.index[0]))
        return None

    data = data.truncate(before=start_date, after=end_date)
    log.info("%d rows after truncating" % len(data))

    # downsample if necessary
    if downsample_days > 1:
        drange = pandas.DateRange(start_date,
                                  end_date,
                                  offset=downsample_days * datetools.day)
        grouped = data.groupby(drange.asof)
        means = grouped.mean()
        log.info("%d rows after downsampling" % len(means))
        return means
    else:
        return data
	def fetch_price_frame(self,startdate,enddate):
				
		dts=ps.DateRange(start=startdate,end=enddate,offset=bday)
		
		i=0
		dividends=[]
		prices=[]
		
		for dt in dts:
			
			irs=InterestRate.objects.filter(date__lte=dt,investment=self).order_by('-date')
			
			if len(irs) == 0:
					return None
			else:
				ir=irs[0]
			
			prices.append(Decimal(1.0))
			dividends.append(ir.annualrate/365)
			
		data={
			'price':prices,
			'dividend':dividends,
		
		}
		df=ps.DataFrame(data,index=dts)
		
		return df
Ejemplo n.º 6
0
def table_33():
    path = os.path.join(data_path, 'Table3.3.data.txt')
    sep = '\s+'

    lines = [re.split(sep, l.strip()) for l in open(path)]

    y_data = []
    f_data = []
    saw_f = False
    for line in lines:
        if line[0] == 'Y':
            continue
        elif line[0] == 'F':
            saw_f = True
            continue

        # drop year
        if saw_f:
            f_data.extend(line[1:])
        else:
            y_data.extend(line[1:])

    y_data = np.array(y_data, dtype=float)
    f_data = np.array(f_data, dtype=float)

    dates = pn.DateRange(datetime(1975, 1, 1),
                         periods=len(y_data),
                         timeRule='Q@MAR')

    Y = pn.Series(y_data, index=dates)
    F = pn.Series(f_data, index=dates)

    return Y, F
Ejemplo n.º 7
0
def download_hist(sleept=10):
    """
    download historical data from 1999/01/31 to 2014/05/31
    """
    for i in pd.DateRange(dt.datetime(1999,1,1), dt.datetime(2014,6,1), offset=pd.datetools.MonthEnd()):
        download(i)
        time.sleep(sleept)
Ejemplo n.º 8
0
def sse_hist(start=dt.datetime(2006, 8, 7), end=dt.datetime.today()):
    fail_list = []
    for date in pd.DateRange(start, end):
        print date
        fail_list.append(sse_lhb(date))
        time.sleep(10)
    print[i for i in fail_list if i]
Ejemplo n.º 9
0
def _trade_dates(dt_start, dt_end, s_period):
    '''
    @summary: Generate dates on which we need to trade
    @param c_strat: Strategy config class
    @param dt_start: Start date
    @param dt_end: End date
    '''

    ldt_timestamps = getNYSEdays(dt_start,
                dt_end, dt.timedelta(hours=16) )


    # Use pandas reindex method instead
    # Note, dates are index as well as values, we select based on index
    # but return values since it is a numpy array of datetimes instead of
    # pandas specific.
    ts_dates = pd.TimeSeries(index=ldt_timestamps, data=ldt_timestamps)

    # These are the dates we want
    if s_period[:2] == 'BW':
        # special case for biweekly

        dr_range = pd.DateRange(dt_start, dt_end,
                                timeRule=s_period[1:])
        dr_range = np.asarray(dr_range)
        li_even = np.array(range(len(dr_range)))
        dr_range = dr_range[li_even[li_even % 2 == 0]]
    else:
        dr_range = pd.DateRange(dt_start, dt_end,
                                timeRule=s_period)
        dr_range = np.asarray(dr_range)


    # Warning, we MUST copy the date range, if we modify it it will be returned
    # in it's modified form the next time we use it.
    dr_range = np.copy(dr_range)
    dr_range += pd.DateOffset(hours=16)
    ts_dates = ts_dates.reindex( dr_range, method='bfill' )
    ldt_dates = ts_dates[ts_dates.notnull()].values

    #Make unique
    sdt_unique = set()
    ldt_dates = [x for x in ldt_dates
                 if x not in sdt_unique and not sdt_unique.add(x)]

    return ldt_dates
Ejemplo n.º 10
0
def download_hist(sleept=10):
    """
    download historical data from 2014-06-01
    """
    for i in pd.DateRange(dt.datetime(2014, 6, 1),
                          dt.datetime.today(),
                          offset=pd.datetools.MonthEnd()):
        downloader_new(i)
        time.sleep(sleept)
Ejemplo n.º 11
0
    def discrete_boxcar_average(self, seconds=1):
        """Computes a discrete boxcar average for the DataFrame"""
        date_range = pandas.DateRange(self.data.index[0],
                                      self.data.index[-1],
                                      offset=pandas.datetools.Second(seconds))
        grouped = self.data.groupby(date_range.asof)
        subsampled = grouped.mean()

        return LightCurve(subsampled, self.header.copy())
Ejemplo n.º 12
0
    def collect_year(self, year=None, station_name=None, exact_station = False, 
                    location_WMO=None, location_WBAN=None, country=None, 
                    state=None, internet_connected = True):
        """ Process a request for data for a given year at a given location 
        optionaly.

        Inputs:
        - year, int. If no year is passed, choose the current one.
        - station_name, str. (Part of) Name of the station to collect data at. 
        The station names are search for in the ish-history txt file stored in 
        self.location_db.
        - exact_station, bool. If false, all station names are search and the 
          ones containing the string station_name are selected.
        - location WMO code and/or WBAN code, int, int. If no location is selected,
        collect the yearly data for all locations.

        Output:
        - pandas data structure: 2D (DataFrame) if only one location is
        requested, 3D (panel) if multiple locations are requested
        """
        if year is None:
            year = datetime.datetime.today().year
            warnings.warn("No year was provided: using the current one (%s)" 
                          % year)
            
        no_location = (location_WMO is None and location_WBAN is None
                       and station_name is None and country is None and
                       state is None)
        if no_location:
            # Requested all data for the year that is at all locations. Returns
            # a panel if it can fit in memory, and None if not. In the latter
            # case, the data files are still stored locally. 
            return collect_year(year)
        else:
            filtered = search_station(self.location_db, self.location_dict,
                                      station_name, exact_station, location_WMO, location_WBAN,
                                      country, state)
            if len(filtered) == 1:
                result = collect_year_at_loc(year, location_WMO = filtered['USAF'][0],
                                             location_WBAN = filtered['WBAN'][0], 
                                             internet_connected = internet_connected)
            else:
                data = {}
                for layer in filtered:
                    df = collect_year_at_loc(year, layer['USAF'], layer['WBAN'], 
                                             internet_connected = internet_connected)
                    # reindex over the entire year in case there are missing values
                    if df is None:
                        continue
                    df = df.reindex(pandas.DateRange(start = '1/1/%s' % year,
                                                     end = '31/12/%s' % year,
                                                     offset = pandas.datetools.day))
                    key = "%s-%s" % (layer['USAF'], layer['WBAN'])
                    data[key] = df
                result = pandas.Panel(data)
            return result
Ejemplo n.º 13
0
    def dataframe(self):

        bs = self.budgets
        ts = self.timeseries

        if len(ts) == 0:
            return None

        for c in self.child.all():
            cts = c.timeseries
            if cts:
                ts = ts.combine(cts, np.sum, 0)

        if bs:
            bts = bs[0].timeseries
            if len(bs) > 1:
                for b in range(1, len(bs)):
                    bts = bts.combine(b.timeseries, np.sum, 0)
        else:
            bts = ps.TimeSeries([0], index=[ts.index[0]])

        startdate = min(ts.index[0], bts.index[0])
        enddate = max(ts.index[-1], bts.index[-1])

        dates = ps.DateRange(startdate, enddate, offset=ps.DateOffset(days=1))

        bts = bts.reindex(dates)
        ts = ts.reindex(dates)

        df = ps.DataFrame({'actual': ts, 'budget': bts})
        df = df.fillna(0)
        df['actual'] = df['actual'].apply(Decimal)
        df['budget'] = df['budget'].apply(Decimal)
        df['vsbudget'] = (df['actual'] - df['budget']).apply(Decimal)

        types = []
        accounts = []
        depth = []
        for dt in df.index:
            types.append(self.account_type)
            accounts.append(self.guid)
            depth.append(self.depth)

        edf = ps.DataFrame({
            'type': types,
            'account': accounts,
            'depth': depth
        },
                           index=df.index)

        df['type'] = edf['type']
        df['account'] = edf['account']
        df['depth'] = edf['depth']

        return df
Ejemplo n.º 14
0
 def _generateCashflow(self):
     monthly_offset = 12 / self.frequency
     #offset = int(365.2425/self.frequency)
     dr = pd.DateRange(
         self.startdate,
         self.maturitydate,
         offset=pdt.DateOffset(months=monthly_offset))  #bday *   offset)
     cf = Cashflow(cf_times=dr,
                   cf_amounts=self.couponrate * 100 / self.frequency)
     cf.cf[0] = -self.price
     cf.cf[cf.cf.count() - 1] += 100.0
     return cf
Ejemplo n.º 15
0
def contacts_per_day(db, end=now()):
    end = datetime.datetime.fromtimestamp(end)
    end = end.replace(hour=12, minute=0, second=0, microsecond=0)
    hourly = pandas.DateRange(end=end, offset=pandas.DateOffset(hours=24), periods=120)
    contacts = db.contact_log.find({'created':{'$gt': time.mktime(hourly[0].timetuple())}}, {'created': True})
    contact_times = sorted([datetime.datetime.utcfromtimestamp(c['created']) for c in contacts])
    data = pandas.Series(1, contact_times)
    data = pandas.Series(data.groupby(hourly.asof).sum())

    return {  'times': [time.mktime(x.timetuple()) for x in data.index.tolist()]
            , 'values': data.values.tolist()
            }
Ejemplo n.º 16
0
def parse_lutkepohl_data(path): # pragma: no cover
    """
    Parse data files from Lutkepohl (2005) book

    Source for data files: www.jmulti.de
    """

    from collections import deque
    from datetime import datetime
    import pandas
    import pandas.core.datetools as dt
    import re

    regex = re.compile('<(.*) (\w)([\d]+)>.*')
    lines = deque(open(path))

    to_skip = 0
    while '*/' not in lines.popleft():
        to_skip += 1

    while True:
        to_skip += 1
        line = lines.popleft()
        m = regex.match(line)
        if m:
            year, freq, start_point = m.groups()
            break

    data = np.genfromtxt(path, names=True, skip_header=to_skip+1)

    n = len(data)

    # generate the corresponding date range (using pandas for now)
    start_point = int(start_point)
    year = int(year)

    offsets = {
        'Q' : dt.BQuarterEnd(),
        'M' : dt.BMonthEnd(),
        'A' : dt.BYearEnd()
    }

    # create an instance
    offset = offsets[freq]

    inc = offset * (start_point - 1)
    start_date = offset.rollforward(datetime(year, 1, 1)) + inc

    offset = offsets[freq]
    date_range = pandas.DateRange(start_date, offset=offset, periods=n)

    return data, date_range
Ejemplo n.º 17
0
    def _execute(self, period):
        def actions_per_user_per_day():
            map1 = """
                function() {
                    date = new Date((this.created - 12*3600) * 1000);
                    day = Date.UTC(date.getFullYear(), date.getMonth(), date.getDate(), 12);
                    emit({name: this.user_name, date: day/1000}, 1);
                }"""

            reduce = """
                function(key, values) {
                    var total=0;
                    for (var i=0; i < values.length; i++) {
                        total += values[i];
                    }
                    return total;
                }"""

            mr1_name = 'mr.actions_per_user_per_day'
            mr1 = self.db.mdb[mr1_name]
            latest = mr1.find_one(sort=[('_id.date', -1)])['_id']['date']
            # The following line performs incremental map reduce, but depends on mongodb version >= 1.8
            return self.db.ActionLog._col.map_reduce(
                map1,
                reduce,
                mr1_name,
                merge_output=True,
                query={'created': {
                    '$gt': latest - 24 * 3600
                }})

        mr_col = actions_per_user_per_day()
        mr_col.ensure_index('_id.date')
        offset = pandas.DateOffset(days=period)
        start = newhive.utils.time_u(
            mr_col.find_one(sort=[('_id.date', 1)])['_id']['date'])
        index = pandas.DateRange(start=start + offset,
                                 end=datetime.datetime.now(),
                                 offset=pandas.DateOffset(days=1))

        def users_active_on(date):
            cursor = mr_col.find(
                {'_id.date': dates_to_spec(date - offset, date)})
            return len(cursor.distinct('_id.name'))

        data = pandas.DataFrame(
            index=index,
            data={'Active{}'.format(period): index.map(users_active_on)})
        return data
Ejemplo n.º 18
0
def downloadBTCDateRange(start, end):
    for d in pd.DateRange(start, end):
        addDateToPayload(d)
        req2 = s.post(BTCurl, headers=headers, data=payload)

        res = req2.headers.get('content-disposition')
        if pd.isnull(res):
            print "no data on {}".format(d)
        elif 'filename' in res:
            filename = res.split('=')[1]
            print filename
            filepath = '{}/BTC-OpenPositions/{}'.format(outdir, filename)
            with open(filepath, "wb") as f:
                f.write(req2.content)
        # pause 5 secs
        pause()
Ejemplo n.º 19
0
def foo():
    path = os.path.join(data_path, 'Table11.1.data.txt')
    sep = '\s+'

    lines = [re.split(sep, l.strip()) for l in open(path)]

    datad = {}
    for start in [0]:
        name = lines[start][0]
        time_rule = lines[start + 1][0]
        start_date = lines[start + 2][0]
        data = np.concatenate(lines[start + 3:start + 9]).astype(float)
        dates = pn.DateRange(start_date, periods=len(data), timeRule=time_rule)
        datad[name] = pn.Series(data, index=dates)

    return pn.DataFrame(datad)
Ejemplo n.º 20
0
def parse_table_22():
    path = os.path.join(data_path, 'Table2.2.data.txt')
    sep = '\s+'

    lines = [re.split(sep, l.strip()) for l in open(path)]

    data = []
    for line in lines:
        # drop year
        data.extend(line[1:])

    data = np.array(data, dtype=float) / 100
    dates = pn.DateRange(datetime(1975, 1, 1),
                         periods=len(data),
                         timeRule='EOM')

    return pn.Series(data, index=dates)
Ejemplo n.º 21
0
    def budgetpanel(self, startdate, enddate):

        analysis_dates = ps.DateRange(startdate,
                                      enddate,
                                      offset=ps.DateOffset(days=1))

        data = {}
        for a in self:
            if type(a) != str:

                df = a.dataframe

                if df is not None:
                    df = df.reindex(analysis_dates)
                    data[a] = df

        p = ps.Panel(data, major_axis=analysis_dates)

        return p
Ejemplo n.º 22
0
def loadfrets(model, start_date, stop_date):
    '''
    load axioma factor returns data over a given window
    
    Parameters:
    model : str
        name of axioma risk model
    start_date : datetime
        start date of factor returns
    stop_date : datetime
        stop date of factor returns

    returns DataFrame
    '''

    tmp = pandas.DataFrame()
    for dt in pandas.DateRange(start_date, stop_date):
        _tmp = loadfret(model, dt)
        tmp = tmp.append(_tmp, ignore_index=True)

    return tmp
Ejemplo n.º 23
0
def backfill_b_alphas2(startdate=datetime.datetime(2005, 1, 1),
                       enddate=datetime.datetime.today() -
                       pandas.datetools.day,
                       ncpus=8):
    """
    this function is to backfill the bucket signals defined in the production env for 1 day.
    """
    print "going to generate production def. bucket alphas"
    job_server = pp.Server(ncpus)
    jobs = []
    for date in pandas.DateRange(startdate,
                                 enddate,
                                 offset=pandas.datetools.day):
        jobs.append(
            job_server.submit(backfill_b_1d, (date, ), (),
                              ('pandas', 'datetime')))

    for job in jobs:
        job()
    job_server.print_stats()
    job_server.destroy()
Ejemplo n.º 24
0
def create_full_record(p_series_list):
    '''tiles records together
    creates a daily record over the minimum to maximum dates in the 
    series list
    '''

    #--find the min and max dates
    min_date, max_date = datetime(year=3012, month=1,
                                  day=1), datetime(year=1512, month=1, day=1)
    for p in p_series_list:

        if p.index.min() < min_date:
            min_date = p.index[0]
        if p.index.max() > max_date:
            max_date = p.index[-1]
    #print min_date,max_date

    #--create new pandas date range inclusive of the whole record
    d_range = pandas.DateRange(start=min_date,
                               end=max_date,
                               offset=pandas.core.datetools.day)
    full_series = pandas.TimeSeries(np.ones(len(d_range)) * np.nan, d_range)
    #print d_range

    for dt, val in full_series.iteritems():
        #--try to find an entry in one of the series for this day
        v = np.nan
        for p in p_series_list:
            #print p.head()
            try:
                if p[dt] != np.nan:
                    v = p[dt]
            except:
                pass
            if v != np.nan:
                full_series[dt] = v
                #break
    return full_series
Ejemplo n.º 25
0
    def mdf_evalto(self, parameter_s=""):
        """
        Advances the current context to the end date and return a pandas
        dataframe of nodes evaluated on each timestep.

        %mdf_evalto <end_date> [nodes...]

        eg: %mdf_evalto 2020-01-01 <my node 1> <my node 2>
        """
        args = tokenize(parameter_s)

        cur_ctx = _get_current_context()
        root_ctx = cur_ctx.get_parent() or cur_ctx
        end_date, nodes = args[0], args[1:]
        end_date = _parse_datetime(end_date, self.shell.user_global_ns,
                                   self.shell.user_ns)
        nodes = map(
            lambda x: eval(x, self.shell.user_global_ns, self.shell.user_ns),
            nodes)

        df_ctx = root_ctx
        if len(nodes) > 0 and isinstance(nodes[-1], (dict, list, tuple)):
            shift_sets = _get_shift_sets(args[-1], nodes.pop())
            assert len(
                shift_sets) <= 1, "Only one shift set allowed for %mdf_evalto"
            if shift_sets:
                unused, shift_set = shift_sets[0]
                df_ctx = df_ctx.shift(shift_set=shift_set)

        df_builder = DataFrameBuilder(nodes, filter=True)
        date_range = pd.DateRange(cur_ctx.get_date(),
                                  end_date,
                                  offset=self.__timestep)
        for dt in date_range:
            root_ctx.set_date(dt)
            df_builder(dt, df_ctx)
        return df_builder.get_dataframe(df_ctx)
Ejemplo n.º 26
0
def downloadBTCDateRange(start, end):
    fns = []
    for d in pd.DateRange(start, end):
        addDateToPayload(d)
        req2 = s.post(BTCurl, headers=headers, data=payload)

        res = req2.headers.get('content-disposition')
        if pd.isnull(res):
            print "no data on {}".format(d)
        elif 'filename' in res:
            filename = res.split('=')[1]
            print filename
            filepath = '{}/BTC-OpenPositions/{}'.format(outdir, filename)
            foldern = outdir + '/BTC-OpenPositions'
            if not os.path.exists(foldern):
                os.makedirs(foldern)
            print filepath
            with open(filepath, 'wb') as f:
                f.write(req2.content)
        # pause 5 secs
        fns.append(filepath)
        time.sleep(5)

    return fns
Ejemplo n.º 27
0
def main(startdate=None, enddate=None, sleeptime=0):
    """

    """
    if startdate is None:
        startdate = dt.datetime.today()

    if enddate is None:
        enddate = dt.datetime.today()

    for date in pd.DateRange(startdate, enddate, offset=pd.datetools.day):
        print date.strftime('%Y-%m-%d')
        fn = download_report(date)
        df = parser(fn)
        sql_del = """delete from %(table)s where datadate='%(date)s'""" % {
            'table': TABLE,
            'date': date.strftime('%Y-%m-%d')
        }
        dbo.cursor.execute(sql_del)
        dbo.commit()
        print "len(df) = %s" % len(df)
        print "start to upload to table %s" % TABLE
        sql_io.write_frame(df, TABLE, if_exists='append', bulk='off')
        time.sleep(sleeptime)
Ejemplo n.º 28
0
def run(start_date, end_date=None, forceall=False):

    if end_date is None:
        end_date = start_date

    bd_list = pd.DateRange(start_date, end_date, offset=pd.datetools.bday)
    d_str_l = [re.search('\d+', i).group() for i in os.listdir(output_dir)]

    for bd in bd_list:

        print bd
        d_str = bd.strftime('%Y%m%d')

        if (not forceall) and (d_str in d_str_l):
            continue

        c1 = requests.get(url_base % d_str).content
        if '\xe4\xbd\x8d\xe7\xbd\xae\xe5\xb7\xb2\xe6\x9b\xb4\xe6\x94\xb9' in c1:
            continue

        with open(output_dir + '/DTOP_O_%s.zip' % d_str, 'w') as f:
            f.write(c1)

        time.sleep(60)
Ejemplo n.º 29
0
    def _magic_dataframe(self, parameter_s, widepanel=False, single_df=True):
        """Implementation for magic_dataframe and magic_widepanel"""
        # the first two arguments are dates, and after that it's a list of nodes
        # with some optional keyword args, ie %mdf_df <start> <end> node, node, node, shifts=[{x:1}, {x:2}]
        args = arg_names = tokenize(parameter_s)
        args = [
            _try_eval(x, self.shell.user_global_ns, self.shell.user_ns)
            for x in args
        ]
        args = list(zip(arg_names, args))

        start = None
        if len(args) > 0:
            arg_name, arg = args.pop(0)
            start = _parse_datetime(arg_name, self.shell.user_global_ns,
                                    self.shell.user_ns)

        end = None
        if len(args) > 0:
            arg_name, arg = args.pop(0)
            end = _parse_datetime(arg_name, self.shell.user_global_ns,
                                  self.shell.user_ns)

        # the final argument can be the number of processes to use
        num_processes = 0
        if len(args) > 0:
            arg_name, arg = args[-1]
            if isinstance(arg, basestring) and arg.startswith("||"):
                arg_name, arg = args.pop()
                num_processes = int(arg[2:])

        # the next to last parameter may be a shift set or list of
        # shift sets.
        has_shifts = False
        shift_sets = [{}]  # always have at least one empty shift set
        shift_names = ["_0"]
        arg_name, arg = args[-1] if len(args) > 0 else (None, None)
        if not isinstance(arg, MDFNode):
            arg_name, arg = args.pop()
            named_shift_sets = _get_shift_sets(arg_name, arg)
            if named_shift_sets:
                shift_names, shift_sets = zip(*named_shift_sets)
                has_shifts = True

        # any remaining arguments are the nodes
        nodes = []
        node_var_names = []
        for arg_name, node in args:
            assert isinstance(node, MDFNode), "%s is not a node" % arg_name
            nodes.append(node)
            node_var_names.append(arg_name)

        curr_ctx = _get_current_context()
        ctxs = [None] * len(nodes)

        if not nodes:
            # get the selected nodes from the viewer
            if _viewer_imported:
                selected = viewer.get_selected()
                ctxs, nodes = zip(*selected)
                for i, (ctx, node) in enumerate(selected):
                    assert ctx.is_shift_of(curr_ctx), \
                        "selected node '%s' is not in the current context" % node.name

                    # replace any contexts that are simply the current context with None
                    # so that shifting works correctly
                    if ctx is curr_ctx:
                        ctxs[i] = None

        # if there are shifts then all the contexts have to be None otherwise the
        # shifts won't work correctly. This could be relaxed later if it causes problems,
        # but for now this makes the code simpler.
        if has_shifts:
            assert np.array([x is None for x in ctxs]).all(), \
                "Can't apply shifts when contexts are explicitly specified"

        # list df_builders, one per node or group of nodes
        callbacks = []
        df_builders = []
        if widepanel or not single_df:
            # build multiple dataframes
            for node, ctx in zip(nodes, ctxs):
                if ctx is None:
                    df_builder = DataFrameBuilder([node], filter=True)
                else:
                    df_builder = DataFrameBuilder([node],
                                                  contexts=[ctx],
                                                  filter=True)
                df_builders.append(df_builder)
        else:
            # build a single dataframe
            if np.array([x is None for x in ctxs]).all():
                df_builder = DataFrameBuilder(nodes, filter=True)
            else:
                df_builder = DataFrameBuilder(nodes,
                                              contexts=ctxs,
                                              filter=True)
            df_builders.append(df_builder)

        # add all the dataframe builders to the callbacks
        callbacks.extend(df_builders)

        root_ctx = curr_ctx.get_parent() or curr_ctx
        date_range = pd.DateRange(start, end, offset=self.__timestep)

        # Add a progress bar to the callbacks
        callbacks.append(ProgressBar(date_range[0], date_range[-1]))

        shifted_ctxs = run(date_range,
                           callbacks,
                           ctx=root_ctx,
                           shifts=shift_sets,
                           num_processes=num_processes)

        if not has_shifts:
            shifted_ctxs = [root_ctx]

        # when returning a list of results because multiple shifts have been specified
        # use a named tuple with the items being the names of the shifts
        tuple_ctr = tuple
        if has_shifts:
            # Currying hell yeah
            tuple_ctr = partial(ShiftedResultsTuple, shift_names)

        if widepanel:
            wps = []
            for shift_name, shift_set, shifted_ctx in zip(
                    shift_names, shift_sets, shifted_ctxs):
                wp_dict = {}
                for node_var_name, df_builder in zip(node_var_names,
                                                     df_builders):
                    wp_dict[node_var_name] = df_builder.get_dataframe(
                        shifted_ctx)
                wp = pd.WidePanel.from_dict(wp_dict)

                if has_shifts:
                    wp = WidePanelWithShiftSet(wp, shift_name, shift_set)
                wps.append(wp)

            if len(wps) == 1:
                return wps[0]
            return tuple_ctr(*wps)

        # list a list of lists of dataframes
        # [[dfs for one shift set], [dfs for next shift set], ...]
        df_lists = []
        for shift_name, shift_set, shifted_ctx in zip(shift_names, shift_sets,
                                                      shifted_ctxs):
            dfs = []
            for df_builder in df_builders:
                df = df_builder.get_dataframe(shifted_ctx)
                if has_shifts:
                    df = DataFrameWithShiftSet(df, shift_name, shift_set)
                dfs.append(df)
            df_lists.append(dfs)

        if single_df:
            # flatten into a single list (there should be one dataframe per shift)
            dfs = reduce(operator.add, df_lists, [])
            if len(dfs) == 1:
                return dfs[0]
            return tuple_ctr(*dfs)

        if len(df_lists) == 1:
            return df_lists[0]
        return tuple_ctr(*df_lists)
Ejemplo n.º 30
0
#    mod_tb = tb.lpc(y, 2)
#    t_end = timer()
#    print str(t_end - t) + " seconds for talkbox.lpc"
#    print """For higher lag lengths ours quickly fills up memory and starts
#thrashing the swap.  Should we include talkbox C code or Cythonize the
#Levinson recursion algorithm?"""

    ## Try with a pandas series
    import pandas
    import scikits.timeseries as ts
    d1 = ts.Date(year=1700, freq='A')
    #NOTE: have to have yearBegin offset for annual data until parser rewrite
    #should this be up to the user, or should it be done in TSM init?
    #NOTE: not anymore, it's end of year now
    ts_dr = ts.date_array(start_date=d1, length=len(sunspots.endog))
    pandas_dr = pandas.DateRange(start=d1.datetime,
                                 periods=len(sunspots.endog), timeRule='A@DEC')
    #pandas_dr = pandas_dr.shift(-1, pandas.datetools.yearBegin)

    dates = np.arange(1700, 1700 + len(sunspots.endog))
    dates = ts.date_array(dates, freq='A')
    #sunspots = pandas.Series(sunspots.endog, index=dates)

    #NOTE: pandas only does business days for dates it looks like
    import datetime
    dt_dates = np.asarray(lmap(datetime.datetime.fromordinal,
                              ts_dr.toordinal().astype(int)))
    sunspots = pandas.Series(sunspots.endog, index=dt_dates)

    #NOTE: pandas can't handle pre-1900 dates
    mod = AR(sunspots, freq='A')
    res = mod.fit(method='mle', maxlag=9)