Ejemplo n.º 1
0
 def test_repr(self):
     expected = "<QuarterEnd: startingMonth=3>"
     assert repr(QuarterEnd()) == expected
     expected = "<QuarterEnd: startingMonth=3>"
     assert repr(QuarterEnd(startingMonth=3)) == expected
     expected = "<QuarterEnd: startingMonth=1>"
     assert repr(QuarterEnd(startingMonth=1)) == expected
Ejemplo n.º 2
0
 def get_max_dt():
     with get_session() as ss:
         g = ss.query(
             fund.PortfolioAsset.wind_code,
             sa.func.max(
                 fund.PortfolioAsset.end_date).label('max_dt')).group_by(
                     fund.PortfolioAsset.wind_code).subquery('g')
         fund_list = pd.DataFrame(
             ss.query(
                 fund.Description.wind_code,
                 fund.Description.setup_date,
                 fund.Description.maturity_date,
                 fund.Description.is_initial,
                 g.c.max_dt,
             ).join(
                 g,
                 fund.Description.wind_code == g.c.wind_code,
                 isouter=True  # left join
             ).order_by(fund.Description.is_initial.desc()))
     for c in ('setup_date', 'maturity_date', 'max_dt'):
         fund_list.loc[:, c] = pd.to_datetime(fund_list[c])
     fund_list = fund_list.fillna({'max_dt': fund_list['setup_date']})
     fund_list = fund_list.loc[
         lambda df: df['max_dt'] + QuarterEnd(n=0) < df[
             'maturity_date'].clip(upper=get_last_td()) - QuarterEnd(n=1)]
     return fund_list
Ejemplo n.º 3
0
    def run(self, *args, **kwargs):
        fund_list = self.get_max_dt(
        ).loc[lambda df: df['max_dt'] < '2018-12-31']
        for _, row in fund_list.iterrows():
            ts_data = self.get_tushare_data('asset_portfolio',
                                            ts_code=row['wind_code'])
            ts_data = ts_data.loc[
                lambda df: df['end_date'] >= row['max_dt'] - QuarterEnd(n=1)]
            self.insert_data(ts_data, fund.PortfolioAsset)
            self.insert_data(ts_data.loc[lambda df: df['bond_value'].gt(0)],
                             fund.PortfolioAssetBond)

        # 临时的修补坑数据
        fund_list = self.get_max_dt().loc[lambda df: df['is_initial'].eq(1)]
        mapping = get_wind_conf('crawler_mf_prf')
        for _, row in fund_list.iterrows():
            w_data = self.query_wind(
                api_name='wsd',
                codes=row['wind_code'],
                fields=mapping['fields'].keys(),
                col_mapping=mapping['fields'],
                beginTime=row['max_dt'] + QuarterEnd(n=1),
                endTime=pd.Timestamp.now() - QuarterEnd(n=1),
                options=mapping['options']).assign(wind_code=row['wind_code'])
            w_data['end_date'] = w_data.index
            self.insert_data(w_data, fund.PortfolioAsset, msg=row['wind_code'])

        self.clean_duplicates(
            fund.PortfolioAsset,
            [fund.PortfolioAsset.wind_code, fund.PortfolioAsset.end_date])
        self.clean_duplicates(fund.PortfolioAssetBond, [
            fund.PortfolioAssetBond.wind_code, fund.PortfolioAssetBond.end_date
        ])
Ejemplo n.º 4
0
def findHolder(mkey):
    today = datetime.now().date()
    d = today - pd.DateOffset(months=18)
    submit_date = QuarterEnd().rollback(d)

    tdf = pd.read_sql_query(
        "select sh.holder_name,sh.holder_code,sh.holder_type,sh.report_date from stock_holder sh " \
        "where sh.holder_code like %(mkey)s and report_date >= %(submit_date)s order by report_date desc", db.engine, \
        params={'mkey': '%' + mkey + '%', 'submit_date': submit_date.strftime('%Y-%m-%d')})
    gtdf = tdf.groupby(['holder_code'])
    bdf = gtdf.first()
    bdf['hold_size'] = gtdf.size()
    bdf = bdf.reset_index().sort_index(by='report_date',ascending=False)

    return bdf
Ejemplo n.º 5
0
def findHolder(mkey):
    today = datetime.now().date()
    d = today - pd.DateOffset(months=18)
    submit_date = QuarterEnd().rollback(d)

    tdf = pd.read_sql_query(
        "select sh.holder_name,sh.holder_code,sh.holder_type,sh.report_date from stock_holder sh " \
        "where sh.holder_code like %(mkey)s and report_date >= %(submit_date)s order by report_date desc", db.engine, \
        params={'mkey': '%' + mkey + '%', 'submit_date': submit_date.strftime('%Y-%m-%d')})
    gtdf = tdf.groupby(['holder_code'])
    bdf = gtdf.first()
    bdf['hold_size'] = gtdf.size()
    bdf = bdf.reset_index().sort_index(by='report_date', ascending=False)

    return bdf
Ejemplo n.º 6
0
def format_date(curr_date, prev_date, freq):
    if freq == '6':
        # Quarterly
        if int(curr_date[6:]) == 1:
            curr_date = curr_date[:5] + '01/01'
        elif int(curr_date[6:]) == 2:
            curr_date = curr_date[:5] + '04/01'
        elif int(curr_date[6:]) == 3:
            curr_date = curr_date[:5] + '07/01'
        else:
            curr_date = curr_date[:5] + '10/01'

        report_date = datetime.strptime(curr_date, "%Y/%m/%d") + QuarterEnd(1)
    elif freq == '8':
        curr_date = curr_date + '/01'
        report_date = datetime.strptime(curr_date, "%Y/%m/%d") + MonthEnd(1)
    elif freq == '9':
        if not prev_date or prev_date != curr_date:
            curr_date = curr_date + '/15'
            report_date = datetime.strptime(curr_date, "%Y/%m/%d")
        else:
            curr_date = curr_date + '/01'
            report_date = datetime.strptime(curr_date, "%Y/%m/%d") + MonthEnd(1)

    return report_date
Ejemplo n.º 7
0
    def _time_format(self, start: str, end: str, freq='d'):
        '''轉換日期格式'''
        if freq == 'd':
            pass

        elif freq == 'm':
            start = pd.to_datetime(start, format='%Y%m') + MonthEnd(1)
            end = pd.to_datetime(end, format='%Y%m') + MonthEnd(1)

        elif freq == 'q':
            start = start[0:4] + start[4:6].replace('0', 'Q')
            start = pd.to_datetime(start) + QuarterEnd(1)

            end = end[0:4] + end[4:6].replace('0', 'Q')
            end = pd.to_datetime(end) + QuarterEnd(1)

        elif freq == 'y':
            start = pd.to_datetime(start) + YearEnd(1)
            end = pd.to_datetime(end) + YearEnd(1)
        return start, end
Ejemplo n.º 8
0
 def __init__(self,
              n=1,
              normalize=False,
              weekmask='Mon Tue Wed Thu Fri',
              holidays=None,
              calendar=None,
              **kwds):
     self.n = n
     self.normalize = normalize
     self.kwds.update(kwds)
     self.offset = kwds.get('offset', timedelta(0))
     self.startingMonth = kwds.get('startingMonth', 3)
     self.cbday = CustomBusinessDay(n=1,
                                    normalize=normalize,
                                    weekmask=weekmask,
                                    holidays=holidays,
                                    calendar=calendar)
     self.calendar = self.cbday.calendar
     self.holidays = holidays
     self.startingMonth = self.startingMonth
     self.q_offset = QuarterEnd(1)
Ejemplo n.º 9
0
def _get_start_dates(sess, code, class_):
    """获取数据库指定代码所对应的表开始日期"""
    last_date = sess.query(func.max(
        class_.date)).filter(class_.code == code).scalar()
    if last_date is None:
        start = sess.query(Issue.A004_上市日期).filter(Issue.code == code).scalar()
    else:
        # 开始日期递延到下一天
        start = last_date + timedelta(days=1)
    # 没有上市日期
    if start is None:
        return None
    elif start > pd.Timestamp('today').date():
        return None
    else:
        # 当存在开始日期时,移动到季度末
        qe = QuarterEnd()
        start = qe.apply(start).date()
        if start > pd.Timestamp('today').date():
            return None
        else:
            return start
Ejemplo n.º 10
0
class CustomBusinessQuaterEnd(QuarterOffset):
    _cacheable = False
    _prefix = 'CBQE'
    _attributes = frozenset({'holidays', 'calendar'}
                            | set(QuarterOffset._attributes))

    def __init__(self,
                 n=1,
                 normalize=False,
                 weekmask='Mon Tue Wed Thu Fri',
                 holidays=None,
                 calendar=None,
                 **kwds):
        self.n = n
        self.normalize = normalize
        self.kwds.update(kwds)
        self.offset = kwds.get('offset', timedelta(0))
        self.startingMonth = kwds.get('startingMonth', 3)
        self.cbday = CustomBusinessDay(n=1,
                                       normalize=normalize,
                                       weekmask=weekmask,
                                       holidays=holidays,
                                       calendar=calendar)
        self.calendar = self.cbday.calendar
        self.holidays = holidays
        self.startingMonth = self.startingMonth
        self.q_offset = QuarterEnd(1)

    @apply_wraps
    def apply(self, other):
        n = self.n
        cur_qend = self.q_offset.rollforward(other)
        cur_cqend = self.cbday.rollback(cur_qend)

        if n == 0 and other != cur_cqend:
            n += 1
        if other < cur_cqend and n >= 1:
            n -= 1
        if other > cur_cqend and n <= -1:
            n += 1

        new = cur_qend + n * self.q_offset
        result = self.cbday.rollback(new)
        return result

    def onOffset(self, dt):
        if self.normalize and not _is_normalized(dt):
            return False
        if not self.cbday.onOffset(dt):
            return False
        return (dt + self.cbday).quarter != dt.quarter
Ejemplo n.º 11
0
def period_backward(dates, quarter=None, back_nyear=1, back_nquarter=None):
    """计算上年报告期, 默认返回上年同期"""
    if back_nquarter is not None:
        dates = pd.to_datetime(dates.astype('str')) - QuarterEnd(back_nquarter)
        return np.asarray(dates.strftime("%Y%m%d")).astype('int')
    year = dates // 10000
    month = dates % 10000 // 100
    c = calendar
    if quarter is not None:
        if isinstance(quarter, int):
            month = np.ones(len(dates)).astype('int') * quarter * 3
        else:
            month = quarter * 3
    day = np.asarray([c.monthrange(y-back_nyear, m)[1] for y, m in zip(year, month)])
    return (year - back_nyear) * 10000 + month * 100 + day
Ejemplo n.º 12
0
def add_quarter_data(df, label):
    quarter_label = 'quarter_' + label
    df[quarter_label] = None
    for index, line in df.iterrows():
        # 计算每个季度的总收入
        if '0331' == line['end_date'][4:]:
            df.loc[index, quarter_label] = line[label]
        else:
            this_quarter = pd.Timestamp(line['end_date'])
            last_quarter = (this_quarter - QuarterEnd(n=1)).strftime("%Y%m%d")
            tmp = df[df.end_date == last_quarter]
            if not tmp.empty:
                last_quarter_line = tmp.iloc[-1]
                df.loc[index,
                       quarter_label] = line[label] - last_quarter_line[label]
Ejemplo n.º 13
0
 def set_current(self):
     now = datetime.date.today()
     offset_m, offset_q = MonthEnd(), QuarterEnd()
     self.newest_date['M'] = offset_m.rollback(now)
     self.newest_date['Q'] = offset_q.rollback(now)
     self.newest_date['D'] = now - timedelta(days=1)
     self.newest_date['Y'] = YearEnd().rollback(now)
     half1 = datetime.date(now.year, 6, 30)
     half2 = datetime.date(now.year, 12, 31)
     if now < half1:
         self.newest_date['H'] = datetime.date(now.year - 1, 12, 31)
     elif now < half2:
         self.newest_date['H'] = half1
     else:
         self.newest_date['H'] = half2
Ejemplo n.º 14
0
def ecos(code='021Y125',
         item1='?',
         item2='?',
         item3='?',
         freq='Q',
         first='1900',
         last='2100',
         N='10000',
         detail=True,
         col=None):
    '''retreive monthly, quarterly, annul time series from ecos.
    run 'open_ecosapi() to explore ecos api codes.'''
    ecos_key = "http://ecos.bok.or.kr/api/StatisticSearch/390S6FIOF95M7MHASMEA"
    freq_str = {'QQ': 'Q', 'MM': '-'}
    freq += freq  # Y, Q, M, D -> YY, QQ, MM, DD
    url = f"{ecos_key}/json/kr/1/{N}/{code}/{freq}/{first}/{last}/{item1}/{item2}/{item3}/"
    result = urlopen(url)
    data = json.loads(result.read())
    data = data["StatisticSearch"]["row"]
    df = pd.DataFrame(data)
    if detail:
        print(
            f"통계: {df.loc[0, 'STAT_NAME']}",
            f"단위: {df.loc[0, 'UNIT_NAME']}",
            f"기간: {df.loc[0, 'TIME']} - {df.loc[df.index[-1], 'TIME']}",
            f"항목: {df.loc[0, 'ITEM_NAME1']}",
        )
    df = df.set_index("TIME")
    df.index.names = ['DATE']
    if (freq == 'MM'):
        df.index = pd.DatetimeIndex(df.index.str[:4] + freq_str[freq] +
                                    df.index.str[4:])
        df.index = df.index + MonthEnd()
    elif (freq == 'QQ'):
        df.index = pd.DatetimeIndex(df.index.str[:4] + freq_str[freq] +
                                    df.index.str[4:])
        df.index = df.index + QuarterEnd()
    elif (freq == 'YY'):
        df.index = pd.DatetimeIndex(df.index)
        df.index = df.index + YearEnd()
    elif (freq == 'DD'):
        df.index = pd.DatetimeIndex(df.index)
    else:
        print('frequency is not one of D, M, Q, A.')
        return
    df["DATA_VALUE"] = df["DATA_VALUE"].astype("float")
    return df['DATA_VALUE'].to_frame(col)
Ejemplo n.º 15
0
def join_crsp_and_funda(crsp, funda, offset=QuarterEnd(2)):
    crsp = crsp.copy()
    crsp['time_idx_p'] = crsp['time_idx'] - offset
    joined = pd.merge(crsp,
                      funda,
                      left_on=['permno', 'time_idx_p'],
                      right_on=['permno', 'time_idx'],
                      how='left',
                      suffixes=('', '_d')).drop('time_idx_d', axis=1)
    joined.sort_values(['permno', 'time_idx'], inplace=True)
    joined = joined.groupby('permno',
                            as_index=False).fillna(method='ffill').dropna()
    print(f'CRSP recrods: {crsp.shape[0]}')
    print(
        f'Merged recrods: {joined.shape[0]} ({joined.shape[0] / crsp.shape[0]:.2%})'
    )
    return joined
Ejemplo n.º 16
0
    def __init__(self, start_year=1993):
        self.start_year = start_year
        self.ftp = ftplib.FTP(self.FTP_ADDR)

        self._last_quarter = Timestamp('now') - QuarterEnd(1)
        self.periods = period_range(self.start_year,
                                    self._last_quarter,
                                    freq='Q')

        self._start = '{year}Q{qtr}'.format(year=self.periods[0].year,
                                            qtr=self.periods[0].quarter)

        self._end = '{year}Q{qtr}'.format(year=self.periods[-1].year,
                                          qtr=self.periods[-1].quarter)
        self.CACHE = os.path.join(
            self.ER_CORPUS_DIR,
            self.CACHE_FORMAT.format(start=self._start, end=self._end))

        self.archive = DataFrame(columns=self.COLUMNS)
Ejemplo n.º 17
0
def findStocksByHolder(mkey):
    sql = "select max(report_date) from stock_holder sh where sh.holder_name like :mkey and sh.holder_type != '自然人股'"
    resultProxy = db.session.execute(text(sql), {'mkey': '%' + mkey + '%'})
    _max_date = resultProxy.scalar()
    if (_max_date == None):
        _max_date = pd.to_datetime('2000-06-30')
    _next_date = QuarterEnd().rollback(_max_date - DateOffset(days=1))

    bdf = pd.read_sql_query(
        "select sh.* from stock_holder sh " \
        "where sh.holder_name like %(mkey)s and sh.report_date >= %(mdate)s and sh.holder_type != '自然人股'", db.engine, \
        params={'mkey': '%' + mkey + '%', 'mdate': _next_date})

    df3 = dbs.get_global_data()

    df = pd.merge(bdf, df3, how='left', on='code')
    df['holder_amt'] = df['t_cap'] * df['rate'] / 100

    return df
Ejemplo n.º 18
0
 def __init__(self,
              n=1,
              normalize=False,
              weekmask='Mon Tue Wed Thu Fri',
              holidays=None,
              calendar=None,
              **kwds):
     object.__setattr__(self, "n", n)
     object.__setattr__(self, "normalized", normalize)
     self.kwds.update(kwds)
     object.__setattr__(self, "offset", kwds.get('offset', timedelta(0)))
     object.__setattr__(self, "startingMonth", kwds.get('startingMonth', 3))
     object.__setattr__(
         self, "cbday",
         CustomBusinessDay(n=1,
                           normalize=normalize,
                           weekmask=weekmask,
                           holidays=holidays,
                           calendar=calendar))
     object.__setattr__(self, "calendar", self.cbday.calendar)
     object.__setattr__(self, "holidays", holidays)
     object.__setattr__(self, "q_offset", QuarterEnd(1))
Ejemplo n.º 19
0
def period_backward(dates, quarter=None, back_nyear=1, back_nquarter=None):
    """返回N年之前的同期日期

    Parameters:
    -----------
    dates: list of int date
        原始日期序列。
    quarter: int or list of int, Default None
        季度参数(1,2,3,4),结果会返回N年之前该季度的最后一天。
    back_nyear: int
        回溯N年之前的同期日期。
    back_nquarter: int, default None
        回溯N个季度之前的日期, 如果该参数不是None, quarter
        和back_nyear两个参数无效。

    Examples:
    ----------
    >>> period_backward([20101231], back_nyear=2)
    [20081231]
    >>> period_backward([20101231], back_nquarter=2)
    [20090630]
    >>> period_backward([20101231], quarter=1)
    [20090331]
    """
    if back_nquarter is not None:
        dates = pd.to_datetime(dates.astype('str')) - QuarterEnd(back_nquarter)
        return np.asarray(dates.strftime("%Y%m%d")).astype('int')
    year = dates // 10000
    month = dates % 10000 // 100
    c = calendar
    if quarter is not None:
        if isinstance(quarter, int):
            month = np.ones(len(dates)).astype('int') * quarter * 3
        else:
            month = quarter.astype('int') * 3
    day = np.asarray(
        [c.monthrange(y - back_nyear, m)[1] for y, m in zip(year, month)])
    return (year - back_nyear) * 10000 + month * 100 + day
Ejemplo n.º 20
0
    def run(self, *args, **kwargs):
        mapping = get_wind_conf('crawler_mf_prf')
        with get_session() as ss:
            max_dt, = ss.query(sa.func.max(fund.PortfolioAsset.end_date)).one()
            if max_dt is not None:
                max_dt = min((pd.to_datetime(max_dt),
                              pd.Timestamp.now() - QuarterEnd(n=1)))
            else:
                max_dt = pd.Timestamp('2009-12-30')

            for quarter_end in pd.date_range(max_dt,
                                             pd.Timestamp.now(),
                                             freq='Q'):
                fund_list = [
                    f for f, *_ in ss.query(fund.Description.wind_code).filter(
                        fund.Description.setup_date < quarter_end,
                        fund.Description.maturity_date >= quarter_end,
                        fund.Description.is_initial == 1,
                    ).all()
                ]
                for i, funds in enumerate(utils.chunk(fund_list, 1499),
                                          start=1):
                    data = self.query_wind(api_name='wss',
                                           codes=funds,
                                           fields=mapping['fields'].keys(),
                                           col_mapping=mapping['fields'])
                    self.insert_data(
                        data.assign(end_date=quarter_end,
                                    wind_code=data.index),
                        fund.PortfolioAsset,
                        msg=
                        f'{quarter_end} - {min(i * 1499 / 8000, 1) * 100:.2f}%'
                    )
        self.clean_duplicates(
            fund.PortfolioAsset,
            [fund.PortfolioAsset.wind_code, fund.PortfolioAsset.end_date])
Ejemplo n.º 21
0
    'BAS'     : BYearBegin(month=1),
    'BAS-FEB' : BYearBegin(month=2),
    'BAS-MAR' : BYearBegin(month=3),
    'BAS-APR' : BYearBegin(month=4),
    'BAS-MAY' : BYearBegin(month=5),
    'BAS-JUN' : BYearBegin(month=6),
    'BAS-JUL' : BYearBegin(month=7),
    'BAS-AUG' : BYearBegin(month=8),
    'BAS-SEP' : BYearBegin(month=9),
    'BAS-OCT' : BYearBegin(month=10),
    'BAS-NOV' : BYearBegin(month=11),
    'BAS-DEC' : BYearBegin(month=12),

    # Quarterly - Calendar
    # 'Q'     : QuarterEnd(startingMonth=3),
    'Q-JAN' : QuarterEnd(startingMonth=1),
    'Q-FEB' : QuarterEnd(startingMonth=2),
    'Q-MAR' : QuarterEnd(startingMonth=3),
    'Q-APR' : QuarterEnd(startingMonth=4),
    'Q-MAY' : QuarterEnd(startingMonth=5),
    'Q-JUN' : QuarterEnd(startingMonth=6),
    'Q-JUL' : QuarterEnd(startingMonth=7),
    'Q-AUG' : QuarterEnd(startingMonth=8),
    'Q-SEP' : QuarterEnd(startingMonth=9),
    'Q-OCT' : QuarterEnd(startingMonth=10),
    'Q-NOV' : QuarterEnd(startingMonth=11),
    'Q-DEC' : QuarterEnd(startingMonth=12),

    # Quarterly - Calendar (Start)
    # 'QS'     : QuarterBegin(startingMonth=1),
    'QS-JAN' : QuarterBegin(startingMonth=1),
Ejemplo n.º 22
0
def create_data():
    """ create the pickle/msgpack data """

    data = {
        'A': [0., 1., 2., 3., np.nan],
        'B': [0, 1, 0, 1, 0],
        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
        'D': date_range('1/1/2009', periods=5),
        'E': [0., 1, Timestamp('20100101'), 'foo', 2.]
    }

    scalars = dict(timestamp=Timestamp('20130101'), period=Period('2012', 'M'))

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10),
                 float=Index(np.arange(10, dtype=np.float64)),
                 uint=Index(np.arange(10, dtype=np.uint64)),
                 timedelta=timedelta_range('00:00:00', freq='30T', periods=10))

    if _loose_version >= LooseVersion('0.18'):
        from pandas import RangeIndex
        index['range'] = RangeIndex(10)

    if _loose_version >= LooseVersion('0.21'):
        from pandas import interval_range
        index['interval'] = interval_range(0, periods=10)

    mi = dict(reg2=MultiIndex.from_tuples(tuple(
        zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
              ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])),
                                          names=['first', 'second']))

    series = dict(float=Series(data['A']),
                  int=Series(data['B']),
                  mixed=Series(data['E']),
                  ts=Series(np.arange(10).astype(np.int64),
                            index=date_range('20130101', periods=10)),
                  mi=Series(np.arange(5).astype(np.float64),
                            index=MultiIndex.from_tuples(tuple(
                                zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                                         names=['one',
                                                                'two'])),
                  dup=Series(np.arange(5).astype(np.float64),
                             index=['A', 'B', 'C', 'D', 'A']),
                  cat=Series(Categorical(['foo', 'bar', 'baz'])),
                  dt=Series(date_range('20130101', periods=5)),
                  dt_tz=Series(
                      date_range('20130101', periods=5, tz='US/Eastern')),
                  period=Series([Period('2000Q1')] * 5))

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list("ABCDA")
    frame = dict(float=DataFrame({
        'A': series['float'],
        'B': series['float'] + 1
    }),
                 int=DataFrame({
                     'A': series['int'],
                     'B': series['int'] + 1
                 }),
                 mixed=DataFrame({k: data[k]
                                  for k in ['A', 'B', 'C', 'D']}),
                 mi=DataFrame(
                     {
                         'A': np.arange(5).astype(np.float64),
                         'B': np.arange(5).astype(np.int64)
                     },
                     index=MultiIndex.from_tuples(tuple(
                         zip(*[['bar', 'bar', 'baz', 'baz', 'baz'],
                               ['one', 'two', 'one', 'two', 'three']])),
                                                  names=['first', 'second'])),
                 dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                               columns=['A', 'B', 'A']),
                 cat_onecol=DataFrame({'A': Categorical(['foo', 'bar'])}),
                 cat_and_float=DataFrame({
                     'A':
                     Categorical(['foo', 'bar', 'baz']),
                     'B':
                     np.arange(3).astype(np.int64)
                 }),
                 mixed_dup=mixed_dup_df,
                 dt_mixed_tzs=DataFrame(
                     {
                         'A': Timestamp('20130102', tz='US/Eastern'),
                         'B': Timestamp('20130603', tz='CET')
                     },
                     index=range(5)),
                 dt_mixed2_tzs=DataFrame(
                     {
                         'A': Timestamp('20130102', tz='US/Eastern'),
                         'B': Timestamp('20130603', tz='CET'),
                         'C': Timestamp('20130603', tz='UTC')
                     },
                     index=range(5)))

    cat = dict(int8=Categorical(list('abcdefg')),
               int16=Categorical(np.arange(1000)),
               int32=Categorical(np.arange(10000)))

    timestamp = dict(normal=Timestamp('2011-01-01'),
                     nat=NaT,
                     tz=Timestamp('2011-01-01', tz='US/Eastern'))

    if _loose_version < LooseVersion('0.19.2'):
        timestamp['freq'] = Timestamp('2011-01-01', offset='D')
        timestamp['both'] = Timestamp('2011-01-01',
                                      tz='Asia/Tokyo',
                                      offset='M')
    else:
        timestamp['freq'] = Timestamp('2011-01-01', freq='D')
        timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', freq='M')

    off = {
        'DateOffset': DateOffset(years=1),
        'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824),
        'BusinessDay': BusinessDay(offset=timedelta(seconds=9)),
        'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'),
        'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'),
        'SemiMonthBegin': SemiMonthBegin(day_of_month=9),
        'SemiMonthEnd': SemiMonthEnd(day_of_month=24),
        'MonthBegin': MonthBegin(1),
        'MonthEnd': MonthEnd(1),
        'QuarterBegin': QuarterBegin(1),
        'QuarterEnd': QuarterEnd(1),
        'Day': Day(1),
        'YearBegin': YearBegin(1),
        'YearEnd': YearEnd(1),
        'Week': Week(1),
        'Week_Tues': Week(2, normalize=False, weekday=1),
        'WeekOfMonth': WeekOfMonth(week=3, weekday=4),
        'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3),
        'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
        'Easter': Easter(),
        'Hour': Hour(1),
        'Minute': Minute(1)
    }

    return dict(series=series,
                frame=frame,
                index=index,
                scalars=scalars,
                mi=mi,
                sp_series=dict(float=_create_sp_series(),
                               ts=_create_sp_tsseries()),
                sp_frame=dict(float=_create_sp_frame()),
                cat=cat,
                timestamp=timestamp,
                offsets=off)
Ejemplo n.º 23
0
 def test_offset_corner_case(self):
     # corner
     offset = QuarterEnd(n=-1, startingMonth=1)
     assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31)
Ejemplo n.º 24
0
 def test_isAnchored(self):
     assert QuarterEnd(startingMonth=1).isAnchored()
     assert QuarterEnd().isAnchored()
     assert not QuarterEnd(2, startingMonth=1).isAnchored()
Ejemplo n.º 25
0
class TestQuarterEnd(Base):
    _offset = QuarterEnd

    def test_repr(self):
        expected = "<QuarterEnd: startingMonth=3>"
        assert repr(QuarterEnd()) == expected
        expected = "<QuarterEnd: startingMonth=3>"
        assert repr(QuarterEnd(startingMonth=3)) == expected
        expected = "<QuarterEnd: startingMonth=1>"
        assert repr(QuarterEnd(startingMonth=1)) == expected

    def test_isAnchored(self):
        assert QuarterEnd(startingMonth=1).isAnchored()
        assert QuarterEnd().isAnchored()
        assert not QuarterEnd(2, startingMonth=1).isAnchored()

    def test_offset_corner_case(self):
        # corner
        offset = QuarterEnd(n=-1, startingMonth=1)
        assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31)

    offset_cases = []
    offset_cases.append((QuarterEnd(startingMonth=1), {
        datetime(2008, 1, 1): datetime(2008, 1, 31),
        datetime(2008, 1, 31): datetime(2008, 4, 30),
        datetime(2008, 2, 15): datetime(2008, 4, 30),
        datetime(2008, 2, 29): datetime(2008, 4, 30),
        datetime(2008, 3, 15): datetime(2008, 4, 30),
        datetime(2008, 3, 31): datetime(2008, 4, 30),
        datetime(2008, 4, 15): datetime(2008, 4, 30),
        datetime(2008, 4, 30): datetime(2008, 7, 31)}))

    offset_cases.append((QuarterEnd(startingMonth=2), {
        datetime(2008, 1, 1): datetime(2008, 2, 29),
        datetime(2008, 1, 31): datetime(2008, 2, 29),
        datetime(2008, 2, 15): datetime(2008, 2, 29),
        datetime(2008, 2, 29): datetime(2008, 5, 31),
        datetime(2008, 3, 15): datetime(2008, 5, 31),
        datetime(2008, 3, 31): datetime(2008, 5, 31),
        datetime(2008, 4, 15): datetime(2008, 5, 31),
        datetime(2008, 4, 30): datetime(2008, 5, 31)}))

    offset_cases.append((QuarterEnd(startingMonth=1, n=0), {
        datetime(2008, 1, 1): datetime(2008, 1, 31),
        datetime(2008, 1, 31): datetime(2008, 1, 31),
        datetime(2008, 2, 15): datetime(2008, 4, 30),
        datetime(2008, 2, 29): datetime(2008, 4, 30),
        datetime(2008, 3, 15): datetime(2008, 4, 30),
        datetime(2008, 3, 31): datetime(2008, 4, 30),
        datetime(2008, 4, 15): datetime(2008, 4, 30),
        datetime(2008, 4, 30): datetime(2008, 4, 30)}))

    offset_cases.append((QuarterEnd(startingMonth=1, n=-1), {
        datetime(2008, 1, 1): datetime(2007, 10, 31),
        datetime(2008, 1, 31): datetime(2007, 10, 31),
        datetime(2008, 2, 15): datetime(2008, 1, 31),
        datetime(2008, 2, 29): datetime(2008, 1, 31),
        datetime(2008, 3, 15): datetime(2008, 1, 31),
        datetime(2008, 3, 31): datetime(2008, 1, 31),
        datetime(2008, 4, 15): datetime(2008, 1, 31),
        datetime(2008, 4, 30): datetime(2008, 1, 31),
        datetime(2008, 7, 1): datetime(2008, 4, 30)}))

    offset_cases.append((QuarterEnd(startingMonth=1, n=2), {
        datetime(2008, 1, 31): datetime(2008, 7, 31),
        datetime(2008, 2, 15): datetime(2008, 7, 31),
        datetime(2008, 2, 29): datetime(2008, 7, 31),
        datetime(2008, 3, 15): datetime(2008, 7, 31),
        datetime(2008, 3, 31): datetime(2008, 7, 31),
        datetime(2008, 4, 15): datetime(2008, 7, 31),
        datetime(2008, 4, 30): datetime(2008, 10, 31)}))

    @pytest.mark.parametrize('case', offset_cases)
    def test_offset(self, case):
        offset, cases = case
        for base, expected in compat.iteritems(cases):
            assert_offset_equal(offset, base, expected)

    on_offset_cases = [
        (QuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True),
        (QuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False),
        (QuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False),
        (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False),
        (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False),
        (QuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True),
        (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False),
        (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 31), False),
        (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False),
        (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False),
        (QuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False),
        (QuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False),
        (QuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True),
        (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False),
        (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False),
        (QuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False),
        (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), False),
        (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 31), True),
        (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False),
        (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False),
        (QuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False),
        (QuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True),
        (QuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False),
        (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), False),
        (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), True),
        (QuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False),
        (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False),
        (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False),
        (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False),
        (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True)]

    @pytest.mark.parametrize('case', on_offset_cases)
    def test_onOffset(self, case):
        offset, dt, expected = case
        assert_onOffset(offset, dt, expected)
Ejemplo n.º 26
0
    offset = cls(n=n)
    rng = pd.date_range(start='1/1/2000', periods=100000, freq='T')
    ser = pd.Series(rng)

    res = rng + offset
    res_v2 = offset.apply_index(rng)
    assert (res == res_v2).all()
    assert res[0] == rng[0] + offset
    assert res[-1] == rng[-1] + offset
    res2 = ser + offset
    # apply_index is only for indexes, not series, so no res2_v2
    assert res2.iloc[0] == ser.iloc[0] + offset
    assert res2.iloc[-1] == ser.iloc[-1] + offset


@pytest.mark.parametrize('offset', [QuarterBegin(), QuarterEnd(),
                                    BQuarterBegin(), BQuarterEnd()])
def test_on_offset(offset):
    dates = [datetime(2016, m, d)
             for m in [10, 11, 12]
             for d in [1, 2, 3, 28, 29, 30, 31] if not (m == 11 and d == 31)]
    for date in dates:
        res = offset.onOffset(date)
        slow_version = date == (date + offset) - offset
        assert res == slow_version


# --------------------------------------------------------------------
# Months

class TestMonthBegin(Base):
Ejemplo n.º 27
0
    ser = pd.Series(rng)

    res = rng + offset
    assert res.freq is None  # not retained
    res_v2 = offset.apply_index(rng)
    assert (res == res_v2).all()
    assert res[0] == rng[0] + offset
    assert res[-1] == rng[-1] + offset
    res2 = ser + offset
    # apply_index is only for indexes, not series, so no res2_v2
    assert res2.iloc[0] == ser.iloc[0] + offset
    assert res2.iloc[-1] == ser.iloc[-1] + offset


@pytest.mark.parametrize(
    "offset", [QuarterBegin(), QuarterEnd(), BQuarterBegin(), BQuarterEnd()]
)
def test_on_offset(offset):
    dates = [
        datetime(2016, m, d)
        for m in [10, 11, 12]
        for d in [1, 2, 3, 28, 29, 30, 31]
        if not (m == 11 and d == 31)
    ]
    for date in dates:
        res = offset.is_on_offset(date)
        slow_version = date == (date + offset) - offset
        assert res == slow_version


# --------------------------------------------------------------------
Ejemplo n.º 28
0
    ser = pd.Series(rng)

    res = rng + offset
    res_v2 = offset.apply_index(rng)
    assert (res == res_v2).all()
    assert res[0] == rng[0] + offset
    assert res[-1] == rng[-1] + offset
    res2 = ser + offset
    # apply_index is only for indexes, not series, so no res2_v2
    assert res2.iloc[0] == ser.iloc[0] + offset
    assert res2.iloc[-1] == ser.iloc[-1] + offset


@pytest.mark.parametrize(
    'offset', [QuarterBegin(),
               QuarterEnd(),
               BQuarterBegin(),
               BQuarterEnd()])
def test_on_offset(offset):
    dates = [
        datetime(2016, m, d) for m in [10, 11, 12]
        for d in [1, 2, 3, 28, 29, 30, 31] if not (m == 11 and d == 31)
    ]
    for date in dates:
        res = offset.onOffset(date)
        slow_version = date == (date + offset) - offset
        assert res == slow_version


# --------------------------------------------------------------------
# Months
Ejemplo n.º 29
0
def time_for_next_update(last_time, freq='D', num=9, is_end=False):
    """前次更新后下一次更新时间

    Arguments:
        last_time {obj} -- 上次时间

    Keyword Arguments:
        freq {str} -- 更新周期 (default: {'D'})
        num {int} -- 日级别以下为单位数,以上为小时数 (default: {9})
        is_end {bool} -- 是否为周期尾部 (default: {False})

    Raises:
        TypeError: 不能识别的周期类型

    Returns:
        Timestamp -- 下一次更新时间

    Notes:
        一、 freq < D
            `num`代表周期数
            上一时点`normalize`后移动`num`周期,不考虑开始及结束问题
        二、 freq in D、B
            `num`代表小时
            对于历史时间,上一时点`normalize`后一律移动到下一个周期,且将小时调整到指定的num
            如上一时点其日期为当前日期,且在其`normalize`及调整小时后的值晚于上一时点,则取调整后的值
        三、 freq > D 开始及结束才有效
            `num`无效
            如周初、周末、月初、月末、季初、季末、年初、年末
            此时num数字不起作用
    """
    valid_freq = ('B', 'D', 'W', 'M', 'Q', 'H', 'MIN')
    if pd.isnull(last_time):
        return pd.Timestamp(MARKET_START)
    assert isinstance(
        last_time, pd.Timestamp), f'类型错误,希望Timestamp,实际为{type(last_time)}'
    now = pd.Timestamp.now(tz=last_time.tz)
    assert last_time <= now, '过去时间必须小于当前时间'
    freq = freq.upper()
    if freq == 'MIN':
        offset = Minute(n=num)
        return offset.apply(last_time.floor(freq))
    if freq == 'H':
        offset = Hour(n=num)
        return offset.apply(last_time.floor(freq))
    if freq == 'D':
        # √ 此处要考虑小时数
        limit = last_time.floor(freq).replace(hour=num)
        if last_time < limit:
            return limit
        else:
            offset = Day()
            return offset.apply(last_time.floor(freq)).replace(hour=num)
    if freq == 'B':
        offset = BDay()
        # 工作日
        if last_time.weekday() in range(0, 5):
            # √ 此处要考虑小时数
            limit = last_time.normalize().replace(hour=num)
            if last_time < limit:
                return limit
            else:
                return offset.apply(last_time.normalize()).replace(hour=num)
        else:
            return offset.apply(last_time.normalize()).replace(hour=num)
    if freq == 'W':
        nw = last_time.normalize() + pd.Timedelta(weeks=1)
        if is_end:
            return nw + pd.Timedelta(days=7-nw.weekday()) - pd.Timedelta(nanoseconds=1)
        else:
            return nw - pd.Timedelta(days=nw.weekday())
    if freq == 'M':
        if is_end:
            offset = MonthEnd(n=2)
            res = offset.apply(last_time.normalize())
            if last_time.is_month_end:
                res = offset.rollback(res)
            return res
        else:
            offset = MonthBegin()
            return offset.apply(last_time.normalize())
    if freq == 'Q':
        if is_end:
            offset = QuarterEnd(n=2, startingMonth=3, normalize=True)
            res = offset.apply(last_time)
            if last_time.is_quarter_end:
                offset = QuarterEnd(n=-1, startingMonth=3, normalize=True)
                res = offset.apply(res)
            return res
        else:
            offset = QuarterBegin(n=1, normalize=True, startingMonth=1)
            return offset.apply(last_time)
    if freq == 'Y':
        if last_time.year == now.year:
            if is_end:
                return last_time.normalize().replace(year=now.year, month=12, day=31)
            else:
                return last_time.normalize().replace(year=now.year, month=1, day=1)
        if is_end:
            offset = YearEnd(normalize=True, month=12, n=2)
            res = offset.apply(last_time)
            if last_time.is_year_end:
                offset = YearEnd(n=-1, month=12, normalize=True)
                res = offset.apply(res)
            return res
        else:
            offset = YearBegin(normalize=True, month=1, n=1)
            return offset.apply(last_time)
    raise ValueError('不能识别的周期类型,仅接受{}。实际输入为{}'.format(
        valid_freq, freq))
def create_data():
    """ create the pickle/msgpack data """

    data = {
        "A": [0.0, 1.0, 2.0, 3.0, np.nan],
        "B": [0, 1, 0, 1, 0],
        "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
        "D": date_range("1/1/2009", periods=5),
        "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
    }

    scalars = dict(timestamp=Timestamp("20130101"), period=Period("2012", "M"))

    index = dict(
        int=Index(np.arange(10)),
        date=date_range("20130101", periods=10),
        period=period_range("2013-01-01", freq="M", periods=10),
        float=Index(np.arange(10, dtype=np.float64)),
        uint=Index(np.arange(10, dtype=np.uint64)),
        timedelta=timedelta_range("00:00:00", freq="30T", periods=10),
    )

    index["range"] = RangeIndex(10)

    if _loose_version >= LooseVersion("0.21"):
        from pandas import interval_range

        index["interval"] = interval_range(0, periods=10)

    mi = dict(reg2=MultiIndex.from_tuples(
        tuple(
            zip(*[
                ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
                ["one", "two", "one", "two", "one", "two", "one", "two"],
            ])),
        names=["first", "second"],
    ))

    series = dict(
        float=Series(data["A"]),
        int=Series(data["B"]),
        mixed=Series(data["E"]),
        ts=Series(np.arange(10).astype(np.int64),
                  index=date_range("20130101", periods=10)),
        mi=Series(
            np.arange(5).astype(np.float64),
            index=MultiIndex.from_tuples(tuple(
                zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                         names=["one", "two"]),
        ),
        dup=Series(np.arange(5).astype(np.float64),
                   index=["A", "B", "C", "D", "A"]),
        cat=Series(Categorical(["foo", "bar", "baz"])),
        dt=Series(date_range("20130101", periods=5)),
        dt_tz=Series(date_range("20130101", periods=5, tz="US/Eastern")),
        period=Series([Period("2000Q1")] * 5),
    )

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list("ABCDA")
    frame = dict(
        float=DataFrame({
            "A": series["float"],
            "B": series["float"] + 1
        }),
        int=DataFrame({
            "A": series["int"],
            "B": series["int"] + 1
        }),
        mixed=DataFrame({k: data[k]
                         for k in ["A", "B", "C", "D"]}),
        mi=DataFrame(
            {
                "A": np.arange(5).astype(np.float64),
                "B": np.arange(5).astype(np.int64)
            },
            index=MultiIndex.from_tuples(
                tuple(
                    zip(*[
                        ["bar", "bar", "baz", "baz", "baz"],
                        ["one", "two", "one", "two", "three"],
                    ])),
                names=["first", "second"],
            ),
        ),
        dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                      columns=["A", "B", "A"]),
        cat_onecol=DataFrame({"A": Categorical(["foo", "bar"])}),
        cat_and_float=DataFrame({
            "A": Categorical(["foo", "bar", "baz"]),
            "B": np.arange(3).astype(np.int64),
        }),
        mixed_dup=mixed_dup_df,
        dt_mixed_tzs=DataFrame(
            {
                "A": Timestamp("20130102", tz="US/Eastern"),
                "B": Timestamp("20130603", tz="CET"),
            },
            index=range(5),
        ),
        dt_mixed2_tzs=DataFrame(
            {
                "A": Timestamp("20130102", tz="US/Eastern"),
                "B": Timestamp("20130603", tz="CET"),
                "C": Timestamp("20130603", tz="UTC"),
            },
            index=range(5),
        ),
    )

    cat = dict(
        int8=Categorical(list("abcdefg")),
        int16=Categorical(np.arange(1000)),
        int32=Categorical(np.arange(10000)),
    )

    timestamp = dict(
        normal=Timestamp("2011-01-01"),
        nat=NaT,
        tz=Timestamp("2011-01-01", tz="US/Eastern"),
    )

    timestamp["freq"] = Timestamp("2011-01-01", freq="D")
    timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M")

    off = {
        "DateOffset": DateOffset(years=1),
        "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824),
        "BusinessDay": BusinessDay(offset=timedelta(seconds=9)),
        "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"),
        "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"),
        "SemiMonthBegin": SemiMonthBegin(day_of_month=9),
        "SemiMonthEnd": SemiMonthEnd(day_of_month=24),
        "MonthBegin": MonthBegin(1),
        "MonthEnd": MonthEnd(1),
        "QuarterBegin": QuarterBegin(1),
        "QuarterEnd": QuarterEnd(1),
        "Day": Day(1),
        "YearBegin": YearBegin(1),
        "YearEnd": YearEnd(1),
        "Week": Week(1),
        "Week_Tues": Week(2, normalize=False, weekday=1),
        "WeekOfMonth": WeekOfMonth(week=3, weekday=4),
        "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3),
        "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
        "Easter": Easter(),
        "Hour": Hour(1),
        "Minute": Minute(1),
    }

    return dict(
        series=series,
        frame=frame,
        index=index,
        scalars=scalars,
        mi=mi,
        sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()),
        sp_frame=dict(float=_create_sp_frame()),
        cat=cat,
        timestamp=timestamp,
        offsets=off,
    )
            vectorizer, topic_model = train_lda_model(df_reports, train_last,
                                                      n_dims)
        pickle.dump((vectorizer, topic_model), open(pickle_name, "wb"))

# loading reports for training set
df_reports_train = df_reports.loc[train_first:train_last]
train_range = df_reports_train.index

train_x = []
train_y = []
mean_sims = []

if model_train_sample == "whole":
    for date in train_range:

        returns_stop = date + QuarterEnd(startingMonth=3,
                                         n=time_horizon_quarters)

        print("training for period:")
        print(date + pd.DateOffset(days=1))
        print(returns_stop)

        # loading reports and returns for period, finding the companies in which both datapoints exist
        reports = get_reports_for_date(df_reports, date)
        returns = get_returns_for_period(df_returns,
                                         date + pd.DateOffset(days=1),
                                         returns_stop)
        returns, reports = find_column_intersection([returns, reports])

        # covariance and correlation matrix for period
        cov = predict_cov_sample(returns)
        cor = predict_cov_sample(returns, True)