def fetchQuotes(sym, start=FROM_DATE, end=CURRENT_DATE):
    his = None
    data = None
    try:
        # print start, end
        data = ystockquote.get_historical_prices(sym, start, end)
    except Exception:
        print "Please check the dates. Data might not be available. 404 returned"

        # 404 due to data yet not available
    if data:
        his = DataFrame(collections.OrderedDict(sorted(data.items()))).T
        his = his.convert_objects(convert_numeric=True)
        his.index = pd.to_datetime(his.index)
        his.insert(0, 'symbol', sym, allow_duplicates=True)
        # insert the date as dataframe too
        his.insert(1, 'date', his.index)
        # his.columns = getColumns('stock_quote_historical')   # Removing as db dependency is removed
        his.columns = getColumnsNoSql('stock_quote_historical')

    daily = ystockquote.get_all(sym)
    # print daily
    # persist(his, daily, sym, end)

    return his, daily
Example #2
0
 def compute_commit_periods(self, ticket_frame: pd.DataFrame):
     commit_dates = ticket_frame.CommitDate
     commit_periods = self.compute_periods(commit_dates)
     commit_periods = pd.concat(
         [pd.Series(data=[pd.Timedelta(days=0)]),
          commit_periods]).reset_index(drop=True)
     ticket_frame.insert(8, 'CommitPeriod', commit_periods.dt.days)
     return ticket_frame
Example #3
0
    def test_left_join_multi_index(self, left, right, sort):
        icols = ['1st', '2nd', '3rd']

        def bind_cols(df):
            iord = lambda a: 0 if a != a else ord(a)
            f = lambda ts: ts.map(iord) - ord('a')
            return (f(df['1st']) + f(df['3rd']) * 1e2 +
                    df['2nd'].fillna(0) * 1e4)

        def run_asserts(left, right, sort):
            res = left.join(right, on=icols, how='left', sort=sort)

            assert len(left) < len(res) + 1
            assert not res['4th'].isna().any()
            assert not res['5th'].isna().any()

            tm.assert_series_equal(
                res['4th'], - res['5th'], check_names=False)
            result = bind_cols(res.iloc[:, :-2])
            tm.assert_series_equal(res['4th'], result, check_names=False)
            assert result.name is None

            if sort:
                tm.assert_frame_equal(
                    res, res.sort_values(icols, kind='mergesort'))

            out = merge(left, right.reset_index(), on=icols,
                        sort=sort, how='left')

            res.index = np.arange(len(res))
            tm.assert_frame_equal(out, res)

        lc = list(map(chr, np.arange(ord('a'), ord('z') + 1)))
        left = DataFrame(np.random.choice(lc, (5000, 2)),
                         columns=['1st', '3rd'])
        left.insert(1, '2nd', np.random.randint(0, 1000, len(left)))

        i = np.random.permutation(len(left))
        right = left.iloc[i].copy()

        left['4th'] = bind_cols(left)
        right['5th'] = - bind_cols(right)
        right.set_index(icols, inplace=True)

        run_asserts(left, right, sort)

        # inject some nulls
        left.loc[1::23, '1st'] = np.nan
        left.loc[2::37, '2nd'] = np.nan
        left.loc[3::43, '3rd'] = np.nan
        left['4th'] = bind_cols(left)

        i = np.random.permutation(len(left))
        right = left.iloc[i, :-1]
        right['5th'] = - bind_cols(right)
        right.set_index(icols, inplace=True)

        run_asserts(left, right, sort)
Example #4
0
def get_result(result):
    result = result.round()
    if(len(result.shape)==1):
        df = DataFrame(result,columns=[0])
    else:
        df = DataFrame(result,columns=['col_'+str(i) for i in range(result.shape[1])])
    df.insert(0,'shop_id',[i for i in range(1,2001)])
    df = pd.merge(df,df,on='shop_id')
    return df        
Example #5
0
def getNgrams(query, corpus, startYear, endYear, smoothing):
    params = dict(content=query, year_start=startYear, year_end=endYear,
                  corpus=corpora[corpus], smoothing=smoothing)
    req = requests.get('http://books.google.com/ngrams/graph', params=params)
    response = req.content
    res = re.findall('var data = (.*?);', response)
    data = {qry['ngram']: qry['timeseries'] for qry in literal_eval(res[0])}
    df = DataFrame(data)
    df.insert(0, 'year', range(startYear, endYear+1))
    return req.url, params['content'], df
Example #6
0
def getNgrams(query, corpus="eng_2012", startYear=1980, endYear=2000, smoothing=10, caseInsensitive=True):
    params = dict(content=query, year_start=startYear, year_end=endYear,
                  corpus=corpora[corpus], smoothing=smoothing,
                  case_insensitive=caseInsensitive)
    req = requests.get('http://books.google.com/ngrams/graph', params=params)
    res = re.findall('var data = (.*?);\\n', req.text)
    data = {qry['ngram']: qry['timeseries'] for qry in literal_eval(res[0])}
    df = DataFrame(data)
    df.insert(0, 'year', range(startYear, endYear+1))
    return list(df.columns.values)[2:]
Example #7
0
    def _standardize_index(self,
                           df_in: pd.DataFrame,
                           symbol: str = None,
                           datatype: str = None,
                           barsize: str = None,
                           tz: str = None):
        """Normalize input DataFrame index to MarketDataBlock standard.
        """
        # Add or starndardize index names in the input.
        if isinstance(df_in.index, pd.MultiIndex):
            df_in.reset_index(inplace=True)

        # Rename ambiguous column names.
        df_in.columns = [
            col_rename.get(col.strip().lower(),
                           col.strip().lower()) for col in df_in.columns
        ]

        # Insert Symbol, DataType, Barsize columns from arguments if not
        # found in the input dataframe.
        for col in MarketDataBlock.data_index:
            if col not in df_in.columns:
                if locals().get(col.lower(), None) is None:
                    raise KeyError(
                        'No {0} argument and no {0} column in the DataFrame.'.
                        format(col))
                df_in.insert(0, col, locals()[col.lower()])

        # Convert datetime strings to pandas DatetimeIndex
        df_in['TickerTime'] = pd.DatetimeIndex(df_in['TickerTime'].apply(
            pd.Timestamp))

        # Standardize BarSize strings
        df_in['BarSize'] = df_in['BarSize'].map(timedur_standardize)

        # Set index to class-defined MultiIndex
        df_in.set_index(MarketDataBlock.data_index, inplace=True)

        # Set time zone so all DatetimeIndex are tz-aware
        df_in_tz = df_in.index.levels[self.__class__.dtlevel].tz
        if df_in_tz is None or isinstance(df_in_tz, timezone) or \
           isinstance(df_in_tz, pytz._FixedOffset):
            # Input df has naive time index, or tzinfo is not pytz.timezone()
            if tz is None:
                raise ValueError(
                    'Argument tz=None, and TickerTime.tzinfo is None(naive),'
                    'datetime.timezone, or pytz._FixedOffset.')
            if df_in_tz is None:
                df_in = df_in.tz_localize(tz, level=self.__class__.dtlevel)
            else:
                df_in = df_in.tz_convert(tz, level=self.__class__.dtlevel)

        return df_in
Example #8
0
 def filter_faculty(data: pd.DataFrame) -> pd.DataFrame():
     data['单位'] = [self._keep_zh(x) for x in data['单位'].tolist()]
     data.insert(
         1, "Faculty",
         [re.search('(.*学院)(.*)', x).groups()[0] for x in data['单位']])
     data.insert(
         2, "Major",
         [re.search(r'(.*学院)(.*)', x).groups()[1] for x in data['单位']])
     data['Faculty'] = [
         x.replace("国际学院国际学院", "国际学院") for x in data['Faculty']
     ]
     return data
Example #9
0
    def test_fillna_columns(self):
        df = DataFrame(np.random.randn(10, 10))
        df.values[:, ::2] = np.nan

        result = df.fillna(method="ffill", axis=1)
        expected = df.T.fillna(method="pad").T
        tm.assert_frame_equal(result, expected)

        df.insert(6, "foo", 5)
        result = df.fillna(method="ffill", axis=1)
        expected = df.astype(float).fillna(method="ffill", axis=1)
        tm.assert_frame_equal(result, expected)
Example #10
0
    def test_fillna_columns(self):
        df = DataFrame(np.random.randn(10, 10))
        df.values[:, ::2] = np.nan

        result = df.fillna(method='ffill', axis=1)
        expected = df.T.fillna(method='pad').T
        assert_frame_equal(result, expected)

        df.insert(6, 'foo', 5)
        result = df.fillna(method='ffill', axis=1)
        expected = df.astype(float).fillna(method='ffill', axis=1)
        assert_frame_equal(result, expected)
Example #11
0
def exam_period(flattened_student_data: pd.DataFrame) -> pd.DataFrame:
    flattened_student_data.insert(loc=0,
                                  column='exam_period',
                                  value=deepcopy(flattened_student_data.index))

    non_exam_mask = (flattened_student_data['exam_period'] < definitions.MIDTERM_START_DATE) | (
            flattened_student_data['exam_period'] > definitions.MIDTERM_END_DATE)

    flattened_student_data['exam_period_inferred'] = 1
    flattened_student_data.loc[non_exam_mask, 'exam_period_inferred'] = 0

    return flattened_student_data.drop(columns='exam_period')
Example #12
0
    def test_left_join_multi_index(self, left, right, sort):
        icols = ["1st", "2nd", "3rd"]

        def bind_cols(df):
            iord = lambda a: 0 if a != a else ord(a)
            f = lambda ts: ts.map(iord) - ord("a")
            return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4

        def run_asserts(left, right, sort):
            res = left.join(right, on=icols, how="left", sort=sort)

            assert len(left) < len(res) + 1
            assert not res["4th"].isna().any()
            assert not res["5th"].isna().any()

            tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
            result = bind_cols(res.iloc[:, :-2])
            tm.assert_series_equal(res["4th"], result, check_names=False)
            assert result.name is None

            if sort:
                tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))

            out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")

            res.index = np.arange(len(res))
            tm.assert_frame_equal(out, res)

        lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
        left = DataFrame(np.random.choice(lc, (5000, 2)), columns=["1st", "3rd"])
        left.insert(1, "2nd", np.random.randint(0, 1000, len(left)))

        i = np.random.permutation(len(left))
        right = left.iloc[i].copy()

        left["4th"] = bind_cols(left)
        right["5th"] = -bind_cols(right)
        right.set_index(icols, inplace=True)

        run_asserts(left, right, sort)

        # inject some nulls
        left.loc[1::23, "1st"] = np.nan
        left.loc[2::37, "2nd"] = np.nan
        left.loc[3::43, "3rd"] = np.nan
        left["4th"] = bind_cols(left)

        i = np.random.permutation(len(left))
        right = left.iloc[i, :-1]
        right["5th"] = -bind_cols(right)
        right.set_index(icols, inplace=True)

        run_asserts(left, right, sort)
Example #13
0
    def test_fillna_columns(self):
        df = DataFrame(np.random.randn(10, 10))
        df.values[:, ::2] = np.nan

        result = df.fillna(method='ffill', axis=1)
        expected = df.T.fillna(method='pad').T
        assert_frame_equal(result, expected)

        df.insert(6, 'foo', 5)
        result = df.fillna(method='ffill', axis=1)
        expected = df.astype(float).fillna(method='ffill', axis=1)
        assert_frame_equal(result, expected)
Example #14
0
    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"])
        df.insert(0, "id", 0)
        df.insert(5, "dt", "foo")

        grouped = df.groupby("id")
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix="_right")
def move_columns(
    df: _pd.DataFrame,
    colname_idx: dict,
    inplace: bool = False,
) -> _pd.DataFrame:
    """ For a given DataFrame, move given column_name keys into the given index values

    Parameters
    ----------
    df : pd.DataFrame, mandatory

    colname_idx: dict, mandatory
        { "colname1" : 2, "colname10" : 1 }

    Returns
    -------
    pf.DataFrame with updated columns
    """

    if not inplace:
        df = df.copy()

    if not isinstance(df, _pd.DataFrame):
        raise TypeError("df should be a pandas DataFrame")

    if not isinstance(colname_idx, dict):
        raise TypeError("colname_idx should be a dict")

    for column_name in colname_idx.keys():
        if not isinstance(column_name, str):
            raise TypeError(f"key={column_name}, should be an str type")

        if column_name not in df.columns:
            raise ValueError(
                f"key={column_name} is not a column of the given DataFrame")

        idx = colname_idx[column_name]

        if not isinstance(idx, int):
            raise TypeError(
                f"key={column_name}, val={idx} should be an integer type")

        if df.columns.get_loc(column_name) == idx:
            next

        tmp_col = df[column_name]

        df.drop(labels=[column_name], axis=1, inplace=True)

        df.insert(loc=idx, column=column_name, value=tmp_col)

        return df
def update_dataframe():
    df = DataFrame({
        'goods': ['cola', 'egg', 'cookie', 'apple', 'banana', 'milk'],
        'quantity': [4, 5, 6, 10, 11, 12],
        'color': ['B', 'Y', 'Y', 'R', 'Y', 'W'],
        'price': [13, 11, 14, 21, 20, 22]
    })
    df['total'] = df['quantity'] * df['price']
    print('添加列 :\r\n', df)
    df['isQualified'] = True
    print('添加列 固定值:\r\n', df)
    df.insert(2, 'allQuantity', [100, 50, 30, 100, 50, 30])
    print('插入列:\r\n', df)
    df.loc[6] = ['shampoo', 'B', 13, 100, 50, 650, True]
    print('插入行:\r\n', df)
    df.append({
        'goods': 'pear',
        'quantity': 30,
        'price': 12
    },
              ignore_index=True)  # 字典不能是列表
    print('append添加行:\r\n', df)
    print(
        'append 新值:\r\n',
        df.append({
            'goods': 'pear',
            'quantity': 30,
            'price': 12
        },
                  ignore_index=True))
    df.drop(labels='isQualified', axis=1, inplace=True)
    print('删除列:\r\n', df)
    df.drop(['allQuantity', 'total'], axis=1, inplace=True)
    print('删除多列:\r\n', df)
    df.drop(6, axis=0, inplace=True)
    print('删除一行:\r\n', df)
    df.drop([4, 5], axis=0, inplace=True)
    print('删除多行:\r\n', df)
    df.loc[3] = ['orange', 'O', 5, 12]  # 不存在则添加
    print('修改某行:\r\n', df)
    df.loc[:, 'quantity'] = 10
    print('修改某列:\r\n', df)
    df.loc[df.price > 15, 'price'] = 15
    print('修改某些:\r\n', df)
    print('-' * 30, '统计')
    print('np求和:', np.sum(df['quantity']))
    print('df求和:', df['quantity'].sum())
    print('df describe 描述性统计:\r\n', df[{'quantity', 'price'}].describe())
    print('频次统计:\r\n', df['color'].value_counts())
    # 数据类型转category类型
    df['color'] = df['color'].astype('category')
    print('category describe:\r\n', df['color'].describe())
Example #17
0
def write_tfs(tfs_path: str,
              data_frame: DataFrame,
              headers_dict: dict = None,
              save_index: Union[str, bool] = False,
              colwidth: int = DEFAULT_COLUMN_WIDTH):
    """
    Writes the DataFrame into tfs_path with the headers_dict as
    headers dictionary. If you want to keep the order of the headers, use collections.OrderedDict.

    Args:
        tfs_path: path to the output TFS file
        data_frame: TfsDataFrame or pandas.DataFrame to save
        headers_dict: Headers of the data_frame, if empty tries to use data_frame.headers
        save_index: bool or string. Default: False
            If True, saves the index of the data_frame to a column identifiable by INDEX_ID.
            If string, it saves the index of the data_frame to a column named by string.
        colwidth: Column width
    """
    _validate(data_frame, f"to be written in {tfs_path:s}")
    if save_index:
        if isinstance(save_index, str):
            # saves index into column by name given
            idx_name = save_index
        else:
            # saves index into column, which can be found by INDEX_ID
            try:
                idx_name = INDEX_ID + data_frame.index.name
            except TypeError:
                idx_name = INDEX_ID
        data_frame.insert(0, idx_name, data_frame.index)
    LOGGER.debug(
        f"Attempting to write file: {basename(tfs_path)} in {dirname(tfs_path)}"
    )

    if headers_dict is None:  # Tries to get headers from TfsDataFrame
        try:
            headers_dict = data_frame.headers
        except AttributeError:
            headers_dict = {}

    colwidth = max(MIN_COLUMN_WIDTH, colwidth)
    headers_str = _get_headers_str(headers_dict)
    colnames_str = _get_colnames_str(data_frame.columns, colwidth)
    coltypes_str = _get_coltypes_str(data_frame.dtypes, colwidth)
    data_str = _get_data_str(data_frame, colwidth)
    with open(tfs_path, "w") as tfs_data:
        tfs_data.write("\n".join(
            (headers_str, colnames_str, coltypes_str, data_str)))

    if save_index:
        # removed inserted column again
        data_frame.drop(data_frame.columns[0], axis=1, inplace=True)
Example #18
0
def write_df(df: pd.DataFrame,
             worksheet: str,
             loc: str = "A1",
             columns: bool = True,
             worksheet_loc: int = 6):
    """Write a dataframe to a sheet without changing it"""
    worksheet = get_worksheet(worksheet, worksheet_loc)
    df = df.copy()
    df.insert(0, df.index.name, df.index)
    df.replace([0, np.inf, np.nan, float("-inf")], "", inplace=True)
    values = [df.columns.to_list()
              ] + df.values.tolist() if columns else df.values.tolist()
    worksheet.update(loc, values, raw=False)
Example #19
0
    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')
Example #20
0
def calc_grade(df: pd.DataFrame, periods=2):
    """
    计算坡度
    :param df: pd.DataFrame
    :param periods: int 计算时的偏移
    """
    diff_altitude = df['altitude'].diff(periods=periods)
    diff_distance = df['distance'].diff(periods=periods)
    grade = round(diff_altitude / diff_distance * 100, 2)
    # print(grade)
    grade[:periods] = 0.0
    # df['grade'] = grade
    df.insert(5, 'grade', grade)
Example #21
0
    def test_mixed_type_join_with_suffix(self):
        # GH #916
        df = DataFrame(np.random.randn(20, 6),
                       columns=['a', 'b', 'c', 'd', 'e', 'f'])
        df.insert(0, 'id', 0)
        df.insert(5, 'dt', 'foo')

        grouped = df.groupby('id')
        mn = grouped.mean()
        cn = grouped.count()

        # it works!
        mn.join(cn, rsuffix='_right')
    def _read_one_data(self, symbol):
        """ read one data from specified URL """
        url = "https://query1.finance.yahoo.com/v8/finance/chart/{}=X".format(symbol)
        params = self._get_params(symbol)

        resp = self._get_response(url, params=params)
        jsn = json.loads(resp.text)

        data = jsn["chart"]["result"][0]
        df = DataFrame(data["indicators"]["quote"][0])
        df.insert(0, "date", to_datetime(Series(data["timestamp"]), unit="s").dt.date)
        df.columns = map(str.capitalize, df.columns)
        return df
Example #23
0
def DataFrameColumnCopy(df_from: pd.DataFrame, df_to: pd.DataFrame, columns: [str]) -> (int, int):
    copied = uncopied = 0
    for c in columns:
        if c not in df_from.columns.tolist():
            uncopied += 1
            continue
        col = df_from[c]
        if c not in df_to.columns.tolist():
            df_to.insert(len(df_to.columns), c, col.copy())
        else:
            df_to[c] = col.copy()
        copied += 1
    return copied, uncopied
Example #24
0
 def coordinator(self, rasa_chatlog_df: pd.DataFrame):
     rasa_chatlog_df.insert(2, "use_case", "")
     rasa_chatlog_df.insert(3, "outcome", "")
     conversation_ids = rasa_chatlog_df["conversation_id"].drop_duplicates(keep="first").to_list()
     for id in conversation_ids:
         chatlog_sub_df = rasa_chatlog_df[rasa_chatlog_df["conversation_id"] == id]
         item_index, use_case = self.set_uc1_and_uc2_for_conversations_2(chatlog_sub_df)
         if item_index and use_case:
             rasa_chatlog_df.at[item_index, "use_case"] = use_case
         item_index, outcome = self.specify_conversation_outcome_2(chatlog_sub_df)
         if item_index and outcome:
             rasa_chatlog_df.at[item_index, "outcome"] = outcome
     return rasa_chatlog_df
Example #25
0
def indexcnName(dftopn: pd.DataFrame):
    # 插入中文名称
    codetop = list(dftopn.index.levels[1])
    indexList = qa.QA_fetch_index_list_adv()
    # for c in codetop:
    #     # 打印股票代码及中文名8
    #     print(c, indexList.loc[c]['name'])
    print("排名靠前的数量:{}".format(len(codetop)))
    # 插入中文名
    dftopn.reset_index(inplace=True)
    #
    # dftopn['name'] = dftopn['code'].apply(lambda x: indexList.loc[x]['name'])
    dftopn.insert(1, 'name', dftopn['code'].apply(lambda x: indexList.loc[x]['name']))
    return dftopn.set_index(['date', 'code'])
Example #26
0
def save_training_fitness_information(g_best_dict, number_tasks, name_mha,
                                      name_paras, results_folder_path):
    results_path = f'{results_folder_path}/optimize_process/{name_mha}/{name_paras}'
    Path(results_path).mkdir(parents=True, exist_ok=True)
    fitness_file_path = f'{results_path}/training_{number_tasks}_tasks.csv'
    fit_list = array([[0, 0, 0, 0]])
    for key, value in g_best_dict.items():
        value = insert(value, 0, key, axis=1)
        fit_list = concatenate((fit_list, value), axis=0)
    fitness_df = DataFrame(fit_list)
    fitness_df = fitness_df.iloc[1:]
    fitness_df.to_csv(fitness_file_path,
                      index=False,
                      header=["Epoch", "Power", "Latency", "Cost"])

    if Config.METRICS_NEED_MIN_OBJECTIVE_VALUES:
        fitness_df.drop(fitness_df[fitness_df['Epoch'] != key].index,
                        inplace=True)
        fitness_df.insert(0, 'Name Paras', name_paras)
        fitness_df.insert(0, 'Name MHA', name_mha)
        fitness_df.insert(0, 'N Tasks', number_tasks)
        fitness_df.insert(0, 'Metric', Config.METRICS)
        fitness_df.to_csv(f'{Config.RESULTS_DATA}/summary.csv',
                          mode='a',
                          header=False)
def convert_meas_to_mW(dataframe: pd.DataFrame, settings):
    for name, setting in settings.items():
        voltage = setting[1] if setting[1] is not None else dataframe[
            "Voltage [V]"]
        dataframe[name] = dataframe[name] * voltage / setting[2]
        dataframe = dataframe.rename(columns={name: setting[0]})
    dataframe.insert(
        dataframe.columns.get_loc("SoC After DCDC [mW]") + 1,
        "Efficiency DCDC",
        dataframe["SoC After DCDC [mW]"] / dataframe["SoC Before DCDC [mW]"])
    dataframe.insert(
        dataframe.columns.get_loc("Latency [ms]") + 1, "Energy [uJ]",
        dataframe["Latency [ms]"] * dataframe["SoC Before DCDC [mW]"])
    return dataframe
Example #28
0
def MakeExcelFile():

    # 연결 여부 체크
    objCpCybos = win32com.client.Dispatch("CpUtil.CpCybos")

    # 데이터프레임 제작
    data1 = MakeDataBase(1)
    df_1st = DataFrame(data1, columns=['날짜', '시가', '고가', '저가', '종가'])
    df_1st.sort_index(0, ascending=False)

    # 기존 데이터에 MA를 추가시켜준다.
    MovingAverage_3Days = round(df_1st['종가'].rolling(window=3).mean())
    df_1st.insert(len(df_1st.columns), "3일이평", MovingAverage_3Days)

    MovingAverage_5Days = round(df_1st['종가'].rolling(window=5).mean())
    df_1st.insert(len(df_1st.columns), "5일이평", MovingAverage_5Days)

    MovingAverage_10Days = round(df_1st['종가'].rolling(window=10).mean())
    df_1st.insert(len(df_1st.columns), "10일이평", MovingAverage_10Days)

    MovingAverage_20Days = round(df_1st['종가'].rolling(window=20).mean())
    df_1st.insert(len(df_1st.columns), "20일이평", MovingAverage_20Days)

    # 엑셀화
    df_1st.to_excel('./MyDB.xlsx', encoding='euc_KR')
    print('DB 제작 완료!')
Example #29
0
class InsertColumns:
    def setup(self):
        self.N = 10 ** 3
        self.df = DataFrame(index=range(self.N))

    def time_insert(self):
        np.random.seed(1234)
        for i in range(100):
            self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True)

    def time_assign_with_setitem(self):
        np.random.seed(1234)
        for i in range(100):
            self.df[i] = np.random.randn(self.N)
Example #30
0
def change_stock_1(df: pd.DataFrame):
    """
    规范化因子和股票数据信息,将时间作为index
    :param df: 数据
    :return: 新的数据
    """
    if "trade_date" in df.columns:
        date = pd.to_datetime(df["trade_date"], format='%Y%m%d')
        df.insert(df.shape[1], 'date', date)
        df.drop("trade_date")
        return df
    else:
        print("There is no DataFrame named trade_date")
        return 0
Example #31
0
def _order_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Orders the columns of the dataframe as: date, region, observations."""
    df.insert(0, _COLUMNS.DATE.value, df.pop(_COLUMNS.DATE.value))
    reg_columns = []
    obs_columns = []
    for col in df.columns[1:]:
        if col.startswith(constants.REGION_PREFIX):
            reg_columns.append(col)
        elif col.startswith(constants.OBSERVATION_PREFIX):
            obs_columns.append(col)
        else:
            raise ValueError(f"Unknown column: '{col}'")
    columns = [_COLUMNS.DATE.value] + reg_columns + obs_columns
    return df[columns]
Example #32
0
def _build_data_frame(inst,
                      data,
                      picks,
                      long_format,
                      mindex,
                      index,
                      default_index,
                      col_names=None,
                      col_kind='channel'):
    """Build DataFrame from MNE-object-derived data array."""
    # private function; pandas already checked in calling function
    from pandas import DataFrame
    from ..source_estimate import _BaseSourceEstimate
    # build DataFrame
    if col_names is None:
        col_names = [inst.ch_names[p] for p in picks]
    df = DataFrame(data, columns=col_names)
    for i, (k, v) in enumerate(mindex):
        df.insert(i, k, v)
    # build Index
    if long_format:
        df.set_index(default_index, inplace=True)
        df.columns.name = col_kind
    elif index is not None:
        df.set_index(index, inplace=True)
        if set(index) == set(default_index):
            df.columns.name = col_kind
    # long format
    if long_format:
        df = df.stack().reset_index()
        df.rename(columns={0: 'value'}, inplace=True)
        # add column for channel types (as appropriate)
        ch_map = (None if isinstance(inst, _BaseSourceEstimate) else dict(
            zip(
                np.array(inst.ch_names)[picks],
                np.array(inst.get_channel_types())[picks])))
        if ch_map is not None:
            col_index = len(df.columns) - 1
            ch_type = df['channel'].map(ch_map)
            df.insert(col_index, 'ch_type', ch_type)
        # restore index
        if index is not None:
            df.set_index(index, inplace=True)
        # convert channel/vertex/ch_type columns to factors
        to_factor = [
            c for c in df.columns.tolist() if c not in ('time', 'value')
        ]
        _set_pandas_dtype(df, to_factor, 'category')
    return df
    def getNgrams(self, query, corpus, startYear, endYear, smoothing,
                  caseInsensitive):
        params = dict(content=query,
                      year_start=startYear,
                      year_end=endYear,
                      corpus=corpora[corpus],
                      smoothing=smoothing,
                      case_insensitive=caseInsensitive)
        if params['case_insensitive'] is False:
            params.pop('case_insensitive')
        if '?' in params['content']:
            params['content'] = params['content'].replace('?', '*')
        if '@' in params['content']:
            params['content'] = params['content'].replace('@', '=>')

        retry_wait_time = 20

        while True:
            try:
                req = requests.get('http://books.google.com/ngrams/graph',
                                   params=params)
            except Exception:
                # Wait minute for connection to repair
                print("Connection error, waiting 60 seconds")
                sleep(60)
                continue

            if req.status_code == 200:
                res = re.findall('var data = (.*?);\\n', req.text)

                if res:
                    data = {
                        qry['ngram']: qry['timeseries']
                        for qry in literal_eval(res[0])
                    }
                    df = DataFrame(data)
                    df.insert(0, 'year', list(range(startYear, endYear + 1)))
                else:
                    df = DataFrame()

                break

            if req.status_code != 200:
                retry_wait_time += 1
                print("Response %s received, waiting %s seconds" %
                      (req.status_code, retry_wait_time))
                sleep(retry_wait_time)

        return req.url, params['content'], df
Example #34
0
    def _standardize_index(
            self, df_in: pd.DataFrame, symbol: str=None, datatype: str=None,
            barsize: str=None, tz: str=None):
        """Normalize input DataFrame index to MarketDataBlock standard.
        """
        # Add or starndardize index names in the input.
        if isinstance(df_in.index, pd.MultiIndex):
            df_in.reset_index(inplace=True)

        # Rename ambiguous column names.
        df_in.columns = [
            col_rename.get(col.strip().lower(), col.strip().lower())
            for col in df_in.columns]

        # Insert Symbol, DataType, Barsize columns from arguments if not
        # found in the input dataframe.
        for col in MarketDataBlock.data_index:
            if col not in df_in.columns:
                if locals().get(col.lower(), None) is None:
                    raise KeyError(
                        'No {0} argument and no {0} column in the DataFrame.'
                        .format(col))
                df_in.insert(0, col, locals()[col.lower()])

        # Convert datetime strings to pandas DatetimeIndex
        df_in['TickerTime'] = pd.DatetimeIndex(
            df_in['TickerTime'].apply(pd.Timestamp))

        # Standardize BarSize strings
        df_in['BarSize'] = df_in['BarSize'].map(timedur_standardize)

        # Set index to class-defined MultiIndex
        df_in.set_index(MarketDataBlock.data_index, inplace=True)

        # Set time zone so all DatetimeIndex are tz-aware
        df_in_tz = df_in.index.levels[self.__class__.dtlevel].tz
        if df_in_tz is None or isinstance(df_in_tz, timezone) or \
           isinstance(df_in_tz, pytz._FixedOffset):
            # Input df has naive time index, or tzinfo is not pytz.timezone()
            if tz is None:
                raise ValueError(
                    'Argument tz=None, and TickerTime.tzinfo is None(naive),'
                    'datetime.timezone, or pytz._FixedOffset.')
            if df_in_tz is None:
                df_in = df_in.tz_localize(tz, level=self.__class__.dtlevel)
            else:
                df_in = df_in.tz_convert(tz, level=self.__class__.dtlevel)

        return df_in
Example #35
0
    def _read_one_data(self, symbol):
        """ read one data from specified URL """
        url = 'https://query1.finance.yahoo.com/v8/finance/chart/{}=X'\
            .format(symbol)
        params = self._get_params(symbol)

        resp = self._get_response(url, params=params)
        jsn = json.loads(resp.text)

        data = jsn['chart']['result'][0]
        df = DataFrame(data['indicators']['quote'][0])
        df.insert(0, 'date',
                  to_datetime(Series(data['timestamp']), unit='s').dt.date)
        df.columns = map(str.capitalize, df.columns)
        return df
Example #36
0
def _add_last_seen_column(df: pd.DataFrame) -> pd.DataFrame:
    """Adds last seen time ago column to a dataframe of clients.

  Args:
    df: Dataframe of clients.

  Returns:
    Dataframe with a column added.
  """
    if 'last_seen_at' not in df.columns:
        return df

    seen_ago = [client_textify.last_seen(tm) for tm in df['last_seen_at']]
    df.insert(0, 'last_seen_ago', pd.Series(seen_ago))
    return df
 def replace_time(data: pd.DataFrame):
     """
         Replace the complete datetime with its (day of week, hour)
     :param data:
     :return:
     """
     date = pd.to_datetime(data[const.TIME],
                           format=const.T_FORMAT,
                           utc=True)
     data.insert(loc=0, column='hour', value=date.dt.hour)
     data.insert(loc=0,
                 column='dayofweek',
                 value=(date.dt.dayofweek + 1) %
                 7)  # 0: monday -> 0: sunday
     data.drop(columns=[const.TIME], inplace=True)
Example #38
0
    def test_setitem_clear_caches(self):
        # see GH#304
        df = DataFrame(
            {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3]
        )
        df.insert(2, "z", np.nan)

        # cache it
        foo = df["z"]
        df.loc[df.index[2:], "z"] = 42

        expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z")

        assert df["z"] is not foo
        tm.assert_series_equal(df["z"], expected)
Example #39
0
def add_gene_symbols(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds columns of gene names at column_index = 0.
    Assumes index is gene ids.
    """

    #     if 'symbol' not in df.columns:
    gene_ids = df.index.to_list()

    conv = GeneConverter()
    gene_symbols = conv.convert_to_symbols(gene_ids)

    df.insert(loc=0, column='symbol', value=gene_symbols)

    return df
 def getNgrams( query, corpus, startYear, endYear, smoothing, caseInsensitive ):
     params = dict( content = query, year_start = startYear, year_end = endYear,
                    corpus = _corpora[ corpus ], smoothing = smoothing,
                    case_insensitive = caseInsensitive )
     if params[ 'case_insensitive' ] is False:
         params.pop( 'case_insensitive' )
     if '?' in params[ 'content' ]:
         params[ 'content' ] = params[ 'content' ].replace( '?', '*' )
     if '@' in params[ 'content' ]:
         params[ 'content' ] = params[ 'content' ].replace( '@', '=>' )
     req = requests.get( 'http://books.google.com/ngrams/graph', params = params )
     res = regex.findall( 'var data = (.*?);\\n', req.text )
     data = { qry[ 'ngram' ]: qry[ 'timeseries' ] for qry in literal_eval( res[ 0 ] ) }
     df = DataFrame( data )
     df.insert( 0, 'year', range( startYear, endYear + 1 ) )
     return req.url, params[ 'content' ], df
Example #41
0
class InsertColumns:

    def setup(self):
        self.N = 10**3
        self.df = DataFrame(index=range(self.N))

    def time_insert(self):
        np.random.seed(1234)
        for i in range(100):
            self.df.insert(0, i, np.random.randn(self.N),
                           allow_duplicates=True)

    def time_assign_with_setitem(self):
        np.random.seed(1234)
        for i in range(100):
            self.df[i] = np.random.randn(self.N)
Example #42
0
def export(params, path, paramsToGroupBySize, hasCycles):
    """Formats extracted data and exports to Data.xlsv"""
    paramToUnit, Files = extractFolder(params, path,
                                       paramsToGroupBySize, hasCycles)
    channelToFiles = groupFilesByChannel(Files)
    writer = ExcelWriter(path + 'Data.xlsx')  # Needed to save multiple sheets

    # Iterate through channels
    currentChannelIndex = 1
    numOfChannels = len(channelToFiles)
    for channel in channelToFiles:
        extractedValues = {p: [] for p in params}
        names = []
        cyclesColumn = []

        # Obtain list of values and names from files in channel
        for File in channelToFiles[channel]:
            if hasCycles:
                appendFileInfoCycles(File, params, extractedValues,
                                     names, cyclesColumn)
            else:
                appendFileInfo(File, params, extractedValues, names)

        # Create table / DataFrame
        table = {'{} ({})'.format(p, paramToUnit[p]): extractedValues[p]
                 for p in params}
        df = DataFrame(table)
        df.insert(0, 'File Name', names)
        if hasCycles:
            df.insert(1, 'Cycle', cyclesColumn)
        sheet = 'Ch. ' + channel

        # Add sheets and autofit column dimesntions
        df.to_excel(writer, sheet_name=sheet, index=False)
        writer.sheets[sheet].column_dimensions['A'].width = len(
            max(names, key=len))

        # Message
        print('--Successfully extracted '
              'from {} ({} of {})'.format(sheet,
                                          currentChannelIndex,
                                          numOfChannels))
        currentChannelIndex += 1

    # Export
    writer.save()
    print('')
Example #43
0
class InsertColumns(object):

    goal_time = 0.2

    def setup(self):
        self.N = 10**3
        self.df = DataFrame(index=range(self.N))

    def time_insert(self):
        np.random.seed(1234)
        for i in range(100):
            self.df.insert(0, i, np.random.randn(self.N))

    def time_assign_with_setitem(self):
        np.random.seed(1234)
        for i in range(100):
            self.df[i] = np.random.randn(self.N)
Example #44
0
    def test_insert(self):
        df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
                       columns=['c', 'b', 'a'])

        df.insert(0, 'foo', df['a'])
        tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
        tm.assert_series_equal(df['a'], df['foo'], check_names=False)

        df.insert(2, 'bar', df['c'])
        tm.assert_index_equal(df.columns,
                              Index(['foo', 'c', 'bar', 'b', 'a']))
        tm.assert_almost_equal(df['c'], df['bar'], check_names=False)

        # diff dtype

        # new item
        df['x'] = df['a'].astype('float32')
        result = Series(dict(float32=1, float64=5))
        assert (df.get_dtype_counts().sort_index() == result).all()

        # replacing current (in different block)
        df['a'] = df['a'].astype('float32')
        result = Series(dict(float32=2, float64=4))
        assert (df.get_dtype_counts().sort_index() == result).all()

        df['y'] = df['a'].astype('int32')
        result = Series(dict(float32=2, float64=4, int32=1))
        assert (df.get_dtype_counts().sort_index() == result).all()

        with pytest.raises(ValueError, match='already exists'):
            df.insert(1, 'a', df['b'])
        msg = "cannot insert c, already exists"
        with pytest.raises(ValueError, match=msg):
            df.insert(1, 'c', df['b'])

        df.columns.name = 'some_name'
        # preserve columns name field
        df.insert(0, 'baz', df['c'])
        assert df.columns.name == 'some_name'

        # GH 13522
        df = DataFrame(index=['A', 'B', 'C'])
        df['X'] = df.index
        df['X'] = ['x', 'y', 'z']
        exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
        assert_frame_equal(df, exp)
Example #45
0
def file_prep(file):
    df = DataFrame(read_csv(file, sep = '\t'))
    df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True)
    major_freqs = df.apply(major_prop_find, axis = 1)
    major_alleles = df.apply(major_find, axis =1 )
    df.insert(3,'major_freqs', major_freqs)
    df.insert(3,'major_alleles', major_alleles)
    df = df.transpose()
    
    
    chrom, chrom_idx = np.unique(df.loc['chrom'], return_index=True)
    
    super_missing_df = df == '.'
    
    chromosome_dict = {}
    for number in np.unique(df.loc['chrom']):
        chromosome_dict[number] = df.loc['chrom'][df.loc['chrom'] == number].index
    return df, super_missing_df, chromosome_dict
Example #46
0
def get_fingerprint_from_DataFrame(chem_smile,fpfunc):
    molsmitmp = [Chem.MolFromSmiles(x) for x in chem_smile['smiles']]
    i = 0
    molsmi = []
    for x in molsmitmp:
        if x is not None:
            x.SetProp("_Name",chem_smile['compound'][i])
            molsmi.append(x)
        i += 1
    fps = [fpfunc(x) for x in molsmi]
    # Note above: multi parameters can be used to generate E/FCFP.
    fpsmat = np.matrix(fps)
    df = DataFrame(fpsmat,index = [x.GetProp("_Name") for x in molsmi]) 
    df.insert(0,'chembl',df.index)
    df.insert(1,'smiles',[Chem.MolToSmiles(x) for x in molsmi])
    #df['SMILES'] = [Chem.MolToSmiles(x) for x in molsmi]
    #df['CHEMBL'] = df.index
    return(df)
Example #47
0
    def test_insert_column_bug_4032(self):

        # GH4032, inserting a column and renaming causing errors
        df = DataFrame({'b': [1.1, 2.2]})
        df = df.rename(columns={})
        df.insert(0, 'a', [1, 2])

        result = df.rename(columns={})
        str(result)
        expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b'])
        assert_frame_equal(result, expected)
        df.insert(0, 'c', [1.3, 2.3])

        result = df.rename(columns={})
        str(result)

        expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]],
                             columns=['c', 'a', 'b'])
        assert_frame_equal(result, expected)
Example #48
0
def data_prep(input_file, bad_samples_file, freq_dict=None):
    '''prepare the ibdhmm file byremoving sites that are too close from eachother, calculating major and minor allele
    if specified, the freq_dict should be a json file that contains the frequencies. This is created from freq_parse.py'''
    min_snpD = 10
    tri_allele= 0
    
    output_file = ('.').join(input_file.split('.')[0:-2]) + '_cleaned.txt'
    
    
    #relaxing conditions because we only have 3000 SNPs to begin with
    bad_samples = [sample.strip() for sample in open(bad_samples_file)]                                              
    df = DataFrame(read_csv(input_file, sep = '\t'))
    #remove bad samples
    df.drop(bad_samples, inplace = True, axis =1)
    #remove non-biallelic alleles
    #df.drop(df[df.apply(allele_count, axis = 1) != 2].index, inplace = True)
    
    
    #relaxing conditions because we only have 3000 SNPs to begin with
    '''#remove SNPs that are too close to one another
    df['diff'] = df.groupby('chrom')['pos'].diff()
    df.fillna('first', inplace = True)
    #df.to_csv('test_df.txt', sep = '\t')
    # BUG NOTE MUST FIX THE DAISY CHAIN PROBLEM
    df = df.query('diff > 10 or diff == "first"')
    df.drop('diff', axis = 1, inplace = True)'''
    
    if not freq_dict:
        #calculate the major and minor allele
        major = df.apply(major_find, axis =1 )
        minor = df.apply(minor_find, axis =1 )
        major_prop = df.apply(major_prop_find, axis =1 )
        minor_prop = df.apply(minor_prop_find, axis = 1)
    else:
        snp_dict = json.load(open(freq_dict))
        df['keys'] = df['chrom'].map(str) +':'+ df['pos'].map(str)        
        major = df['keys'].apply(lambda x : snp_dict[x]['major'])
        major_prop = df['keys'].apply(lambda x : snp_dict[x]['major_freq'])
        minor = df['keys'].apply(lambda x : snp_dict[x]['minor'])
        minor_prop = df['keys'].apply(lambda x : snp_dict[x]['minor_freq'])
        
        df.drop('keys', inplace= True, axis = 1)
               
        
        
    #inserting this stuff into dataframe for future use
    df.insert(3, 'minor_prop', minor_prop)
    df.insert(3, 'minor', minor)
    df.insert(3, 'major_prop', major_prop)
    df.insert(3, 'major', major)
    
    df.to_csv(output_file, sep = '\t', index= False)
    return df
Example #49
0
def average_by_cell_line():
    datasets = load_datasets()
    ess_train_data = datasets['ess_train_data']

    lines_board = load_cell_lines(CELL_LINES_LEADERBOARD_PH1)
    lines_train = load_cell_lines(CELL_LINES_TRAINING_PH1)

    data = {}

    for line in lines_board.index:
        site = lines_board.at[line, 'Site_primary']
        matches = lines_train.index[lines_train['Site_primary'] == site]
        if matches.size > 0:
            data[line] = ess_train_data.loc[:, matches].mean(1).tolist()
        else:
            data[line] = ess_train_data.mean(1).tolist()

    ess_avg_data = DataFrame(data, index=ess_train_data.index, columns=lines_board.index)
    ess_avg_data.insert(0, 'Description', ess_train_data.index)
    save_gct_data(ess_avg_data, 'avg_per_line.gct')
Example #50
0
class InfoTable(DataFrameWidget):
    def __init__(self, samples=None):
        self.initVars()
        super(InfoTable, self).__init__(self.table)

    def initVars(self):
        """Initialises variables."""
        self.columns = ["Plate ID", "Plate Name", "Plate Kea", "Well"]
        self.table = DataFrame(columns=self.columns)

    ########################################################################
    def update(self):
        plateID = self.table["Plate ID"]
        plateName = self.table["Plate Name"]
        plateKea = self.table["Plate Kea"]
        well = self.table["Well"]
        self.table = self.table.drop(labels=["Plate ID", "Plate Name", "Plate Kea", "Well"], axis=1)
        self.table.insert(0, "Plate ID", plateID)
        self.table.insert(1, "Plate Name", plateName)
        self.table.insert(2, "Plate Kea", plateKea)
        self.table.insert(3, "Well", well)
        self.setDataFrame(self.table)

    def append(self, appendage):
        self.table = self.table.append(appendage, ignore_index=True)
        self.update()

    def editPlates(self, edits):
        self.table = self.table.set_index("Plate ID")
        edits = edits.set_index("ID")
        self.table.update(edits)
        self.table = self.table.reset_index()

    def importPlateData(self, plateData, key):
        plateData = plateData.set_index(key)
        self.table = self.table.set_index(key)
        self.table.update(plateData)
        self.table = self.table.reset_index()

    def importSampleData(self, sampleData, tableKey, importKey):
        sampleData[tableKey] = sampleData[importKey]
        sampleData = sampleData.set_index(tableKey)
        self.table = self.table.set_index(tableKey)
        self.table = self.table.join(sampleData, rsuffix="_new")
        self.table = self.table.reset_index()

    def getKeaSexTestingData(self):
        table = self.table[["Plate ID", "Well", "Sample ID", "Plant Alt Names"]]
        table = table.set_index(["Plate ID", "Well"])
        table.rename(columns={"Plant Alt Names": "Plant AltName"}, inplace=True)
        return table
Example #51
0
def ingest(app, path=''):
    c = ConfigParser.ConfigParser()
    for f in 'ref_perf.txt', 'auto_perf.txt', 'naive_perf.txt', 'sweep_perf.txt':
        c.read(os.path.join(path, app, f))
    df = DataFrame([dict(c.items(s)) for s in c.sections()])

    # coerce types
    for col in df.columns:
        try:
            ints = df[col].astype(int)
            df[col] = ints
        except:
            try:
                floats = df[col].astype(float)
                df[col] = floats
            except:
                pass # keep as string

    # coerce old data names if present
    df = df.rename(columns={'nthreads':'threads'})

    app_name = app.replace('_', ' ').title()

    df.insert(0, 'app', app)
    df.insert(0, 'app_name_pretty', app_name)
    assert(len(df))
    df['throughput'] = 1000.0 / df.runtime # runs/sec
    df['speedup'] = 0.0

    # this is a little bullshit, but DataFrame slice indexing gets confusing
    ref = df[df.version == 'ref']#.set_index('threads')
    def compute_speedup(row):
        r = ref[ref.threads == row.threads].runtime.iloc[0] #FFFfffuuu
        row.speedup = r / row.runtime
        return row
    df = df.apply(compute_speedup, axis=1)

    df['runtime_norm'] = df.runtime / max(df.runtime)
    df['throughput_norm'] = df.throughput / max(df.throughput)

    return df
Example #52
0
def getNgrams(query, corpus, startYear, endYear, smoothing, caseInsensitive):
    params = dict(content=query, year_start=startYear, year_end=endYear,
                  corpus=corpora[corpus], smoothing=smoothing,
                  case_insensitive=caseInsensitive)
    if params['case_insensitive'] is False:
        params.pop('case_insensitive')
    if '?' in params['content']:
        params['content'] = params['content'].replace('?', '*')
    if '@' in params['content']:
        params['content'] = params['content'].replace('@', '=>')
    print "here"
    # Google blocked the god damn api without ssl...
    #req = requests.get('https://books.google.com/ngrams/graph', params=params)
    param_list='&'.join([(x+"="+str('+'.join(str(params[x]).split(' ')))) for x in params.keys()])
    return_html = urllib2.urlopen("https://books.google.com/ngrams/graph?"+param_list).read()
    print len(return_html)
    res = re.findall('var data = (.*?);\\n', return_html)
    data = {qry['ngram']: qry['timeseries'] for qry in literal_eval(res[0])}
    print data
    df = DataFrame(data)
    df.insert(0, 'year', range(startYear, endYear+1))
    return req.url, params['content'], df
Example #53
0
def getNgrams(query, corpus, startYear, endYear, smoothing, caseInsensitive):
    params = dict(content=query, year_start=startYear, year_end=endYear,
                  corpus=corpora[corpus], smoothing=smoothing,
                  case_insensitive=caseInsensitive)
    if params['case_insensitive'] is False:
        params.pop('case_insensitive')
    if '?' in params['content']:
        params['content'] = params['content'].replace('?', '*')
    if '@' in params['content']:
        params['content'] = params['content'].replace('@', '=>')
    req = requests.get('http://books.google.com/ngrams/graph', params=params)
    res = re.findall('var data = (.*?);\\n', req.text)
    if res:
        data = {qry['ngram'].strip().replace(" ", "_"): qry['timeseries']
                for qry in literal_eval(res[0])}
        df = DataFrame(data)
        df.insert(0, 'year', list(range(startYear, startYear+len(df))))
        df.set_index('year', inplace=True)
        if len(df.columns) > 1:
            df = df[list(filter(lambda x:"(All)" in x, df.columns))]
        df = df.rename(columns={x:x.split("_(All)")[0].strip().replace(" ", "_") for x in df.columns})
    else:
        df = DataFrame()
    return req.url, params['content'], df
Example #54
0
    def test_insert(self):
        df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
                       columns=['c', 'b', 'a'])

        df.insert(0, 'foo', df['a'])
        self.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
        tm.assert_series_equal(df['a'], df['foo'], check_names=False)

        df.insert(2, 'bar', df['c'])
        self.assert_index_equal(df.columns,
                                Index(['foo', 'c', 'bar', 'b', 'a']))
        tm.assert_almost_equal(df['c'], df['bar'], check_names=False)

        # diff dtype

        # new item
        df['x'] = df['a'].astype('float32')
        result = Series(dict(float64=5, float32=1))
        self.assertTrue((df.get_dtype_counts() == result).all())

        # replacing current (in different block)
        df['a'] = df['a'].astype('float32')
        result = Series(dict(float64=4, float32=2))
        self.assertTrue((df.get_dtype_counts() == result).all())

        df['y'] = df['a'].astype('int32')
        result = Series(dict(float64=4, float32=2, int32=1))
        self.assertTrue((df.get_dtype_counts() == result).all())

        with assertRaisesRegexp(ValueError, 'already exists'):
            df.insert(1, 'a', df['b'])
        self.assertRaises(ValueError, df.insert, 1, 'c', df['b'])

        df.columns.name = 'some_name'
        # preserve columns name field
        df.insert(0, 'baz', df['c'])
        self.assertEqual(df.columns.name, 'some_name')

        # GH 13522
        df = DataFrame(index=['A', 'B', 'C'])
        df['X'] = df.index
        df['X'] = ['x', 'y', 'z']
        exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
        assert_frame_equal(df, exp)
Example #55
0
    def test_insert(self):
        df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
                       columns=['c', 'b', 'a'])

        df.insert(0, 'foo', df['a'])
        self.assert_numpy_array_equal(df.columns, ['foo', 'c', 'b', 'a'])
        assert_almost_equal(df['a'], df['foo'])

        df.insert(2, 'bar', df['c'])
        self.assert_numpy_array_equal(df.columns,
                                      ['foo', 'c', 'bar', 'b', 'a'])
        assert_almost_equal(df['c'], df['bar'])

        # diff dtype

        # new item
        df['x'] = df['a'].astype('float32')
        result = Series(dict(float64=5, float32=1))
        self.assertTrue((df.get_dtype_counts() == result).all())

        # replacing current (in different block)
        df['a'] = df['a'].astype('float32')
        result = Series(dict(float64=4, float32=2))
        self.assertTrue((df.get_dtype_counts() == result).all())

        df['y'] = df['a'].astype('int32')
        result = Series(dict(float64=4, float32=2, int32=1))
        self.assertTrue((df.get_dtype_counts() == result).all())

        with assertRaisesRegexp(ValueError, 'already exists'):
            df.insert(1, 'a', df['b'])
        self.assertRaises(ValueError, df.insert, 1, 'c', df['b'])

        df.columns.name = 'some_name'
        # preserve columns name field
        df.insert(0, 'baz', df['c'])
        self.assertEqual(df.columns.name, 'some_name')
Example #56
0
File: RfCv.py Project: rishy/Kaggle
rf_fit = rf.fit(train_X, train_y, sample_weight = balance_weights(train_y))

# <codecell>

rf_fit

# <codecell>

rf_prob = rf_fit.score(val_X, val_y)

# <codecell>

rf_prob

# <codecell>

rf_prob = rf_fit.predict_proba(test_X)

# <codecell>

df = DataFrame(rf_prob, columns = np.unique(train_y))
df.insert(loc = 0, column = "id", value = test["id"])

# <codecell>

df.to_csv("RfWeightedSolution.csv", index = False)

# <codecell>


    def add_var2(self, varname, target=None, source = 'free'):
        """
        Add a variable in the dataframe

        Parameters
        ----------

        varname : str
                  name of the variable
        target : float
                 target for the margin of the variable
        source : str, default 'free'
                 database source
        """
        w_init = self.weights_init*self.champm
        w = self.weights*self.champm
        inputs = self.simulation.survey
        output_table = self.simulation.output_table

        varcol = self.simulation.get_col(varname)
        entity = self.entity
        enum = inputs.column_by_name.get('qui'+self.entity).enum
        people = [x[1] for x in enum]

        if varname in inputs.column_by_name:
            value = inputs.get_value(varname, index = idx)
        elif output_table is not None and varname in output_table.column_by_name:
            value = output_table.get_value(varname, index = idx, opt = people, sum_ = True)

        label = varcol.label
        # TODO: rewrite this using pivot table
        items = [ ('marge'    , w[self.champm]  ), ('marge initiale' , w_init[self.champm] )]
        if varcol.__class__  in MODCOLS:
            items.append(('mod',   value[self.champm]))
            df = DataFrame.from_items(items)
            res = df.groupby('mod', sort= True).sum()
        else:
            res = DataFrame(index = ['total'],
                            data = {'marge' : (value*w).sum(),
                                    'marge initiale' : (value*w_init).sum()  } )
        res.insert(0, u"modalités",u"")
        res.insert(2, "cible", 0)
        res.insert(2, u"cible ajustée", 0)
        res.insert(4, "source", source)
        mods = res.index

        if target is not None:
            if len(mods) != len(target.keys()):
                drop_indices = [ (varname, mod) for mod in target.keys()]
                if source == 'input':
                    self.input_margins_df.drop(drop_indices, inplace=True)
                    self.input_margins_df.index.names = ['var','mod']
                if source == 'output':
                    self.output_margins_df.drop(drop_indices, inplace=True)
                    self.output_margins_df.index.names = ['var','mod']
                return

        if isinstance(varcol, EnumCol):
            if varcol.enum:
                enum = varcol.enum
                res[u'modalités'] = [enum._vars[mod] for mod in mods]
                res['mod'] = mods
            else:
                res[u'modalités'] = [mod for mod in mods]
                res['mod'] = mods
        elif isinstance(varcol, BoolCol):
            res[u'modalités'] = bool(mods)
            res['mod']        = mods
        elif isinstance(varcol, IntCol):
            res[u'modalités'] = mods
            res['mod']        = mods
        elif isinstance(varcol, AgeCol):
            res[u'modalités'] = mods
            res['mod'] = mods
        else:
            res[u'modalités'] = "total"
            res['mod']  = 0

        if label is not None:
            res['variable'] = label
        else:
            res['variable'] = varname
        res['var'] = varname

        if target is not None:
            for mod, margin in target.iteritems():
                if mod == varname:    # dirty to deal with non catgorical data
                    res['cible'][0] = margin
                else:
                    res['cible'][mod] = margin

        if self.frame is None:
            self.frame = res
        else:
            self.frame = concat([self.frame, res])

        self.frame = self.frame.reset_index(drop=True)
Example #58
0
fpr, tpr, thresholds = roc_curve(ROCtestTRG, probas[:, 1])
roc_auc  = auc(fpr, tpr)
pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('SVC', roc_auc))
#RandomForestClassifier
probas = model_rfc.fit(ROCtrainTRN, ROCtrainTRG).predict_proba(ROCtestTRN)
fpr, tpr, thresholds = roc_curve(ROCtestTRG, probas[:, 1])
roc_auc  = auc(fpr, tpr)
pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('RandonForest',roc_auc))
#KNeighborsClassifier
probas = model_knc.fit(ROCtrainTRN, ROCtrainTRG).predict_proba(ROCtestTRN)
fpr, tpr, thresholds = roc_curve(ROCtestTRG, probas[:, 1])
roc_auc  = auc(fpr, tpr)
pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('KNeighborsClassifier',roc_auc))
#LogisticRegression
probas = model_lr.fit(ROCtrainTRN, ROCtrainTRG).predict_proba(ROCtestTRN)
fpr, tpr, thresholds = roc_curve(ROCtestTRG, probas[:, 1])
roc_auc  = auc(fpr, tpr)
pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('LogisticRegression',roc_auc))
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.legend(loc=0, fontsize='small')
pl.show()

model_rfc.fit(train, target)
result.insert(1,'Survived', model_rfc.predict(test))
result.to_csv('Kaggle_Titanic/Result/test.csv', index=False)

Example #59
0
class MyTuShare(object):
  # Column = Enum("Range")
  """docstring for ClassName"""
  def __init__(self):
    TestArr = [["601366","利群股份","预升","2018-04-16","0.1700","30.33"],
["000018","神州长城","预增","2018-04-16","0.0583","140%~170%"],
["000626","远大控股","预亏","2018-04-16","0.1602","0"],
["000430" ,'张家界',"预亏","2018-04-14","0.0330",'-241.78%~-237.06%'],
["300107" ,'张家界2',"预亏","2018-04-19","0.0320",'-30%']]
    if(Test):
      self.forecastDF = DataFrame(TestArr, columns=["code", "name","type","report_date", "pre_eps","range"])
    else:
      self.forecastDF = ts.forecast_data(2018,1)

    self.currentPrice = None
    
    # forecastDF = forecastDF[0:59]

#        code   name type report_date  pre_eps              range
# 0     601366   利群股份   预升  2018-04-16   0.1700              30.33
# 1     000018   神州长城   预增  2018-04-16   0.0583          140%~170%
# 2     000626   远大控股   预亏  2018-04-16   0.1602                  0
# 3     000430    张家界   预亏  2018-04-14   0.0330  -241.78%~-237.06%
# 4     000690   宝新能源   预增  2018-04-14   0.0100    120.89%~170.41%
# 5     000825   太钢不锈   预增  2018-04-14   0.0560    335.12%~381.74%
# 6     000883   湖北能源   预增  2018-04-14   0.0680      51.83%~63.16%
# 7     000760    斯太尔   减亏  2018-04-14  -0.0600                  0

  # def getClosePrice1(self, code):
  #   today = "2018-04-20"
  #   self.currentPrice = ts.get_hist_data('300107', start=today)
  #   return self.currentPrice.loc[today, "close"]

  def getClosePrice(self, code):
    today = "2018-04-20"
    todayKdata = ts.get_hist_data(code, start=today)
    # print todayKdata
    # if(todayKdata != None):
    return todayKdata.loc[today, "close"]


  def insertClosePriceColumn(self):
    prices = []
    for code in self.forecastDF['code']:
      print code
      price = self.getClosePrice(code)
      prices.append(price)

    self.forecastDF.insert(self.forecastDF.columns.size, "close", prices)

  def initCurrentPrice(self):
    # self.currentPrice = ts.get_today_all()

    #get previous day close price
    self.currentPrice = ts.get_hist_data('300107', start="2018-04-20")
    print self.currentPrice.loc["2018-04-20", "close"]

    #get today's count
    # df = ts.get_today_ticks('300107', retry_count=20)
    # self.currentPrice = df.head(10)

    #get volume > 500
    # self.currentPrice = ts.get_sina_dd('300107', date='2018-04-20', vol=500)  #指定大于等于500手的数据
  
  def searchString(self, regExpr, line):
    patternAck = re.compile(regExpr)
    # print ">>>>" + str(regExpr) +"=>" + str(line)
    match = patternAck.search(line)
    return match

  def sort(self, columns):
    weight = 0
    for column in columns:

      # sort
      if column == Column.Range:
        # print str(self.forecastDF.index)
        avgs = []
        for rowNum in self.forecastDF.index:
          rangeStr = self.forecastDF.loc[rowNum]['range']
          avg = 0
          match1 = self.searchString(r'(-*\d+\.*\d*)%~(-*\d+\.*\d*)%', rangeStr)
          match2 = self.searchString(r'(-*\d+\.*\d*)%', rangeStr)
          if match1:
            low = float(match1.group(1))
            high = float(match1.group(2))
            avg = (low + high) / 2
          elif(match2):
            avg = float(match2.group(1))
          else:
            avg = float(rangeStr)
          # print avg
          avgs.append(avg)
        # print avgs
        self.forecastDF.insert(self.forecastDF.columns.size, "avgRange", avgs)
        self.forecastDF = self.forecastDF.sort_values(by=["avgRange"], axis = 0, ascending = False)
        # df.sort_values(by=['col1'])
        # print self.forecastDF.at(3)
      elif(column == Column.ReportDate):
        self.forecastDF = self.forecastDF.sort_values(by=["report_date"], axis = 0, ascending = False)
      else:
        pass

      self.forecastDF = self.forecastDF.reindex()
      print "*" * 20 + "After Sort"
      print self.forecastDF

      numRow = self.forecastDF.index.size
      if ("myrank" in self.forecastDF.columns.tolist() ):
        # if(myrank == None and myrank.size != self.forecastDF.index.size):
        #   raise Exception("wrong myrank size" + str(myrank))
        myrankIndex = self.forecastDF.columns.tolist().index("myrank")
        for i in range(0, numRow):
          print "1.>>>>" + str(self.forecastDF.iloc[i, myrankIndex]) + ":" + str(columns[column]) + "=>" +str(i)
          self.forecastDF.iloc[i, myrankIndex] += (numRow - i) * columns[column] 
      else:
        arr = range(1, numRow + 1)
        arr.reverse()
        for i in range(0, len(arr)):
          arr[i] = columns[column] * arr[i]
        self.forecastDF.insert(self.forecastDF.columns.size, "myrank", arr)

      print "*" * 20 + "add myrank"
      print self.forecastDF
      weight += columns[column]
    myrankIndex = self.forecastDF.columns.tolist().index("myrank")
    for i in range(0, self.forecastDF.index.size):
      self.forecastDF.iloc[i, myrankIndex] = round( float(self.forecastDF.iloc[i, myrankIndex]) / weight , 2)
    print "*" * 30 + "final"
    self.forecastDF = self.forecastDF.sort_values(by=["myrank"], axis = 0, ascending = False)
    print self.forecastDF
Example #60
0
    def test_column_dups_operations(self):

        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = lrange(2)
        df = DataFrame(arr, columns=['A', 'A'])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range('20130101', periods=4, freq='Q-NOV')
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['a', 'a', 'a', 'a'])
        df.columns = idx
        expected = DataFrame(
            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
        check(df, expected)

        # insert
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        df['string'] = 'bah'
        expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
                              [2, 1, 3, 5, 'bah']],
                             columns=['foo', 'bar', 'foo', 'hello', 'string'])
        check(df, expected)
        with assertRaisesRegexp(ValueError, 'Length of value'):
            df.insert(0, 'AnotherColumn', range(len(df.index) - 1))

        # insert same dtype
        df['foo2'] = 3
        expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
                              [2, 1, 3, 5, 'bah', 3]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)

        # set (non-dup)
        df['foo2'] = 4
        expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
                              [2, 1, 3, 5, 'bah', 4]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)
        df['foo2'] = 3

        # delete (non dup)
        del df['bar']
        expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
                              [2, 3, 5, 'bah', 3]],
                             columns=['foo', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # try to delete again (its not consolidated)
        del df['hello']
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # consolidate
        df = df.consolidate()
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # insert
        df.insert(2, 'new_col', 5.)
        expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
                              [2, 3, 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col', 'string',
                                      'foo2'])
        check(df, expected)

        # insert a dup
        assertRaisesRegexp(ValueError, 'cannot insert',
                           df.insert, 2, 'new_col', 4.)
        df.insert(2, 'new_col', 4., allow_duplicates=True)
        expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
                              [1, 2, 4., 5., 'bah', 3],
                              [2, 3, 4., 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col',
                                      'new_col', 'string', 'foo2'])
        check(df, expected)

        # delete (dup)
        del df['foo']
        expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
                              [4., 5., 'bah', 3]],
                             columns=['new_col', 'new_col', 'string', 'foo2'])
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        check(df)

        df['foo2'] = 7.
        expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
                              [2, 1, 3., 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        result = df['foo']
        expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
                             columns=['foo', 'foo'])
        check(result, expected)

        # multiple replacements
        df['foo'] = 'string'
        expected = DataFrame([['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        del df['foo']
        expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
                             'bar', 'hello', 'foo2'])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        self.assertTrue((result == expected).all().all())

        # rename, GH 4403
        df4 = DataFrame(
            {'TClose': [22.02],
             'RT': [0.0454],
             'TExg': [0.0422]},
            index=MultiIndex.from_tuples([(600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        df5 = DataFrame({'STK_ID': [600809] * 3,
                         'RPT_Date': [20120930, 20121231, 20130331],
                         'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
                         'TClose': [38.05, 41.66, 30.01]},
                        index=MultiIndex.from_tuples(
                            [(600809, 20120930),
                             (600809, 20121231),
                             (600809, 20130331)],
                            names=['STK_ID', 'RPT_Date']))

        k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
        result = k.rename(
            columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
        str(result)
        result.dtypes

        expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
                                u('饡驦'), 30.01]],
                              columns=['RT', 'TClose', 'TExg',
                                       'RPT_Date', 'STK_ID', 'STK_Name',
                                       'QT_Close'])
                    .set_index(['STK_ID', 'RPT_Date'], drop=False))
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        self.assertRaises(ValueError, df.reindex, columns=['bar'])
        self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo'])

        # drop
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        result = df.drop(['a'], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=['bar'])
        check(result, expected)
        result = df.drop('a', axis=1)
        check(result, expected)

        # describe
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['bar', 'a', 'a'], dtype='float64')
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'A'])
        for index in [df.index, pd.Index(list('edcba'))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame.from_items([('A', expected_ser),
                                                ('B', this_df['B']),
                                                ('A', expected_ser)])
            this_df['A'] = index
            check(this_df, expected_df)

        # operations
        for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ['A', 'A']
            df.columns = ['A', 'A']
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
        expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])

        df['that'] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
        expected = DataFrame(1, index=range(5), columns=['that', 'that'])

        df['that'] = 1
        check(df, expected)