def classifyTestData(testFilePath,modelRoot):
    """
    This method calls the traverseDecisionTreeModel() to classify the test data on the trained model and generate Confusion matrix and error at the given depth
    :param testFilePath: Path to the test file
    :param modelRoot: Root node of the decision tree of the trained model

    """
    correctlyClassifiedInstances=0
    incorrectlyClassifiedInstances=0
    testDataList=[]
    input=open(testFilePath,'rU')
    csvObject=csv.reader(input)
    label = featureList[len(featureList) -1]
    classLabels = featureAndValueMapping.get(label)
    classLabelCount = len(classLabels)
    ConfusionMatrix = [[0 for x in range(int(classLabelCount))] for x in range(int(classLabelCount))]
    for row in csvObject:
        predictedLabel=traverseDecisionTreeModel(row,root)
        ConfusionMatrix[int(row[len(row)- 1]) - 1][int(predictedLabel) - 1] += 1

        if predictedLabel==row[len(row)-1]:
            correctlyClassifiedInstances+=1
        else:
            incorrectlyClassifiedInstances+=1
    df = DataFrame(ConfusionMatrix)
    df.columns = classLabels
    df.index = classLabels

    print "Confusion Matrix :: \n"
    print df
    print "Correctly Classified Instance ",correctlyClassifiedInstances
    print "Incorrectly Classified Instance ",incorrectlyClassifiedInstances
Beispiel #2
0
def _unstack_frame(obj, level):
    from pandas.core.internals import BlockManager, make_block

    if obj._is_mixed_type:
        unstacker = _Unstacker(np.empty(obj.shape, dtype=bool),  # dummy
                               obj.index, level=level,
                               value_columns=obj.columns)
        new_columns = unstacker.get_new_columns()
        new_index = unstacker.get_new_index()
        new_axes = [new_columns, new_index]

        new_blocks = []
        mask_blocks = []
        for blk in obj._data.blocks:
            bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
                                    value_columns=blk.items)
            new_items = bunstacker.get_new_columns()
            new_values, mask = bunstacker.get_new_values()

            mblk = make_block(mask.T, new_items, new_columns)
            mask_blocks.append(mblk)

            newb = make_block(new_values.T, new_items, new_columns)
            new_blocks.append(newb)

        result = DataFrame(BlockManager(new_blocks, new_axes))
        mask_frame = DataFrame(BlockManager(mask_blocks, new_axes))
        return result.ix[:, mask_frame.sum(0) > 0]
    else:
        unstacker = _Unstacker(obj.values, obj.index, level=level,
                               value_columns=obj.columns)
        return unstacker.get_result()
 def test_read_empty_dta(self):
     empty_ds = DataFrame(columns=['unit'])
     # GH 7369, make sure can read a 0-obs dta file
     with tm.ensure_clean() as path:
         empty_ds.to_stata(path,write_index=False)
         empty_ds2 = read_stata(path)
         tm.assert_frame_equal(empty_ds, empty_ds2)
def feature_engineering(raw_data):
    input_data = raw_data[['Date','AdjClose','AdjVolume']].dropna()
    train_ratio = 0.8
    
    savedata= DataFrame(input_data)
    savedata.to_csv('/home/peng/workspace/datafortrainCao.csv', header=0)
    #===========================================================================
    # Vol_5 = index_cal().VOL_n(input_data, 5)
    # Vol_10 = index_cal().VOL_n(input_data, 10)
    # Vol_15 = index_cal().VOL_n(input_data, 15)
    # Vol_20 = index_cal().VOL_n(input_data, 20)
    # RDV_5 = index_cal().RDV_n(input_data, 5)
    # RDV_10 = index_cal().RDV_n(input_data, 10)
    # RDV_15 = index_cal().RDV_n(input_data, 15)
    # RDV_20 = index_cal().RDV_n(input_data, 20)
    #===========================================================================
    
    EMA15 = index_cal().EMAn(input_data, 15)
    RDP_5 = index_cal().RDP_n(input_data, 5)
    RDP_10 = index_cal().RDP_n(input_data, 10)
    RDP_15 = index_cal().RDP_n(input_data, 15)
    RDP_20 = index_cal().RDP_n(input_data, 20)
    RDP_plus_5 = index_cal().RDP_plus_n(input_data, 5)
    
    all_data = mergeColumnByDate(RDP_5,RDP_10,RDP_15,RDP_20,EMA15,RDP_plus_5)
    features = all_data[['RDP-5','RDP-10','RDP-15','RDP-20','EMA15']]
    features = PCA().fit_transform(features.values)
    (x_train, x_test) = divideTrainTest(features, train_ratio)
    objectives = all_data['RDP+5'].values
    (y_train,y_real) = divideTrainTest(objectives, train_ratio)
    
    return (x_train,y_train,x_test,y_real)
Beispiel #5
0
    def get_daily_normals(self, start_date = None, end_date = None, stamp_year = 2001):
        """
        :type start_date: datetime.datetime
        :type end_date: datetime.datetime
        :rtype : list , list
        """
        self.stamp_day_dates = pandas.DatetimeIndex(start = datetime(stamp_year,1,1), end = date(stamp_year, 12, 31),
            freq = pandas.datetools.offsets.Day())

        if start_date is None:
            start_date = self.time[0]

        if end_date is None:
            end_date = self.time[-1]


        di = pandas.DatetimeIndex(data = self.time)
        df = DataFrame(data = self.data, index = di, columns=["values",])


        df = df.select( lambda d: start_date <= d <= end_date )
        df_mean = df.groupby(by = lambda d: (d.day, d.month)).mean()


        return self.stamp_day_dates, df_mean.ix[[ (d.day, d.month) for d in self.stamp_day_dates] ,"values"]
 def getList(self,week, club, colList, filename):
     s = pd.read_csv(filename)
     df2 = DataFrame(s)
     df3 = DataFrame(s)
     
     columns = df2.columns
     xlist = list()
     for c in columns:
         if c.upper().find("PRICE ADJUSTMENT") == -1:
             if  c.find(week) != -1:
                 xlist.append(str(c))
                     
     indexList = list()
     for xcolumn in xlist:
         colist = list()
         colist.append(xcolumn)
         df4 = DataFrame(df3, columns=colist)[~df3[xcolumn].isnull()]
         for row in df4.iterrows():
             if row[1][0] == club:
                 indexList.append(row[0])
     fin = DataFrame(df2, index=indexList, columns=colList)
     if fin.empty:
         return
     fin["Camp"] = club
     fin["Week"] = week
     return fin
Beispiel #7
0
    def plot(self):
        """
            Plots 2 graphs. One for N-period moving average, lower and upper bands.
            One for P/N and position.
        """

        columns = {"Upper Bands": self.upper_bands,
                   "Lower Bands": self.lower_bands,
                   "Moving Means": self.moving_means,
                   "Opening Prices": self.prices}
        df = DataFrame(columns, index=self.dates)
        df.plot()

        fig = plt.figure(num=None, figsize=(18, 10), dpi=80, facecolor='w', edgecolor='k')
        fig.add_subplot(121)
        trans_dates = [tran.date for tran in self.transactions]
        # we negate the value here to show profit/loss
        trans = Series([-tran.value() for tran in self.transactions], index=trans_dates)
        position = Series([tran.units for tran in self.transactions], index=trans_dates)

        position.cumsum().plot(label="Position")
        plt.xlabel("Date")
        plt.ylabel("Position")
        plt.title("Position over Time")
        plt.legend(loc="best")

        fig.add_subplot(122)
        trans.cumsum().plot(label="P/L")
        plt.xlabel("Date")
        plt.ylabel("Profit/Loss")
        plt.title("Profit and Loss over Time")
        plt.legend(loc="best")

        plt.show()
Beispiel #8
0
def pivot(self, index=None, columns=None, values=None):
    """
    See DataFrame.pivot
    """
    index_vals = self[index]
    column_vals = self[columns]
    mindex = MultiIndex.from_arrays([index_vals, column_vals])
    try:
        mindex._verify_integrity()
    except Exception:
        raise Exception("duplicate index/column pairs!")

    if values is None:
        items = self.columns - [index, columns]
        mat = self.reindex(columns=items).values
    else:
        items = [values]
        mat = np.atleast_2d(self[values].values).T

    stacked = DataFrame(mat, index=mindex, columns=items)

    if not mindex.is_lexsorted():
        stacked = stacked.sortlevel(level=0)

    unstacked = stacked.unstack()
    if values is not None:
        unstacked.columns = unstacked.columns.droplevel(0)
    return unstacked
Beispiel #9
0
def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis, nw_lags, nobs, df, nw_overlap):
    from pandas.core.frame import group_agg

    xx_inv = math.inv(xx)

    yv = y.values

    if cluster_axis is None:
        if nw_lags is None:
            return xx_inv * (rmse ** 2)
        else:
            resid = yv - np.dot(x.values, beta)
            m = (x.values.T * resid).T

            xeps = math.newey_west(m, nw_lags, nobs, df, nw_overlap)

            return np.dot(xx_inv, np.dot(xeps, xx_inv))
    else:
        Xb = np.dot(x.values, beta).reshape((len(x.values), 1))
        resid = DataFrame(yv[:, None] - Xb, index=y.index, columns=["resid"])

        if cluster_axis == 1:
            x = x.swaplevel(0, 1).sortlevel(0)
            resid = resid.swaplevel(0, 1).sortlevel(0)

        m = group_agg(x.values * resid.values, x.index._bounds, lambda x: np.sum(x, axis=0))

        if nw_lags is None:
            nw_lags = 0

        xox = 0
        for i in range(len(x.index.levels[0])):
            xox += math.newey_west(m[i : i + 1], nw_lags, nobs, df, nw_overlap)

        return np.dot(xx_inv, np.dot(xox, xx_inv))
Beispiel #10
0
    def test_missing_value_generator(self):
        types = ('b','h','l')
        df = DataFrame([[0.0]],columns=['float_'])
        with tm.ensure_clean() as path:
            df.to_stata(path)
            with StataReader(path) as rdr:
                valid_range = rdr.VALID_RANGE
        expected_values = ['.' + chr(97 + i) for i in range(26)]
        expected_values.insert(0, '.')
        for t in types:
            offset = valid_range[t][1]
            for i in range(0,27):
                val = StataMissingValue(offset+1+i)
                self.assertTrue(val.string == expected_values[i])

        # Test extremes for floats
        val = StataMissingValue(struct.unpack('<f',b'\x00\x00\x00\x7f')[0])
        self.assertTrue(val.string == '.')
        val = StataMissingValue(struct.unpack('<f',b'\x00\xd0\x00\x7f')[0])
        self.assertTrue(val.string == '.z')

        # Test extremes for floats
        val = StataMissingValue(struct.unpack('<d',b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
        self.assertTrue(val.string == '.')
        val = StataMissingValue(struct.unpack('<d',b'\x00\x00\x00\x00\x00\x1a\xe0\x7f')[0])
        self.assertTrue(val.string == '.z')
Beispiel #11
0
    def export_converted_values(self):
        """
        This function is called initially to convert per-100g values to per serving values
        Once this function is invoked, new file is generated which serves as Database
        This function will need to be called only one time
        :return:
        """
        file_converted = self.file_converted_values
        data_file = self.file_database
        data = self.read_csv(data_file)
        converted_data = list()
        for item in data.values:
            converted_list = list(item[0:2])
            sub_item = item[2:50]
            for nutrient in sub_item:
                import math

                if math.isnan(nutrient):
                    nutrient = 0
                converted_list.append(nutrient * sub_item[47] / 100)
            converted_list.append(item[50])
            converted_data.append(converted_list)
        if len(self.cols) == 0:
            for col_name in list(data._info_axis._data):
                self.cols.append(col_name)
        df = DataFrame(data=converted_data, columns=self.cols)
        df.to_csv(file_converted, index=False)
        print 'File has been exported'
Beispiel #12
0
    def get_result(self):
        if self._is_series:
            if self.axis == 0:
                new_data = com._concat_compat([x.get_values() for x in self.objs])
                name = com._consensus_name_attr(self.objs)
                return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat')
            else:
                data = dict(zip(range(len(self.objs)), self.objs))
                index, columns = self.new_axes
                tmpdf = DataFrame(data, index=index)
                if columns is not None:
                    tmpdf.columns = columns
                return tmpdf.__finalize__(self, method='concat')
        else:
            mgrs_indexers = []
            for obj in self.objs:
                mgr = obj._data
                indexers = {}
                for ax, new_labels in enumerate(self.new_axes):
                    if ax == self.axis:
                        # Suppress reindexing on concat axis
                        continue

                    obj_labels = mgr.axes[ax]
                    if not new_labels.equals(obj_labels):
                        indexers[ax] = obj_labels.reindex(new_labels)[1]

                mgrs_indexers.append((obj._data, indexers))

            new_data = concatenate_block_managers(
                mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy)
            if not self.copy:
                new_data._consolidate_inplace()

            return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat')
    def test_read_write_dta12(self):
        original = DataFrame(
            [(1, 2, 3, 4, 5, 6)],
            columns=[
                "astringwithmorethan32characters_1",
                "astringwithmorethan32characters_2",
                "+",
                "-",
                "short",
                "delete",
            ],
        )
        formatted = DataFrame(
            [(1, 2, 3, 4, 5, 6)],
            columns=[
                "astringwithmorethan32characters_",
                "_0astringwithmorethan32character",
                "_",
                "_1_",
                "_short",
                "_delete",
            ],
        )
        formatted.index.name = "index"
        formatted = formatted.astype(np.int32)

        with tm.ensure_clean() as path:
            with warnings.catch_warnings(record=True) as w:
                original.to_stata(path, None)
                tm.assert_equal(len(w), 1)  # should get a warning for that format.

            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted)
Beispiel #14
0
    def _wrap_aggregated_output(self, output, mask, comp_ids):
        agg_axis = 0 if self.axis == 1 else 1
        agg_labels = self._obj_with_exclusions._get_axis(agg_axis)

        if len(output) == len(agg_labels):
            output_keys = agg_labels
        else:
            output_keys = sorted(output)
            try:
                output_keys.sort()
            except Exception:  # pragma: no cover
                pass

            if isinstance(agg_labels, MultiIndex):
                output_keys = MultiIndex.from_tuples(output_keys,
                                                     names=agg_labels.names)

        if not self.as_index:
            result = DataFrame(output, columns=output_keys)
            group_levels = self._get_group_levels(mask, comp_ids)
            for i, (name, labels) in enumerate(group_levels):
                result.insert(i, name, labels)
            result = result.consolidate()
        else:
            index = self._get_multi_index(mask, comp_ids)
            result = DataFrame(output, index=index, columns=output_keys)

        if self.axis == 1:
            result = result.T

        return result
Beispiel #15
0
    def test_missing_value_generator(self):
        types = ("b", "h", "l")
        df = DataFrame([[0.0]], columns=["float_"])
        with tm.ensure_clean() as path:
            df.to_stata(path)
            with StataReader(path) as rdr:
                valid_range = rdr.VALID_RANGE
        expected_values = ["." + chr(97 + i) for i in range(26)]
        expected_values.insert(0, ".")
        for t in types:
            offset = valid_range[t][1]
            for i in range(0, 27):
                val = StataMissingValue(offset + 1 + i)
                self.assertTrue(val.string == expected_values[i])

        # Test extremes for floats
        val = StataMissingValue(struct.unpack("<f", b"\x00\x00\x00\x7f")[0])
        self.assertTrue(val.string == ".")
        val = StataMissingValue(struct.unpack("<f", b"\x00\xd0\x00\x7f")[0])
        self.assertTrue(val.string == ".z")

        # Test extremes for floats
        val = StataMissingValue(struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0])
        self.assertTrue(val.string == ".")
        val = StataMissingValue(struct.unpack("<d", b"\x00\x00\x00\x00\x00\x1a\xe0\x7f")[0])
        self.assertTrue(val.string == ".z")
Beispiel #16
0
def stack_sparse_frame(frame):
    """
    Only makes sense when fill_value is NaN
    """
    lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)]
    nobs = sum(lengths)

    # this is pretty fast
    minor_labels = np.repeat(np.arange(len(frame.columns)), lengths)

    inds_to_concat = []
    vals_to_concat = []
    # TODO: Figure out whether this can be reached.
    # I think this currently can't be reached because you can't build a SparseDataFrame
    # with a non-np.NaN fill value (fails earlier).
    for _, series in compat.iteritems(frame):
        if not np.isnan(series.fill_value):
            raise TypeError('This routine assumes NaN fill value')

        int_index = series.sp_index.to_int_index()
        inds_to_concat.append(int_index.indices)
        vals_to_concat.append(series.sp_values)

    major_labels = np.concatenate(inds_to_concat)
    stacked_values = np.concatenate(vals_to_concat)
    index = MultiIndex(levels=[frame.index, frame.columns],
                       labels=[major_labels, minor_labels],
                       verify_integrity=False)

    lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index,
                   columns=['foo'])
    return lp.sortlevel(level=0)
Beispiel #17
0
def stack_sparse_frame(frame):
    """
    Only makes sense when fill_value is NaN
    """
    lengths = [s.sp_index.npoints for _, s in frame.iteritems()]
    nobs = sum(lengths)

    # this is pretty fast
    minor_labels = np.repeat(np.arange(len(frame.columns)), lengths)

    inds_to_concat = []
    vals_to_concat = []
    for _, series in frame.iteritems():
        if not np.isnan(series.fill_value):
            raise Exception('This routine assumes NaN fill value')

        int_index = series.sp_index.to_int_index()
        inds_to_concat.append(int_index.indices)
        vals_to_concat.append(series.sp_values)

    major_labels = np.concatenate(inds_to_concat)
    stacked_values = np.concatenate(vals_to_concat)
    index = MultiIndex(levels=[frame.index, frame.columns],
                       labels=[major_labels, minor_labels])

    lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index,
                   columns=['foo'])
    return lp.sortlevel(level=0)
Beispiel #18
0
    def test_read_write_dta12(self):
        original = DataFrame([(1, 2, 3, 4, 5, 6)],
                             columns=['astringwithmorethan32characters_1',
                                      'astringwithmorethan32characters_2',
                                      '+',
                                      '-',
                                      'short',
                                      'delete'])
        formatted = DataFrame([(1, 2, 3, 4, 5, 6)],
                              columns=['astringwithmorethan32characters_',
                                       '_0astringwithmorethan32character',
                                       '_',
                                       '_1_',
                                       '_short',
                                       '_delete'])
        formatted.index.name = 'index'
        formatted = formatted.astype(np.int32)

        with tm.ensure_clean() as path:
            with warnings.catch_warnings(record=True) as w:
                original.to_stata(path, None)
                tm.assert_equal(len(w), 1)  # should get a warning for that format.

            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
Beispiel #19
0
def pivot(self, index=None, columns=None, values=None):
    """
    See DataFrame.pivot
    """
    index_vals = self[index]
    column_vals = self[columns]
    mindex = MultiIndex.from_arrays([index_vals, column_vals],
                                    names=[index, columns])

    if values is None:
        items = self.columns - [index, columns]
        mat = self.reindex(columns=items).values
    else:
        items = [values]
        mat = np.atleast_2d(self[values].values).T

    stacked = DataFrame(mat, index=mindex, columns=items)

    if not mindex.is_lexsorted():
        stacked = stacked.sortlevel(level=0)

    unstacked = stacked.unstack()
    if values is not None:
        unstacked.columns = unstacked.columns.droplevel(0)
    return unstacked
Beispiel #20
0
    def _wrap_applied_output(self, keys, values, not_indexed_same=False):
        if len(keys) == 0:
            return Series([])

        key_names = [ping.name for ping in self.groupings]

        if isinstance(values[0], Series):
            if not_indexed_same:
                data_dict = dict(zip(keys, values))
                result = DataFrame(data_dict).T
                if len(self.groupings) > 1:
                    result.index = MultiIndex.from_tuples(keys, names=key_names)
                return result
            else:
                cat_values = np.concatenate([x.values for x in values])
                cat_index = values[0].index
                if len(values) > 1:
                    cat_index = cat_index.append([x.index for x in values[1:]])
                return Series(cat_values, index=cat_index)
        elif isinstance(values[0], DataFrame):
            # possible that Series -> DataFrame by applied function
            return self._wrap_frames(keys, values,
                                     not_indexed_same=not_indexed_same)
        else:
            if len(self.groupings) > 1:
                index = MultiIndex.from_tuples(keys, names=key_names)
                return Series(values, index)
            else:
                return Series(values, keys)
 def testEnsembleForecastWeightCombinesForecasts(self):
     result = self.weight(self.forecasts)
     self.assertIsInstance(result, Forecast)
     for i in [0, 1, 5, 10, 19]:
         expected = [fcst.mean.iloc[i] for fcst in self.forecasts]
         expected = DataFrame(expected)
         self.assertTrue(result.mean.iloc[i].equals(expected.mean()))
 def getCamps(self,week):
     s = pd.read_csv("D:/UTDallasStudy/summer.csv")
     df2 = DataFrame(s)
     df3 = DataFrame(s)
     
     columns = df2.columns
     x = set()
     xlist = list()
     ylist = list()
     for c in columns:
         if c.upper().find("PRICE ADJUSTMENT") == -1:
             if  c.find(week) != -1:
                 x.add(re.split("\.?", str(c))[0])
                 ylist.append(c)
             else:
                 x.add(str(c))
                 
     for x1 in x: xlist.append(x1)
     xlist.sort(cmp=None, key=None, reverse=False)
     campset = set()
     for y in ylist:
         clist = list()
         clist.append(y)
         res = DataFrame(df3, columns=clist)[~df3[y].isnull()]
         for row in res.iterrows():
             campName = row[1][0]
             campset.add(campName)
     camplist = list()
     for camp in campset: camplist.append(camp)
     camplist.sort(cmp=None, key=None, reverse=False)
     return camplist
Beispiel #23
0
        def generate_input_df(self, n_topics, vocab_size, document_length, n_docs, 
                              previous_vocab=None, vocab_prefix=None, 
                              df_outfile=None, vocab_outfile=None, 
                              n_bags=1):
                        
            print "Generating input DF"
                        
            # word_dists is the topic x document_length matrix
            word_dists = self.generate_word_dists(n_topics, vocab_size, document_length)                        
            
            # generate each document x terms vector
            docs = np.zeros((vocab_size, n_docs), dtype=int64)
            for i in range(n_docs):
                docs[:, i] = self.generate_document(word_dists, n_topics, vocab_size, document_length)
                
            if previous_vocab is not None:
                width = vocab_size/n_topics
                high = int(document_length / width)                
                # randomly initialises the previous_vocab part
                additional = np.random.randint(high, size=(len(previous_vocab), n_docs))
                docs = np.vstack((additional, docs))
                
            df = DataFrame(docs)
            df = df.transpose()
            print df.shape            
            if self.make_plot:            
                self._plot_nicely(df, 'Documents X Terms', 'Terms', 'Docs')
            
            if df_outfile is not None:
                df.to_csv(df_outfile)        

            print "Generating vocabularies"
            
            # initialises vocab to either previous vocab or a blank list
            if previous_vocab is not None:
                vocab = previous_vocab.tolist()
            else:
                vocab = []

            # add new words
            for n in range(vocab_size):
                if vocab_prefix is None:
                    word = "word_" + str(n)
                else:
                    word = vocab_prefix + "_word_" + str(n)
                # if more than one bag, then initialise word type too
                if n_bags > 1:
                    word_type = np.random.randint(n_bags)
                    tup = (word, word_type)
                    vocab.append(tup)
                else:
                    vocab.append(word)
            
            # save to txt
            vocab = np.array(vocab)
            if vocab_outfile is not None:
                np.savetxt(vocab_outfile, vocab, fmt='%s')
            
            return df, vocab
Beispiel #24
0
    def get_result(self):

        # series only
        if self._is_series:

            # stack blocks
            if self.axis == 0:
                new_data = com._concat_compat([x._values for x in self.objs])
                name = com._consensus_name_attr(self.objs)
                return (Series(new_data, index=self.new_axes[0],
                               name=name,
                               dtype=new_data.dtype)
                        .__finalize__(self, method='concat'))

            # combine as columns in a frame
            else:
                data = dict(zip(range(len(self.objs)), self.objs))
                index, columns = self.new_axes
                tmpdf = DataFrame(data, index=index)
                # checks if the column variable already stores valid column
                # names (because set via the 'key' argument in the 'concat'
                # function call. If that's not the case, use the series names
                # as column names
                if (columns.equals(Index(np.arange(len(self.objs)))) and
                        not self.ignore_index):
                    columns = np.array([data[i].name
                                        for i in range(len(data))],
                                       dtype='object')
                    indexer = isnull(columns)
                    if indexer.any():
                        columns[indexer] = np.arange(len(indexer[indexer]))
                tmpdf.columns = columns
                return tmpdf.__finalize__(self, method='concat')

        # combine block managers
        else:
            mgrs_indexers = []
            for obj in self.objs:
                mgr = obj._data
                indexers = {}
                for ax, new_labels in enumerate(self.new_axes):
                    if ax == self.axis:
                        # Suppress reindexing on concat axis
                        continue

                    obj_labels = mgr.axes[ax]
                    if not new_labels.equals(obj_labels):
                        indexers[ax] = obj_labels.reindex(new_labels)[1]

                mgrs_indexers.append((obj._data, indexers))

            new_data = concatenate_block_managers(
                mgrs_indexers, self.new_axes,
                concat_axis=self.axis, copy=self.copy)
            if not self.copy:
                new_data._consolidate_inplace()

            return (self.objs[0]._from_axes(new_data, self.new_axes)
                    .__finalize__(self, method='concat'))
 def test_write_missing_strings(self):
     original = DataFrame([["1"], [None]], columns=["foo"])
     expected = DataFrame([["1"], [""]], columns=["foo"])
     expected.index.name = "index"
     with tm.ensure_clean() as path:
         original.to_stata(path)
         written_and_read_again = self.read_dta(path)
         tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
 def test_no_index(self):
     columns = ["x", "y"]
     original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns)
     original.index.name = "index_not_written"
     with tm.ensure_clean() as path:
         original.to_stata(path, write_index=False)
         written_and_read_again = self.read_dta(path)
         tm.assertRaises(KeyError, lambda: written_and_read_again["index_not_written"])
class XLSDataFrameWriter(object):
    def __init__(self, records, columns):
        self.dataframe = DataFrame(records, columns=columns)

    def write_to_excel(self, excel_writer, sheet_name, header=False,
        index=False):
        self.dataframe.to_excel(excel_writer, sheet_name, header=header,
                index=index)
Beispiel #28
0
 def test_column_order_plus_index(self):
     query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3"
     col_order = ['STRING_3', 'STRING_2']
     result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col='STRING_1', col_order=col_order)
     correct_frame = DataFrame({'STRING_1' : ['a'], 'STRING_2' : ['b'], 'STRING_3' : ['c']})
     correct_frame.set_index('STRING_1', inplace=True)
     correct_frame = correct_frame[col_order]
     tm.assert_frame_equal(result_frame, correct_frame)
Beispiel #29
0
 def test_column_order_plus_index(self):
     query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3"
     col_order = ["STRING_3", "STRING_2"]
     result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col="STRING_1", col_order=col_order)
     correct_frame = DataFrame({"STRING_1": ["a"], "STRING_2": ["b"], "STRING_3": ["c"]})
     correct_frame.set_index("STRING_1", inplace=True)
     correct_frame = correct_frame[col_order]
     tm.assert_frame_equal(result_frame, correct_frame)
Beispiel #30
0
 def test_excessively_long_string(self):
     str_lens = (1, 244, 500)
     s = {}
     for str_len in str_lens:
         s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len])
     original = DataFrame(s)
     with tm.assertRaises(ValueError):
         with tm.ensure_clean() as path:
             original.to_stata(path)
def rpy2py_dataframe(obj):
    items = OrderedDict((k, rpy2py(v) if isinstance(v, Sexp) else v)
                        for k, v in obj.items())
    res = PandasDataFrame.from_dict(items)
    res.index = obj.rownames
    return res
Beispiel #32
0
    def makeNewsDataCsv(cls,
                        cur=None,
                        start_date=None,
                        end_date=None,
                        basic_path=None,
                        word_trend_file=None,
                        news_file=None,
                        output_file=None,
                        stock_id=None):
        if cur == None or start_date == None or end_date == None or word_trend_file is None or output_file == None or stock_id == None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        news_path = os.path.join(basic_path, news_file)
        word_trend_path = os.path.join(basic_path, word_trend_file)
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])

        columns = [
            "stock_id", "date", "opening", "closing", "difference",
            "percentage_difference", "lowest", "highest", "volume", "amount",
            "rate"
        ] + ["news_pos_num", "news_neg_num"]
        data = {}
        for k in columns:
            data[k] = []
        pd.DataFrame(data).to_csv(output_path, index=False, columns=columns)

        word_trend = {}
        word_trend_temp = pd.read_csv(word_trend_path)
        for k in word_trend_temp["0"].keys():
            word_trend[word_trend_temp["0"][k]] = [
                word_trend_temp["1"][k], word_trend_temp["2"][k]
            ]
        p_up = word_trend['total_words'][0] / (word_trend['total_words'][0] +
                                               word_trend['total_words'][1])
        p_down = word_trend['total_words'][1] / (word_trend['total_words'][0] +
                                                 word_trend['total_words'][1])

        cur.execute(
            "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' "
            % (stock_id, start_date, end_date))
        count = cur.fetchall()
        count = count[0][0]

        skip = 100
        slimit = 0
        while slimit < count:
            cur.execute(
                "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d "
                % (stock_id, start_date, end_date, 0 if slimit - 1 < 0 else
                   slimit - 1, skip if slimit - 1 < 0 else skip + 1))
            slimit += skip
            history_tt = cur.fetchall()
            history_t = []
            for h in history_tt:
                history_t.append([
                    int(h[0]),
                    float(h[1]),
                    float(h[2]),
                    float(h[3]),
                    float(h[4]),
                    float(h[5]),
                    float(h[6]),
                    float(h[7]),
                    float(h[8]),
                    str(h[9])
                ])
            del history_tt

            history_temp = []
            for h in zip(*history_t):
                history_temp.append(h)
            history = {
                'stock_id': history_temp[0],
                'opening': history_temp[1],
                'closing': history_temp[2],
                'difference': history_temp[3],
                'percentage_difference': history_temp[4],
                'lowest': history_temp[5],
                'highest': history_temp[6],
                'volume': history_temp[7],
                'amount': history_temp[8],
                'date': history_temp[9]
            }
            del history_t, history_temp
            history = DataFrame(history)
            g_history = history.groupby(by=['stock_id'])
            #0.01 -> 1 % 保留2位小数
            history['rate'] = 100 * (g_history.shift(0)["closing"] /
                                     g_history.shift(1)["closing"] - 1)
            history.dropna(axis=0,
                           how='any',
                           thresh=None,
                           subset=None,
                           inplace=True)
            '''
            '''
            sdate = str(history['date'][history['date'].keys()[0]])
            edate = str(history['date'][history['date'].keys()[-1]])
            # sdate = datetime.datetime.strptime(sdate,'%Y-%m-%d')
            # sdate = (sdate - datetime.timedelta(days=0)).strftime('%Y-%m-%d')
            cur.execute(
                "SELECT GROUP_CONCAT(id  SEPARATOR ','), time FROM news WHERE time between '%s' and '%s' group by time"
                % (sdate, edate))
            news_temp = cur.fetchall()
            news_by_date = {}
            news_by_id = {}
            for n in news_temp:
                news_by_date[str(n[1])] = n[0].split(",")
                for nid in news_by_date[str(n[1])]:
                    news_by_id[nid] = None
            del news_temp

            nid_len = len(news_by_id)
            reader = pd.read_csv(news_path, chunksize=1000)
            for sentences in reader:
                if nid_len > 0:
                    for k in sentences['1'].keys():
                        nid = str(sentences['0'][k])
                        if nid in news_by_id and news_by_id[nid] == None:
                            news_by_id[nid] = str(sentences['1'][k]).split(" ")
                            wp_up = p_up
                            wp_down = p_down
                            for w in news_by_id[nid]:
                                if w not in word_trend:
                                    wp_up *= (1 / word_trend['total_words'][0])
                                    wp_down *= (1 /
                                                word_trend['total_words'][1])
                                else:
                                    if word_trend[w][0] > 0:
                                        wp_up *= word_trend[w][0]
                                    else:
                                        wp_up *= (1 /
                                                  word_trend['total_words'][0])

                                    if word_trend[w][1] > 0:
                                        wp_down *= word_trend[w][1]
                                    else:
                                        wp_down *= (
                                            1 / word_trend['total_words'][1])
                                while True:
                                    if wp_up < 1 and wp_down < 1:
                                        wp_up *= 10
                                        wp_down *= 10
                                    else:
                                        break

                            news_by_id[nid] = [
                                wp_up / (wp_up + wp_down),
                                -1 * wp_down / (wp_up + wp_down)
                            ]
                            nid_len -= 1
                            if nid_len <= 0:
                                break
                else:
                    break
            reader.close()
            del reader, sentences

            for d in news_by_date:
                sumn = [0, 0]
                for nid in news_by_date[d]:
                    sumn[0] += news_by_id[nid][0]
                    sumn[1] += news_by_id[nid][1]
                le = len(news_by_date[d])
                if le > 0:
                    sumn[0] /= le
                    sumn[1] /= le
                news_by_date[d] = sumn
                print(d)

            history['news_pos_num'] = 0
            history['news_neg_num'] = 0
            for i in history.index:
                history.loc[i, 'rate'] = str(
                    np.round(float(history['rate'][i]), 2))
                if str(history['date'][i]) in news_by_date:
                    history.loc[i, 'news_pos_num'] = str(
                        np.round(
                            float(news_by_date[str(history['date'][i])][0]),
                            2))
                    history.loc[i, 'news_neg_num'] = str(
                        np.round(
                            float(news_by_date[str(history['date'][i])][1]),
                            2))
                else:
                    history.loc[i, 'news_pos_num'] = "0"
                    history.loc[i, 'news_neg_num'] = "0"

            #将经过标准化的数据处理成训练集和测试集可接受的形式
            def func_train_data(data_stock):
                if cls.groupby_skip == False:
                    cls.groupby_skip = True
                    return None
                print("正在处理的股票代码:%06s" % data_stock.name)

                data = {}
                for k in columns:
                    data[k] = []
                for i in range(len(data_stock) - 1):
                    for k in data:
                        data[k].append(data_stock.iloc[i][k])
                pd.DataFrame(data).to_csv(output_path,
                                          index=False,
                                          header=False,
                                          mode="a",
                                          columns=columns)

            g_stock = history.groupby(by=["stock_id"])
            #清空接收路径下的文件,初始化列名
            cls.groupby_skip = False
            g_stock.apply(func_train_data)
    if lista_nombres[i][-2] == lista_nombres[i][1]:
        n2 = ""

    medico = clases.Medico(n1, n2, ap1, ap2, lista_ruts[i], lista_edad[i],
                           lista_emails[i], lista_numero[i],
                           lista_especialidad[i])

    lista_medicos.append(medico)
clinica_objeto = clases.Clinica("Clinica de la Salud", "Público",
                                "Avenida Verdadera #123, Rancagua", "",
                                lista_medicos, lista_pacientes)

lista_citas = []
cita_vacia = clases.Cita("", "", "", "")
cita_csv = pd.read_csv('./datos/Citas.csv')
cita_csv = DataFrame(cita_csv)
codigo = cita_csv["codigo"].values
rut_paciente = cita_csv["rut paciente"].values
rut_medico = cita_csv["rut medico"].values
fecha_citada = cita_csv["fecha citada"].values
fecha_creacion = cita_csv["fecha de creacion"].values
modalidad = cita_csv["modalidad"].values
prestacion = cita_csv["prestacion"].values
confirmada = cita_csv["confirmada"].values
tiempo_restante = cita_csv["tiempo restante"].values

for i in range(len(codigo)):
    cita_vacia.setCodigo(codigo[i])
    cita_vacia.setPaciente(clinica_objeto.buscarPaciente(rut_paciente[i])[0])
    cita_vacia.setMedico(clinica_objeto.buscarMedico(rut_medico[i])[0])
    cita_vacia.setFechaCitada(parser.parse(fecha_citada[i]))
Beispiel #34
0
model.fit(x=X_train,
          y=y_train,
          epochs=3,
          batch_size=128,
          verbose=2,
          validation_split=0.1)
#预测
y_predict = model.predict(X_test)
#转换预测结果
y_predict_label = label2tag(predictions=y_predict, y=y)
#统计正确率
Y_test = label2tag(predictions=y_test, y=y)
print(
    sum([y_predict_label[i] == Y_test[i]
         for i in range(len(y_predict))]) / len(y_predict))

#导入另一个测试集进行预测,并导出结果
filename = 'xiaomi5a.csv'
test_data = pd.read_csv(filename)
x = test_data['comment']
X_cut = cut_texts(texts=x, need_cut=True, word_len=2, savepath=None)
X_seq = text2seq(texts_cut=X_cut, maxlen=maxlen, tokenizer=tokenizer)
X_seq = np.array(X_seq)
y_predict = model.predict(X_seq)
y_predict_label = label2tag(predictions=y_predict, y=y)
#Series转成dateframe
out_x = x.to_frame(name=None)
out_y = DataFrame(y_predict_label)
out_x.to_csv('x.csv')
out_y.to_csv('y.csv')
#model
starttime = datetime.datetime.now()  #Caculate time
sample_model = KMeans(n_clusters=10).fit(images_train_sample)  #K-Means
endtime = datetime.datetime.now()  #Caculate time
scikit_learn_execution_time = (endtime - starttime).seconds
print('scikit-learn execution time:',
      scikit_learn_execution_time)  #Caculate time 429s

#objective function value
cluster = sample_model.labels_
objective_function_value = sample_model.inertia_  #394810072745.4526
print('scikit-learn objective function value:', objective_function_value)

#accuracy
crosstable_data = {'label': labels_train_sample, 'cluster': list(cluster)}
df = DataFrame(crosstable_data)
crosstable = pd.crosstab(index=df['label'], columns=df['cluster'])
scikit_accuracy = sum(crosstable.max(axis=0)) / sum(crosstable.sum())  #0.22124
print('scikit-learn accuracy:', scikit_accuracy)

#PART3 my kmeans
######
#####
####
###
##
#

#model
starttime_2 = datetime.datetime.now()  #Caculate start time
cluster_center, cluster_assign = Kmeans(array(images_train_sample), 10)
Beispiel #36
0
 def test_stata_doc_examples(self):
     with tm.ensure_clean() as path:
         df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
         df.to_stata(path)
Beispiel #37
0
target = cl.astype('int')
print (target)
# 切分訓練與測試資料
train_X, test_X, train_y, test_y = train_test_split(data, target, train_size = 0.9, random_state = 42)
print (train_y)

# 建立分類器
clf = neighbors.KNeighborsClassifier(n_neighbors = 25)
data_clf = clf.fit(train_X, train_y)

# 預測
test_y_predicted = data_clf.predict(test_X)
"""print (test_y_predicted)

# 標準答案
print (test_y)"""

# 績效
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print "accuracy : ", accuracy
precision = metrics.precision_score(test_y, test_y_predicted, average='macro')
print "precision : ", precision
recall = metrics.recall_score(test_y, test_y_predicted, average='macro')
print "recall : ", recall
f_measure = 2 * (precision * recall / (precision + recall))
print "f_measure : ", f_measure

output = {'click' : test_y_predicted}
output = DataFrame(output)
output.to_csv('output.csv', sep=',', index = 0)
Beispiel #38
0
def predict_role(ps):
    fd = pd.read_csv('player_label.csv')

    df_obj = fd.label
    fd.label = df_obj.apply(lambda x: str(x).strip())
    print(fd.label)
    test_set = fd[['label']]
    train_set = fd[[
        'attacking_work_rate', 'defensive_work_rate', 'crossing', 'finishing',
        'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve',
        'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration',
        'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power',
        'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
        'interceptions', 'positioning', 'vision', 'penalties', 'marking',
        'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling',
        'gk_kicking', 'gk_positioning', 'gk_reflexes'
    ]]
    train_set = train_set[1:]
    test_set = test_set[1:]

    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(train_set,
                                                        test_set,
                                                        test_size=0.33,
                                                        random_state=12)

    from sklearn.naive_bayes import MultinomialNB
    clf_NB = MultinomialNB().fit(x_train, y_train)
    predicted = clf_NB.predict(x_test)
    import numpy as np
    from sklearn import metrics
    print("#################### NB ######################")
    confusion_matrix_NB = metrics.confusion_matrix(y_test, predicted)
    print(confusion_matrix_NB)
    accuracy_NB = metrics.accuracy_score(y_test, predicted)
    print(accuracy_NB)
    #	print(metrics.classification_report(y_test, predicted))

    print("##############################################")

    from sklearn import tree
    clf_tree = tree.DecisionTreeClassifier().fit(x_train, y_train)
    predicted = clf_tree.predict(x_test)
    print("#################### Decision Tree ######################")
    print(metrics.confusion_matrix(y_test, predicted))
    accuracy_DT = metrics.accuracy_score(y_test, predicted)
    print(accuracy_DT)
    #	print(metrics.classification_report(y_test, predicted))
    print("##############################################")

    from sklearn.linear_model import SGDClassifier
    clf_SGD = SGDClassifier().fit(x_train, y_train)
    predicted = clf_SGD.predict(x_test)
    print("#################### SGD Classifier ######################")
    print(metrics.confusion_matrix(y_test, predicted))
    accuracy_SGD = metrics.accuracy_score(y_test, predicted)
    print(accuracy_SGD)
    print("##############################################")

    from pandas.core.frame import DataFrame
    predict_data = DataFrame(ps)

    print(
        "----------------------&&&&&&&&&&&&&&&&&&&&&&&&&&&&&-----------------------"
    )
    print(predict_data)
    print(
        "----------------------&&&&&&&&&&&&&&&&&&&&&&&&&&&&&-----------------------"
    )

    predict_data = predict_data.iloc[:, 7:]
    print(
        "---------------------- become 38  ----------------------------------------"
    )
    print(predict_data)
    print(
        "---------------------- become 38  ----------------------------------------"
    )

    accuracy_list = [accuracy_NB, accuracy_DT, accuracy_SGD]
    if max(accuracy_list) == accuracy_NB:
        clf_model = clf_NB
    elif max(accuracy_list) == accuracy_DT:
        clf_model = clf_tree
    elif max(accuracy_list) == accuracy_SGD:
        clf_model = clf_SGD
    predicted = clf_model.predict(predict_data)
    print("************* model selection ****************")
    print(clf_model)
    pd.value_counts(predicted)
    print(predicted)
    print(type(predicted))
    return predicted.tolist()
Beispiel #39
0
    def __init__(self,
                 data=None,
                 index=None,
                 columns=None,
                 default_kind=None,
                 default_fill_value=None,
                 dtype=None,
                 copy=False):

        # pick up the defaults from the Sparse structures
        if isinstance(data, SparseDataFrame):
            if index is None:
                index = data.index
            if columns is None:
                columns = data.columns
            if default_fill_value is None:
                default_fill_value = data.default_fill_value
            if default_kind is None:
                default_kind = data.default_kind
        elif isinstance(data, (SparseSeries, SparseArray)):
            if index is None:
                index = data.index
            if default_fill_value is None:
                default_fill_value = data.fill_value
            if columns is None and hasattr(data, 'name'):
                columns = [data.name]
            if columns is None:
                raise Exception("cannot pass a series w/o a name or columns")
            data = {columns[0]: data}

        if default_fill_value is None:
            default_fill_value = np.nan
        if default_kind is None:
            default_kind = 'block'

        self._default_kind = default_kind
        self._default_fill_value = default_fill_value

        if is_scipy_sparse(data):
            mgr = self._init_spmatrix(data,
                                      index,
                                      columns,
                                      dtype=dtype,
                                      fill_value=default_fill_value)
        elif isinstance(data, dict):
            mgr = self._init_dict(data, index, columns, dtype=dtype)
        elif isinstance(data, (np.ndarray, list)):
            mgr = self._init_matrix(data, index, columns, dtype=dtype)
        elif isinstance(data, SparseDataFrame):
            mgr = self._init_mgr(data._data,
                                 dict(index=index, columns=columns),
                                 dtype=dtype,
                                 copy=copy)
        elif isinstance(data, DataFrame):
            mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
        elif isinstance(data, Series):
            mgr = self._init_dict(data.to_frame(),
                                  data.index,
                                  columns=None,
                                  dtype=dtype)
        elif isinstance(data, BlockManager):
            mgr = self._init_mgr(data,
                                 axes=dict(index=index, columns=columns),
                                 dtype=dtype,
                                 copy=copy)
        elif data is None:
            data = DataFrame()

            if index is None:
                index = Index([])
            else:
                index = ensure_index(index)

            if columns is None:
                columns = Index([])
            else:
                for c in columns:
                    data[c] = SparseArray(np.nan,
                                          index=index,
                                          kind=self._default_kind,
                                          fill_value=self._default_fill_value)
            mgr = to_manager(data, columns, index)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        else:
            msg = ('SparseDataFrame called with unknown type "{data_type}" '
                   'for data argument')
            raise TypeError(msg.format(data_type=type(data).__name__))

        generic.NDFrame.__init__(self, mgr)
Beispiel #40
0
def _get_dummies_1d(data,
                    prefix,
                    prefix_sep='_',
                    dummy_na=False,
                    sparse=False,
                    drop_first=False,
                    dtype=None):
    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    dtype = np.dtype(dtype)

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_Frame(data, sparse):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        if not sparse:
            return DataFrame(index=index)
        else:
            return SparseDataFrame(index=index, default_fill_value=0)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_Frame(data, sparse)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_Frame(data, sparse)

    number_of_cols = len(levels)

    if prefix is not None:
        dummy_strs = [
            u'{prefix}{sep}{level}'
            if isinstance(v, text_type) else '{prefix}{sep}{level}'
            for v in levels
        ]
        dummy_cols = [
            dummy_str.format(prefix=prefix, sep=prefix_sep, level=v)
            for dummy_str, v in zip(dummy_strs, levels)
        ]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:
        sparse_series = {}
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        for ndx, code in enumerate(codes):
            if code == -1:
                # Blank entries if not dummy_na and code == -1, #GH4446
                continue
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
                               sparse_index=IntIndex(N, ixs),
                               fill_value=0,
                               dtype=dtype)
            sparse_series[col] = SparseSeries(data=sarr, index=index)

        out = SparseDataFrame(sparse_series,
                              index=index,
                              columns=dummy_cols,
                              default_fill_value=0,
                              dtype=dtype)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Beispiel #41
0
 def get_empty_frame(data) -> DataFrame:
     if isinstance(data, Series):
         index = data.index
     else:
         index = np.arange(len(data))
     return DataFrame(index=index)
Beispiel #42
0
    def makeBindexDataCsv(cls,
                          cur=None,
                          start_date=None,
                          end_date=None,
                          basic_path=None,
                          output_file=None,
                          word_count=20,
                          stock_id=None,
                          ranking_type='tfidf'):
        if cur == None or start_date == None or end_date == None or output_file == None or stock_id == None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        if word_count < 0:
            word_count = 20
        if ranking_type not in ["tfidf", "textrank"]:
            ranking_type = "tfidf"
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])

        words = cls.getImportVocab(cur, count=20, ranking_type=ranking_type)
        word_count = len(words)
        for i in range(len(words)):
            words[i] = "'" + words[i] + "'"
        words_str = ",".join(words)
        del words

        word_key_list = []
        for i in range(1, word_count + 1):
            word_key_list.append("word%s" % i)
        columns = [
            "stock_id", "date", "opening", "closing", "difference",
            "percentage_difference", "lowest", "highest", "volume", "amount",
            "rate"
        ] + word_key_list
        data = {}
        for k in columns:
            data[k] = []
        pd.DataFrame(data).to_csv(output_path, index=False, columns=columns)

        cur.execute(
            "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' "
            % (stock_id, start_date, end_date))
        count = cur.fetchall()
        count = count[0][0]

        skip = 50
        slimit = 0
        while slimit < count:
            cur.execute(
                "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d "
                % (stock_id, start_date, end_date, 0 if slimit - 1 < 0 else
                   slimit - 1, skip if slimit - 1 < 0 else skip + 1))
            slimit += skip
            history_tt = cur.fetchall()
            history_t = []
            for h in history_tt:
                history_t.append([
                    int(h[0]),
                    float(h[1]),
                    float(h[2]),
                    float(h[3]),
                    float(h[4]),
                    float(h[5]),
                    float(h[6]),
                    float(h[7]),
                    float(h[8]),
                    str(h[9])
                ])
            del history_tt

            sdate = str(history_t[0][9])
            edate = str(history_t[-1][9])
            sdate = datetime.datetime.strptime(sdate, '%Y-%m-%d')
            sdate = (sdate - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
            cur.execute(
                "SELECT b.vocab_id, b.bindex, b.date FROM vocab v left join baidu_index b on v.id = b.vocab_id WHERE v.word in (%s) and b.date between '%s' and '%s' order by date, vocab_id asc"
                % (words_str, sdate, edate))
            bindex = cur.fetchall()
            bindex_t = []
            bindex_vec = 0
            cur_date = None
            if len(bindex) > 0:
                cur_date = str(bindex[0][2])
            bix = []
            bix_item = [cur_date]
            if len(bindex) > 0:
                for bi in bindex:
                    if str(bi[2]) != cur_date:
                        cur_date = str(bi[2])
                        bix.append(bix_item)
                        bix_item = [cur_date]
                    bix_temp = json.loads(bi[1])
                    bix_item.append(bix_temp['all']['0'])
                bix.append(bix_item)
            del bindex

            bindex = {}
            for k in range(1, len(bix)):
                b_t = []
                for kk in range(1, len(bix[k])):
                    if int(bix[k][kk]) != 0 and int(bix[k - 1][kk]) != 0:
                        b_t.append(
                            str(
                                np.round(
                                    float(100 * (int(bix[k][kk]) /
                                                 int(bix[k - 1][kk]) - 1)),
                                    2)))
                    else:
                        b_t.append(str(0.01))
                bindex[bix[k][0]] = b_t
            del bix

            for i in range(len(history_t)):
                history_t[i] += bindex[history_t[i][9]]
            history_temp = []
            for h in zip(*history_t):
                history_temp.append(h)
            history = {
                'stock_id': history_temp[0],
                'opening': history_temp[1],
                'closing': history_temp[2],
                'difference': history_temp[3],
                'percentage_difference': history_temp[4],
                'lowest': history_temp[5],
                'highest': history_temp[6],
                'volume': history_temp[7],
                'amount': history_temp[8],
                'date': history_temp[9]
            }
            for i in range(10, 10 + word_count):
                history["word%s" % (i - 9)] = history_temp[i]
            del history_t, history_temp
            history = DataFrame(history)
            g_history = history.groupby(by=['stock_id'])
            #0.01 -> 1 % 保留2位小数
            history['rate'] = 100 * (g_history.shift(0)["closing"] /
                                     g_history.shift(1)["closing"] - 1)
            history.dropna(axis=0,
                           how='any',
                           thresh=None,
                           subset=None,
                           inplace=True)
            for i in history.index:
                history.loc[i, 'rate'] = str(
                    np.round(float(history['rate'][i]), 2))

            #将经过标准化的数据处理成训练集和测试集可接受的形式
            def func_train_data(data_stock):
                if cls.groupby_skip == False:
                    cls.groupby_skip = True
                    return None
                print("正在处理的股票代码:%06s" % data_stock.name)

                data = {}
                for k in columns:
                    data[k] = []
                for i in range(len(data_stock) - 1):
                    for k in data:
                        data[k].append(data_stock.iloc[i][k])
                pd.DataFrame(data).to_csv(output_path,
                                          index=False,
                                          header=False,
                                          mode="a",
                                          columns=columns)

            g_stock = history.groupby(by=["stock_id"])
            #清空接收路径下的文件,初始化列名
            cls.groupby_skip = False
            g_stock.apply(func_train_data)
Beispiel #43
0
def _get_dummies_1d(
    data,
    prefix,
    prefix_sep="_",
    dummy_na=False,
    sparse=False,
    drop_first=False,
    dtype=None,
):
    from pandas.core.reshape.concat import concat

    # Series avoids inconsistent NaN handling
    codes, levels = factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    dtype = np.dtype(dtype)

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_frame(data) -> DataFrame:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_frame(data)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_frame(data)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:
        dummy_cols = [f"{prefix}{prefix_sep}{level}" for level in levels]

    index: Optional[Index]
    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:

        fill_value: Union[bool, float, int]
        if is_integer_dtype(dtype):
            fill_value = 0
        elif dtype == bool:
            fill_value = False
        else:
            fill_value = 0.0

        sparse_series = []
        N = len(data)
        sp_indices: List[List] = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(
                np.ones(len(ixs), dtype=dtype),
                sparse_index=IntIndex(N, ixs),
                fill_value=fill_value,
                dtype=dtype,
            )
            sparse_series.append(Series(data=sarr, index=index, name=col))

        out = concat(sparse_series, axis=1, copy=False)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Beispiel #44
0
                 start_days)
    else:
        fitCurve(US_data[start_days:], title, 0, fill_dates, start_days)

    pass


confirmed_cases_since_Jan_22 = [
    1, 1, 2, 2, 5, 5, 5, 5, 5, 7, 8, 8, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12,
    13, 13, 13, 13, 13, 13, 13, 13, 15, 15, 15, 51, 51, 57, 58, 60, 68, 74, 98,
    118, 149, 217, 262, 402, 518, 583, 959, 1281, 1663, 2179, 2727, 3499, 4632,
    6421, 7783, 13677, 19100, 25489, 33276, 43847, 53740, 65778, 83836, 101657,
    121478, 140886, 161807, 188172, 213372, 243453, 275586, 308850, 337072,
    366614, 396263
]  #Until April 7th
US_data = DataFrame({0: confirmed_cases_since_Jan_22})
ori_len = len(US_data)

start_days = 0
last_date = datetime.datetime(2020, 1,
                              22) + datetime.timedelta(days=(len(US_data) - 1))
last_date_str = last_date.strftime('%Y-%m-%d')

plotConfirmedCases(US_data, start_days, last_date_str)

fill_dates = 0
ratio = 1.0
US_data = fillData(US_data, fill_dates, ratio)
title = "US_best_scenario_pred"
go(start_days, fill_dates, ratio, US_data, title)
Beispiel #45
0
    def makeOriginDataCsv(cls,
                          cur=None,
                          start_date=None,
                          end_date=None,
                          basic_path=None,
                          output_file=None,
                          stock_id=None):
        #初始化源文件路径和存储文件路径
        if cur is None or start_date is None or end_date is None or output_file is None or stock_id is None:
            return None
        if basic_path is None:
            basic_path = os.path.dirname(os.path.abspath(__file__))
        output_path = os.path.join(basic_path, output_file)
        VTool.makeDirs(files=[output_path])

        data = cur.execute(
            "select id, stock_id, date, opening, closing, difference, percentage_difference, lowest, highest, volume, amount from history where stock_id = '%s' and date between '%s' and '%s' "
            % (stock_id, start_date, end_date))
        data = cur.fetchall()
        if len(data) == 0:
            return None

        res = []
        for d in data:
            res.append([
                int(d[0]),
                int(d[1]),
                str(d[2]),
                float(d[3]),
                float(d[4]),
                float(d[5]),
                float(d[6]),
                float(d[7]),
                float(d[8]),
                float(d[9]),
                float(d[10])
            ])
        new_data = []
        for d in zip(*res):
            new_data.append(d)
        origin_data = {
            'id': new_data[0],
            'stock_id': new_data[1],
            'date': new_data[2],
            'opening': new_data[3],
            'closing': new_data[4],
            'difference': new_data[5],
            'percentage_difference': new_data[6],
            'lowest': new_data[7],
            'highest': new_data[8],
            'volume': new_data[9],
            'amount': new_data[10]
        }

        #读取原始数据,只保留需要使用的列
        total_data = DataFrame(origin_data)
        total_data.sort_values(by=['stock_id', 'date'], inplace=True)
        #根据股票代码分组
        g_stock_num = total_data.groupby(by=["stock_id"])
        total_data["rate"] = 100 * (g_stock_num.shift(0)["closing"] /
                                    g_stock_num.shift(1)["closing"] - 1)
        for i in total_data.index:
            total_data.loc[i, 'rate'] = str(
                np.round(float(total_data['rate'][i]), 2))
        #重新调整列的顺序,为接下来处理成输入、输出形式做准备
        columns = [
            "stock_id", "date", "opening", "closing", "difference",
            "percentage_difference", "lowest", "highest", "volume", "amount",
            "rate"
        ]
        total_data = total_data[columns]

        def func_train_data(data_one_stock_num):
            if cls.groupby_skip == False:
                cls.groupby_skip = True
                return None
            print("正在处理的股票代码:%06s" % data_one_stock_num.name)
            data = {
                "stock_id": [],
                "date": [],
                "opening": [],
                "closing": [],
                "difference": [],
                "percentage_difference": [],
                "lowest": [],
                "highest": [],
                "volume": [],
                "amount": [],
                "rate": []
            }
            for i in range(len(data_one_stock_num.index) - 1):
                for k in data:
                    data[k].append(data_one_stock_num.iloc[i][k])
            pd.DataFrame(data).to_csv(output_path,
                                      index=False,
                                      columns=columns)

        total_data1 = total_data.dropna()
        total_data2 = total_data1.drop(
            total_data1[(total_data1.rate == 'nan')].index)
        g_stock_num = total_data2.groupby(by=["stock_id"])
        #清空接收路径下的文件,初始化列名
        cls.groupby_skip = False
        g_stock_num.apply(func_train_data)