Example #1
0
    def test_dataframe(self, orient, numpy):
        if orient == "records" and numpy:
            pytest.skip("Not idiomatic pandas")

        df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[
            "a", "b"], columns=["x", "y", "z"])
        encode_kwargs = {} if orient is None else dict(orient=orient)
        decode_kwargs = {} if numpy is None else dict(numpy=numpy)

        output = ujson.decode(ujson.encode(df, **encode_kwargs),
                              **decode_kwargs)

        # Ensure proper DataFrame initialization.
        if orient == "split":
            dec = _clean_dict(output)
            output = DataFrame(**dec)
        else:
            output = DataFrame(output)

        # Corrections to enable DataFrame comparison.
        if orient == "values":
            df.columns = [0, 1, 2]
            df.index = [0, 1]
        elif orient == "records":
            df.index = [0, 1]
        elif orient == "index":
            df = df.transpose()

        tm.assert_frame_equal(output, df, check_dtype=False)
Example #2
0
    def testDataFrame(self):
        df = DataFrame([[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"])

        # column indexed
        outp = DataFrame(ujson.decode(ujson.encode(df)))
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        outp = DataFrame(**ujson.decode(ujson.encode(df, orient="split")))
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="records")))
        outp.index = df.index
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="values")))
        outp.index = df.index
        self.assertTrue((df.values == outp.values).all())

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="index")))
        self.assertTrue((df.transpose() == outp).values.all())
        assert_array_equal(df.transpose().columns, outp.columns)
        assert_array_equal(df.transpose().index, outp.index)
Example #3
0
def svd_agg(m_rna, mi_rna, targets_matrix, c=1):
    if settings.CELERY_DEBUG:
        import sys
        sys.path.append('/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg')
        import pydevd
        pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True)

    #
    mRNA_data = m_rna.apply(lambda x: 1.0*x/max(x), axis=0)
    miRNA_data = mi_rna.apply(lambda x: 1-1.0*x/max(x), axis=0)
    #
    aggregate_data = mRNA_data
    #
    common_mRNAs =  Index(set(mRNA_data.columns) & set(targets_matrix.columns))
    common_miRNAs = Index(set(miRNA_data.columns) & set(targets_matrix.index))
    #
    for mRNA in common_mRNAs:
        #
        mRNA = Index([mRNA])
        #
        targetting_miRNAs = targets_matrix.ix[targets_matrix[mRNA[0]]==1, mRNA].index
        #
        selected_miRNA = miRNA_data.ix[:, targetting_miRNAs].T
        #
        if len(selected_miRNA.index) > 1:
            first_comp = DataFrame(np.linalg.svd(selected_miRNA)[2]).ix[0, :]
            first_comp.index = selected_miRNA.columns
        else:
            continue
        new_rep = DataFrame(np.linalg.svd(DataFrame([aggregate_data.ix[:, mRNA[0]], first_comp ]))[2]).ix[0, :]
        new_rep.index = aggregate_data.index
        aggregate_data.ix[:, mRNA[0]] = new_rep
    return aggregate_data
Example #4
0
def svd_agg_train(m_rna, mi_rna, targets_matrix, hide_columns=Index([])):
    #
    sample_indexes = m_rna.index - hide_columns
    mRNA_data = m_rna.apply(lambda x: 1.0*x/max(x), axis=0).ix[sample_indexes, :]
    miRNA_data = mi_rna.apply(lambda x: 1-1.0*x/max(x), axis=0).ix[sample_indexes, :]
    #
    aggregate_data = mRNA_data
    #
    common_mRNAs =  Index(set(mRNA_data.columns) & set(targets_matrix.columns))
    common_miRNAs = Index(set(miRNA_data.columns) & set(targets_matrix.index))
    #
    for mRNA in common_mRNAs:
        #
        mRNA = Index([mRNA])
        #
        targetting_miRNAs = targets_matrix.ix[targets_matrix[mRNA[0]]==1, mRNA].index
        #
        selected_miRNA = miRNA_data.ix[:, targetting_miRNAs]
        #
        if len(selected_miRNA.columns)>1:
            first_comp = DataFrame(np.linalg.svd(selected_miRNA)[2]).ix[0, :]
            first_comp.index = selected_miRNA.index
        new_rep = DataFrame(np.linalg.svd(DataFrame([aggregate_data.ix[:,mRNA[0]], first_comp ]).transpose())[2]).ix[0, :]
        new_rep.index = aggregate_data.index
        aggregate_data.ix[:, mRNA[0]] = new_rep
    return aggregate_data
Example #5
0
    def testDataFrame(self):
        df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z'])

        # column indexed
        outp = DataFrame(ujson.decode(ujson.encode(df)))
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split")))
        outp = DataFrame(**dec)
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)
        assert_array_equal(df.index, outp.index)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="records")))
        outp.index = df.index
        self.assertTrue((df == outp).values.all())
        assert_array_equal(df.columns, outp.columns)

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="values")))
        outp.index = df.index
        self.assertTrue((df.values == outp.values).all())

        outp = DataFrame(ujson.decode(ujson.encode(df, orient="index")))
        self.assertTrue((df.transpose() == outp).values.all())
        assert_array_equal(df.transpose().columns, outp.columns)
        assert_array_equal(df.transpose().index, outp.index)
Example #6
0
def set2df(sets, column_names, index=None, sort=True):
    df = DataFrame(list(sets), columns=column_names, index=index)
    if sort:
        df = df.sort(column_names)
        if index:
            df.index = index
        else:
            df.index = range(len(df))
    return df
Example #7
0
def test_sort_datetimelike():
    # GH10505

    # use same data as test_groupby_sort_categorical, which category is
    # corresponding to datetime.month
    df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
                           datetime(2011, 2, 1), datetime(2011, 5, 1),
                           datetime(2011, 2, 1), datetime(2011, 1, 1),
                           datetime(2011, 5, 1)],
                    'foo': [10, 8, 5, 6, 4, 1, 7],
                    'bar': [10, 20, 30, 40, 50, 60, 70]},
                   columns=['dt', 'foo', 'bar'])

    # ordered=True
    df['dt'] = Categorical(df['dt'], ordered=True)
    index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
             datetime(2011, 5, 1), datetime(2011, 7, 1)]
    result_sort = DataFrame(
        [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
    result_sort.index = CategoricalIndex(index, name='dt', ordered=True)

    index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
             datetime(2011, 5, 1), datetime(2011, 1, 1)]
    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                              columns=['foo', 'bar'])
    result_nosort.index = CategoricalIndex(index, categories=index,
                                           name='dt', ordered=True)

    col = 'dt'
    assert_frame_equal(
        result_sort, df.groupby(col, sort=True, observed=False).first())

    # when categories is ordered, group is ordered by category's order
    assert_frame_equal(
        result_sort, df.groupby(col, sort=False, observed=False).first())

    # ordered = False
    df['dt'] = Categorical(df['dt'], ordered=False)
    index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
             datetime(2011, 5, 1), datetime(2011, 7, 1)]
    result_sort = DataFrame(
        [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
    result_sort.index = CategoricalIndex(index, name='dt')

    index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
             datetime(2011, 5, 1), datetime(2011, 1, 1)]
    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                              columns=['foo', 'bar'])
    result_nosort.index = CategoricalIndex(index, categories=index,
                                           name='dt')

    col = 'dt'
    assert_frame_equal(
        result_sort, df.groupby(col, sort=True, observed=False).first())
    assert_frame_equal(
        result_nosort, df.groupby(col, sort=False, observed=False).first())
Example #8
0
    def test_grouper_index_types(self):
        # related GH5375
        # groupby misbehaving when using a Floatlike index
        df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
        for index in [tm.makeFloatIndex, tm.makeStringIndex,
                      tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex,
                      tm.makePeriodIndex]:

            df.index = index(len(df))
            df.groupby(list('abcde')).apply(lambda x: x)

            df.index = list(reversed(df.index.tolist()))
            df.groupby(list('abcde')).apply(lambda x: x)
Example #9
0
    def deserialize(self, item, force_bytes_to_unicode=False):
        index = self._index_from_records(item)
        column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']]
        multi_column = item.dtype.metadata.get('multi_column')
        if len(item) == 0:
            rdata = item[column_fields] if len(column_fields) > 0 else None
            if multi_column is not None:
                columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])
                return DataFrame(rdata, index=index, columns=columns)
            else:
                return DataFrame(rdata, index=index)

        columns = item.dtype.metadata['columns']
        df = DataFrame(data=item[column_fields], index=index, columns=columns)

        if multi_column is not None:
            df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])

        if force_bytes_to_unicode:
            # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow
            # of people migrating to py3. # https://github.com/manahl/arctic/issues/598
            # This should not be used for a normal flow, and you should instead of writing unicode strings
            # if you want to work with str in py3.,

            for c in df.select_dtypes(object):
                # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
                # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
                # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
                # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
                # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
                if type(df[c].iloc[0]) == bytes:
                    df[c] = df[c].str.decode('utf-8')

            if isinstance(df.index, MultiIndex):
                unicode_indexes = []
                # MultiIndex requires a conversion at each level.
                for level in range(len(df.index.levels)):
                    _index = df.index.get_level_values(level)
                    if isinstance(_index[0], bytes):
                        _index = _index.astype('unicode')
                    unicode_indexes.append(_index)
                df.index = unicode_indexes
            else:
                if type(df.index[0]) == bytes:
                    df.index = df.index.astype('unicode')

            if type(df.columns[0]) == bytes:
                df.columns = df.index.astype('unicode')

        return df
Example #10
0
    def test_basic(self, sparse, dtype):
        s_list = list('abc')
        s_series = Series(s_list)
        s_series_index = Series(s_list, list('ABC'))

        expected = DataFrame({'a': [1, 0, 0],
                              'b': [0, 1, 0],
                              'c': [0, 0, 1]},
                             dtype=self.effective_dtype(dtype))
        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
        if sparse:
            tm.assert_sp_frame_equal(result,
                                     expected.to_sparse(kind='integer',
                                                        fill_value=0))
        else:
            assert_frame_equal(result, expected)

        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
        if sparse:
            expected = expected.to_sparse(kind='integer', fill_value=0)
        assert_frame_equal(result, expected)

        expected.index = list('ABC')
        result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
        if sparse:
            expected.to_sparse(kind='integer', fill_value=0)
        assert_frame_equal(result, expected)
Example #11
0
    def test_merge_datetime_index(self, box):
        # see gh-19038
        df = DataFrame([1, 2, 3],
                       ["2016-01-01", "2017-01-01", "2018-01-01"],
                       columns=["a"])
        df.index = pd.to_datetime(df.index)
        on_vector = df.index.year

        if box is not None:
            on_vector = box(on_vector)

        expected = DataFrame(
            OrderedDict([
                ("a", [1, 2, 3]),
                ("key_1", [2016, 2017, 2018]),
            ])
        )

        result = df.merge(df, on=["a", on_vector], how="inner")
        tm.assert_frame_equal(result, expected)

        expected = DataFrame(
            OrderedDict([
                ("key_0", [2016, 2017, 2018]),
                ("a_x", [1, 2, 3]),
                ("a_y", [1, 2, 3]),
            ])
        )

        result = df.merge(df, on=[df.index.year], how="inner")
        tm.assert_frame_equal(result, expected)
Example #12
0
    def test_wls_panel(self):
        y = tm.makeTimeDataFrame()
        x = Panel({"x1": tm.makeTimeDataFrame(), "x2": tm.makeTimeDataFrame()})

        y.ix[[1, 7], "A"] = np.nan
        y.ix[[6, 15], "B"] = np.nan
        y.ix[[3, 20], "C"] = np.nan
        y.ix[[5, 11], "D"] = np.nan

        stack_y = y.stack()
        stack_x = DataFrame(dict((k, v.stack()) for k, v in x.iteritems()))

        weights = x.std("items")
        stack_weights = weights.stack()

        stack_y.index = stack_y.index.get_tuple_index()
        stack_x.index = stack_x.index.get_tuple_index()
        stack_weights.index = stack_weights.index.get_tuple_index()

        result = ols(y=y, x=x, weights=1 / weights)
        expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights)

        assert_almost_equal(result.beta, expected.beta)

        for attr in ["resid", "y_fitted"]:
            rvals = getattr(result, attr).stack().values
            evals = getattr(expected, attr).values
            assert_almost_equal(rvals, evals)
Example #13
0
    def predict(self, tree):
        """
        TODO Should take an array and predict every item. A score can be stored.
        It would follow the guidelines set by scikit-learn.
        """
        tree_rules = self.extract_rules(tree)
        df = DataFrame(columns=['label', 'prob'])
        gb = self.posteriori.groupby('label')


        for key, indexes in gb.groups.items():
            apriori_prob = self.apriori[self.apriori.label == key]['freq'].values[0]
            prob = apriori_prob

            group_df, missing_prob = self.apply_smoothing(self.posteriori.ix[indexes], tree_rules)

            for rule in tree_rules:
                prob_evidence = group_df[group_df.rule == rule]['freq']
                if len(prob_evidence) == 0:
                    prob_evidence = missing_prob
                else:
                    prob_evidence = prob_evidence.values[0]
                prob *= prob_evidence
            
            post = DataFrame({'label':[key], 'prob':[prob]})
            df = df.append(post)

        df.index = np.arange(df.index.size)
        df = df.sort(columns='prob', ascending=False)
        return df.ix[df['prob'].idxmax()]
Example #14
0
    def test_wls_panel(self):
        y = tm.makeTimeDataFrame()
        x = Panel({'x1' : tm.makeTimeDataFrame(),
                   'x2' : tm.makeTimeDataFrame()})

        y.ix[[1, 7], 'A'] = np.nan
        y.ix[[6, 15], 'B'] = np.nan
        y.ix[[3, 20], 'C'] = np.nan
        y.ix[[5, 11], 'D'] = np.nan

        stack_y = y.stack()
        stack_x = DataFrame(dict((k, v.stack())
                                  for k, v in x.iteritems()))

        weights = x.std('items')
        stack_weights = weights.stack()

        stack_y.index = stack_y.index.get_tuple_index()
        stack_x.index = stack_x.index.get_tuple_index()
        stack_weights.index = stack_weights.index.get_tuple_index()

        result = ols(y=y, x=x, weights=1/weights)
        expected = ols(y=stack_y, x=stack_x, weights=1/stack_weights)

        assert_almost_equal(result.beta, expected.beta)

        for attr in ['resid', 'y_fitted']:
            rvals = getattr(result, attr).stack().values
            evals = getattr(expected, attr).values
            assert_almost_equal(rvals, evals)
Example #15
0
def make_plot():
    types = request.form.getlist("type")

    ticker = request.form["ticker"]
    now = datetime.now()
    end_date = now.strftime("%Y-%m-%d")
    start_date = (now - timedelta(days=180)).strftime("%Y-%m-%d")  # six - month timeframe

    URL = (
        "https://www.quandl.com/api/v3/datasets/WIKI/"
        + ticker
        + ".json?start_date="
        + start_date
        + "&end_date="
        + end_date
        + "&order=asc&api_key=eFoXAcyvLhyuB3Rsvg6o"
    )
    r = requests.get(URL)
    df_handle = DataFrame(r.json())

    df = DataFrame(df_handle.ix["data", "dataset"], columns=df_handle.ix["column_names", "dataset"])
    df.columns = [x.lower() for x in df.columns]
    df = df.set_index(["date"])
    df.index = to_datetime(df.index)

    p = figure(x_axis_type="datetime")

    if "open" in types:
        p.line(df.index, df["open"], color="blue", legend="opening price")
    if "high" in types:
        p.line(df.index, df["high"], color="red", legend="highest price")
    if "close" in types:
        p.line(df.index, df["close"], color="green", legend="closing price")
    return p
Example #16
0
 def data_frame(self):
     if self._processed_knockouts is None:
         self._process_knockouts()
     data_frame = DataFrame(self._processed_knockouts)
     data_frame.sort_values("size", inplace=True)
     data_frame.index = [i for i in range(len(data_frame))]
     return data_frame
def twitter_count(keys,d,strdb):

	#Mongo
	connection = pymongo.MongoClient(keys['db']['host'])
	dbtm = connection[keys['db']['name']]
	db = dbtm[strdb]
	#MongoDB Query - Mentions
	#The Day Of
	upper_bound_start_ts = float(calendar.timegm(d[-1].utctimetuple())*1000); 
	upper_bound_end = d[-1] + timedelta(days=1); 
	upper_bound_end_ts = float(calendar.timegm(upper_bound_end.utctimetuple())*1000)
	#upper_bound_end_ts = float(calendar.timegm(d[-1].utctimetuple())*1000); upper_bound_start = d[-1] - timedelta(days=1); upper_bound_start_ts = float(calendar.timegm(upper_bound_start.utctimetuple())*1000)
	# #Retrieve Tweeets that are not authored by the user itself. 
	if strdb in 'mentions':
		tr = 	db.aggregate([
								{'$match': {'timestamp':{'$gt': upper_bound_start_ts, '$lt': upper_bound_end_ts}}},
								{'$unwind':'$cdpid'},
								{'$group':{'_id':'$cdpid',strdb:{'$sum':1}}}])
	#Tweets collection does not need unwind unlike mentions collection. 
	else:
		tr = 	db.aggregate([
							{'$match': {'timestamp':{'$gt': upper_bound_start_ts, '$lt': upper_bound_end_ts}}},
							{'$group':{'_id':'$cdpid',strdb:{'$sum':1}}}])
	tr = DataFrame(tr['result']); 
	tr.index = tr._id;  tr=tr.drop('_id',axis=1); tr = tr.sort_index();
	#mts['Date'] = Period(d[-2],'D')
	print '%s for ' %(strdb), d[-1], ' processed'
	return(tr)
Example #18
0
def plotting():
    
    # get list of the checked features
    features = request.form.getlist('feature')
    #user's input
    ticker = request.form['ticker']
    #calculate the time one month before
    now = datetime.now()
    #calculate the time difference
    start_date = (now - timedelta(days=30)).strftime('%Y-%m-%d')
    #fetch the dataset
    URL = 'https://www.quandl.com/api/v3/datasets/WIKI/'+ticker+'.json?start_date='+start_date+'&end_date='+end_date+'&order=asc&api_key=WVEFZw8uyJzuvHE3VsQW'
    r = requests.get(URL)
    
    
    #pass to pandas dataframe
    raw_data = DataFrame(r.json())
    #clean up the data
    df = DataFrame(raw_data.ix['data','dataset'] , columns = raw_data.ix['column_names','dataset'])
    #set the column names with lower case
    df.columns = [x.lower() for x in df.columns]
    #set the index to the date column
    df = df.set_index(['date'])
    #convert the index to datetime 
    df.index = to_datetime(df.index)
    
    #create the plot
    p = figure(x_axis_type = "datetime")
    if 'open' in features:
        p.line(df.index, df['open'], color='blue', legend='opening price')
    if 'high' in features:
        p.line(df.index, df['high'], color='red', legend='highest price')
    if 'close' in features:
        p.line(df.index, df['close'], color='green', legend='closing price')
    return p
def fetchQuotes(sym, start=FROM_DATE, end=CURRENT_DATE):
    his = None
    data = None
    try:
        # print start, end
        data = ystockquote.get_historical_prices(sym, start, end)
    except Exception:
        print "Please check the dates. Data might not be available. 404 returned"

        # 404 due to data yet not available
    if data:
        his = DataFrame(collections.OrderedDict(sorted(data.items()))).T
        his = his.convert_objects(convert_numeric=True)
        his.index = pd.to_datetime(his.index)
        his.insert(0, 'symbol', sym, allow_duplicates=True)
        # insert the date as dataframe too
        his.insert(1, 'date', his.index)
        # his.columns = getColumns('stock_quote_historical')   # Removing as db dependency is removed
        his.columns = getColumnsNoSql('stock_quote_historical')

    daily = ystockquote.get_all(sym)
    # print daily
    # persist(his, daily, sym, end)

    return his, daily
Example #20
0
def make_plot():
	# get list of the checked features
	features = request.form.getlist('feature')
	
	# capture the ticker input from the user
	ticker = request.form['ticker']

	# calculate one month time period from now
	now = datetime.now()
	#end_date = now.strftime('%Y-%m-%d') 
	start_date = (now - timedelta(days=30)).strftime('%Y-%m-%d')

	# fetch the appropriate dataset via API
	URL = 'https://www.quandl.com/api/v3/datasets/WIKI/'+ticker+'.json?start_date='+start_date+'&end_date='+end_date+'&order=asc&api_key=eFoXAcyvLhyuB3Rsvg6o'
	# URL = 'https://www.quandl.com/api/v3/datasets/WIKI/'+ticker+'.json?start_date=2015-08-01&end_date=2015-09-01&order=asc&api_key=eFoXAcyvLhyuB3Rsvg6o'
	r = requests.get(URL)

	# convert into a pandas dataframe
	request_df = DataFrame(r.json()) 
	df = DataFrame(request_df.ix['data','dataset'], columns = request_df.ix['column_names','dataset'])
	df.columns = [x.lower() for x in df.columns]
	df = df.set_index(['date'])
	df.index = to_datetime(df.index)

	# create a Bokeh plot from the dataframe
	# output_file("stock.html", title="Stock prices changes for last month")
	p = figure(x_axis_type = "datetime")
	if 'open' in features:
	    p.line(df.index, df['open'], color='blue', legend='opening price')
	if 'high' in features:
	    p.line(df.index, df['high'], color='red', legend='highest price')
	if 'close' in features:
	    p.line(df.index, df['close'], color='green', legend='closing price')
	return p
 def get_regression_table(self):
     regression_table = DataFrame({"beta": self.coef, "std_X": self.std_X})
     regression_table.index = self.features
     regression_table['beta_normalized'] = regression_table.beta * regression_table.std_X
     regression_table['effect'] = np.fabs(regression_table['beta_normalized'])
     regression_table = regression_table.sort_index(by='effect', ascending=False)
     return regression_table
Example #22
0
def output():
	# getting user set options from the index2.html page
        options = request.form.getlist('feature')
	stock = request.form['stock']
        stock = stock.upper()
        
        # requesting data from Quandl
        nw = datetime.now()
	start_date = (nw - timedelta(days=30)).strftime('%Y-%m-%d')
	end_date = nw.strftime('%Y-%m-%d')
	req_url = 'https://www.quandl.com/api/v3/datasets/WIKI/'+stock+'.json?start_date='+start_date+'&end_date='+end_date+'&order=asc&api_key=3bkydVzcH_PPsy5zzAPn'
	r = requests.get(req_url)
        
        # pandas in action
	request_df = DataFrame(r.json()) 
	df = DataFrame(request_df.ix['data','dataset'], columns = request_df.ix['column_names','dataset'])
	df.columns = [x.lower() for x in df.columns]
	df = df.set_index(['date'])
	df.index = to_datetime(df.index)
	
	  
       
        # create plot - PLAY AROUND WITH THIS TO MAKE IT GENUINE
	#output_file("output.html", title="Stock prices changes for last month")
	p = figure(x_axis_type = "datetime")
	if 'open' in options:
	    p.line(df.index, df['open'], color='black', legend='Opening price')
	if 'high' in options:
	    p.line(df.index, df['high'], color='red', legend='Highest price')
	if 'close' in options:
	    p.line(df.index, df['close'], color='blue', legend='Closing price')
	return p
Example #23
0
    def test_wls_panel(self):
        y = tm.makeTimeDataFrame()
        x = Panel({"x1": tm.makeTimeDataFrame(), "x2": tm.makeTimeDataFrame()})

        y.ix[[1, 7], "A"] = np.nan
        y.ix[[6, 15], "B"] = np.nan
        y.ix[[3, 20], "C"] = np.nan
        y.ix[[5, 11], "D"] = np.nan

        stack_y = y.stack()
        stack_x = DataFrame(dict((k, v.stack()) for k, v in compat.iteritems(x)))

        weights = x.std("items")
        stack_weights = weights.stack()

        stack_y.index = stack_y.index._tuple_index
        stack_x.index = stack_x.index._tuple_index
        stack_weights.index = stack_weights.index._tuple_index

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = ols(y=y, x=x, weights=1 / weights)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights)

        assert_almost_equal(result.beta, expected.beta)

        for attr in ["resid", "y_fitted"]:
            rvals = getattr(result, attr).stack().values
            evals = getattr(expected, attr).values
            assert_almost_equal(rvals, evals)
Example #24
0
 def bdib(self, ticker, fld_list, startDateTime, endDateTime, eventType='TRADE', interval = 1):
     """
     Get one ticker (Only one ticker available per call); eventType (TRADE, BID, ASK,..etc); interval (in minutes)
             ; fld_list (Only [open, high, low, close, volumne, numEvents] availalbe)
     return pandas dataframe with return Data
     """
     # Create and fill the request for the historical data
     request = self.refDataService.createRequest("IntradayBarRequest")
     request.set("security", ticker)
     request.set("eventType", eventType)
     request.set("interval", interval)  # bar interval in minutes        
     request.set("startDateTime", startDateTime)
     request.set("endDateTime", endDateTime)
     
     print "Sending Request:", request
     # Send the request
     self.session.sendRequest(request)
     # defaultdict - later convert to pandas
     data = defaultdict(dict)
     # Process received events
     while(True):
         # We provide timeout to give the chance for Ctrl+C handling:
         ev = self.session.nextEvent(500)
         for msg in ev:
             barTickData = msg.getElement('barData').getElement('barTickData')
             for i in range(barTickData.numValues()) :
                 for j in range(len(fld_list)) :
                     data[(fld_list[j])][barTickData.getValue(i).getElement(0).getValue()] = barTickData.getValue(i).getElement(fld_list[j]).getValue()
     
         if ev.eventType() == blpapi.Event.RESPONSE:
             # Response completly received, so we could exit
             break
     data = DataFrame(data)
     data.index = pd.to_datetime(data.index)
     return data
Example #25
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
        df.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df['A'] = df['A'].astype(np.int16)
        df['B'] = df['B'].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected['A'] = expected['A'].astype(np.int16)
        expected['B'] = expected['B'].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)
    def __init__(self, outcomes, texts, parameters_display, verbose=False):

        options = {"lowercase": True, "lemmatize": True, "remove-stopwords": True}
        super(DisplayTextModel, self).__init__(outcomes, texts, 'bag-of-words', options)

        data = DataFrame({"y": outcomes, "text": texts})

        # Storing whether the outcome is a dummy:
        if set(data.y) == set([0, 1]):
            self.is_dummy_outcome = True

        N = data.shape[0]
        self.number_of_observations = N

        data.index = [str(x) for x in range(N)]
        data['y_hat'] = self.pipe.predict(texts)

        ridge = self.pipe.named_steps['ridge_model']

        self.std_X = ridge.std_X
        self.parameters_display = parameters_display
        self.mean_outcome_in_groups = mean_outcome_in_groups(data.y, data.y_hat)
        self.percent_correct = share_correct(data.y, data.y_hat, verbose=verbose)
        self.outcome_summary = get_summary(outcomes)

        self.coef = ridge.coef_
        self.number_of_features = len(self.coef)
        features = self.pipe.named_steps['featurizer'].get_feature_names()
        self.features = [f.split("__")[1] for f in features]
Example #27
0
    def test_nunique(self):
        df = DataFrame({
            'A': list('abbacc'),
            'B': list('abxacc'),
            'C': list('abbacx'),
        })

        expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
        result = df.groupby('A', as_index=False).nunique()
        tm.assert_frame_equal(result, expected)

        # as_index
        expected.index = list('abc')
        expected.index.name = 'A'
        result = df.groupby('A').nunique()
        tm.assert_frame_equal(result, expected)

        # with na
        result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
        tm.assert_frame_equal(result, expected)

        # dropna
        expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
                             index=list('abc'))
        expected.index.name = 'A'
        result = df.replace({'x': None}).groupby('A').nunique()
        tm.assert_frame_equal(result, expected)
Example #28
0
    def test_wls_panel(self):
        y = tm.makeTimeDataFrame()
        x = Panel({'x1': tm.makeTimeDataFrame(),
                   'x2': tm.makeTimeDataFrame()})

        y.iloc[[1, 7], y.columns.get_loc('A')] = np.nan
        y.iloc[[6, 15], y.columns.get_loc('B')] = np.nan
        y.iloc[[3, 20], y.columns.get_loc('C')] = np.nan
        y.iloc[[5, 11], y.columns.get_loc('D')] = np.nan

        stack_y = y.stack()
        stack_x = DataFrame(dict((k, v.stack())
                                 for k, v in x.iteritems()))

        weights = x.std('items')
        stack_weights = weights.stack()

        stack_y.index = stack_y.index._tuple_index
        stack_x.index = stack_x.index._tuple_index
        stack_weights.index = stack_weights.index._tuple_index

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = ols(y=y, x=x, weights=1 / weights)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            expected = ols(y=stack_y, x=stack_x, weights=1 / stack_weights)

        assert_almost_equal(result.beta, expected.beta)

        for attr in ['resid', 'y_fitted']:
            rvals = getattr(result, attr).stack().values
            evals = getattr(expected, attr).values
            assert_almost_equal(rvals, evals)
Example #29
0
def test_fenci():

    dfs = []

    for i in range(0, 9):
        f = file('Data/ftags_{}.pkl'.format(i), 'rb')
        fdist = pickle.load(f)
        #fdist.plot(50)
        df = DataFrame(fdist.items(), columns=['关键词', '计数'])
        df = df.sort_index(by='计数', ascending=False)
        df.index = range(len(df))

        df_plt = df[:30]
        df_plt = df_plt[::-1]
        #df_plt['关键词'].apply(lambda x : x.encode('utf8'))
        print df_plt.head()
        df_plt.plot(kind='barh', x=df_plt['关键词'], title=classifies[i])

        #plt.show()

        filePath = 'Data/{}.png'.format(classifies[i])
        str_name_f = filePath.decode("utf8")
        plt.savefig(str_name_f, dpi=100)

        dfs.append((classifies[i],df))

        #print df[df[1] > 1]
        f.close()
    print 'end'

    with pd.ExcelWriter('Data/keys.xlsx') as writer:
        for key, df in dfs:
            print key
            df.to_excel(writer, sheet_name=key, index=False)
Example #30
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
        df.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df["A"] = df["A"].astype(np.int16)
        df["B"] = df["B"].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected["A"] = expected["A"].astype(np.int16)
        expected["B"] = expected["B"].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.float)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)