Example #1
0
	def scrab_one_user(self,uid,num):
		##登陆
		weiboLogin = WeiboLogin.WeiboLogin(self.username, self.pwd,self.header)
		weiboLogin.Login()

		##开始获取页面
		WBmsg = GetWeiboPage.getWeiboPage()
		WBmsg.body['uid'] = uid

		##构造微博数据存储结构
		wb_detail = []
		wb_all = {}
		wb_all = wb_all.fromkeys(wb_detail, [])
		wb_frame = DataFrame(wb_all, index=[])
		for n in range(1, num):
	        # 生成页面url地址
			url = 'http://weibo.com/' + uid + '?is_search=0&visible=0&is_all=1&is_tag=0&profile_ftype=1&page=' + str(n)

			#print WBmsg.get_firstpage(url,n)
	        all_weibo=PagePlyr.parse_page(PagePlyr.get_json_content(WBmsg.get_firstpage(url,n)))
	        wb_frame=wb_frame.append(all_weibo, ignore_index=True)
	        all_weibo=PagePlyr.parse_page(PagePlyr.get_json_content( WBmsg.get_secondpage(url,n)))
	        wb_frame=wb_frame.append(all_weibo, ignore_index=True)
	        all_weibo=PagePlyr.parse_page(PagePlyr.get_json_content(WBmsg.get_thirdpage(url,n)))
	        wb_frame=wb_frame.append(all_weibo, ignore_index=True)

	        print n
		return wb_frame
Example #2
0
    def get_endpoint_timeframe(self):
        result_df = DataFrame()
        res_len = self._make_req(self.chunk_start, self.chunk_end)
        self._wait_for_rate_limit()
        # check to see if there are possibly more results to get if close to max_result
        # this will make additional requests until either the results are smaller than 9k or the timeframe is 1day
        if res_len / self.max_results > .90:
            delta = self.chunk_end - self.chunk_start
            step_size = math.floor(delta.days / 2)
            self.chunk_end = self.chunk_start + timedelta(days=step_size)
            # if step is greater than a day make request
            if self.chunk_start != self.chunk_end:
                self.get_endpoint_timeframe()
            # if no step save data and just increment another day
            else:
                self.chunk_start = self.chunk_end + timedelta(days=1)
                self.chunk_end = self.chunk_end + timedelta(days=1)
                self.get_endpoint_timeframe()
                # parse & append results to dataframe
                df = self._parse_json()
                result_df = result_df.append(df)

        # pick up where we left off from chunking
        elif self.chunk_end != self.end_dt:
            self.chunk_start = self.chunk_end
            self.chunk_end = self.end_dt
            self.get_endpoint_timeframe()
            # parse & append results to dataframe
            df = self._parse_json()
            result_df = result_df.append(df)
        return result_df
Example #3
0
def getIndexChangeRate(startDate,endDate):    
    df_result = DataFrame()
    df = ts.get_hist_data('sh',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'sh'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('sz',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'sz'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('zxb',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'zxb'
    df_result = df_result.append(df)
    
    df = ts.get_hist_data('cyb',start =startDate,end = endDate ).reset_index()
    df['gap'] = df['high'] - df['low']
    df['gap_rate'] = df['gap']/df['close']*100
    df['mkt'] = 'cyb'
    df_result = df_result.append(df)
    
    fileName = r'D:\stock\index_changeRate_' +startDate+'_' + endDate + '.csv'
    df_result = df_result.loc[:,['date','mkt','close','volume','price_change','p_change','gap','gap_rate']]
    df_result = df_result.sort_index(by='date',ascending=False)
    df_result.to_csv(fileName,index = False)
Example #4
0
class Append(object):

    goal_time = 0.2

    def setup(self):
        self.df1 = DataFrame(np.random.randn(10000, 4),
                             columns=['A', 'B', 'C', 'D'])
        self.df2 = self.df1.copy()
        self.df2.index = np.arange(10000, 20000)
        self.mdf1 = self.df1.copy()
        self.mdf1['obj1'] = 'bar'
        self.mdf1['obj2'] = 'bar'
        self.mdf1['int1'] = 5
        try:
            with warnings.catch_warnings(record=True):
                self.mdf1.consolidate(inplace=True)
        except:
            pass
        self.mdf2 = self.mdf1.copy()
        self.mdf2.index = self.df2.index

    def time_append_homogenous(self):
        self.df1.append(self.df2)

    def time_append_mixed(self):
        self.mdf1.append(self.mdf2)
    def test_append_empty_dataframe(self):

        # Empty df append empty df
        df1 = DataFrame([])
        df2 = DataFrame([])
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)

        # Non-empty df append empty df
        df1 = DataFrame(np.random.randn(5, 2))
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)

        # Empty df with columns append empty df
        df1 = DataFrame(columns=['bar', 'foo'])
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)

        # Non-Empty df with columns append empty df
        df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
        df2 = DataFrame()
        result = df1.append(df2)
        expected = df1.copy()
        assert_frame_equal(result, expected)
Example #6
0
class Record(object):
    def __init__(self):
        self.trade_history = DataFrame()
        self.position_history = DataFrame()
        self.portfolio_value_history = DataFrame()

    def update_trade(self, date, trade_type, symbol, amount, price):
        newtrade = DataFrame(
            {"Date": [date], "Trade_type": [trade_type], "Symbol": [symbol], "Amount": [amount], "Price": [price]}
        )
        self.trade_history = self.trade_history.append(newtrade, ignore_index=True)

    def update_position(self, date, p):
        newposition = DataFrame(
            {
                "Date": [date],
                "Symbol": [p.symbol],
                "Amount": [p.amount],
                "Avg_price": [p.avg_price],
                "Position_value": [p.position_value],
            }
        )
        self.position_history = self.position_history.append(newposition, ignore_index=True)

    def update_portfolio_value(self, date, port, pos, cash):
        newport = DataFrame({"Date": [date], "Portfolio_value": [port], "Position_value": [pos], "Cash": [cash]})
        self.portfolio_value_history = self.portfolio_value_history.append(newport, ignore_index=True)
Example #7
0
    def handle_community(self, community, **options):
        #from mpl_toolkits.mplot3d import Axes3D
        canvas = pyplot.figure().gca(projection='3d')

        clothing_vectors = numpy.array([
            cast_elements_to_floats(individual["vectors"]) for individual in community.kernel.individual_set.all()
        ])
        centroids, labels = kmeans2(clothing_vectors, 10, minit="points")

        clothing_frame = DataFrame()
        clothing_by_cluster = sorted(zip(labels, clothing_vectors), key=itemgetter(0))
        current_label = None
        for label, vector in clothing_by_cluster:
            if label != current_label:
                current_label = label
                clothing_frame = clothing_frame.append(Series(data=centroids[current_label]), ignore_index=True)
            clothing_frame = clothing_frame.append(Series(data=vector), ignore_index=True)

        #centroids_frame = DataFrame(centroids)
        #centroids_frame.T.plot()
        #centroids_frame.drop(range(20, 4096), axis=1, inplace=True)
        #print(centroids_frame.head())

        self.plot_data(canvas, clothing_frame, 'b')
        pyplot.show()
Example #8
0
def get_sex_type():
    file_name = 'data/info_train.csv'
    y = pd.read_csv(file_name,header=None,index_col=0)
    male_id = y[y[1]<7].index
    m = DataFrame([0]*male_id.size,index=male_id,columns=['sex'])
    female_id = y[y[1]>6].index
    f = DataFrame([1]*female_id.size,index=female_id,columns=['sex'])
    m.append(f).to_csv('data/train_sex.csv')
    def get_topwords(self, countries, thresh=10, tf_idf=False):
        tw = DataFrame()
        for r in range(len(self.df)):
            if self.df.loc[r, 'country_id'] in countries:
                if tf_idf:
                    tw = tw.append(self.tf_idf.loc[r, :])
                else:
                    tw = tw.append(self.df.loc[r, :])

        return tw.mean().order(ascending=False)[:thresh]
Example #10
0
    def test_append(self):
        begin_index = self.frame.index[:5]
        end_index = self.frame.index[5:]

        begin_frame = self.frame.reindex(begin_index)
        end_frame = self.frame.reindex(end_index)

        appended = begin_frame.append(end_frame)
        assert_almost_equal(appended['A'], self.frame['A'])

        del end_frame['A']
        partial_appended = begin_frame.append(end_frame)
        self.assertIn('A', partial_appended)

        partial_appended = end_frame.append(begin_frame)
        self.assertIn('A', partial_appended)

        # mixed type handling
        appended = self.mixed_frame[:5].append(self.mixed_frame[5:])
        assert_frame_equal(appended, self.mixed_frame)

        # what to test here
        mixed_appended = self.mixed_frame[:5].append(self.frame[5:])
        mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:])

        # all equal except 'foo' column
        assert_frame_equal(
            mixed_appended.reindex(columns=['A', 'B', 'C', 'D']),
            mixed_appended2.reindex(columns=['A', 'B', 'C', 'D']))

        # append empty
        empty = DataFrame({})

        appended = self.frame.append(empty)
        assert_frame_equal(self.frame, appended)
        self.assertIsNot(appended, self.frame)

        appended = empty.append(self.frame)
        assert_frame_equal(self.frame, appended)
        self.assertIsNot(appended, self.frame)

        # overlap
        self.assertRaises(ValueError, self.frame.append, self.frame,
                          verify_integrity=True)

        # new columns
        # GH 6129
        df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}})
        row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z')
        expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': {
                             'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}})
        result = df.append(row)
        assert_frame_equal(result, expected)
def slide_21():
    import json
    db = json.load(open(FOODJSONPATH))
    print len(db)

    print db[0].keys()
    print db[0]['nutrients'][0]

    nutrients = DataFrame(db[0]['nutrients'])
    print nutrients[:7]

    info_keys = ['description', 'group', 'id', 'manufacturer']
    info = DataFrame(db, columns=info_keys)
    print info[:5]

    print pd.value_counts(info.group)[:10]

    print "今から全部のnutrientsを扱うよ"
    nutrients = []

    for rec in db:
        fnuts = DataFrame(rec['nutrients'])
        fnuts['id'] = rec['id']
        nutrients.append(fnuts)

    nutrients = pd.concat(nutrients, ignore_index=True)
    print "なんか重複多い"
    print nutrients.duplicated().sum()
    nutrients = nutrients.drop_duplicates()

    print "infoとnutrients両方にdescriptionとgroupがあるから変えよう"
    col_mapping = {'description': 'food', 'group': 'fgroup'}
    info = info.rename(columns=col_mapping, copy=False)

    col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
    nutrients = nutrients.rename(columns=col_mapping, copy=False)

    ndata = pd.merge(nutrients, info, on='id', how='outer')
    print ndata.ix[30000]

    result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
    result['Zinc, Zn'].order().plot(kind='barh')
    plt.show()

    by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
    get_maximum = lambda x: x.xs(x.value.idxmax())
    get_minimum = lambda x: x.xs(x.value.idxmin())

    max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

    max_foods.food = max_foods.food.str[:50]

    print max_foods.ix['Amino Acids']['food']
Example #12
0
def concatenate_years_data():
    '''This function combines all of the dataframes
    for each year into one dataframe consisting
    of the births data from all of the years.'''
    years = np.array(range(1880,2011))  # These are all of the years for which we have data
    current_directory=os.getcwd()
    if current_directory!='C:\\Users\\Jormak\\PycharmProjects\\PANDAS_Book\\pydata-book\\ch02\\names':
        os.chdir('C:\\Users\\Jormak\\PycharmProjects\\PANDAS_Book\\pydata-book\\ch02\\names')
    all_years = DataFrame()
    for year in years:
        one_year = pd.read_csv('yob'+str(year)+'.txt',names = ['name','sex','births'])  #note that read_csv can read .txt files too
        all_years.append(one_year)
    names = pd.concat(all_years, ignore_index=True)
    return all_years
Example #13
0
    def test_append_missing_cols(self):
        # GH22252
        # exercise the conditional branch in append method where the data
        # to be appended is a list and does not contain all columns that are in
        # the target DataFrame
        df = DataFrame(np.random.randn(5, 4),
                       columns=['foo', 'bar', 'baz', 'qux'])

        dicts = [{'foo': 9}, {'bar': 10}]
        with tm.assert_produces_warning(None):
            result = df.append(dicts, ignore_index=True, sort=True)

        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
        assert_frame_equal(result, expected)
Example #14
0
def _extract_data(file_name, filters, fields=None, summary=None,
                  classname='Table', mode='walk', hash=''):
    '''
    Not meant for direct use.  This is broken out of :func:`extract_data` so we
    can wrap the code in a caching decorator to speed up loading of data from
    disk.  The hash is created by :func:`extract_data` to ensure that the cache
    is cleared if the last modified time changes.  Note that if you move the
    file to a different folder, this does not clear the cache.
    '''
    log.info('... No cached copy of data found, reloading data')
    with tables.openFile(file_name, 'r') as h:
        data = DataFrame()
        if mode == 'walk':
            iterator = walk_nodes(h.root, filters, classname)
        elif mode == 'pattern':
            iterator = p_iter_nodes(h.root, filters)
        else:
            raise ValueError, 'Unsupported mode {}'.format(mode)

        for node in iterator:
            log.info('... Found node %s', node._v_pathname)
            if type(node) == tables.Table:
                frame = extract_node_data(node, fields, summary)
                data = data.append(frame, ignore_index=True)
            else:
                raise NotImplementedError
    return data
    def test_append_list_of_series_dicts(self):
        df = DataFrame(np.random.randn(5, 4),
                       columns=['foo', 'bar', 'baz', 'qux'])

        dicts = [x.to_dict() for idx, x in df.iterrows()]

        result = df.append(dicts, ignore_index=True)
        expected = df.append(df, ignore_index=True)
        assert_frame_equal(result, expected)

        # different columns
        dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
                 {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
        result = df.append(dicts, ignore_index=True, sort=True)
        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
        assert_frame_equal(result, expected)
Example #16
0
def read_data(features, feat_path='out'):
    frame = DataFrame()

    for data_path, data_ids in data_paths_and_ids:
        frame = frame.append(frame_for_id(features, feat_path, data_ids, data_path))

    return frame
Example #17
0
def frame_for_id(features, feat_path='out', data_ids=sts.sts12.train_ids, data_dir='STS2012-train'):
    frame = DataFrame()

    for data_id in data_ids:
        data = {}

        for feat_id in features:
            data_id_dir = data_id[9:] if data_id.startswith("surprise.") else data_id
            feat_fn = os.path.join(feat_path, data_dir, data_id_dir, "%s.txt" % feat_id)

            data[feat_id] = series_from_feat(feat_fn)

        new_frame = DataFrame(data)
        new_frame['data_id'] = data_id

        gs_fn = os.path.join(repos_dir, 'data', data_dir, "STS.gs.%s.txt" % data_id)

        if os.path.exists(gs_fn):
            new_frame['gs'] = Series(loadtxt(gs_fn))
        else:
            new_frame['gs'] = None

        frame = frame.append(new_frame)

    frame['data_set'] = data_dir

    return frame
Example #18
0
class matchbox:
    def __init__(self, articlepaths):
        self.num_exports = 0
        self.num_articles_total = len(articlepaths)
        self.num_articles_matched = 0
        self.num_matches = 0
        self.dataframe = DataFrame()
        self.init_time = time.strftime("%Y-%m-%d_%H-%M-%S_")

    def update(self, matches):
        self.dataframe = self.dataframe.append(matches, ignore_index=True)
        self.num_articles_matched += 1
        self.num_matches += len(matches)
        print('Matched {} places in article {} of {} ({:.2%} complete). '
              'Total: {}.'.format(len(matches),
                                          self.num_articles_matched,
                                          self.num_articles_total,
                                          self.num_articles_matched / self.num_articles_total,
                                          self.num_matches))

    def empty_into_csv(self):
        self.num_exports += 1
        outname = outdir + self.init_time + 'pubs_aegypti_' + str(self.num_exports) + '.csv'
        self.dataframe.to_csv(outname, encoding='utf-8')
        print('Wrote matches from chunk {} to {}.'.format(self.num_exports, outname))
        del self.dataframe
        self.dataframe = DataFrame()
Example #19
0
    def test_append_length0_frame(self):
        df = DataFrame(columns=['A', 'B', 'C'])
        df3 = DataFrame(index=[0, 1], columns=['A', 'B'])
        df5 = df.append(df3)

        expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C'])
        assert_frame_equal(df5, expected)
Example #20
0
def main():

    logger = get_root_logger()
    get_header(logger, 'LOADING PROJECTIONS')

    client = APIClient()

    # grab dataframe shape from a trial run
    data = client.get_data('weekly-projections', 'json', 'QB')
    test_df = json_normalize(data['Projections'])

    # get DF structure from columns in test_df
    cols = test_df.columns
    df = DataFrame(columns=cols)

    # grab current week
    current_week = test_df.week.values[0]

    # loop through all weeks up to current week
    for wk in [str(x) for x in range(int(current_week))]:
        logger.info('Processing projections for week {0}'.format(int(wk) + 1))
        # loop through all positions
        for pos in ['QB', 'RB', 'WR', 'TE', 'K', 'DEF']:
            tmp_data = client.get_data('weekly-projections', 'json', pos, wk)
            tmp_df = json_normalize(tmp_data['Projections'])
            df = df.append(tmp_df)

    # import this df directly to PG DB
    conn = DBClient()
    conn.load(df, 'projections', schema='raw', if_exists='replace')
Example #21
0
    def test_append_dtype_coerce(self):

        # GH 4993
        # appending with datetime will incorrectly convert datetime64
        import datetime as dt
        from pandas import NaT

        df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0),
                                            dt.datetime(2013, 1, 2, 0, 0)],
                        columns=['start_time'])
        df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0),
                                             dt.datetime(2013, 1, 3, 6, 10)],
                                            [dt.datetime(2013, 1, 4, 0, 0),
                                             dt.datetime(2013, 1, 4, 7, 10)]],
                        columns=['start_time', 'end_time'])

        expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10),
                                   dt.datetime(2013, 1, 4, 7, 10)],
                                  name='end_time'),
                           Series([dt.datetime(2013, 1, 1, 0, 0),
                                   dt.datetime(2013, 1, 2, 0, 0),
                                   dt.datetime(2013, 1, 3, 0, 0),
                                   dt.datetime(2013, 1, 4, 0, 0)],
                                  name='start_time')], axis=1)
        result = df1.append(df2, ignore_index=True)
        assert_frame_equal(result, expected)
Example #22
0
def getFeatures(filename):
    csvfile = pd.read_csv(filename)  # Reading .csv files containing tweets.
    tweet_ids = csvfile["id_str"]  # Copying the 'id_str' attribute values to a item.
    length = len(tweet_ids)  # Getting the length of 'tweet_ids'.

    df = DataFrame(d, index=[0])  # Creating a DataFrame

    twitter = Twython(APP_KEY, APP_SECRET, oauth_version=2)
    ACCESS_TOKEN = twitter.obtain_access_token()
    twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN)
    # Generating Access Token

    for i in range(0, length):
        status = twitter.show_status(id=tweet_ids[i])
        d["id"] = status["id_str"].encode("utf-8")
        d["created_at"] = status["created_at"].encode("utf-8")
        d["from_user"] = status["user"]["screen_name"].encode("utf-8")
        d["followers_count"] = status["user"]["followers_count"]
        d["friends_count"] = status["user"]["friends_count"]
        d["statuses_count"] = status["user"]["statuses_count"]
        d["verified"] = status["user"]["verified"]
        d["location"] = 0 if (len(status["user"]["location"].encode("utf-8")) == 0) else 1
        d["text"] = status["text"].encode("utf-8")
        d["retweet_count"] = status["retweet_count"]
        d["favorite_count"] = status["favorite_count"]
        d["hashtag_count"] = len(status["entities"]["hashtags"])
        d["url_count"] = len(status["entities"]["urls"])
        d["mentions_count"] = len(status["entities"]["user_mentions"])
        if len(status["entities"]["urls"]) > 0:
            for x in range(0, len(status["entities"]["urls"])):
                d["links"] += status["entities"]["urls"][x]["expanded_url"].encode("utf-8") + "  "
        df = df.append(d, ignore_index=True)
        df.to_csv("NSamples.csv")  # Saving file to disk
        d["links"] = ""
    print "\nAll Done!"
def prepare_modeling_results(model,all_data,expnoList,path,exptitle):
	results_table = DataFrame(columns=['h','X','X model','S', 'S model', 'P', 'P model', 'expno', 'XnOBS', 'XnPRED', 'SnOBS', 'SnPRED', 'PnOBS', 'PnPRED'])
	for dataset1,expno1 in zip(all_data,expnoList):
		results_table1 = model.simulation(dataset1, expno=expno1)
		# calculate normalized values
		# XnOBS, XnPRED = feature_scaling(results_table1['X'].values,results_table1['X model'].values)
		# SnOBS, SnPRED = feature_scaling(results_table1['S'].values,results_table1['S model'].values)
		# PnOBS, PnPRED = feature_scaling(results_table1['P'].values,results_table1['P model'].values)
		# standardization
		XnOBS, XnPRED = zero_mean_variance(results_table1['X'].values,results_table1['X model'].values)
		SnOBS, SnPRED = zero_mean_variance(results_table1['S'].values,results_table1['S model'].values)
		PnOBS, PnPRED = zero_mean_variance(results_table1['P'].values,results_table1['P model'].values)

		
		# and add them to the table as new columns
		results_table1['XnOBS']  = XnOBS
		results_table1['XnPRED'] = XnPRED
		results_table1['SnOBS']  = SnOBS
		results_table1['SnPRED'] = SnPRED
		results_table1['PnOBS']  = PnOBS
		results_table1['PnPRED'] = PnPRED
		# now add the current experiment to the big table of all experiments
		results_table = results_table.append(results_table1)
	results_table.to_html("{0}results_table_{1}.html".format(path,exptitle))
	return results_table
Example #24
0
def train_data_construct(bins, train_set, iteration, realtime = False):
    train_bins = defaultdict(tuple)

    print 'start to construct the train data bins'
    if realtime:
        idx = 0
        for bin in bins:
            if len(bin) > 0:
                feature_bin = DataFrame()
                lable_bin = Series()
                for uid in bin:
                    tmp = train_set[train_set['product_uid'] == int(uid)]
                    if not tmp.empty:
                        feature_bin = feature_bin.append(tmp)
                        # should drop the relevance data here
                        lable_bin = lable_bin.append(tmp['relevance'])
                train_bins[idx] = (feature_bin,lable_bin)
                print len(train_bins[idx][0]), ' entries in bin', idx
                # if idx == 0:
                #     feature_bin.to_csv('feature_bin.csv')
                idx += 1
        f1 = file('../data/train_bins'+str(iteration)+'.pkl','wb')
        pk.dump(train_bins,f1)
    else:
        f1 = file('../data/train_bins'+str(iteration)+'.pkl','rb')
        train_bins=pk.load(f1)
    print 'finish constructing training bins'

    return train_bins
def test(dataset, overshoot_threshold):
	from numpy import where, zeros
	from sklearn.neighbors.kde import KernelDensity
	folder = make_issue_specific_figure_folder('108 cluster after removing outliers', dataset)
	fit, par, gen, ids = IO.load_pickled_generation_dataframe(dataset)
	o = where(fit.overshoot > overshoot_threshold)[0]
	#not_o = where(fit.overshoot <= overshoot_threshold)[0]
	par = par.drop(o)
	fit = fit.drop(o)
	g1 = par.groupby('ssmm_nAgents').groups.keys()
	g2 = par.groupby('ssmm_latency_mu').groups.keys()
	#stdev_mean = zeros((len(g1), len(g2)))
	data = DataFrame(columns=['ssmm_nAgents', 'ssmm_latency_mu', 'stdev_mean'])
	for a, ssmm_nAgents in enumerate(g1):
		print ssmm_nAgents
		for l, ssmm_latency_mu in enumerate(g2):
			row = dict()
			try:
				row['stdev_mean'] = fit[(par['ssmm_latency_mu'] == ssmm_latency_mu) & (par['ssmm_nAgents'] == ssmm_nAgents)]['stdev'].mean()
				row['ssmm_nAgents'] = ssmm_nAgents
				row['ssmm_latency_mu'] = ssmm_latency_mu
				#print row
				data = data.append(row, ignore_index = True)
			except TypeError:
				print "ARGHS"

	X, Y = np.meshgrid(g1.groups.keys(), g2.groups.keys())
	xy = np.vstack([Y.ravel(), X.ravel()]).T
	return data
Example #26
0
 def OnRtnTrade(self, Trade):
     """成交回报"""
     # print('OnRtnTrade:', Trade)
     print('OnRtnTrade:\n', Utils.code_transform(Trade))
     PyCTP_Trader_API.dfOnRtnTrade = DataFrame.append(PyCTP_Trader_API.dfOnRtnTrade,
                                                      other=Utils.code_transform(Trade),
                                                      ignore_index=True)
Example #27
0
def handleBi5(infile, fileDataFrame):

    if os.path.getsize(infile) == 0:
        return fileDataFrame

    array = infile.split('/')
    print array
    alen = len(array)

    dateWithoutHour = long(datetime(int(array[alen-4]),int(array[alen-3]),int(array[alen -2])).strftime("%s"))
    dateWithoutMilisec = (dateWithoutHour+int(array[alen-1].split('_')[0].split('h')[0])*3600)*1000
    subprocess.call("xz -dkc --suffix=bi5 " + infile + ">tmp.bin", shell=True)


    hdfDir = "./hdf/" + infile.split('/')[2]
    if not os.path.exists(hdfDir):
        os.makedirs(hdfDir)
    cvsFileName = hdfDir + "/" + infile.split('/')[3]

    if fileDataFrame.empty:
        if os.path.exists(cvsFileName):
            fileDataFrame =	read_csv(cvsFileName, index_col=0)
        else:
            fileDataFrame = DataFrame()

    fileDataFrame = fileDataFrame.append(processBinFile("tmp.bin", dateWithoutMilisec))

    print fileDataFrame.iloc[0]
    return fileDataFrame
Example #28
0
def convertToPutJson(csv_file):
    df = cleanColumns(read_csv(csv_file))
    putColumns = ["method", "recordId", "body"]
    putDf = DataFrame(columns = putColumns)

    for recordId in df.index:
        print "Converting data for recordId {recordId}...".format(recordId = recordId)
        body = {}
        
        for col in df.columns:
            body[str(col).strip()] = [str(df[col][recordId]).strip()]
        
        putDfRow = DataFrame([["PUT", str(recordId), body]], columns = putColumns)
        putDf = putDf.append(putDfRow)
    
    json_file = sub("csv|txt", "json", csv_file)
    putDf.to_json(json_file, orient="records")

    with open(json_file, 'r') as target:
        putData = target.read()

    target = open(json_file, 'w')
    putData = putData.replace("},{", "}\n\n{")[1:-1]
    target.write(putData)
    target.close()

    print "Successfully created put data!"
    return json_file
    def test_append_concat(self):
        rng = date_range('5/8/2012 1:45', periods=10, freq='5T')
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)

        result = ts.append(ts)
        result_df = df.append(df)
        ex_index = DatetimeIndex(np.tile(rng.values, 2))
        tm.assert_index_equal(result.index, ex_index)
        tm.assert_index_equal(result_df.index, ex_index)

        appended = rng.append(rng)
        tm.assert_index_equal(appended, ex_index)

        appended = rng.append([rng, rng])
        ex_index = DatetimeIndex(np.tile(rng.values, 3))
        tm.assert_index_equal(appended, ex_index)

        # different index names
        rng1 = rng.copy()
        rng2 = rng.copy()
        rng1.name = 'foo'
        rng2.name = 'bar'
        assert rng1.append(rng1).name == 'foo'
        assert rng1.append(rng2).name is None
Example #30
0
def parse_page(html):
	#html解析
	soup = BeautifulSoup(html,"lxml")
	#提取微博文本
	text=soup.find_all(attrs={"node-type":"feed_list_content","class":"WB_text W_f14"})
	#提取转发部分
	forward=soup.find_all(attrs={"node-type":"forward_btn_text"})
	#提取评论部分
	comment=soup.find_all(attrs={"node-type":"comment_btn_text"})
	#提取日期
	date=soup.find_all(attrs={"node-type":"feed_list_item_date"})
	#提取来源平台
	source=soup.find_all(attrs={"action-type":"app_source"})
	#提取点赞数
	like=soup.select('li a[title="赞"]')
	#删除无关信息
	for each in date:
		if each.has_attr('suda-data')==False:
			date.remove(each)
	for each in source:
		if each.has_attr('suda-uatrack')==False:
			source.remove(each)
	for each in like:
		if each.has_attr('suda-uatrack')==False:
			like.remove(each)
	#构建数据字典
	wb_de=[]
	wb_al={}
	wb_al=wb_al.fromkeys(wb_de,[])
	wb_fr=DataFrame(wb_al,index=[])
	for i in range(len(text)):
		all_weibo={"text":text_list(text)[i],"date":text_list(date)[i],"source":text_list(source)[i],"forward":text_list(forward)[i],"comment":text_list(comment)[i],"like":text_list(like)[i]}
		wb_fr=wb_fr.append(all_weibo,ignore_index=True)
	    
	return wb_fr