コード例 #1
1
ファイル: MysqlGetUrl.py プロジェクト: robihidayat/Python
def get_link(url):
	link_exr = re.compile(r'<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>')
	links = []
	
	# open web content
	f = urllib2.urlopen(url)
	content = f.read()
	
	# versi find html tag : find all url and save to links
	# soup = BeautifulSoup(content, "lxml")
	# for a in soup.find_all('a', href=True):
	# 	if "detik.com" in a['href']:
	# 		if "http:" not in a['href']:
	# 			a['href'] = "http:" + a['href']
	# 		print "Found the URL:", a['href']
	# 		links.append(a['href'])
			
	# versi regex : find all url and save to links			
	for link in link_exr.findall(content):
		if "detik.com" in link[0]:
			link_detik = link[0]
			if "http:" not in link_detik:
				link_detik = "http:" + link_detik
			links.append(link_detik)
	
	# save to DataFrame
	df = DataFrame(links, columns=['detik url'])
	df.drop_duplicates()

	print df.head(0)
		# create and save to sqlite database
	detik_db = create_engine("mysql://*****:*****@localhost/data_detik") 
	df.to_sql('url_detik', detik_db, if_exists='replace')
コード例 #2
0
ファイル: views.py プロジェクト: uddhavpgautam/mining
    def post(self):
        post = json.loads(self.request.body)

        MyClient = riak.RiakClient(protocol=RIAK_PROTOCOL,
                                   http_port=RIAK_HTTP_PORT,
                                   host=RIAK_HOST)

        MyAdminBucket = MyClient.bucket(ADMIN_BUCKET_NAME)

        connection = None
        for c in MyAdminBucket.get('connection').data:
            if c['slug'] == post.get('connection', None):
                connection = c['connection']

        sql = """SELECT * FROM ({}) AS CUBE LIMIT 10;""".format(
            post.get('sql', None))

        e = create_engine(connection)
        connection = e.connect()
        try:
            resoverall = connection.execute(text(sql))
        except:
            self.write({'sql': '', 'msg': 'Error!'})
            self.finish()

        df = DataFrame(resoverall.fetchall())
        if df.empty:
            self.finish()
        df.columns = resoverall.keys()
        df.head()

        self.write({'sql': df.to_json(orient='records'), 'msg': 'Success!'})
        self.finish()
コード例 #3
0
ファイル: mining.py プロジェクト: uddhavpgautam/mining
def run(cube_slug=None):
    mc = memcache.Client(['127.0.0.1:11211'], debug=0)
    for cube in MyAdminBucket.get('cube').data:
        try:
            slug = cube['slug']

            if cube_slug and cube_slug != slug:
                continue

            sql = """SELECT * FROM ({}) AS CUBE;""".format(cube['sql'])
            for c in MyAdminBucket.get('connection').data:
                if c['slug'] == cube['connection']:
                    connection = c['connection']

            print "\n# CLEAN MEMCACHE/RIAK: {}".format(slug)
            mc.delete(str(slug))
            mc.delete(str('{}-columns'.format(slug)))

            MyBucket.new(slug, data='').store()
            MyBucket.new(u'{}-columns'.format(slug), data='').store()
            MyBucket.new(u'{}-connect'.format(slug), data='').store()
            MyBucket.new(u'{}-sql'.format(slug), data='').store()

            print "# CONNECT IN RELATION DATA BASE: {}".format(slug)
            e = create_engine(connection)
            connection = e.connect()

            resoverall = connection.execute(text(sql))

            print "# LOAD DATA ON DATAWAREHOUSE: {}".format(slug)
            df = DataFrame(resoverall.fetchall())
            if df.empty:
                print '[warnning]Empty cube: {}!!'.format(cube)
                return
            df.columns = resoverall.keys()
            df.head()

            pdict = map(fix_render, df.to_dict(outtype='records'))

            print "# SAVE DATA (JSON) ON RIAK: {}".format(slug)
            MyBucket.new(slug, data=pdict).store()

            print "# SAVE COLUMNS ON RIAK: {}".format(slug)
            MyBucket.new(u'{}-columns'.format(slug),
                         data=json.dumps([c for c in df.columns])).store()

            print "# SAVE CONNECT ON RIAK: {}".format(slug)
            MyBucket.new(u'{}-connect'.format(slug), data=c).store()

            print "# SAVE SQL ON RIAK: {}".format(slug)
            MyBucket.new(u'{}-sql'.format(slug), data=sql).store()

            print "# CLEAN MEMORY: {}\n".format(slug)
            del pdict, df
            gc.collect()
        except:
            pass

    print "## FINISH"
    return True
コード例 #4
0
    def test_setitem_chained_setfault(self):

        # GH6026
        # setfaults under numpy 1.7.1 (ok on 1.8)
        data = ['right', 'left', 'left', 'left', 'right', 'left', 'timeout']
        mdata = ['right', 'left', 'left', 'left', 'right', 'left', 'none']

        df = DataFrame({'response': np.array(data)})
        mask = df.response == 'timeout'
        df.response[mask] = 'none'
        tm.assert_frame_equal(df, DataFrame({'response': mdata}))

        recarray = np.rec.fromarrays([data], names=['response'])
        df = DataFrame(recarray)
        mask = df.response == 'timeout'
        df.response[mask] = 'none'
        tm.assert_frame_equal(df, DataFrame({'response': mdata}))

        df = DataFrame({'response': data, 'response1': data})
        mask = df.response == 'timeout'
        df.response[mask] = 'none'
        tm.assert_frame_equal(df, DataFrame({'response': mdata,
                                             'response1': data}))

        # GH 6056
        expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar']))
        df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar'])))
        df['A'].iloc[0] = np.nan
        result = df.head()
        tm.assert_frame_equal(result, expected)

        df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar'])))
        df.A.iloc[0] = np.nan
        result = df.head()
        tm.assert_frame_equal(result, expected)
コード例 #5
0
class MetrilyxAnalyticsSerie(MetrilyxSerie, BasicAnalyticsSerie):

    def __init__(self, serie, graphType="line", dataCallback=None):
        super(MetrilyxAnalyticsSerie, self).__init__(serie, dataCallback)

        self.graphType = graphType
        self._istruct = None

        if not self.error:
            self._istruct = self.__getInternalStruct()
            self.__applyTransform()
            

    def __getInternalStruct(self):
        out = []
        for d in self._serie['data']:
            out.append((d['uuid'], Series([d['dps'][k] for k in sorted(d['dps'].keys())],
                index=to_datetime([int(ts) for ts in sorted(d['dps'].keys())], unit='s'))))
        return DataFrame(dict(out))

    def __getSerieMetadata(self, serie):
        return dict([(k,v) for k,v in serie.items() if k != 'dps'])

    def data(self, ts_unit='ms'):
        if self.error: return { 'error': self.error }

        out = []
        for s in self._serie['data']:
            md = self.__getSerieMetadata(s)
            logger.error("HERE %s" % (s['uuid']))
            datapoints = self._getDataSerieDps(self._serie['query']['aggregator'],
                                                self._istruct[s['uuid']], ts_unit)

            error = self._dataHasErrors(datapoints)
            if not error:
                md['dps'] = datapoints
                md['uuid'] = SerieUUID(s).uuid
            else:
                #s = {'error': error}
                logger.warning("Error assembling data: %s" %(str(e)))
            out.append(md)

        return out

    def __applyTransform(self):
        if self._serie['yTransform'] != "":
            try:
                self._istruct = eval("%s" %(self._serie['yTransform']))(self._istruct)
                
                if isinstance(self._istruct, Series):
                    logger.error("Need to handle 'Series'")
                    logger.error(self._serie['alias'])
                    self._istruct = DataFrame({self._serie['alias']: self._istruct})
                    print self._istruct.head()

            except Exception,e:
                logger.warn("Could not apply yTransform: %s" %(str(e)))
コード例 #6
0
def main():
    loc_char_count = create_data_array()
    # print loc_char_count
    count_sum = loc_char_count.sum(axis=0)
    # print count_sum
    count_sum[count_sum==0] = 1
    # print count_sum

    test =  loc_char_count/count_sum
    df=  DataFrame(test)
    df.to_csv('./data/char_loc_freq.csv')
    print df.head(4)
    print df.ix(3)[2]
コード例 #7
0
def getPostData(fbGraph, entry):
    global CHART_LIMIT
    retrieved = False
    i=0
    while retrieved == False:
        i += 1
        try:
            posts = fbGraph.get_object(entry['page'] + '/posts',
                                       limit=CHART_LIMIT*15)['data']
            retrieved = True
        except facebook.GraphAPIError:
            print "Failed retrieving Graph object from facebook, retrying..."
            pass
        if i > 14:
            print "Giving up"
            return None
        
    frame = DataFrame(posts)
    ##Later, maybe output this frame for further study
    
    postData = DataFrame(columns=('Date', 'Likes', 'Shares'))
    postData['Shares'] = frame['shares'].map(fmtShares)
    postData['Likes']  = frame['id'].map(fmtLikes)
    postData['Date']   = frame['created_time'].map(fmtDate)
    
    postData = postData.groupby(by='Date', sort=False).mean()
    postData = postData.head(n=CHART_LIMIT)
    postData.fillna(value=0)
    return postData
コード例 #8
0
ファイル: gonzales.py プロジェクト: MazenAly/datamining
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
コード例 #9
0
ファイル: join_merge.py プロジェクト: bashtage/pandas
 def setup(self, axis):
     N = 1000
     s = Series(N, index=tm.makeStringIndex(N))
     self.series = [s[i:- i] for i in range(1, 10)] * 50
     self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000
     df = DataFrame({'A': range(N)},
                    index=date_range('20130101', periods=N, freq='s'))
     self.empty_left = [DataFrame(), df]
     self.empty_right = [df, DataFrame()]
     self.mixed_ndims = [df, df.head(N // 2)]
コード例 #10
0
def select_best(clstruct,
scorenames=['sensitivity','mmr','aupr','cliqueness_3_20','nonov_iter','n_proteins','n_complexes_3_20'],
        rfunc=operator.add, use_norm=False, dispn=15, score_factors=None,
        use_ranks=True, output_ranks=False, print_ranks=False,
        require_scores=None):
    cxstructs, stats = clstruct.cxstructs, clstruct.stats
    clusts = [cxstr.cxs for cxstr in cxstructs]
    scorenames = scorenames or list(stats.dtype.names)
    stats = stats[scorenames]
    ranks = rank_columns(stats)
    if use_ranks:
        stats = ranks
    else:
        if use_norm: stats = norm_columns(stats)
        if score_factors: stats = rescale_columns(stats, score_factors)
    inds = np.argsort(reduce(rfunc, [stats[n] for n in scorenames]))[::-1]
    if require_scores is not None:
        for req_name,thresh in require_scores:
            thresh = (np.median(clstruct.stats[req_name]) if thresh is None
                    else thresh)
            inds = [i for i in inds if clstruct.stats[req_name][i] > thresh]
    nstats = len(stats)
    def filt_params(s):
        return " ".join([p[:2]+p.split('=')[1] for p in s.split(',')])
    show_columns = (scorenames if require_scores is None else
            scorenames+ut.i0(require_scores))
    d = DataFrame(clstruct.stats[inds[:dispn]][show_columns],
            index=["#%s: %sc %si %s" %
                (i,len(clusts[i]),len(cxstructs[i].cxppis),
                    filt_params(cxstructs[i].params)) for i in inds[:dispn]])
    print d.head(dispn)
    for i in inds[:dispn]: 
        #print (i, ["%0.4f " % s for s in clstruct.stats[i]], len(clusts[i]), 
                #len(cxstructs[i].cxppis), cxstructs[i].params)
        if print_ranks:
            print i, [nstats-s for s in ranks[i]]
    if output_ranks:
        return inds
    else:
        return clusts[inds[0]], cxstructs[inds[0]].cxppis, inds[0]
コード例 #11
0
def download_data(order_set,save_set):
    for i in range(len(table_name_set)):
        cursor.execute(order_set[i])
        rows = cursor.fetchall()
        #print(len(rows))
        #print(cursor.description)
        import numpy as np
        rows = np.array(rows).reshape(len(rows),len(rows[0]))
        df = DataFrame(rows,columns=[i[0] for i in cursor.description])
        #print([i[0] for i in cursor.description])
        df.to_csv(save_set[i],index=None,encoding='GB2312')
        print(df.head())
        cnxn.commit()
コード例 #12
0
ファイル: tests.py プロジェクト: ggstuart/compost
class TestPerfectData(unittest.TestCase):
    """what happens with nice data"""

    def setUp(self):
        index = date_range('1/1/2015', periods=365)
        self.df = DataFrame(list(range(len(index))), index=index, columns=['value'])
        self.dataset = Dataset(self.df, 60*60*24, cumulative=False)

    def test_validates(self):
        self.assertTrue(self.dataset.validate())

    def test_partial_validates(self):
        """cut the data up and it still works"""
        d = Dataset(self.df.head(100), 60*60*24, cumulative=False)
        self.assertTrue(d.validate())

    def test_short_raises(self):
        """single value datasets raise an error"""
        d = Dataset(self.df.head(1), 60*60*24, cumulative=False)
        self.assertRaises(ShortDatasetError, d.validate)

    def test_interpolate_skipped(self):
        d2 = self.dataset.interpolate()
        self.assertEqual(self.dataset, d2)
コード例 #13
0
ファイル: parameter_search.py プロジェクト: AntHar/xgboost
def make_submission(path, params, threshold_ratio):

    X_train, w_train, y_train = load_training_data()
    indexes_test, X_test = load_test_data()
    y_out = fit_predict(X_train, w_train, y_train, X_test, params)
    y_pred, rank = get_y_pred_rank(y_out, threshold_ratio)

    submission = DataFrame({'EventId': indexes_test, 'RankOrder': rank, 'Class': y_pred},
        columns=['EventId', 'RankOrder', 'Class'])
    submission['Class'] = submission['Class'].apply(lambda x: 's' if x else 'b')

    submission.to_csv(path, index=False)
    print('--------------------- Submission')
    print(submission.head())
    print(path)
    return submission
コード例 #14
0
    def process(self, start_time:datetime, end_time:datetime, input:DataFrame):
        if (self._args is not None and len(self._args) > 2) or \
           (len(self._args) != 0 and not isinstance(self._args[0], QueryFunction)):
            raise ValueError('Invalid argument to absolute value function')

        # get the data
        data = input if len(self._args) == 0 else self._args[0].process(start_time, end_time, input)

        ret = None

        # go through each column, get the average, and apply it to the rows
        for col in data.columns:
            abs = data[col].abs()  # get the absolute value for each value in the column
            abs.name = 'abs ' + col  # update the name

            if ret is None:
                ret = DataFrame(abs)
            else:
                ret = ret.combine_first(DataFrame(abs))  # add it to our return value

        print(ret.head())

        return ret
コード例 #15
0
ファイル: MathFunction.py プロジェクト: Metrink/metrink-fe
    def process(self, start_time: datetime, end_time: datetime, input:DataFrame):
        if str(self.name) not in '+-*/':
            raise ValueError("Unknown math function: " + str(self.name))

        ret = DataFrame()

        # two args means we're doing A + B
        if len(self._args) == 2:
            left = self._args[0].process(start_time, end_time, input) if isinstance(self._args[0], QueryFunction) else self._args[0]
            right = self._args[1].process(start_time, end_time, input) if isinstance(self._args[1], QueryFunction) else self._args[1]

            for l_col in left.columns:
                for r_col in right.columns:
                    if self.name == '+':
                        t = left[l_col] + right[r_col]
                    elif self.name == '-':
                        t = left[l_col] - right[r_col]
                    elif self.name == '*':
                        t = left[l_col] * right[r_col]
                    elif self.name == '/':
                        t = left[l_col] / right[r_col]
                    else:
                        raise ValueError("Unknown operator: " + str(self.name))

                    t = DataFrame(t)
                    t.columns = [l_col + self.name + r_col]

                    print(left.head())
                    print(right.head())
                    print(t.head())
                    ret = ret.combine_first(t)

        else:  # everything is in the input DataFrame
            ret = DataFrame(input.sum(axis=0))
            ret.columns = [' + '.join(input.columns)]

        return ret
コード例 #16
0
# In[11]:


#Calculating Moving averages for Gold
ma = DataFrame(values['Date'],columns=['Date'])
ma['Date']=pd.to_datetime(ma['Date'],format='%Y-%b-%d')
ma['15SMA'] = (values['Gold']/(values['Gold'].rolling(window=15).mean()))-1
ma['30SMA'] = (values['Gold']/(values['Gold'].rolling(window=30).mean()))-1
ma['60SMA'] = (values['Gold']/(values['Gold'].rolling(window=60).mean()))-1
ma['90SMA'] = (values['Gold']/(values['Gold'].rolling(window=90).mean()))-1
ma['180SMA'] = (values['Gold']/(values['Gold'].rolling(window=180).mean()))-1
ma['90EMA'] = (values['Gold']/(values['Gold'].ewm(span=90,adjust=True,ignore_na=True).mean()))-1
ma['180EMA'] = (values['Gold']/(values['Gold'].ewm(span=180,adjust=True,ignore_na=True).mean()))-1
ma = ma.dropna(axis=0)
print(ma.shape)
ma.head()


# In[12]:


#Merging Moving Average values to the feature space
print(data.shape)
data['Date']=pd.to_datetime(data['Date'],format='%Y-%b-%d')
data = pd.merge(left=data,right=ma,how='left',on='Date')
print(data.shape)
data.isna().sum()


# This wall all about features. Now we need to create targets, i.e what we want to predict. Since we are predicting returns, we need to pick a horizon for which we need to predict returns. I have chosen 14-day and 22-day horizons because other smaller horizons tend to be very volatile and lack and predictive power. One can however, experiment with other horizons as well.
# 
コード例 #17
0
    def file_load(self, datatype):
        '''
        sales_dict和ads_dict 表示国家对应的广告数据和销售数据的文件目录

        datatype= True,打开广告数据. False, 打开销售数据

        start, end传入时间,可为None。目前暂时用于读取销售数据用。

        读销售数据原理:用os.listdir找到数据月份文件夹(如:2017.03),
        根据时间段与文件匹配,读取该时间段内的数据。

        函数返回DataFrame对象

        '''

        ads_dict = {
            'SXDE': '/data/SX/EU/Ads/DE/ads report/',
            'SXES': '/data/SX/EU/Ads/ES/ads report/',
            'SXFR': '/data/SX/EU/Ads/FR/ads report/',
            'SXIT': '/data/SX/EU/Ads/IT/ads report/',
            'SXUK': '/data/SX/EU/Ads/UK/ads report/',
            'SXJP': '/data/SX/Japan/Ads/',
            'SXCA': '/data/SX/North America/Ads/CA/ads report/',
            'SXUS': '/data/SX/North America/Ads/USA/ads report/',
            'HYYDE': '/data/HYY/EU/ads/DE/',
            'HYYES': '/data/HYY/EU/ads/ES/',
            'HYYFR': '/data/HYY/EU/ads/FR/',
            'HYYIT': '/data/HYY/EU/ads/IT/',
            'HYYUK': '/data/HYY/EU/ads/UK/',
            'HYYJP': '/data/HYY/Japan/Ads/',
            'HYYUS': '/data/HYY/North America/ads/USA/ads report/',
            'TXHLDE': '/data/TXHL/EU/ads/DE/',
            'TXHLES': '/data/TXHL/EU/ads/ES/',
            'TXHLFR': '/data/TXHL/EU/ads/FR/',
            'TXHLIT': '/data/TXHL/EU/ads/IT/',
            'TXHLUK': '/data/TXHL/EU/ads/UK/',
            'TXHLJP': '/data/TXHL/Japan/ads/',
            'TXHLCA': '',
            'TXHLUS': '',
        }

        sales_dict = {
            'SXDE': '/data/SX/EU/business report/DE/',
            'SXES': '/data/SX/EU/business report/ES/',
            'SXFR': '/data/SX/EU/business report/FR/',
            'SXIT': '/data/SX/EU/business report/IT/',
            'SXUK': '/data/SX/EU/business report/UK/',
            'SXJP': '/data/SX/Japan/business report/',
            'SXCA': '/data/SX/North America/business report/CA/',
            'SXUS': '/data/SX/North America/business report/USA/',
            'HYYDE': '/data/HYY/EU/business report/DE/',
            'HYYES': '/data/HYY/EU/business report/ES/',
            'HYYFR': '/data/HYY/EU/business report/FR/',
            'HYYIT': '/data/HYY/EU/business report/IT/',
            'HYYUK': '/data/HYY/EU/business report/UK/',
            'HYYJP': '/data/HYY/Japan/business report/',
            'HYYCA': '/data/HYY/North America/business report/CA/',
            'HYYUS': '/data/HYY/North America/business report/USA/',
            'TXHLDE': '/data/TXHL/EU/business report/DE/',
            'TXHLES': '/data/TXHL/EU/business report/ES/',
            'TXHLFR': '/data/TXHL/EU/business report/FR/',
            'TXHLIT': '/data/TXHL/EU/business report/IT/',
            'TXHLUK': '/data/TXHL/EU/business report/UK/',
            'TXHLJP': '/data/TXHL/Japan/business report/',
            'TXHLCA': '',
            'TXHLUS': '',
        }

        if datatype:
            ad_campaign = DataFrame()
            path = 'F:/PycharmFile'+ ads_dict[self.store + self.country]   # 广告数据文件地址
            file_fold = self.end.strftime('%Y') + '.' + self.end.strftime('%m')
            # 需修改: 直接写出文件夹名,文件名file_name,如果存在,则打开文件,不存在,则查找
            if os.path.isdir(path + file_fold):  # 找到月份文件夹
                file_name = "ADs_" + self.store + self.country + "_" + str(self.end.year) + "-" \
                            + str(self.end.month) + "-" + str(self.end.day) + ".txt"
                if self.country == "JP":
                    ad_campaign = pd.read_table(path + file_fold + "/" + file_name, sep='\t', encoding='Shift-JIS')
                else:
                    ad_campaign = pd.read_table(path + file_fold + "/" + file_name, sep='\t', encoding='utf-8')

            return ad_campaign
        else:
            sales_df = DataFrame()
            path = 'F:/PycharmFile' + sales_dict[self.store + self.country]     #销售数据文件地址
            delta = (self.end - self.start).days
            for i in range(delta+1):
                date = (self.start + timedelta(days=i))
                file_name = self.store + self.country + '-' + date.strftime('%y') + '-' + str(date.month)\
                + '-' + str(date.day) + '.csv'
                for root, subdirs, files, in os.walk(path):
                    for name in files:
                        if name == file_name:
                            print name
                            file_path = root + '/' + name
                            df = pd.read_csv(file_path, encoding='utf8')
                            sales_df = pd.concat([sales_df, df])

            sales_df = sales_df[[u'(子)ASIN', u'商品名称', u'买家访问次数', u'买家访问次数百分比',u'页面浏览次数',
                                 u'页面浏览次数百分比',
                                 u'购买按钮页面浏览率', u'已订购商品数量', u'订单商品数量转化率', u'已订购商品销售额',
                                 u'订单商品种类数']]
            print sales_df.head()

            return sales_df
コード例 #18
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {
                'test': [5, 7, 9, 11],
                'test1': [4., 5, 6, 7],
                'other': list('abcd')
            },
            index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {
                'test': [11, 9],
                'test1': [7., 6],
                'other': ['d', 'c']
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [11, 9, np.nan],
                'test1': [7., 6, np.nan],
                'other': ['d', 'c', np.nan]
            },
            index=rows)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]
            },
            index=rows)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with pytest.raises(KeyError):
            dfnu.loc[['E']]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame({'test': [5, 7, 5, 7, np.nan]},
                             index=['A', 'A', 'A', 'A', 'E'])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)
コード例 #19
0
def create_feature_map(features):  
    outfile = open('xgb.fmap', 'w')  
    i = 0  
    for feat in features:  
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))  
        i = i + 1  
    outfile.close()  

create_feature_map(feature_name)
import operator
xgb_importance = xgb_model.get_fscore(fmap='xgb.fmap')  
xgb_importance = sorted(xgb_importance.items(), key=operator.itemgetter(1))  
xgb_importance = DF(xgb_importance, columns=['name', 'fscore'])
print(xgb_importance)

online_xgb_set = xgb.DMatrix(online_train[feature_name],label=online_train['label'])
online_xgb_model = xgb.train(params,online_xgb_set,num_boost_round=xgb_model.best_iteration)
ans_xgb = online_xgb_model.predict(xgb.DMatrix(online_data[feature_name]))
submit_xgb = DF()
submit_xgb['id'] = online_data['user_id']
from sklearn.preprocessing import MinMaxScaler
st = MinMaxScaler()
submit_xgb['score'] = st.fit_transform(ans_xgb.reshape(-1,1)) # RANK
# submit_xgb['score'] = ans_xgb # Binary
print(submit_xgb.head(10))
print(submit_xgb['score'].describe())
submit.to_csv('Submit XGB.txt',index=False,header=False)


コード例 #20
0
re.findall(r'\W+', test_phrase3) # sequence of nonalphanum

#Except for control characters, (+ ? . * ^ $ ( ) [ ] { } | \), all characters match themselves.
# You can escape a control character by preceding it with a backslash.
# In which case you should also use raw strings otherwise need double backslash
#==============================================================================
# CAtegoricals
#==============================================================================

s = pd.Series(['a', 'b', 'c', 'a'], dtype="category")

#this is agreat pattern for auto creation of e.g. age ranges
df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})
labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ]
df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
df.head(10)
 
 
 

 
 
#==============================================================================
# McKinney's C2 - intro examples
#==============================================================================
path = '/Users/stevegoodman/Documents/Dev/pydata-book-master'
 
import json
with open(path+'/ch02/usagov_bitly_data2012-03-16-1331923249.txt','rb') as f:
    records = [json.loads(line) for line in f]
#records is a list of dicts
コード例 #21
0
def activity_forecast(activity_daily_stats: DataFrame) -> DataFrame:
    return activity_daily_stats.head(100)
コード例 #22
0
ファイル: Data.py プロジェクト: DL1039/VBFM
rcParams['figure.figsize'] = 10, 8
sb.set_style('whitegrid')

#Load NN Data from mat file into dict.
mat=scipy.io.loadmat(r'/home/dl2020/Python/NeuralNetwork/NNData.mat')
print(mat.keys())

#returns the NNData value forom dict. 
NNData=mat["NNData"]
print(NNData.shape)

#convert list to panda dataframe
df=DataFrame(NNData)

df.columns=['dev._stage','dimple_ang.','radii_ratio','orientation_ang.','area','force']
print(df.head(10))

#Checking for missing values
print(df.isnull().sum())

#print data information
print(df.info())

#Converting categorical variables to a dummy indicators
stage=pd.get_dummies(df['dev._stage'],drop_first=False)
print(stage.head(10))
df.drop(['dev._stage'],axis=1,inplace=True)
df=pd.concat([df,stage],axis=1)
print(df.head())

#rename new columns
コード例 #23
0
ファイル: app-reviews.py プロジェクト: AugustMD/app-review
review_data.loc[review_data["Title"] == "0", "Content"] = "0"
print(review_data[review_data["Content"] == "0"].shape)


# In[500]:


print(review_data[review_data["Content"] != "0"].shape)


# In[501]:


review_data_except_purchase = DataFrame(review_data[review_data["Content"] != "0"], columns=["Source","Date","Name","Title","Content","Rating"])
print(review_data_except_purchase.shape)
review_data_except_purchase.head()


# In[502]:


def clean_exchange_text(content):
    content = str(content)
    word1 = "환불"
    if word1 in content :
        return 0
    else :
        return content


# In[503]:
def main():
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print(' Plotting the classification based on Quartiles of File 04.01')

    print('')
    print('------------------------------------------------------------')
    print(' IMPORTANT MEMO: Dependencies:')
    print(' Have you run:')
    print('   1) the file 01.01 to get the csv files?')
    print('   2) the file 04.01 to get the classification?')

    print('')
    print('------------------------------------------------------------')
    print(' Reading the start, end date and the period:')
    input_file = '00.00.PARAMETERS.txt'

    f_in = open(input_file, 'r')
    lines = f_in.readlines(
    )[1:]  # reads starting from the second line and stores the line in a string
    print(lines)
    # print (lines[0].split('=', 1)[0])
    start_date = lines[0].split(' ', 1)[0]
    stop_date = lines[1].split(' ', 1)[0]
    period = int(lines[2].split(' ', 1)[0])

    print(' Using the following:')
    print(' start_date (closest to us) = ' + start_date)
    print(' stop_date (the most in the past) = ' + stop_date)
    print(' period (days) = ' + str(period))
    print('')

    if period != 30 and period != 7 and period != 1:
        print(' ERROR: period must be 1, 7 or 30')
        quit()

    f_in.close()

    start = datetime.strptime(start_date, "%Y-%m-%d")
    stop = datetime.strptime(stop_date, "%Y-%m-%d")
    day = start.strftime('%Y-%m-%d')

    print('')
    print('------------------------------------------------------------')
    print(' Folder of the Outputs : ')
    folder_RES = '04.02.RES-Quartiles-Plots-e-Graphs'
    print(folder_RES)
    # creating the folder if it does not exist
    if not os.path.exists(folder_RES):
        os.makedirs(folder_RES)

    #quit()

    print('')
    print(' ---------------------------------')
    print(' Where taking the csv file from')
    folder_input = '04.01.RES-Quartiles'
    print(folder_input)
    csv_file_type = folder_input

    print('')
    print(' ---------------------------------')
    print(
        ' Saving the columns x = log10 audience; y = log10 revenues , color = reach'
    )
    # name of x, y, color to take from var_df
    List_name_col = {
        'x': 'log_audience',
        'y': 'log_sum_revenue_euro',
        'color': 'reach'
    }
    print(List_name_col)

    while start > stop:
        # print (day)  # start.strftime('%Y-%m-%d'))
        # tbl = 'Data.' + day  # name of the table to be written

        print('')
        print(
            '/////////////////////////////////////////////////////////////////////////////////////////////////////'
        )
        #MYFILE = csv_file_type + day + '.csv'  # 'TABLE_v02.02_No-duplicate-funnel.csv'
        #print ('Table now (MYFILE) = ' + MYFILE)
        print(' +++ Date now = ' + day)
        name_df = folder_input + '/RES.Date-' + day + '.period-' + str(
            period) + '.clients_id-idxColorReach-idxYRev.01-Unsorted.csv'
        print(' name_df = ' + name_df)

        print('')
        print(' -------------------------------------')
        print(' Reading the database of 04.01 and changing names of columns')
        df_input = pd.read_csv(name_df,
                               na_values=['None'],
                               skiprows=3,
                               skip_blank_lines=True,
                               thousands=',',
                               index_col=False)
        print(df_input.head())
        #quit()

        print('')
        print(' -------------------------------------')
        print(' Changing the names of the columns for practical purposes')
        df_input.rename(columns={
            'index_' + List_name_col['color']:
            'index_quartile_reach'
        },
                        inplace=True)
        df_input.rename(columns={
            'index_' + List_name_col['y']:
            'index_quartile_log_rev_eur'
        },
                        inplace=True)
        print(df_input.head())

        #quit()

        # ---------------------------------------------------------------------------------------------------
        print('')
        print('-------------------------------------')
        print(' Selecting the clients that are Tier1 according to Criteo')

        # save all coord for plots of all points
        x_all = df_input['log_audience']
        y_all = df_input['log_sum_revenue_euro']

        x_tier_1 = []
        y_tier_1 = []

        l_print_tier1 = False

        for index, row in df_input.iterrows():
            if l_print_tier1: print(index),
            if df_input['is_tier_1'][index] == 1:
                if l_print_tier1: print(df_input['is_tier_1'][index]),
                # here we are ok
                x_tier_1.append(df_input['log_audience'][index])
                y_tier_1.append(df_input['log_sum_revenue_euro'][index])
            if l_print_tier1: print('')

        print('Total number of clients now = ' + str(len(x_tier_1)))
        print('Out of which are Tier 1 = ' + str(len(x_all)))

        print('')
        print('-------------------------------------')
        print(' Making a figure to with all without Tier 1')

        # plot 2d: # 5: Audience vs CPC & color = reach
        fig1 = plt.figure()
        plt.title('Clients Segmentation: Date ' + day + ', Period ' +
                  str(period) + '\n number of total clients = ' +
                  str(len(x_all)))
        #+ ' (' + str(int(len(x_tier_1) * 100 / len(x_all))) + '%)')
        ax1 = fig1.add_subplot(1, 1, 1)

        plt.xlabel('log_audience')
        plt.ylabel('log_sum_revenue_euro')

        plt.xlim([2, 8])
        plt.ylim([-2, 6.5])

        # select the x and y -> x2, y2 -> made above: ATTENTION: for this excample only

        # multiple series
        # all points
        ax1.scatter(x_all,
                    y_all,
                    c=df_input['reach'],
                    cmap='rainbow',
                    vmin=0.0,
                    vmax=1.0,
                    marker="o",
                    label="All Points")
        ax1.colorbar()
        # points that are tier 1
        #ax1.scatter(x_tier_1, y_tier_1, c='black', marker="1", label="Tier1", s=100)
        plt.legend(loc='upper left')

        plt.grid(True)

        subfolder_now = '/01.01.graph-All-vs-Tier1'
        # creating the folder if it does not exist
        directory = folder_RES + subfolder_now
        if not os.path.exists(directory):
            os.makedirs(directory)

        name_fig1 = folder_RES + subfolder_now + '/Fig.Date-' + day + '.period-' + str(
            period) + '.All.png'
        plt.savefig(name_fig1, format='png')
        print('  ==> Figure now = ' + name_fig1)
        # plt.show() # caution: it stops the flow of the program
        plt.draw()  # draws without stopping the flow of the program
        plt.clf()  # clear figure
        plt.close()  # close the figure window and continue

        print('')
        print('-------------------------------------')
        print(' Making a figure to compare the Tier 1 vs non-tier 1')

        # plot 2d: # 5: Audience vs CPC & color = reach
        fig1 = plt.figure()
        plt.title('Clients Segmentation: Date ' + day + ', Period ' +
                  str(period) + '\n number of total clients = ' +
                  str(len(x_all)) + ', of which tier 1 = ' +
                  str(len(x_tier_1)) + ' (' +
                  str(int(len(x_tier_1) * 100 / len(x_all))) + '%)')
        ax1 = fig1.add_subplot(1, 1, 1)

        plt.xlabel('log_audience')
        plt.ylabel('log_sum_revenue_euro')

        plt.xlim([2, 8])
        plt.ylim([-2, 6.5])

        # select the x and y -> x2, y2 -> made above: ATTENTION: for this excample only

        # multiple series
        # all points
        ax1.scatter(x_all,
                    y_all,
                    c=df_input['reach'],
                    cmap='rainbow',
                    vmin=0.0,
                    vmax=1.0,
                    marker="o",
                    label="All Points")
        # points that are tier 1
        ax1.scatter(x_tier_1,
                    y_tier_1,
                    c='black',
                    marker="1",
                    label="Tier1",
                    s=100)
        plt.legend(loc='upper left')

        plt.grid(True)

        subfolder_now = '/01.01.graph-All-vs-Tier1'
        # creating the folder if it does not exist
        directory = folder_RES + subfolder_now
        if not os.path.exists(directory):
            os.makedirs(directory)

        name_fig1 = folder_RES + subfolder_now + '/Fig.Date-' + day + '.period-' + str(
            period) + '.All-vs-Tier-1.png'
        plt.savefig(name_fig1, format='png')
        print('  ==> Figure now = ' + name_fig1)
        # plt.show() # caution: it stops the flow of the program
        plt.draw()  # draws without stopping the flow of the program
        plt.clf()  # clear figure
        plt.close()  # close the figure window and continue

        #---------------------------------------------------------------------------------------------------
        print('')
        print('-------------------------------------')
        print(' Plotting the Clients highlighted on the graph for each sector')

        # it works for 4 groups

        # for test purposes
        #coord_chosen = [df_input['log_audience'][0], df_input['log_sum_revenue_euro'][0]]
        #print (coord_chosen)
        #x_chosen = coord_chosen[0]
        #x2 = x_chosen
        #y_chosen = coord_chosen[1]
        #y2 = y_chosen

        # cycle on the points
        for i_reach_now in range(0, 4):
            # sum 1 because it starts form 1
            index_reach_now = i_reach_now + 1
            for i_rev_now in range(0, 4):
                index_rev_now = i_rev_now + 1

                #print ('++++ reach, rev = ' + str(index_reach_now) + ', ' + str(index_rev_now))

                # initialize the vectors
                # declartion of the coordinates of the points in the group
                x_chosen = []
                y_chosen = []

                l_print = False

                #print ('---')
                #print (' Analyzing line ny line for idx_reach = ' + str(index_reach_now) + ', index_log_rev = ' + str(index_rev_now))
                # searching in the lines
                for index, row in df_input.iterrows():
                    if l_print: print(index),
                    if df_input['index_quartile_reach'][
                            index] == index_reach_now:
                        if l_print:
                            print(df_input['index_quartile_reach'][index]),
                        if df_input['index_quartile_log_rev_eur'][
                                index] == index_rev_now:
                            if l_print:
                                print(df_input['index_quartile_log_rev_eur']
                                      [index])
                            # here we are ok
                            x_chosen.append(df_input['log_audience'][index])
                            y_chosen.append(
                                df_input['log_sum_revenue_euro'][index])
                    if l_print: print('')

                #print ('---')
                #print (' Plotting the Clients highlighted on the graph')
                # check
                #print (x_chosen)
                #print (y_chosen)
                x2 = x_chosen
                y2 = y_chosen

                number_clients = len(x_chosen)

                # plot 2d: # 5: Audience vs CPC & color = reach
                fig10 = plt.figure()
                plt.title('Clients Segmentation: Date ' + day + ', Period ' +
                          str(period) + '\nIdxReach' + str(index_reach_now) +
                          '-IdxRev' + str(index_rev_now) + '; num clients = ' +
                          str(number_clients))
                ax10 = fig10.add_subplot(1, 1, 1)

                plt.xlabel('log_audience')
                # plt.xlim([-2, 2])
                # ax5.set_xscale('log')  # log scale
                # y = clean_clust_df[POI[4]]*100
                # plt.ylabel(POI[4] + '*100')
                # log => no x 100

                plt.ylabel('log_sum_revenue_euro')

                plt.xlim([2, 8])
                plt.ylim([-2, 6.5])

                # select the x and y -> x2, y2 -> made above: ATTENTION: for this excample only

                # multiple series
                # all points
                ax10.scatter(x_all,
                             y_all,
                             c=df_input['reach'],
                             cmap='rainbow',
                             vmin=0.0,
                             vmax=1.0,
                             marker="o",
                             label="All Points")
                # points in the group
                #ax10.scatter(x2, y2, c='black', marker="s", label="In the sector", s=100)

                ax10.scatter(x2,
                             y2,
                             facecolors='none',
                             edgecolors='black',
                             marker="s",
                             label="In the sector",
                             s=100)
                # points that are tier 1
                ax10.scatter(x_tier_1,
                             y_tier_1,
                             c='black',
                             marker="1",
                             label="Tier1",
                             s=100)
                plt.legend(loc='upper left')

                # plt.ylim([-2, 40])
                # plt.ylim([1, 9])
                # ax5.set_yscale('log')  # log scale
                # plt.colorbar(ax.imshow(image, interpolation='nearest'))
                # plt.scatter(x, y, c=var_df[POI_here[2]], cmap='rainbow', vmin=0.0, vmax=1.0)
                # ax10.colorbar()
                plt.grid(True)
                # cbar = plt.colorbar()
                # cbar.set_label('Reach', rotation=270)
                # plt.scatter(x, y, c=clean_clust_df[POI[2]], cmap=plt.cm.bwr_r)
                # cmap = sns.diverging_palette(5, 250, as_cmap=True)
                subfolder_now = '/02.01.graph-All-vs-Groups'
                # creating the folder if it does not exist
                directory = folder_RES + subfolder_now
                if not os.path.exists(directory):
                    os.makedirs(directory)
                name_fig10 = folder_RES + subfolder_now + '/Fig.Date-'+ day + '.period-' + str(period) +'.IdxReach' + str(index_reach_now) \
                             + '-IdxRev' + str(index_rev_now) + '.png'
                plt.savefig(name_fig10, format='png')
                print('  ==> Figure now = ' + name_fig10)
                # plt.show() # caution: it stops the flow of the program
                plt.draw()  # draws without stopping the flow of the program
                plt.clf()  # clear figure
                plt.close()  # close the figure window and continue

                del x_chosen
                del y_chosen

                #print (' End of cycle on the rows')

        print('')
        print(
            '----------------------------------------------------------------------------'
        )
        print(
            ' Initialization for counting of how many elements in each of the groups and how many of these are tier 1'
        )

        num_sectors_reach = 4  # x
        num_sectors_log_rev = 4  # y

        # number of clients in each sector: first index = x = reach, second index = y = log_rev
        num_client_in_group = [[0] * num_sectors_log_rev
                               for x in xrange(num_sectors_reach)]
        print(' Check: must be 4x4 null: '),
        print(num_client_in_group)

        # number of clients in each sector that are tier_1: first index = x = reach, second index = y = log_rev
        num_client_in_group_tier1 = [[0] * num_sectors_log_rev
                                     for x in xrange(num_sectors_reach)]
        print(' Check: must be 4x4 null: '),
        print(num_client_in_group_tier1)

        print('')
        print('------------------------------------------')
        print(' Reading the elements in each group from RES-04.01 *03-Count')
        #print (' +++ Date now = ' + day)
        name_df = folder_input + '/RES.Date-' + day + '.period-' + str(
            period) + '.clients_id-idxColorReach-idxYRev.03-Count.csv'
        print(' name_df = ' + name_df)
        df_input_3 = pd.read_csv(name_df,
                                 na_values=['None'],
                                 skiprows=6,
                                 skip_blank_lines=True,
                                 thousands=',',
                                 index_col=False)
        print(df_input_3.head())

        #quit()

        for index, row in df_input_3.iterrows():
            index_of_reach_in_vector = df_input_3['index_reach'][index] - 1
            index_of_logrev_in_vector = df_input_3['index_revenues'][index] - 1
            num_client_in_group[index_of_reach_in_vector][
                index_of_logrev_in_vector] = df_input_3[
                    'number_clients_in_sector'][index]
            num_client_in_group_tier1[index_of_reach_in_vector][
                index_of_logrev_in_vector] = df_input_3[
                    'number_clients_tier1'][index]

        # for index, row in df_input.iterrows():
        #     # counting number of elements in each group and how many are tier 1 -> plot of heatmap
        #     # make plus 1
        #     index_of_reach_in_vector = df_input['index_quartile_reach'][index] - 1
        #     index_of_logrev_in_vector = df_input['index_quartile_log_rev_eur'][index] - 1
        #     num_client_in_group[index_of_reach_in_vector][index_of_logrev_in_vector] += 1
        #     # check
        #     if df_input['is_tier_1'][index] == 1:
        #         num_client_in_group_tier1[index_of_reach_in_vector][index_of_logrev_in_vector] += 1
        #
        #     # print ('print (index, index_of_reach_in_vector, index_of_logrev_in_vector, df_input[is_tier_1][index]) = '),
        #     # print (index, index_of_reach_in_vector, index_of_logrev_in_vector, df_input['is_tier_1'][index])
        #     # print ('num_client_in_group       : '),
        #     # print (num_client_in_group)
        #     #  print ('num_client_in_group_tier1 : '),
        #     # print (num_client_in_group_tier1)
        #     # print ('----')

        print('Final count:')
        print('num_client_in_group       : '),
        print(num_client_in_group)
        print('num_client_in_group_tier1 : '),
        print(num_client_in_group_tier1)

        del df_input_3

        print('')
        print('------------------------------------------')
        print(' Plotting the Heatmap of number of clients')
        # create a dataframe to plot
        # https://stackoverflow.com/questions/12286607/python-making-heatmap-from-dataframe
        dfplot_index = [1, 2, 3, 4]  #
        dfplot_cols = [1, 2, 3, 4]  #
        dfplot = DataFrame(num_client_in_group,
                           index=dfplot_index,
                           columns=dfplot_cols)
        print(dfplot.head())

        ax3 = sns.heatmap(dfplot, annot=True, fmt='g')
        # for t in ax3.texts: t.set_text(t.get_text() + " AllClients")  # add percentage in notation

        plt.title('Clients: Date ' + day + ', Period ' + str(period) +
                  '\nHow many clients in each sector?')
        plt.xlabel('Index of Log Revenue Euro')
        plt.yticks(rotation=0)
        plt.ylabel('Index of Reach')
        plt.yticks(rotation=0)

        plt.tight_layout()
        subfolder_now = '/03.01.heatmap-groups-all'
        # creating the folder if it does not exist
        directory = folder_RES + subfolder_now
        if not os.path.exists(directory):
            os.makedirs(directory)
        fig_name = folder_RES + subfolder_now + '/Heatmap-01-AllClients.Date-' + day + '.period-' + str(
            period) + '.png'
        print('   ==> Figure = ' + fig_name)
        plt.savefig(fig_name)
        plt.clf()
        del dfplot  # clean dataframe per plot

        print('')
        print(
            '----------------------------------------------------------------------------'
        )
        print(
            ' Plotting Graph with numbers of clients that are classified as tier1'
        )

        # create a dataframe to plot
        # https://stackoverflow.com/questions/12286607/python-making-heatmap-from-dataframe
        dfplot2_index = [1, 2, 3, 4]  #
        dfplot2_cols = [1, 2, 3, 4]  #
        dfplot2 = DataFrame(num_client_in_group_tier1,
                            index=dfplot2_index,
                            columns=dfplot2_cols)
        print(dfplot2.head())

        ax2 = sns.heatmap(dfplot2, annot=True, fmt='g')
        # for t in ax2.texts: t.set_text(t.get_text() + " Tier1") # add percentage in notation

        # NOTE: x and y are reversed here wrt to the graphs
        plt.title(
            'Clients: Date ' + day + ', Period ' + str(period) +
            '\nHow many clients in each sector are classified as Tier1 by Criteo?'
        )
        plt.xlabel('Index of Log Revenue Euro')
        plt.yticks(rotation=0)
        plt.ylabel('Index of Reach')
        plt.yticks(rotation=0)

        plt.tight_layout()
        subfolder_now = '/04.01.heatmap-groups-tier-1-absolute'
        # creating the folder if it does not exist
        directory = folder_RES + subfolder_now
        if not os.path.exists(directory):
            os.makedirs(directory)
        fig_name2 = folder_RES + subfolder_now + '/Heatmap-02-Tier1Abs.Date-' + day + '.period-' + str(
            period) + '.png'
        print(' ==> Figure = ' + fig_name2)
        plt.savefig(fig_name2)
        plt.clf()

        del dfplot2

        print('')
        print(
            '----------------------------------------------------------------------------'
        )
        print(
            ' Plotting Graph with % of clients that are tier 1 in each sector')

        # create a dataframe to plot
        # https://stackoverflow.com/questions/12286607/python-making-heatmap-from-dataframe
        dfplot3_index = [1, 2, 3, 4]  #
        dfplot3_cols = [1, 2, 3, 4]  #

        # compute percentage
        perc_clients_tier_1 = [[0] * num_sectors_log_rev
                               for x in xrange(num_sectors_reach)]
        #print (perc_clients_tier_1)
        for i_reach in range(0, 4):
            for j_rev in range(0, 4):
                if num_client_in_group[i_reach][j_rev] == 0:
                    perc_clients_tier_1[i_reach][j_rev] = 0.0
                else:
                    perc_clients_tier_1[i_reach][j_rev] = float(
                        num_client_in_group_tier1[i_reach][j_rev]
                    ) * 100.0 / float(num_client_in_group[i_reach][j_rev])
        #print (perc_clients_tier_1)

        # quit()

        dfplot3 = DataFrame(perc_clients_tier_1,
                            index=dfplot3_index,
                            columns=dfplot3_cols)
        print(dfplot3.head())

        ax3 = sns.heatmap(dfplot3, annot=True, fmt='.0f', vmin=0,
                          vmax=100)  # fmt='g')
        for t in ax3.texts:
            t.set_text(t.get_text() + " %")  # add percentage in notation

        # NOTE: x and y are reversed here wrt to the graphs
        plt.title(
            'Clients: Date ' + day + ', Period ' + str(period) +
            '\nHow much percentage of clients in each sector are classified as Tier1 by Criteo?'
        )
        plt.xlabel('Index of Log Revenue Euro')
        plt.yticks(rotation=0)
        plt.ylabel('Index of Reach')
        plt.yticks(rotation=0)

        plt.tight_layout()
        subfolder_now = '/05.01.heatmap-groups-tier-1-percentage'
        # creating the folder if it does not exist
        directory = folder_RES + subfolder_now
        if not os.path.exists(directory):
            os.makedirs(directory)
        fig_name3 = folder_RES + subfolder_now + '/Heatmap-03-Tier1Perc.Date-' + day + '.period-' + str(
            period) + '.png'
        print(' ==> Figure = ' + fig_name3)
        plt.savefig(fig_name3)
        plt.clf()

        del dfplot3

        print('')
        print('----------------------------------------')
        print(' Cleaning the dataframe for this period')
        del df_input

        #quit() # 1 date only

        # cycles
        start = start - timedelta(days=period)
        day = start.strftime('%Y-%m-%d')

        # clean the df
        #del clean_clust_df

    quit()
コード例 #25
0
from pandas import DataFrame


def add_state_names(my_df):

    #
    ##breakpoint()

    new_df = my_df.copy()
    names_map = {"CA": "Cali", "CO": "Colo", "CT": "Conn"}
    new_df["name"] = new_df["abbrev"].map(names_map)
    return new_df


if __name__ == "__main__":

    df = DataFrame({"abbrev": ["CA", "CO", "CT", "DC", "TX"]})
    #breakpoint()
    print(df.columns)  # property
    print(df.head())  # method

    df2 = add_state_names(df)
    print(df2.head())

    #df3 = DataFrame({"a":[1,2,3,4]})
    #print(df3.head())
コード例 #26
0
ファイル: 01-csv.py プロジェクト: GunterOdimm/python
print_df(df.isnull().sum())

#평균 점수에 대한 열 추가하기
df['평균'] = df.mean(axis=1)

conditions = [
    (df['평균'] >= 90),
    (df['평균'] >= 80),
    (df['평균'] >= 70),
    (df['평균'] < 70),
]

grade = ['A', 'B', 'C', 'F']

df['학점'] = numpy.select(conditions, grade)
print_df(df.head(5))

#생성결과를 csv로 저장하기
NT = dt.datetime.now().strftime("%y%m%d_%H%M%S")
filename = "grade" + NT + ".csv"

df.to_csv(filename,
          encoding='euc-kr',
          na_rep='NaN',
          index_label='이름',
          header=['국', '영', '수', '과', '평균', '학점'])

#데이터 시각화
cnt = df['학점'].value_counts()
result_df = DataFrame(cnt)
print_df(result_df)
コード例 #27
0
print BabyDataSet[:10]

df = DataFrame(data=BabyDataSet, columns=["Names", "Births"])

print df[:10]

df.to_csv('births1880.txt', index=False, header=False)

Location = r'births1880.txt'

df = read_csv(Location)

print df

print df.head()

df = read_csv(Location, header=None)

print df

print df.tail()

df = read_csv(Location, names=['Names', 'Births'])

print df.head()

import os

os.remove(Location)
コード例 #28
0
def editor(interrogation,
           operation=None,
           denominator=False,
           sort_by=False,
           keep_stats=False,
           keep_top=False,
           just_totals=False,
           threshold='medium',
           just_entries=False,
           skip_entries=False,
           span_entries=False,
           merge_entries=False,
           just_subcorpora=False,
           skip_subcorpora=False,
           span_subcorpora=False,
           merge_subcorpora=False,
           replace_names=False,
           replace_subcorpus_names=False,
           projection=False,
           remove_above_p=False,
           p=0.05,
           print_info=False,
           spelling=False,
           selfdrop=True,
           calc_all=True,
           keyword_measure='ll',
           **kwargs):
    """
    See corpkit.interrogation.Interrogation.edit() for docstring
    """

    # grab arguments, in case we get dict input and have to iterate
    locs = locals()

    import corpkit

    import re
    import collections
    import pandas as pd
    import numpy as np

    from pandas import DataFrame, Series
    from time import localtime, strftime

    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
    # new ipython error
    except AttributeError:
        have_ipython = False
        pass

    # to use if we also need to worry about concordance lines
    return_conc = False

    from corpkit.interrogation import Interrodict, Interrogation, Concordance
    if interrogation.__class__ == Interrodict:
        locs.pop('interrogation', None)
        from collections import OrderedDict
        outdict = OrderedDict()
        for i, (k, v) in enumerate(interrogation.items()):
            # only print the first time around
            if i != 0:
                locs['print_info'] = False

            if isinstance(denominator,
                          STRINGTYPE) and denominator.lower() == 'self':
                denominator = interrogation

            # if df2 is also a dict, get the relevant entry

            if isinstance(denominator, (dict, Interrodict)):
                #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \
                #   sorted(set([i.lower() for i in list(denominator.keys())])):
                #   locs['denominator'] = denominator[k]

                # fix: this repeats itself for every key, when it doesn't need to
                # denominator_sum:
                if kwargs.get('denominator_sum'):
                    locs['denominator'] = denominator.collapse(axis='key')

                if kwargs.get('denominator_totals'):
                    locs['denominator'] = denominator[k].totals
                else:
                    locs['denominator'] = denominator[k].results

            outdict[k] = v.results.edit(**locs)
        if print_info:

            thetime = strftime("%H:%M:%S", localtime())
            print(
                "\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n"
                % (thetime, "'\n         '".join(sorted(outdict.keys()))))
        return Interrodict(outdict)

    elif isinstance(interrogation, (DataFrame, Series)):
        dataframe1 = interrogation
    elif isinstance(interrogation, Interrogation):
        #if interrogation.__dict__.get('concordance', None) is not None:
        #    concordances = interrogation.concordance
        branch = kwargs.pop('branch', 'results')
        if branch.lower().startswith('r'):
            dataframe1 = interrogation.results
        elif branch.lower().startswith('t'):
            dataframe1 = interrogation.totals
        elif branch.lower().startswith('c'):
            dataframe1 = interrogation.concordance
            return_conc = True
        else:
            dataframe1 = interrogation.results

    elif isinstance(interrogation, Concordance) or \
                        all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']):
        return_conc = True
        print('heree')
        dataframe1 = interrogation
    # hope for the best
    else:
        dataframe1 = interrogation

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")

    pd.options.mode.chained_assignment = None

    try:
        from process import checkstack
    except ImportError:
        from corpkit.process import checkstack

    if checkstack('pythontex'):
        print_info = False

    def combiney(df, df2, operation='%', threshold='medium', prinf=True):
        """
        Mash df and df2 together in appropriate way
        """
        totals = False
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(
                        df2[df2['Combined total'] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append('...')
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print(
                                'Removing %d entries below threshold:\n    %s'
                                % (len(to_drop), '\n    '.join(to_show)))
                        if len(to_drop) > 10:
                            print('... and %d more ... \n' %
                                  (len(to_drop) - len(to_show) + 1))
                        else:
                            print('')
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == '%':
                totals = df.sum() * 100.0 / float(df.sum().sum())
                df = df * 100.0
                try:
                    df = df.div(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '+':
                try:
                    df = df.add(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '-':
                try:
                    df = df.sub(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '*':
                totals = df.sum() * float(df.sum().sum())
                try:
                    df = df.mul(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '/':
                try:
                    totals = df.sum() / float(df.sum().sum())
                    df = df.div(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)

            elif operation == 'a':
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2

            elif operation.startswith('c'):
                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    df = pandas.concat([df, df2], axis=1)
            return df, totals

        elif not single_totals:
            if not operation.startswith('a'):
                # generate totals
                if operation == '%':
                    totals = df.sum() * 100.0 / float(df2.sum().sum())
                if operation == '*':
                    totals = df.sum() * float(df2.sum().sum())
                if operation == '/':
                    totals = df.sum() / float(df2.sum().sum())
                if operation.startswith('c'):
                    # add here the info that merging will not work
                    # with identical colnames
                    import warnings
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        d = pd.concat([df.T, df2.T])
                        # make index nums
                        d = d.reset_index()
                        # sum and remove duplicates
                        d = d.groupby('index').sum()
                        dx = d.reset_index('index')
                        dx.index = list(dx['index'])
                        df = dx.drop('index', axis=1).T

                def editf(datum):
                    meth = {
                        '%': datum.div,
                        '*': datum.mul,
                        '/': datum.div,
                        '+': datum.add,
                        '-': datum.sub
                    }

                    if datum.name in list(df2.columns):

                        method = meth[operation]
                        mathed = method(df2[datum.name], fill_value=0.0)
                        if operation == '%':
                            return mathed * 100.0
                        else:
                            return mathed
                    else:
                        return datum * 0.0

                df = df.apply(editf)

            else:
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2.T.sum()

        return df, totals

    def skip_keep_merge_span(df):
        """
        Do all skipping, keeping, merging and spanning
        """
        from corpkit.dictionaries.process_types import Wordlist
        if skip_entries:
            if isinstance(skip_entries, (list, Wordlist)):
                df = df.drop(list(skip_entries), axis=1, errors='ignore')
            else:
                df = df.loc[:, ~df.columns.str.contains(skip_entries)]
        if just_entries:
            if isinstance(just_entries, (list, Wordlist)):
                je = [i for i in list(just_entries) if i in list(df.columns)]
                df = df[je]
            else:
                df = df.loc[:, df.columns.str.contains(just_entries)]
        if merge_entries:
            for newname, crit in merge_entries.items():
                if isinstance(crit, (list, Wordlist)):
                    crit = [i for i in list(crit) if i in list(df.columns)]
                    cr = [i for i in list(crit) if i in list(df.columns)]
                    summed = df[cr].sum(axis=1)
                    df = df.drop(list(cr), axis=1, errors='ignore')
                else:
                    summed = df.loc[:,
                                    df.columns.str.contains(crit)].sum(axis=1)
                    df = df.loc[:, ~df.columns.str.contains(crit)]
                df.insert(0, newname, summed, allow_duplicates=True)
        if span_entries:
            df = df.iloc[:, span_entries[0]:span_entries[1]]
        if skip_subcorpora:
            if isinstance(skip_subcorpora, (list, Wordlist)):
                df = df.drop(list(skip_subcorpora), axis=0, errors='ignore')
            else:
                df = df[~df.index.str.contains(skip_subcorpora)]
        if just_subcorpora:
            if isinstance(just_subcorpora, (list, Wordlist)):
                js = [i for i in list(just_subcorpora) if i in list(df.index)]
                df = df.loc[js]
            else:
                df = df[df.index.str.contains(just_subcorpora)]
        if merge_subcorpora:
            df = df.T
            for newname, crit in merge_subcorpora.items():
                if isinstance(crit, (list, Wordlist)):
                    crit = [i for i in list(crit) if i in list(df.columns)]
                    summed = df[list(crit)].sum(axis=1)
                    df = df.drop(list(crit), axis=1, errors='ignore')
                else:
                    summed = df.loc[:,
                                    df.columns.str.contains(crit)].sum(axis=1)
                    df = df.loc[:, ~df.columns.str.contains(crit)]
                df.insert(0, newname, summed, allow_duplicates=True)
            df = df.T
        if span_subcorpora:
            df = df.iloc[span_subcorpora[0]:span_subcorpora[1], :]
        return df

    def parse_input(df, the_input):
        """turn whatever has been passed in into list of words that can 
           be used as pandas indices---maybe a bad way to go about it"""
        parsed_input = False
        import re
        if the_input == 'all':
            the_input = r'.*'
        if isinstance(the_input, int):
            try:
                the_input = str(the_input)
            except:
                pass
            the_input = [the_input]
        elif isinstance(the_input, STRINGTYPE):
            regex = re.compile(the_input)
            parsed_input = [w for w in list(df) if re.search(regex, w)]
            return parsed_input
        from corpkit.dictionaries.process_types import Wordlist
        if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist:
            the_input = list(the_input)
        if isinstance(the_input, list):
            if isinstance(the_input[0], int):
                parsed_input = [
                    word for index, word in enumerate(list(df))
                    if index in the_input
                ]
            elif isinstance(the_input[0], STRINGTYPE):
                try:
                    parsed_input = [
                        word for word in the_input if word in df.columns
                    ]
                except AttributeError:  # if series
                    parsed_input = [
                        word for word in the_input if word in df.index
                    ]
        return parsed_input

    def synonymise(df, pos='n'):
        """pass a df and a pos and convert df columns to most common synonyms"""
        from nltk.corpus import wordnet as wn
        #from dictionaries.taxonomies import taxonomies
        from collections import Counter
        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos=pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to='US', print_info=print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert
        if print_info:
            print('Converting spelling ... \n')
        if convert_to == 'UK':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info=print_info):
        if print_info:
            print('Merging duplicate entries ... \n')
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            #num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis=1)
            #df = df.drop([dup for d in range(num_dupes)], axis=1)
            df = df.drop(dup, axis=1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info=print_info):
        """replace entry names and merge"""
        import re
        # get input into list of tuples
        # if it's a string, we want to delete it
        if isinstance(replace_names, STRINGTYPE):
            replace_names = [(replace_names, '')]
        # this is for some malformed list
        if not isinstance(replace_names, dict):
            if isinstance(replace_names[0], STRINGTYPE):
                replace_names = [replace_names]
        # if dict, make into list of tupes
        if isinstance(replace_names, dict):
            replace_names = [(v, k) for k, v in replace_names.items()]
        for to_find, replacement in replace_names:
            if print_info:
                if replacement:
                    print('Replacing "%s" with "%s" ...\n' %
                          (to_find, replacement))
                else:
                    print('Deleting "%s" from entry names ...\n' % to_find)
            to_find = re.compile(to_find)
            if not replacement:
                replacement = ''
            df.columns = [
                re.sub(to_find, replacement, l) for l in list(df.columns)
            ]
        df = merge_duplicates(df, print_info=False)
        return df

    def newname_getter(df,
                       parsed_input,
                       newname='combine',
                       prinf=True,
                       merging_subcorpora=False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = 'combine'
        if isinstance(newname, int):
            the_newname = list(df.columns)[newname]
        elif isinstance(newname, STRINGTYPE):
            if newname == 'combine':
                if len(parsed_input) <= 3:
                    the_newname = '/'.join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = '/'.join(parsed_input[:3]) + '...'
            else:
                the_newname = newname
        if not newname:
            # revise this code
            import operator
            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(iter(sumdict.items()),
                              key=operator.itemgetter(1))[0]
        if not isinstance(the_newname, STRINGTYPE):
            the_newname = str(the_newname, errors='ignore')
        return the_newname

    def projector(df, list_of_tuples, prinf=True):
        """project abs values"""
        if isinstance(list_of_tuples, list):
            tdict = {}
            for a, b in list_of_tuples:
                tdict[a] = b
            list_of_tuples = tdict
        for subcorpus, projection_value in list(list_of_tuples.items()):
            if isinstance(subcorpus, int):
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if isinstance(projection_value, float):
                    print('Projection: %s * %s' %
                          (subcorpus, projection_value))
                if isinstance(projection_value, int):
                    print('Projection: %s * %d' %
                          (subcorpus, projection_value))
        if prinf:
            print('')
        return df

    def lingres(ser, index):
        from scipy.stats import linregress
        from pandas import Series
        ix = ['slope', 'intercept', 'r', 'p', 'stderr']
        return Series(linregress(index, ser.values), index=ix)

    def do_stats(df):
        """do linregress and add to df"""

        try:
            from scipy.stats import linregress
        except ImportError:
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: sort type not available in this version of corpkit.' %
                  thetime)
            return False

        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = list(range(len(indices)))

        stats = df.apply(lingres, axis=0, index=x)
        df = df.append(stats)
        df = df.replace([np.inf, -np.inf], 0.0)

        return df

    def resort(df, sort_by=False, keep_stats=False):
        """
        Sort results, potentially using scipy's linregress
        """

        # translate options and make sure they are parseable
        stat_field = ['slope', 'intercept', 'r', 'p', 'stderr']
        easy_sorts = ['total', 'infreq', 'name', 'most', 'least', 'reverse']
        stat_sorts = ['increase', 'decrease', 'static', 'turbulent']
        options = stat_field + easy_sorts + stat_sorts
        sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'}
        sort_by = sort_by_convert.get(sort_by, sort_by)

        # probably broken :(
        if just_totals:
            if sort_by == 'name':
                return df.sort_index()
            else:
                return df.sort_values(by='Combined total',
                                      ascending=sort_by != 'total',
                                      axis=1)

        stats_done = False
        if keep_stats or sort_by in stat_field + stat_sorts:
            df = do_stats(df)
            stats_done = True
            if isinstance(df, bool):
                if df is False:
                    return False

        if isinstance(df, Series):
            if stats_done:
                stats = df.ix[range(-5, 0)]
                df = df.drop(list(stats.index))
            if sort_by == 'name':
                df = df.sort_index()
            elif sort_by == 'reverse':
                df = df[::-1]
            else:
                df = df.sort_values(ascending=sort_by != 'total')
            if stats_done:
                df = df.append(stats)
            return df

        if sort_by == 'name':
            # currently case sensitive
            df = df.reindex_axis(sorted(df.columns), axis=1)
        elif sort_by in ['total', 'infreq']:
            if df1_istotals:
                df = df.T
            df = df[list(
                df.sum().sort_values(ascending=sort_by != 'total').index)]

        elif sort_by == 'reverse':
            df = df.T[::-1].T
        # sort by slope etc., or search by subcorpus name
        if sort_by in stat_field or sort_by not in options:
            asc = kwargs.get('reverse', False)
            df = df.T.sort_values(by=sort_by, ascending=asc).T

        if sort_by in ['increase', 'decrease', 'static', 'turbulent']:
            slopes = df.ix['slope']
            if sort_by == 'increase':
                df = df[slopes.argsort()[::-1]]
            elif sort_by == 'decrease':
                df = df[slopes.argsort()]
            elif sort_by == 'static':
                df = df[slopes.abs().argsort()]
            elif sort_by == 'turbulent':
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                df = df.T
                df = df[df['p'] <= p]
                df = df.T

        # remove stats field by default
        if not keep_stats:
            df = df.drop(stat_field, axis=0, errors='ignore')
        return df

    def set_threshold(big_list, threshold, prinf=True):
        if isinstance(threshold, STRINGTYPE):
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500
            if isinstance(big_list, DataFrame):
                tot = big_list.sum().sum()

            if isinstance(big_list, Series):
                tot = big_list.sum()
            tshld = float(tot) / float(denominator)
        else:
            tshld = threshold
        if prinf:
            print('Threshold: %d\n' % tshld)
        return tshld

    # copy dataframe to be very safe
    df = dataframe1.copy()
    # make cols into strings
    try:
        df.columns = [str(c) for c in list(df.columns)]
    except:
        pass

    if operation is None:
        operation = 'None'

    if isinstance(interrogation, Concordance):
        return_conc = True
    # do concordance work
    if return_conc:
        if just_entries:
            if isinstance(just_entries, int):
                just_entries = [just_entries]
            if isinstance(just_entries, STRINGTYPE):
                df = df[df['m'].str.contains(just_entries)]
            if isinstance(just_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in just_entries):
                    mp = df['m'].map(lambda x: x in just_entries)
                    df = df[mp]
                else:
                    df = df.ix[just_entries]

        if skip_entries:
            if isinstance(skip_entries, int):
                skip_entries = [skip_entries]
            if isinstance(skip_entries, STRINGTYPE):
                df = df[~df['m'].str.contains(skip_entries)]
            if isinstance(skip_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_entries):
                    mp = df['m'].map(lambda x: x not in skip_entries)
                    df = df[mp]
                else:
                    df = df.drop(skip_entries, axis=0)

        if just_subcorpora:
            if isinstance(just_subcorpora, int):
                just_subcorpora = [just_subcorpora]
            if isinstance(just_subcorpora, STRINGTYPE):
                df = df[df['c'].str.contains(just_subcorpora)]
            if isinstance(just_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in just_subcorpora):
                    mp = df['c'].map(lambda x: x in just_subcorpora)
                    df = df[mp]
                else:
                    df = df.ix[just_subcorpora]

        if skip_subcorpora:
            if isinstance(skip_subcorpora, int):
                skip_subcorpora = [skip_subcorpora]
            if isinstance(skip_subcorpora, STRINGTYPE):
                df = df[~df['c'].str.contains(skip_subcorpora)]
            if isinstance(skip_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora):
                    mp = df['c'].map(lambda x: x not in skip_subcorpora)
                    df = df[mp]
                else:
                    df = df.drop(skip_subcorpora, axis=0)

        return Concordance(df)

    if print_info:
        print('\n***Processing results***\n========================\n')

    df1_istotals = False
    if isinstance(df, Series):
        df1_istotals = True
        df = DataFrame(df)
        # if just a single result
    else:
        df = DataFrame(df)
    if operation.startswith('k'):
        if sort_by is False:
            if not df1_istotals:
                sort_by = 'turbulent'
        if df1_istotals:
            df = df.T

    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    if denominator.__class__ == Interrogation:
        try:
            denominator = denominator.results
        except AttributeError:
            denominator = denominator.totals

    if denominator is not False and not isinstance(denominator, STRINGTYPE):
        df2 = denominator.copy()
        using_totals = True
        if isinstance(df2, DataFrame):
            if len(df2.columns) > 1:
                single_totals = False
            else:
                df2 = Series(df2.iloc[:, 0])
        elif isinstance(df2, Series):
            single_totals = True
            #if operation == 'k':
            #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?')
    else:
        if operation in ['k', 'a', '%', '/', '*', '-', '+']:
            denominator = 'self'
        if denominator == 'self':
            outputmode = True

    if operation.startswith('a') or operation.startswith('A'):
        if list(df.columns)[0] != '0' and list(df.columns)[0] != 0:
            df = df.T
        if using_totals:
            if not single_totals:
                df2 = df2.T

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if spelling:
        df = convert_spell(df, convert_to=spelling)
        df = merge_duplicates(df, print_info=False)

        if not single_totals:
            df2 = convert_spell(df2, convert_to=spelling, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not df1_istotals:
            sort_by = 'total'

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, replace_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not sort_by:
            sort_by = 'total'

    if replace_subcorpus_names:
        df = name_replacer(df.T, replace_subcorpus_names)
        df = merge_duplicates(df).T
        df = df.sort_index()
        if not single_totals:
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = name_replacer(df2, replace_subcorpus_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = df2.sort_index()
        if not sort_by:
            sort_by = 'total'

    # remove old stats if they're there:
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        df = df.drop(statfields, axis=0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis=0)
        except:
            pass

    # remove totals and tkinter order
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2,
                        [0, 1, 0, 1]):
        if name == 'Total' and df1_istotals:
            continue
        try:
            df = df.drop(name, axis=ax, errors='ignore')
        except:
            pass
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2,
                        [0, 1, 0, 1]):
        if name == 'Total' and single_totals:
            continue

        try:
            df2 = df2.drop(name, axis=ax, errors='ignore')
        except:
            pass

    df = skip_keep_merge_span(df)
    try:
        df2 = skip_keep_merge_span(df2)
    except:
        pass

    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    if just_totals:
        df = DataFrame(df.sum(), columns=['Combined total'])
        if using_totals:
            if not single_totals:
                df2 = DataFrame(df2.sum(), columns=['Combined total'])
            else:
                df2 = df2.sum()

    tots = df.sum(axis=1)

    if using_totals or outputmode:
        if not operation.startswith('k'):
            tshld = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = 'Total'
                else:
                    df2.name = 'Combined total'
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    tshld = set_threshold(df2, threshold, prinf=print_info)
            df, tots = combiney(df,
                                df2,
                                operation=operation,
                                threshold=tshld,
                                prinf=print_info)

    # if doing keywording...
    if operation.startswith('k'):

        if isinstance(denominator, STRINGTYPE):
            if denominator == 'self':
                df2 = df.copy()
            else:
                df2 = denominator

        from corpkit.keys import keywords
        df = keywords(df,
                      df2,
                      selfdrop=selfdrop,
                      threshold=threshold,
                      print_info=print_info,
                      editing=True,
                      calc_all=calc_all,
                      sort_by=sort_by,
                      measure=keyword_measure,
                      **kwargs)

    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    # resort data
    if sort_by or keep_stats:
        df = resort(df, keep_stats=keep_stats, sort_by=sort_by)
        if isinstance(df, bool):
            if df is False:
                return 'linregress'

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = Series(df['Combined total'], name='Combined total')

    if df1_istotals:
        if operation.startswith('k'):
            try:
                df = Series(df.ix[dataframe1.name])
                df.name = '%s: keyness' % df.name
            except:
                df = df.iloc[0, :]
                df.name = 'keyness' % df.name

    # generate totals branch if not percentage results:
    # fix me
    if df1_istotals or operation.startswith('k'):
        if not just_totals:
            try:
                total = Series(df['Total'], name='Total')
            except:
                total = 'none'
                pass

            #total = df.copy()
        else:
            total = 'none'
    else:
        # might be wrong if using division or something...
        try:
            total = df.T.sum(axis=1)
        except:
            total = 'none'

    if not isinstance(tots, DataFrame) and not isinstance(tots, Series):
        total = df.sum(axis=1)
    else:
        total = tots

    if isinstance(df, DataFrame):
        if df.empty:
            datatype = 'object'
        else:
            datatype = df.iloc[0].dtype
    else:
        datatype = df.dtype
    locs['datatype'] = datatype

    # TURN INT COL NAMES INTO STR
    try:
        df.results.columns = [str(d) for d in list(df.results.columns)]
    except:
        pass

    def add_tkt_index(df):
        """add an order for tkintertable if using gui"""
        if isinstance(df, Series):
            df = df.T
            df = df.drop('tkintertable-order', errors='ignore', axis=0)
            df = df.drop('tkintertable-order', errors='ignore', axis=1)
            dat = [i for i in range(len(df.index))]
            df['tkintertable-order'] = Series(dat, index=list(df.index))
            df = df.T
        return df

    # while tkintertable can't sort rows
    if checkstack('tkinter'):
        df = add_tkt_index(df)

    if kwargs.get('df1_always_df'):
        if isinstance(df, Series):
            df = DataFrame(df)

    # delete non-appearing conc lines
    lns = None
    if isinstance(getattr(interrogation, 'concordance', None), Concordance):
        try:
            col_crit = interrogation.concordance['m'].map(
                lambda x: x in list(df.columns))
            ind_crit = interrogation.concordance['c'].map(
                lambda x: x in list(df.index))
            lns = interrogation.concordance[col_crit]
            lns = lns.loc[ind_crit]
            lns = Concordance(lns)
        except ValueError:
            lns = None

    output = Interrogation(results=df,
                           totals=total,
                           query=locs,
                           concordance=lns)

    if print_info:
        print('***Done!***\n========================\n')

    return output
コード例 #29
0
X=iris.data


##### Now we will apply kmeans for each value of k from 1 to 10 

# In[195]:

KM=[kmeans(X,k) for k in K]
print type(KM),len(KM)


# In[196]:

KM_df=DataFrame(KM)
print KM_df.head(1)


# In[197]:

print KM_df.tail(1)


# In[198]:

KM_df.shape


# In[199]:

KM_v1=KM_df[0]
コード例 #30
0
                               "2011-04-12 10:30:00.0000000",
                               "2012-04-12 10:30:00.0000000")
    # table, timestamps, columns, data = service.sample_data('MAC000246', '2012-04-12 10:30:00.0000000', '2012-05-12 10:30:00.0000000')
    # table, timestamps, lables, data = service.hhourly_reading("MAC000246", "2011-04-12 10:30:00.0000000", "2012-04-12 10:30:00.0000000")

    # Solar
    # table, timestamps, lables, data = service.get_readings("control", "", "2015-04-15 18:00:00", "2015-04-15 19:16:18")

    table, timestamps, lables, data = service.parse_response(
        res, DailyReadingSchema())

    sample = DataFrame(data=data, columns=lables)
    sample['timestamp'] = timestamps

    print(sample.describe())
    print(sample.head())

    sample.tss.format_index(timestamps, INFLUX_TS_FMT)

    subsample = sample[2:4]
    print(len(subsample))

    sample.tss.day_of_week_class()

    # stationality = sample.tss.stationality('energy_mean')
    #
    # auto_corr = sample.tss.autocorrelation('energy_mean')

    stationality = sample.tss.stationality('power')

    auto_corr = sample.tss.autocorrelation('power')
コード例 #31
0
ファイル: example.py プロジェクト: id774/sandbox
from pandas import DataFrame
import xlrd  # xlsを読み込む際に必要
import numpy as np
import sqlite3

# データフレームを作る
smp = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nebada', 'Nebada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.6, 1.7, 3.5, 4.3]
       }
frame = DataFrame(smp)

# データフレームの要素へのアクセス
frame.year  # frame$year
frame['year']  # frame$year
frame.head()  # head
frame.tail()  # tail
frame2 = DataFrame(
    smp, index=['one', 'two', 'three', 'four', 'five'])  # インデックスを追加
frame2.ix['one']
frame2.describe()  # summary
print(frame2.describe())

# データを読み込む
data = pd.read_csv('stock_px.csv')
print(data)
xlsx_file = pd.ExcelFile('stock_px.xlsx')  # openpyxlのインストールが必要, xlsも可
xlsx_file.sheet_names
data = xlsx_file.parse('stock_px')
print(data)
コード例 #32
0
# Create dataframe
aonao = DataFrame({'AO': AO, 'NAO': NAO})

# Plot data
aonao.plot(subplots=True)

# Reference data by column name or method of dataframe variable
print(aonao['NAO'])
print(aonao.NAO)

# Add column to dataframe
aonao['Diff'] = aonao['AO'] - aonao['NAO']

# Show first several lines of new dataframe
print(aonao.head())

# Remove column from dataframe
del aonao['Diff']

# Show last few lines of dataframe
print(aonao.tail())

# Show slice from dataframe
print(aonao['1981-01':'1981-03'])

# Complex indexing example
import datetime

aonao.loc[(aonao.AO > 0) & (aonao.NAO < 0) &
          (aonao.index > datetime.datetime(1980, 1, 1)) &
コード例 #33
0
}
df = DataFrame(data).set_index("poly_range")
df

plt.plot(poly_range, df["mse_lr"], label="lr")
plt.plot(poly_range, df["mse_ridge"], label="ridge")
plt.plot(poly_range, df["mse_lasso"], label="lasso")
plt.legend()
plt.show()

df.min()
df["mse_ridge"].sort_values().head()

## Exercise
df = pd.read_csv("./K-MOOC_machine_learning/ch4/yield.csv", sep="\t")
df.head()

X = df["Temp"]
y = df["Yield"]
X = X.reshape(-1, 1)

mse = []
poly_range = range(2, 10)
for poly_value in poly_range:
    poly_features = PolynomialFeatures(degree=poly_value)

    X_poly = poly_features.fit_transform(X)

    lr = LinearRegression()
    lr.fit(X_poly, y)
コード例 #34
0
    def file_load(self, datatype):
        '''
        sales_dict和ads_dict 表示国家对应的广告数据和销售数据的文件目录

        datatype= True,打开广告数据. False, 打开销售数据

        start, end传入时间,可为None。目前暂时用于读取销售数据用。

        读销售数据原理:用os.listdir找到数据月份文件夹(如:2017.03),
        根据时间段与文件匹配,读取该时间段内的数据。

        函数返回DataFrame对象

        '''

        ads_dict = {
            'SXDE': '/data/SX/EU/Ads/DE/ads report/',
            'SXES': '/data/SX/EU/Ads/ES/ads report/',
            'SXFR': '/data/SX/EU/Ads/FR/ads report/',
            'SXIT': '/data/SX/EU/Ads/IT/ads report/',
            'SXUK': '/data/SX/EU/Ads/UK/ads report/',
            'SXJP': '/data/SX/Japan/Ads/',
            'SXCA': '/data/SX/North America/Ads/CA/ads report/',
            'SXUS': '/data/SX/North America/Ads/USA/ads report/',
            'HYYDE': '/data/HYY/EU/ads/DE/',
            'HYYES': '/data/HYY/EU/ads/ES/',
            'HYYFR': '/data/HYY/EU/ads/FR/',
            'HYYIT': '/data/HYY/EU/ads/IT/',
            'HYYUK': '/data/HYY/EU/ads/UK/',
            'HYYJP': '/data/HYY/Japan/Ads/',
            'HYYUS': '/data/HYY/North America/ads/USA/ads report/',
            'TXHLDE': '/data/TXHL/EU/ads/DE/',
            'TXHLES': '/data/TXHL/EU/ads/ES/',
            'TXHLFR': '/data/TXHL/EU/ads/FR/',
            'TXHLIT': '/data/TXHL/EU/ads/IT/',
            'TXHLUK': '/data/TXHL/EU/ads/UK/',
            'TXHLJP': '/data/TXHL/Japan/ads/',
            'TXHLCA': '',
            'TXHLUS': '',
        }

        sales_dict = {
            'SXDE': '/data/SX/EU/business report/DE/',
            'SXES': '/data/SX/EU/business report/ES/',
            'SXFR': '/data/SX/EU/business report/FR/',
            'SXIT': '/data/SX/EU/business report/IT/',
            'SXUK': '/data/SX/EU/business report/UK/',
            'SXJP': '/data/SX/Japan/business report/',
            'SXCA': '/data/SX/North America/business report/CA/',
            'SXUS': '/data/SX/North America/business report/USA/',
            'HYYDE': '/data/HYY/EU/business report/DE/',
            'HYYES': '/data/HYY/EU/business report/ES/',
            'HYYFR': '/data/HYY/EU/business report/FR/',
            'HYYIT': '/data/HYY/EU/business report/IT/',
            'HYYUK': '/data/HYY/EU/business report/UK/',
            'HYYJP': '/data/HYY/Japan/business report/',
            'HYYCA': '/data/HYY/North America/business report/CA/',
            'HYYUS': '/data/HYY/North America/business report/USA/',
            'TXHLDE': '/data/TXHL/EU/business report/DE/',
            'TXHLES': '/data/TXHL/EU/business report/ES/',
            'TXHLFR': '/data/TXHL/EU/business report/FR/',
            'TXHLIT': '/data/TXHL/EU/business report/IT/',
            'TXHLUK': '/data/TXHL/EU/business report/UK/',
            'TXHLJP': '/data/TXHL/Japan/business report/',
            'TXHLCA': '',
            'TXHLUS': '',
        }

        if datatype:
            ad_campaign = DataFrame()
            path = 'F:/PycharmFile' + ads_dict[self.store +
                                               self.country]  # 广告数据文件地址
            file_fold = self.end.strftime('%Y') + '.' + self.end.strftime('%m')
            # 需修改: 直接写出文件夹名,文件名file_name,如果存在,则打开文件,不存在,则查找
            if os.path.isdir(path + file_fold):  # 找到月份文件夹
                file_name = "ADs_" + self.store + self.country + "_" + str(self.end.year) + "-" \
                            + str(self.end.month) + "-" + str(self.end.day) + ".txt"
                if self.country == "JP":
                    ad_campaign = pd.read_table(path + file_fold + "/" +
                                                file_name,
                                                sep='\t',
                                                encoding='Shift-JIS')
                else:
                    ad_campaign = pd.read_table(path + file_fold + "/" +
                                                file_name,
                                                sep='\t',
                                                encoding='utf-8')

            return ad_campaign
        else:
            sales_df = DataFrame()
            path = 'F:/PycharmFile' + sales_dict[self.store +
                                                 self.country]  #销售数据文件地址
            delta = (self.end - self.start).days
            for i in range(delta + 1):
                date = (self.start + timedelta(days=i))
                file_name = self.store + self.country + '-' + date.strftime('%y') + '-' + str(date.month)\
                + '-' + str(date.day) + '.csv'
                for root, subdirs, files, in os.walk(path):
                    for name in files:
                        if name == file_name:
                            print name
                            file_path = root + '/' + name
                            df = pd.read_csv(file_path, encoding='utf8')
                            sales_df = pd.concat([sales_df, df])

            sales_df = sales_df[[
                u'(子)ASIN', u'商品名称', u'买家访问次数', u'买家访问次数百分比', u'页面浏览次数',
                u'页面浏览次数百分比', u'购买按钮页面浏览率', u'已订购商品数量', u'订单商品数量转化率',
                u'已订购商品销售额', u'订单商品种类数'
            ]]
            print sales_df.head()

            return sales_df
コード例 #35
0
                                              "popRanking wrap"}).text.strip()
        #print(site_name)
        #print(site_name,site_city_parent,site_link,site_rating,site_category,site_speciality,sep='\n')
        site = [
            site_name, site_city_parent, site_link, site_rating, site_category,
            site_speciality
        ]
        sites.append(site)
    #print(sites[-1])
    #print(len(sites))
    return (sites)


#listing_info=soup.find_all("div",attrs={"class":"listing_info"})
listing_info = soup.find_all("div", attrs={"class": "listing_details"})
sites = extract_details(listing_info)
site_headers = [
    "site_name", "site_city_parent", "site_link", "site_rating",
    "site_category", "site_speciality"
]
len(sites)
sites_df = DataFrame(sites)
sites_df.columns = site_headers
print(sites_df.head())
#site_link='https://www.tripadvisor.in/'+'Attraction_Review-g668046-d2441213-Reviews-Double_Decker_Living_Root_Bridge-Cherrapunjee_East_Khasi_Hills_District_Meghalaya.html'
##fetch_hours_GPS(site_link)
##page=urlopen(site_link)
##soup = BeautifulSoup(page,'html.parser')
##print(len(soup))
##site_address=s.find("div",attrs={"class","detail_section address"}).text
コード例 #36
0
# In[ ]:

bizframe = DataFrame(bizrecords)
userframe = DataFrame(userrecords)
revframe = DataFrame(revrecords)
checkinframe = DataFrame(checkinrecords)


# In[ ]:

users_df = DataFrame(userrecords, columns=['user_id','yelping_since','review_count', 'average_stars', 'variance_in_rating'])


# In[ ]:

users_df.head(n=10)


# In[ ]:

bizframe_sub = DataFrame(bizrecords, columns=['business_id', 'name', 'categories', 'review_count', 'stars'])


# In[ ]:

bizframe_sub.head(n=10)


# In[ ]:

bsort = bizframe_sub.sort_values(by='review_count', ascending=False)
df_roles = DataFrame(list(job_db.roles.find({})))

# In[7]:

#COUNTING THE USER JOB EXPERIENCE
df_UserJobExperience = DataFrame(list(job_db.UserJobExperience.find({})))
df_UserJobExperience['start_date'] = pd.to_datetime(
    df_UserJobExperience['start_date'], format='%d-%b-%Y %H:%M:%S')
df_UserJobExperience['end_date'] = pd.to_datetime(
    df_UserJobExperience['end_date'], format='%d-%b-%Y %H:%M:%S')
df_UserJobExperience['Experience in Months'] = (
    df_UserJobExperience.end_date.dt.year -
    df_UserJobExperience.start_date.dt.year) * 12 + (
        df_UserJobExperience.end_date.dt.month -
        df_UserJobExperience.start_date.dt.month)
df_UserJobExperience.head()

# # Merging dataframes

# In[8]:

df1 = pd.merge(df_PostJob,
               df_JobSkills,
               left_on='_id',
               right_on='job_id',
               how='left')
df1.head().transpose()

# In[9]:

df2_jobs = pd.merge(df1, df_Company, on='company_id', how='left')
コード例 #38
0
    result=np.append(result,model.predict(test.ix[i]))


# In[69]:

len(result)


# In[70]:

submission=DataFrame({'Id':np.arange(1,9001,1),'Solution':result})


# In[71]:

submission.head()


# In[75]:

submission.to_csv('submit1_KMeans.csv',index=False)


##### We have to also convert the solution column to int type from float else a 0 score is obtained

# In[82]:

submission.info()


# In[91]:
コード例 #39
0
ファイル: 0703.py プロジェクト: xiaoshenkejiushu/chapter07
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('./ml-1m/movies.dat',
                       sep='::',
                       header=None,
                       names=mnames)
movies.head()


genre_iter = (set(x.split('|')) for x in movies.genres)
print(genre_iter)
genres = sorted(set.union(*genre_iter))
print(genres)


dummies = DataFrame(np.zeros((len(movies),len(genres))),columns = genres)
print(dummies.head())


for i, gen in enumerate(movies.genres):
    dummies.loc[i, gen.split('|')] = 1 # 给每部电影打标签

print(dummies.head())

movies_windic = movies.join(dummies.add_prefix('Genre_'))
print(movies_windic.head())


values = np.random.rand(10)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

print(pd.cut(values,bins))
コード例 #40
0
 def df_to_string(df: pd.DataFrame, name: str) -> str:
     """
     Displays relevant information about the DF
     """
     return f"{name} {df.shape} ({df.isna().sum().sum()} missing values) :\n" + df.head(
     ).__str__()
コード例 #41
0
pd.set_option('display.width', None)  # 设置字符显示宽度
pd.set_option('display.max_rows', None)  # 设置显示最大行
pd.set_option('display.max_columns', None)  # 设置显示最大行


client = pymongo.MongoClient('localhost', 27017)
futures = client.futures2
market = futures.position



# market = DataFrame(list(market.find({'date': {'$gte': '20190601'}})))

# # 删除数据
begin = DataFrame(list(market.find({}).sort([('_id', -1)]).limit(1)))
print(begin.head())
begin = begin['date'][0]
print("lastdate: "+begin)







from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from datetime import datetime
import time
# dr=['2001-1-1','2030-1-1']
コード例 #42
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {'test': [5, 7, 9, 11],
             'test1': [4., 5, 6, 7],
             'other': list('abcd')}, index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {'test': [11, 9],
             'test1': [7., 6],
             'other': ['d', 'c']}, index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {'test': [11, 9, np.nan],
             'test1': [7., 6, np.nan],
             'other': ['d', 'c', np.nan]}, index=rows)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                              'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                              'other': [np.nan, np.nan, np.nan,
                                        'd', 'c', np.nan]},
                             index=rows)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with pytest.raises(KeyError):
            dfnu.loc[['E']]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame(
            {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E'])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)
コード例 #43
0
    # placeholder generator
    # replace with your own code
    for k in []:
        yield k


# <codecell>

# use this code to run your code
# I recommend replacing the None in islice to a small number to make sure you're on
# the right track

r = list(islice(places("NAME,P0010001"), None))
places_df = DataFrame(r)
places_df.P0010001 = places_df.P0010001.astype("int")

places_df["FIPS"] = places_df.apply(lambda s: s["state"] + s["place"], axis=1)

print "number of places", len(places_df)
print "total pop", places_df.P0010001.sum()
places_df.head()

# <codecell>

# if you've done this correctly, the following asserts should stop complaining

assert places_df.P0010001.sum() == 228457238
# number of places in 2010 Census
assert len(places_df) == 29261
コード例 #44
0
    df1['entropy5'] = df1[['Asian','Black','Hispanic','White','Other']].apply(entropy,axis=1)
    df1['entropy4'] = df1[['Asian','Black','Hispanic','White']].apply(entropy,axis=1)
    return df1

# <codecell>


msa_list = list(islice(msas(P005_vars_with_name), None))
len(msa_list)

# <codecell>

msa_list
dr = DataFrame(msa_list)
dr = convert_P005_to_int(dr)
dr.head()

grouped = dr.groupby('metropolitan statistical area/micropolitan statistical area').sum()
grouped.head()

# <codecell>


df_diversity = diversity(grouped)

# <codecell>


#'p_Asian', 'p_Black', 'p_Hispanic', 'p_Other','p_White'
df_diversity['p_Asian'] = df_diversity['Asian']/df_diversity['Total']
df_diversity['p_Black'] = df_diversity['Black']/df_diversity['Total']
コード例 #45
0
import statsmodels.api as sm
from mtkirc import wd
from pandas import DataFrame, Series, read_csv

# -- Imports
# rna-seq
trans = read_csv('%s/data/kirc_transcriptomics_filtered_voom_de.txt' % wd, sep='\t', index_col=0)

# targets
tf_targets = read_csv('%s/tables/tfs_targets.txt' % wd, sep='\t', index_col=0)


# -- Activities
def calc_activity(c):
    ys = trans.ix[tf_targets.index, c].dropna()

    xs = tf_targets.ix[ys.index]
    xs['const'] = 1

    lm = sm.OLS(ys, xs).fit()
    print lm.summary()

    return lm.tvalues.drop('const').to_dict()

tf_activity = DataFrame({c: calc_activity(c) for c in trans})
print tf_activity.head()

# -- Export
tf_activity.to_csv('%s/tables/tf_activity.txt' % wd, sep='\t')
print '[INFO] Done'
コード例 #46
0
import pandas as pd
from pandas import DataFrame

ReadCsv = pd.read_csv(r'gloss_entryID_synonyms.csv', sep=';', header='infer')
df_glosses = DataFrame(ReadCsv)
df_glosses.head()

new_df_glosses = df_glosses.drop_duplicates()
df_glosses.to_csv('gloss_entryID_synonyms.csv', sep=',', index=False)
コード例 #47
0
    
    
    
    
    

    
    

# <codecell>

windowed_df = DataFrame(vpr_window_results)

# <codecell>

windowed_df.head()

# <codecell>

import dendropy

fixtrees = glob.glob('newdomaintrees/*.nwk')
for f in fixtrees:
    if 'Equal' not in f:
        continue
    with open(f) as handle:
        tree = dendropy.Tree.get_from_stream(open(f), 'nexus')
        
    tree.deroot()
    rmnodes = [tree.prune_subtree(t, update_splits = True) for t in tree.leaf_nodes() if t.get_node_str().endswith("copy'")]
    #tree.prune_taxa(rmnodes)
コード例 #48
0
            'CO': 'Colo',
            'CT': 'Conn',
            'DC': 'District of Columbia'
        }

        # create a new column which maps the existing column using our names map
        self.df['name'] = self.df['abbrev'].map(names_map)

    def inspect_columns(self):
        print(self.df.columns)


if __name__ == "__main__":

    # Run example
    df = DataFrame({'abbrev': ['CA', 'CO', 'CT', 'DC', 'TX']})
    print(df.head())

    # Initialize a new wranger object, passing our df into the class and storing it as part of
    # the new wrangler object
    wrangler = Wrangler(df)

    # Call the inspect method
    wrangler.inspect_columns()

    # Call the addstate names method, this adds the new column to the df that is stored on the wranger object
    wrangler.add_state_names()

    # Print by calling the method on the df inside the wranger object
    print(wrangler.df.head())
コード例 #49
0
ファイル: lesson2.py プロジェクト: gokulvanan/Learning
#print random_names[:10]
#print births[:10]

dataset = zip(random_names,births)

df = DataFrame(data=dataset, columns=['Names','Births'])
#print df[:10]

df.to_csv("births1880.txt",index=False,header=False)

df = read_csv(r'./births1880.txt',names=["Names","Births"])
print "df.info over all info of df"
print df.info()
print "df.head - first 5 rows"
print df.head()
import os
os.remove(r'./births1880.txt')

uqNames = df['Names'].unique()

print "df['names'].unique()"
print uqNames

print "df.names.describe()"
print df['Names'].describe()

df = df.groupby("Names")  #group by name
print df
df = df.sum() # applys sum to each groupBy obj
print df
コード例 #50
0
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('emails/ham', 'ham'))

print(data.head())

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

examples = ['Free Smartphones now!!!', 'Hello customers here you can get new watches for least cost']
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
print(predictions)
コード例 #51
0
def do():
    train_data = pd.read_csv(
        'D:/testFiles/for_excute_folder/activity_blFreight_2017_5_train_input.csv'
    )
    test_data = pd.read_csv(
        'D:/testFiles/for_excute_folder/activity_blFreight_2017_5_test_input.csv'
    )

    # test_data = pd.read_csv('D:/testFiles/for_excute_folder/activity_blFreight_2017_5_train_input.csv')
    # train_data = pd.read_csv('D:/testFiles/for_excute_folder/activity_blFreight_2017_5_test_input.csv')

    # drop_col_names = ['Global-SystemAdmin']

    train_data = train_data.drop(train_data.columns[0], axis=1)
    test_data = test_data.drop(test_data.columns[0], axis=1)

    train_data = train_data[train_data["TIME_USED"] <= 1000]
    test_data = test_data[test_data["TIME_USED"] <= 1000]

    # train_data = train_data[train_data["ASSIGN_COUNT"] <= 1]
    # test_data = test_data[test_data["ASSIGN_COUNT"] <= 1]

    # train_data = train_data.drop(drop_col_names, axis=1)
    # test_data = test_data.drop(drop_col_names, axis=1)

    train_data['TIME_USED'] = train_data['TIME_USED'] / 60
    test_data['TIME_USED'] = test_data['TIME_USED'] / 60

    train_data['TIME_USERD_MEDIAN_S2'] = train_data['TIME_USERD_MEDIAN']**2
    test_data['TIME_USERD_MEDIAN_S2'] = test_data['TIME_USERD_MEDIAN']**2

    #bkgOffice_median_by_task_type

    train_data['TIME_USERD_MEDIAN_S3'] = train_data[
        'TIME_USERD_MEDIAN'] * train_data['bkgOffice_median_by_task_type']
    test_data['TIME_USERD_MEDIAN_S3'] = test_data[
        'TIME_USERD_MEDIAN'] * test_data['bkgOffice_median_by_task_type']

    train_data['TIME_USERD_MEDIAN_S4'] = train_data[
        'bkgOffice_mean_by_task_type'] * train_data[
            'bkgOffice_median_by_task_type']
    test_data['TIME_USERD_MEDIAN_S4'] = test_data[
        'bkgOffice_mean_by_task_type'] * test_data[
            'bkgOffice_median_by_task_type']

    # train_data = train_data[
    #     ['TIME_USED', 'TIME_USERD_MEDIAN', 'Freight_Type=Semi Auto', 'Freight_Type=No rate', 'TIME_USERD_COUNT', 'TIME_USERD_VAR', 'AWAY_COUNT',
    #      'AWAY_MEAN', 'TIME_USED_BY_REGION' ,'COUNT_BY_REGION', 'TIME_USED_VAR_BY_REGION']]
    # test_data = test_data[
    #     ['TIME_USED', 'TIME_USERD_MEDIAN', 'Freight_Type=Semi Auto', 'Freight_Type=No rate', 'TIME_USERD_COUNT', 'TIME_USERD_VAR', 'AWAY_COUNT',
    #      'AWAY_MEAN', 'TIME_USED_BY_REGION' ,'COUNT_BY_REGION', 'TIME_USED_VAR_BY_REGION']]

    print(test_data.head())

    # print(train_data.describe())

    y_train = train_data['TIME_USED'].values.tolist()
    X_train = train_data.drop(['TIME_USED'], axis=1).values.tolist()

    y_test = test_data['TIME_USED'].values.tolist()
    X_test = test_data.drop(['TIME_USED'], axis=1).values.tolist()

    # 选一个模型

    # regressor = SGDRegressor(l1_ratio=0.1)
    # regressor = Ridge()
    # regressor = Lasso()
    # regressor = SVR()
    # regressor = RandomForestRegressor(n_estimators=400, n_jobs=-1, max_features='sqrt')
    # regressor = AdaBoostRegressor()
    # regressor = GradientBoostingRegressor(n_estimators=400)
    # regressor = BaggingRegressor()
    regressor = XGBRegressor(n_estimators=400,
                             learning_rate=0.02,
                             colsample_bytree=0.1,
                             seed=2017)
    # regressor = LGBMRegressor(n_estimators=400, learning_rate=0.02, seed=2017, colsample_bytree=1)

    # 用训练集做交叉验证
    # scores = cross_val_score(regressor, X_train, y_train, cv=4, scoring='neg_mean_absolute_error', n_jobs=-1)
    #
    # print('交叉验证R方值:', scores)
    # print('交叉验证R方均值:', np.mean([scores]))

    # 用训练集训练模型
    regressor.fit(X_train, y_train)
    # 用模型预测测试集, 打分方法也是r2
    print('测试集R方值:', regressor.score(X_test, y_test))

    # 对比预测数据与真实数据
    y_predict = regressor.predict(X_test)
    df = DataFrame()
    df['predict'] = y_predict
    df['real'] = y_test
    df['diff'] = y_predict - y_test
    df['diff_abs'] = abs(df['diff'])

    df.sort_values(by='diff_abs', ascending=False, inplace=True)

    print(df.head(20))

    print(df['diff_abs'].describe(percentiles=np.arange(0.1, 1, 0.1)))

    print('MAE =  ', mean_absolute_error(y_test, y_predict))
    print('MSE =  ', mean_squared_error(y_test, y_predict))
    print('R2 =  ', r2_score(y_test, y_predict))

    print('feature_importances\n')
コード例 #52
0
plt.ylabel('# of houses')

# In[5]:

plt.scatter(boston.data[:, 5], boston.target)
plt.ylabel('Price in 1000s')
plt.xlabel('# of rooms')

# In[6]:

boston_df = DataFrame(boston.data)
boston_df.columns = boston.feature_names

# In[7]:

boston_df.head()

# In[8]:

boston_df['Price'] = boston.target

# In[9]:

boston_df.head()

# In[10]:

sns.lmplot('RM', 'Price', data=boston_df)

# In[11]:
コード例 #53
0
from aggregate_ranking_representation.models import RankingName, RawRankingRecord
qs_name = RankingName.objects.filter(short_name='QS')
qs_name = RankingName.objects.filter(short_name='QS')[0]
the_name = RankingName.objects.filter(short_name='THE')[0]
from aggregate_ranking_representation.models import RawRankingRecord, RankingName
from pandas import Series, DataFrame
import pandas as pd
qs_raw_records = qs_name.rawrankingrecord_set.all()
qs_raw_records
qs_raw_record0 = qs_raw_records[0]
qs_raw_record0
qs_raw_records = qs_name.rawrankingrecord_set.all().values()
qs_raw_records
qs_df = DataFrame(qs_raw_records)
qs_df = DataFrame(list(qs_raw_records))
qs_df.head()
the_name = RankingName.objects.filter(short_name='THE')
the_name = RankingName.objects.filter(short_name='THE')[0]
the_raw_records = the_name.rawrankingrecord_set.all()
the_df = DataFrame(list(the_raw_records.values()))
the_df.head()
the_df[:-1]
the_df.index
the_df.drop(0, axis=0)
the_df.head()
the_df.drop(0)
the_df.head()
the_df = DataFrame(list(the_raw_records.values()))
the_df.head()
the_df[0]
the_df[1]
コード例 #54
0
df = DataFrame(data)
print(df)

print('-------------------------')

bbqurl = "https://www.bbq.co.kr/menu/menuList.asp"
bbq = req.urlopen(bbqurl)
print(bbq)

soup = bs4.BeautifulSoup(bbq, 'lxml')

datas = []
info = soup.select('div.info')  #메뉴명

for i in info:
    tempPrice = i.select('p.pay')[0].text
    price = ''
    for j in tempPrice:
        try:
            int(j)
            price += j  # 1 19 190 ...
            #print(price)
        except:
            pass
    datas += [[i.select('p.name')[0].text, int(price)]]

df2 = DataFrame(datas, columns=['메뉴', '가격'])
print(df2.head())
print('가격평균 :', df2['가격'].mean())
print('가격평균 :', df2['가격'].std())
コード例 #55
0
# <codecell>

from pandas import DataFrame
import numpy as np
from census import Census
from us import states

import settings

c = Census(settings.CENSUS_KEY)

r = c.sf1.get(('NAME', 'P0010001'), {'for': 'state:*'})
df1 = DataFrame(r)

df1.head()

# <codecell>

len(df1)

# <markdowncell>

# **Q21**: Why does `df` have 52 items? Please explain

# <markdowncell>

# **A21**:
# 
# When queried for "states", the US Census API returns data for the 50 states, the District of Columbia, and Puerto Rico: (50+1+1 = 52 entities).
def determine_classification(dataFrame):
    df = dataFrame
    df['Classification'] = df.apply(lambda row: calculate_classification(row),
                                    axis=1)
    df.head(15)
    return df
コード例 #57
0
ファイル: plotter.py プロジェクト: nkhuyu/corpkit
def plotter(title,
            df,
            kind = 'line',
            x_label = None,
            y_label = None,
            style = 'ggplot',
            figsize = (8, 4),
            save = False,
            legend_pos = 'best',
            reverse_legend = 'guess',
            num_to_plot = 7,
            tex = 'try',
            colours = 'Accent',
            cumulative = False,
            pie_legend = True,
            partial_pie = False,
            show_totals = False,
            transparent = False,
            output_format = 'png',
            interactive = False,
            black_and_white = False,
            show_p_val = False,
            indices = False,
            **kwargs):
    """Visualise corpus interrogations.

    :param title: A title for the plot
    :type title: str
    :param df: Data to be plotted
    :type df: pandas.core.frame.DataFrame
    :param x_label: A label for the x axis
    :type x_label: str
    :param y_label: A label for the y axis
    :type y_label: str
    :param kind: The kind of chart to make
    :type kind: str ('line'/'bar'/'barh'/'pie'/'area')
    :param style: Visual theme of plot
    :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
    :param figsize: Size of plot
    :type figsize: tuple (int, int)
    :param save: If bool, save with *title* as name; if str, use str as name
    :type save: bool/str
    :param legend_pos: Where to place legend
    :type legend_pos: str ('upper right'/'outside right'/etc)
    :param reverse_legend: Reverse the order of the legend
    :type reverse_legend: bool
    :param num_to_plot: How many columns to plot
    :type num_to_plot: int/'all'
    :param tex: Use TeX to draw plot text
    :type tex: bool
    :param colours: Colourmap for lines/bars/slices
    :type colours: str
    :param cumulative: Plot values cumulatively
    :type cumulative: bool
    :param pie_legend: Show a legend for pie chart
    :type pie_legend: bool
    :param partial_pie: Allow plotting of pie slices only
    :type partial_pie: bool
    :param show_totals: Print sums in plot where possible
    :type show_totals: str -- 'legend'/'plot'/'both'
    :param transparent: Transparent .png background
    :type transparent: bool
    :param output_format: File format for saved image
    :type output_format: str -- 'png'/'pdf'
    :param black_and_white: Create black and white line styles
    :type black_and_white: bool
    :param show_p_val: Attempt to print p values in legend if contained in df
    :type show_p_val: bool
    :param indices: To use when plotting "distance from root"
    :type indices: bool
    :param stacked: When making bar chart, stack bars on top of one another
    :type stacked: str
    :param filled: For area and bar charts, make every column sum to 100
    :type filled: str
    :param legend: Show a legend
    :type legend: bool
    :param rot: Rotate x axis ticks by *rot* degrees
    :type rot: int
    :param subplots: Plot each column separately
    :type subplots: bool
    :param layout: Grid shape to use when *subplots* is True
    :type layout: tuple -- (int, int)
    :param interactive: Experimental interactive options
    :type interactive: list -- [1, 2, 3]
    :returns: matplotlib figure
    """
    import corpkit
    import os

    try:
        from IPython.utils.shimmodule import ShimWarning
        import warnings
        warnings.simplefilter('ignore', ShimWarning)
    except:
        pass

    import matplotlib as mpl
    from matplotlib import rc

    # prefer seaborn plotting
    try:
        import seaborn as sns
    except:
        pass   
    
    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt
    
    import pandas
    from pandas import DataFrame

    import numpy
    from time import localtime, strftime
    from tests import check_pytex, check_spider, check_t_kinter

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    # check what environment we're in
    tk = check_t_kinter()
    running_python_tex = check_pytex()
    running_spider = check_spider()

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np
        new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
        return new_cmap

    def get_savename(imagefolder, save = False, title = False, ext = 'png'):
        """Come up with the savename for the image."""
        import os

        def urlify(s):
            "Turn title into filename"
            import re
            s = s.lower()
            s = re.sub(r"[^\w\s-]", '', s)
            s = re.sub(r"\s+", '-', s)
            s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s)
            return s
        # name as 
        if not ext.startswith('.'):
            ext = '.' + ext
        if type(save) == str:
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        #this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith('%s%s' % (ext, ext)):
            savename = savename.replace('%s%s' % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append('%s (%.2f\%%)' % (w, perc))
                else:
                    the_labs.append('%s (%.2f %%)' % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append('%s (n=%d)' % (w, score))
                else:
                    the_labs.append('%s (n=%d)' % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pandas.DataFrame(vals, index = the_labs)
            dataframe.columns = ['Total']
        return dataframe

    def auto_explode(dataframe, input, was_series = False, num_to_plot = 7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]
        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if type(input) == str or type(input) == int:
            input = [input]
        if type(input) == list:
            for i in input:
                if type(i) == str:
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # check if we're doing subplots
    sbplt = False
    if 'subplots' in kwargs:
        if kwargs['subplots'] is True:
            sbplt = True
    kwargs['subplots'] = sbplt

    if colours is True:
        colours = 'Paired'

    # todo: get this dynamically instead.
    styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white']
    #if style not in styles:
        #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles)))

    if style == 'mpl-white':
        try:
            sns.set_style("whitegrid")
        except:
            pass
        style = 'matplotlib'

    if style is not False and style.startswith('seaborn'):
        colours = False

    # use 'draggable = True' to make a draggable legend
    dragmode = kwargs.get('draggable', False)
    kwargs.pop('draggable', None)

    if kwargs.get('savepath'):
        mpl.rcParams['savefig.directory'] = kwargs.get('savepath')
        kwargs.pop('savepath', None)

    mpl.rcParams['savefig.bbox'] = 'tight'
    mpl.rcParams.update({'figure.autolayout': True})

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams['font.family'] = 'sans-serif'
    mpl.rcParams['text.latex.unicode'] = True
    
    if tex == 'try' or tex is True:
        try:
            rc('text', usetex=True)
            rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
            using_tex = True
        except:
            matplotlib.rc('font', family='sans-serif') 
            matplotlib.rc('font', serif='Helvetica Neue') 
            matplotlib.rc('text', usetex='false') 
            rc('text', usetex=False)
    else:
        rc('text', usetex=False)  

    if interactive:
        using_tex = False 

    if show_totals is False:
        show_totals = 'none'

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    kwargs['kind'] = kind.lower()

    if interactive:
        if kwargs['kind'].startswith('bar'):
            interactive_types = [3]
        elif kwargs['kind'] == 'area':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'line':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'pie':
            interactive_types = None
            warnings.warn('Interactive plotting not yet available for pie plots.')
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if kind == 'pie':
        piemode = True
        # always the best spot for pie
        #if legend_pos == 'best':
            #legend_pos = 'lower left'
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            kwargs['pctdistance'] = 0.6
            if using_tex:
                kwargs['autopct'] = r'%1.1f\%%'
            else:
                kwargs['autopct'] = '%1.1f%%'

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if type(dataframe) == pandas.core.series.Series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True
    
    # attempt to convert x axis to ints:
    try:
        dataframe.index = [int(i) for i in list(dataframe.index)]
    except:
        pass

    # remove totals and tkinter order
    if not was_series and not all(x.lower() == 'total' for x in list(dataframe.columns)):
        for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
            try:
                dataframe = dataframe.drop(name, axis = ax, errors = 'ignore')
            except:
                pass
    else:
        dataframe = dataframe.drop('tkintertable-order', errors = 'ignore')
        dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore')
            
    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == 'guess':
            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = 'all'
            dataframe = dataframe.T
            if y_label is None:
                y_label = 'Percentage of all matches'
            if x_label is None:
                x_label = ''

    # set backend?
    output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf']
    if output_format not in output_formats:
        raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats)))
    
    # don't know if these are necessary
    if 'pdf' in output_format:
        plt.switch_backend(output_format) 
    if 'pgf' in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == 'all':
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if piemode and not sbplt and kwargs.get('explode'):
        kwargs['explode'] = auto_explode(dataframe, 
                                        kwargs['explode'], 
                                        was_series = was_series, 
                                        num_to_plot = num_to_plot)
    else:
        kwargs.pop('explode', None)

    legend = kwargs.get('legend', False)

    #cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != 'Total':
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == 'Total':
            plotting_a_totals_column = True
            if not 'legend' in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        dataframe = dataframe.drop(statfields, axis = 1, errors = 'ignore')
    except:
        pass    
    try:
        dataframe.ix['p']
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]['p']

                def p_string_formatter(val):
                    if val < 0.001:
                        if not using_tex:
                            return 'p < 0.001'
                        else:
                            return r'p $<$ 0.001'
                    else:
                        return 'p = %s' % format(val, '.3f')

                pstr = p_string_formatter(pval)
                newname = '%s (%s)' % (col, pstr)
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore')
        else:
            warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.')
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore')

    # make and set y label
    absolutes = True
    if type(dataframe) == pandas.core.frame.DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0,:].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):        
            absolutes = False

    #  use colormap if need be:
    if num_to_plot > 0:
        if not was_series:
            if kind in ['pie', 'line', 'area']:
                if colours:
                    if not plotting_a_totals_column:
                        if colours == 'Default':
                            colours = 'Paired'
                        kwargs['colormap'] = colours
        #else:

            if colours:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours

    if piemode:
        if num_to_plot > 0:
            if colours == 'Default':
                colours = 'Paired'
            kwargs['colormap'] = colours
        else:
            if num_to_plot > 0:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours
    
    # multicoloured bar charts
    if colours:
        if kind.startswith('bar'):
            if len(list(dataframe.columns)) == 1:
                if not black_and_white:
                    import numpy as np
                    the_range = np.linspace(0, 1, num_to_plot)
                    cmap = plt.get_cmap(colours)
                    kwargs['colors'] = [cmap(n) for n in the_range]
                # make a bar width ... ? ...
                #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5


    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if kind in ['bar', 'barh', 'area', 'line', 'pie']:
        if was_series:
            legend = False
        if kind == 'pie':
            if pie_legend:
                legend = True
            else:
                legend = False
    if kind in ['barh', 'area']:
        if reverse_legend == 'guess':
            rev_leg = True
    if not 'rev_leg' in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = 'best'

    # cut dataframe if just_totals
    try:
        tst = dataframe['Combined total']
        dataframe = dataframe.head(num_to_plot)
    except:
        pass
    
    # rotate automatically
    if 'rot' not in kwargs:
        if not was_series:
            xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]]
            #if 'kind' in kwargs:
                #if kwargs['kind'] in ['barh', 'area']:
                    #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        else:
            xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        if len(max(xvals, key=len)) > 6:
            if not piemode:
                kwargs['rot'] = 45

    # no title for subplots because ugly,
    if title and not sbplt:
        kwargs['title'] = title
        
    # no interactive subplots yet:
    if sbplt and interactive:
        import warnings
        interactive = False
        warnings.warn('No interactive subplots yet, sorry.')
        return
        
    # not using pandas for labels or legend anymore.
    #kwargs['labels'] = None
    #kwargs['legend'] = False

    if legend:
        if num_to_plot > 6:
            if not kwargs.get('ncol'):
                kwargs['ncol'] = num_to_plot / 7
        # kwarg options go in leg_options
        leg_options = {'framealpha': .8,
                       'shadow': kwargs.get('shadow', False),
                       'ncol': kwargs.pop('ncol', 1)}    

        # determine legend position based on this dict
        if legend_pos:
            possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 
                        'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 
                        'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 
                        'outside center right': 'center left', 'outside lower right': 'lower left'}

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys())))
            leg_options['loc'] = the_loc
            #weirdness needed for outside plot
            if legend_pos in ['o r', 'outside right', 'outside upper right']:
                leg_options['bbox_to_anchor'] = (1.02, 1)
            if legend_pos == 'outside center right':
                leg_options['bbox_to_anchor'] = (1.02, 0.5)
            if legend_pos == 'outside lower right':
                leg_options['loc'] == 'upper right'
                leg_options['bbox_to_anchor'] = (0.5, 0.5)
        
        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith('o'):
                leg_options['borderaxespad'] = 1

    if not piemode:
        if show_totals.endswith('both') or show_totals.endswith('legend'):
            dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)
    else:
        if pie_legend:
            if show_totals.endswith('both') or show_totals.endswith('legend'):
                dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs['y'] = list(dataframe.columns)[0]
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.columns)
        else:
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.index)   
    
    def filler(df):
        pby = df.T.copy()
        for i in list(pby.columns):
            tot = pby[i].sum()
            pby[i] = pby[i] * 100.0 / tot
        return pby.T

    areamode = False
    if kind == 'area':
        areamode = True

    if legend is False:
        kwargs['legend'] = False

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kind == 'line':
                kwargs['marker'] = ','
        if not piemode:
            kwargs['alpha'] = 0.1
    
    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A')
                    dataframe = dataframe.set_index(n)

        if kwargs.get('filled'):
            if areamode or kind.startswith('bar'):
                dataframe = filler(dataframe)
            kwargs.pop('filled', None)

    MARKERSIZE = 4
    COLORMAP = {
            0: {'marker': None, 'dash': (None,None)},
            1: {'marker': None, 'dash': [5,5]},
            2: {'marker': "o", 'dash': (None,None)},
            3: {'marker': None, 'dash': [1,3]},
            4: {'marker': "s", 'dash': [5,2,5,2,5,10]},
            5: {'marker': None, 'dash': [5,3,1,2,1,10]},
            6: {'marker': 'o', 'dash': (None,None)},
            7: {'marker': None, 'dash': [5,3,1,3]},
            8: {'marker': "1", 'dash': [1,3]},
            9: {'marker': "*", 'dash': [5,5]},
            10: {'marker': "2", 'dash': [5,2,5,2,5,10]},
            11: {'marker': "s", 'dash': (None,None)}
            }

    HATCHES = {
            0:  {'color': '#dfdfdf', 'hatch':"/"},
            1:  {'color': '#6f6f6f', 'hatch':"\\"},
            2:  {'color': 'b', 'hatch':"|"},
            3:  {'color': '#dfdfdf', 'hatch':"-"},
            4:  {'color': '#6f6f6f', 'hatch':"+"},
            5:  {'color': 'b', 'hatch':"x"}
            }

    if black_and_white:
        if kind == 'line':
            kwargs['linewidth'] = 1

        cmap = plt.get_cmap('Greys')
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kind == 'bar':
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs['colormap'] = new_cmap

    class dummy_context_mgr():
        """a fake context for plotting without style
        perhaps made obsolete by 'classic' style in new mpl"""
        def __enter__(self):
            return None
        def __exit__(self, one, two, three):
            return False

    with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr():

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                kwargs['legend'] = False
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs['stacked'] = False
                    rev_leg = False
            ax = dataframe.plot(figsize = figsize, **kwargs)
            if areamode:
                handles, labels = plt.gca().get_legend_handles_labels()
                del handles
                del labels
        else:
            plt.gcf().set_tight_layout(False)
            if not piemode:
                ax = dataframe.plot(figsize = figsize, **kwargs)
            else:
                ax = dataframe.plot(figsize = figsize, **kwargs)
                handles, labels = plt.gca().get_legend_handles_labels()
                plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1),
                bbox_transform = plt.gcf().transFigure )

                # this line allows layouts with missing plots
                # i.e. layout = (5, 2) with only nine plots
                plt.gcf().set_tight_layout(False)
                
        if 'rot' in kwargs:
            if kwargs['rot'] != 0 and kwargs['rot'] != 90:
                labels = [item.get_text() for item in ax.get_xticklabels()]
                ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right')

        if transparent:
            plt.gcf().patch.set_facecolor('white')
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            if kind == 'line':
                # white background
                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color('black')
                    #line.set_width(1)
                    line.set_dashes(COLORMAP[c]['dash'])
                    line.set_marker(COLORMAP[c]['marker'])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(list(COLORMAP.keys())):
                        c = 0

        # draw legend with proper placement etc
        if legend:
            if not piemode and not sbplt:
                if 3 not in interactive_types:
                    handles, labels = plt.gca().get_legend_handles_labels()
                    # area doubles the handles and labels. this removes half:
                    if areamode:
                        handles = handles[-len(handles) / 2:]
                        labels = labels[-len(labels) / 2:]
                    if rev_leg:
                        handles = handles[::-1]
                        labels = labels[::-1]
                    lgd = plt.legend(handles, labels, **leg_options)

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            else:
                ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            if 2 in interactive_types:
                #if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                #else:
                if kind == 'line':
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)
        
    if piemode:
        if not sbplt:
            plt.axis('equal')
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = 'Year'

    if x_label is not False:
        if type(x_label) == str:
            plt.xlabel(x_label)
        else:
            check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count)
            try:
                if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
                    x_label = 'Year'
                check_x_axis = int(check_x_axis)
                if 1500 < check_x_axis < 2050:
                    x_label = 'Year'
                else:
                    x_label = 'Group'
            except:
                x_label = 'Group'

        if not sbplt:
            if not piemode:
                plt.xlabel(x_label)

    def is_number(s):
        """check if str can be can be made into float/int"""
        try:
            float(s) # for int, long and float
        except ValueError:
            try:
                complex(s) # for complex
            except ValueError:
                return False
        return True

    # for now, always turn off sci notation
    from matplotlib.ticker import ScalarFormatter
    if type(dataframe.index) != pandas.tseries.period.PeriodIndex:
        try:
            if all(is_number(s) for s in list(dataframe.index)):
                plt.gca().xaxis.set_major_formatter(ScalarFormatter()) 
        except:
            pass
    try:
        if all(is_number(s) for s in list(dataframe.columns)):
            plt.gca().yaxis.set_major_formatter(ScalarFormatter()) 
    except:
        pass

    # y labelling
    y_l = False
    if not absolutes:
        y_l = 'Percentage'
    else:
        y_l = 'Absolute frequency'
    
    def suplabel(axis,label,label_prop=None,
                 labelpad=5,
                 ha='center',va='center'):
        ''' Add super ylabel or xlabel to the figure
        Similar to matplotlib.suptitle
        axis       - string: "x" or "y"
        label      - string
        label_prop - keyword dictionary for Text
        labelpad   - padding from the axis (default: 5)
        ha         - horizontal alignment (default: "center")
        va         - vertical alignment (default: "center")
        '''
        fig = plt.gcf()
        xmin = []
        ymin = []
        for ax in fig.axes:
            xmin.append(ax.get_position().xmin)
            ymin.append(ax.get_position().ymin)
        xmin,ymin = min(xmin),min(ymin)
        dpi = fig.dpi
        if axis.lower() == "y":
            rotation=90.
            x = xmin-float(labelpad)/dpi
            y = 0.5
        elif axis.lower() == 'x':
            rotation = 0.
            x = 0.5
            y = ymin - float(labelpad)/dpi
        else:
            raise Exception("Unexpected axis: x or y")
        if label_prop is None: 
            label_prop = dict()
        plt.gcf().text(x,y,label,rotation=rotation,
                   transform=fig.transFigure,
                   ha=ha,va=va,
                   **label_prop)

    if y_label is not False:
        if not sbplt:
            if not piemode:
                if type(y_label) == str:
                    plt.ylabel(y_label)
                else:
                    plt.ylabel(y_l)
        else:
            if type(y_label) == str:
                the_y = y_label
            else:
                the_y = y_l
            #suplabel('y', the_y, labelpad = 1.5)
            plt.gcf().text(0.04, 0.5, the_y, va='center', rotation='vertical')
            #plt.subplots_adjust(left=0.5)
        
        #    if not piemode:
        #        if type(y_label) == str:
        #            plt.ylabel(y_label)
        #        else:
        #            plt.ylabel(y_l)


    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        #plt.gca().suptitle(title, fontsize = 16)
        #plt.subplots_adjust(top=0.9)
        # get all axes
        if 'layout' not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)
    
        # set subplot titles
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis('equal')

            # show grid
            a.grid(b=kwargs.get('grid', False))
            kwargs.pop('grid', None)
    
    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        if kind.startswith('bar'):
            width = ax.containers[0][0].get_width()

        # show grid
        ax.grid(b=kwargs.get('grid', False))
        kwargs.pop('grid', None)

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0,the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')        

    plt.subplots_adjust(left=0.1)
    plt.subplots_adjust(bottom=0.18)

    if 'layout' not in kwargs:
        if not sbplt:
            plt.tight_layout()

    if save:
        import os
        if running_python_tex:
            imagefolder = '../images'
        else:
            imagefolder = 'images'

        savename = get_savename(imagefolder, save = save, title = title, ext = output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith('o'):
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format = output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print('\n' + time + ": " + savename + " created.")
        else:
            raise ValueError("Error making %s." % savename)

    if dragmode:
        plt.legend().draggable()


    if sbplt:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)

    if not interactive and not running_python_tex and not running_spider \
        and not tk:
        plt.gcf().show()
        return
    elif running_spider or tk:
        return plt

    if interactive:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
コード例 #58
0
ファイル: test_indexing.py プロジェクト: 701789262a/arbobotti
    def test_dups_fancy_indexing(self):

        # GH 3455

        df = tm.makeCustomDataframe(10, 3)
        df.columns = ["a", "a", "b"]
        result = df[["b", "a"]].columns
        expected = Index(["b", "a", "a"])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa"))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]])
        result.columns = list("aaaaaaa")

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")},
            index=["A", "A", "B", "C"],
        )
        rows = ["C", "B"]
        expected = DataFrame(
            {"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows
        )
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ["C", "B", "E"]
        with pytest.raises(KeyError, match="with any missing labels"):
            df.loc[rows]

        # see GH5553, make sure we use the right indexer
        rows = ["F", "G", "H", "C", "B", "E"]
        with pytest.raises(KeyError, match="with any missing labels"):
            df.loc[rows]

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD"))
        with pytest.raises(
            KeyError,
            match=re.escape(
                "\"None of [Index(['E'], dtype='object')] are in the [index]\""
            ),
        ):
            dfnu.loc[["E"]]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with pytest.raises(KeyError, match="with any missing labels"):
            df.loc[[0, 8, 0]]

        df = DataFrame({"A": list("abc")})
        with pytest.raises(KeyError, match="with any missing labels"):
            df.loc[[0, 8, 0]]

        # non unique with non unique selector
        df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"])
        with pytest.raises(KeyError, match="with any missing labels"):
            df.loc[["A", "A", "E"]]
コード例 #59
0
ファイル: test_indexing.py プロジェクト: cpcloud/pandas
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {'test': [5, 7, 9, 11],
             'test1': [4., 5, 6, 7],
             'other': list('abcd')}, index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {'test': [11, 9],
             'test1': [7., 6],
             'other': ['d', 'c']}, index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {'test': [11, 9, np.nan],
             'test1': [7., 6, np.nan],
             'other': ['d', 'c', np.nan]}, index=rows)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                              'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                              'other': [np.nan, np.nan, np.nan,
                                        'd', 'c', np.nan]},
                             index=rows)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # inconsistent returns for unique/duplicate indices when values are
        # missing
        df = DataFrame(np.random.randn(4, 3), index=list('ABCD'))
        expected = df.reindex(['E'])

        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with catch_warnings(record=True):
            result = dfnu.ix[['E']]
        tm.assert_frame_equal(result, expected)

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame(
            {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E'])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)

        # GH 5835
        # dups on index and missing values
        df = DataFrame(
            np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A'])

        expected = pd.concat(
            [df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'],
                                              index=df.index)], axis=1)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[:, ['A', 'B', 'C']]
        tm.assert_frame_equal(result, expected)

        # GH 6504, multi-axis indexing
        df = DataFrame(np.random.randn(9, 2),
                       index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b'])

        expected = df.iloc[0:6]
        result = df.loc[[1, 2]]
        tm.assert_frame_equal(result, expected)

        expected = df
        result = df.loc[:, ['a', 'b']]
        tm.assert_frame_equal(result, expected)

        expected = df.iloc[0:6, :]
        result = df.loc[[1, 2], ['a', 'b']]
        tm.assert_frame_equal(result, expected)
コード例 #60
0
ファイル: K-Means.py プロジェクト: TheDxr/MLcode
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from sklearn.cluster import KMeans
from sklearn.cluster import Birch

#读取文件
datafile = 'MachineLearning\\DataSet\\go_track_trackspoints.csv'
outfile = 'out.csv'
data = pd.read_csv(datafile, usecols=["latitude", "longitude"])
d = DataFrame(data)
d.head()
# ----------------------------------聚类-------------------------------------------

mod = KMeans(n_clusters=3, n_jobs=4, max_iter=500)  #聚成3类数据,并发数为4,最大循环次数为500
mod.fit_predict(d)  #y_pred表示聚类的结果

#聚成3类数据,统计每个聚类下的数据量,并且求出他们的中心
r1 = pd.Series(mod.labels_).value_counts()
r2 = pd.DataFrame(mod.cluster_centers_)
r = pd.concat([r2, r1], axis=1)
r.columns = list(d.columns) + ["Clustering"]
print(r)

#给每一条数据标注上被分为哪一类
r = pd.concat([d, pd.Series(mod.labels_, index=d.index)], axis=1)
r.columns = list(d.columns) + ["Clustering"]
print(r.head())
r.to_csv(outfile)  #如果需要保存到本地,就写上这一列