Example #1
0
def run(cube_slug=None):
    mc = memcache.Client(['127.0.0.1:11211'], debug=0)
    for cube in MyAdminBucket.get('cube').data:
        try:
            slug = cube['slug']

            if cube_slug and cube_slug != slug:
                continue

            sql = """SELECT * FROM ({}) AS CUBE;""".format(cube['sql'])
            for c in MyAdminBucket.get('connection').data:
                if c['slug'] == cube['connection']:
                    connection = c['connection']

            print "\n# CLEAN MEMCACHE/RIAK: {}".format(slug)
            mc.delete(str(slug))
            mc.delete(str('{}-columns'.format(slug)))

            MyBucket.new(slug, data='').store()
            MyBucket.new(u'{}-columns'.format(slug), data='').store()
            MyBucket.new(u'{}-connect'.format(slug), data='').store()
            MyBucket.new(u'{}-sql'.format(slug), data='').store()

            print "# CONNECT IN RELATION DATA BASE: {}".format(slug)
            e = create_engine(connection)
            connection = e.connect()

            resoverall = connection.execute(text(sql))

            print "# LOAD DATA ON DATAWAREHOUSE: {}".format(slug)
            df = DataFrame(resoverall.fetchall())
            if df.empty:
                print '[warnning]Empty cube: {}!!'.format(cube)
                return
            df.columns = resoverall.keys()
            df.head()

            pdict = map(fix_render, df.to_dict(outtype='records'))

            print "# SAVE DATA (JSON) ON RIAK: {}".format(slug)
            MyBucket.new(slug, data=pdict).store()

            print "# SAVE COLUMNS ON RIAK: {}".format(slug)
            MyBucket.new(u'{}-columns'.format(slug),
                         data=json.dumps([c for c in df.columns])).store()

            print "# SAVE CONNECT ON RIAK: {}".format(slug)
            MyBucket.new(u'{}-connect'.format(slug), data=c).store()

            print "# SAVE SQL ON RIAK: {}".format(slug)
            MyBucket.new(u'{}-sql'.format(slug), data=sql).store()

            print "# CLEAN MEMORY: {}\n".format(slug)
            del pdict, df
            gc.collect()
        except:
            pass

    print "## FINISH"
    return True
Example #2
0
    def post(self):
        post = json.loads(self.request.body)

        MyClient = riak.RiakClient(protocol=RIAK_PROTOCOL,
                                   http_port=RIAK_HTTP_PORT,
                                   host=RIAK_HOST)

        MyAdminBucket = MyClient.bucket(ADMIN_BUCKET_NAME)

        connection = None
        for c in MyAdminBucket.get('connection').data:
            if c['slug'] == post.get('connection', None):
                connection = c['connection']

        sql = """SELECT * FROM ({}) AS CUBE LIMIT 10;""".format(
            post.get('sql', None))

        e = create_engine(connection)
        connection = e.connect()
        try:
            resoverall = connection.execute(text(sql))
        except:
            self.write({'sql': '', 'msg': 'Error!'})
            self.finish()

        df = DataFrame(resoverall.fetchall())
        if df.empty:
            self.finish()
        df.columns = resoverall.keys()
        df.head()

        self.write({'sql': df.to_json(orient='records'), 'msg': 'Success!'})
        self.finish()
Example #3
0
def get_link(url):
	link_exr = re.compile(r'<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>')
	links = []
	
	# open web content
	f = urllib2.urlopen(url)
	content = f.read()
	
	# versi find html tag : find all url and save to links
	# soup = BeautifulSoup(content, "lxml")
	# for a in soup.find_all('a', href=True):
	# 	if "detik.com" in a['href']:
	# 		if "http:" not in a['href']:
	# 			a['href'] = "http:" + a['href']
	# 		print "Found the URL:", a['href']
	# 		links.append(a['href'])
			
	# versi regex : find all url and save to links			
	for link in link_exr.findall(content):
		if "detik.com" in link[0]:
			link_detik = link[0]
			if "http:" not in link_detik:
				link_detik = "http:" + link_detik
			links.append(link_detik)
	
	# save to DataFrame
	df = DataFrame(links, columns=['detik url'])
	df.drop_duplicates()

	print df.head(0)
		# create and save to sqlite database
	detik_db = create_engine("mysql://root:root@localhost/data_detik") 
	df.to_sql('url_detik', detik_db, if_exists='replace')
    def test_setitem_chained_setfault(self):

        # GH6026
        # setfaults under numpy 1.7.1 (ok on 1.8)
        data = ['right', 'left', 'left', 'left', 'right', 'left', 'timeout']
        mdata = ['right', 'left', 'left', 'left', 'right', 'left', 'none']

        df = DataFrame({'response': np.array(data)})
        mask = df.response == 'timeout'
        df.response[mask] = 'none'
        tm.assert_frame_equal(df, DataFrame({'response': mdata}))

        recarray = np.rec.fromarrays([data], names=['response'])
        df = DataFrame(recarray)
        mask = df.response == 'timeout'
        df.response[mask] = 'none'
        tm.assert_frame_equal(df, DataFrame({'response': mdata}))

        df = DataFrame({'response': data, 'response1': data})
        mask = df.response == 'timeout'
        df.response[mask] = 'none'
        tm.assert_frame_equal(df, DataFrame({'response': mdata,
                                             'response1': data}))

        # GH 6056
        expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar']))
        df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar'])))
        df['A'].iloc[0] = np.nan
        result = df.head()
        tm.assert_frame_equal(result, expected)

        df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar'])))
        df.A.iloc[0] = np.nan
        result = df.head()
        tm.assert_frame_equal(result, expected)
class MetrilyxAnalyticsSerie(MetrilyxSerie, BasicAnalyticsSerie):

    def __init__(self, serie, graphType="line", dataCallback=None):
        super(MetrilyxAnalyticsSerie, self).__init__(serie, dataCallback)

        self.graphType = graphType
        self._istruct = None

        if not self.error:
            self._istruct = self.__getInternalStruct()
            self.__applyTransform()
            

    def __getInternalStruct(self):
        out = []
        for d in self._serie['data']:
            out.append((d['uuid'], Series([d['dps'][k] for k in sorted(d['dps'].keys())],
                index=to_datetime([int(ts) for ts in sorted(d['dps'].keys())], unit='s'))))
        return DataFrame(dict(out))

    def __getSerieMetadata(self, serie):
        return dict([(k,v) for k,v in serie.items() if k != 'dps'])

    def data(self, ts_unit='ms'):
        if self.error: return { 'error': self.error }

        out = []
        for s in self._serie['data']:
            md = self.__getSerieMetadata(s)
            logger.error("HERE %s" % (s['uuid']))
            datapoints = self._getDataSerieDps(self._serie['query']['aggregator'],
                                                self._istruct[s['uuid']], ts_unit)

            error = self._dataHasErrors(datapoints)
            if not error:
                md['dps'] = datapoints
                md['uuid'] = SerieUUID(s).uuid
            else:
                #s = {'error': error}
                logger.warning("Error assembling data: %s" %(str(e)))
            out.append(md)

        return out

    def __applyTransform(self):
        if self._serie['yTransform'] != "":
            try:
                self._istruct = eval("%s" %(self._serie['yTransform']))(self._istruct)
                
                if isinstance(self._istruct, Series):
                    logger.error("Need to handle 'Series'")
                    logger.error(self._serie['alias'])
                    self._istruct = DataFrame({self._serie['alias']: self._istruct})
                    print self._istruct.head()

            except Exception,e:
                logger.warn("Could not apply yTransform: %s" %(str(e)))
def main():
    loc_char_count = create_data_array()
    # print loc_char_count
    count_sum = loc_char_count.sum(axis=0)
    # print count_sum
    count_sum[count_sum==0] = 1
    # print count_sum

    test =  loc_char_count/count_sum
    df=  DataFrame(test)
    df.to_csv('./data/char_loc_freq.csv')
    print df.head(4)
    print df.ix(3)[2]
def getPostData(fbGraph, entry):
    global CHART_LIMIT
    retrieved = False
    i=0
    while retrieved == False:
        i += 1
        try:
            posts = fbGraph.get_object(entry['page'] + '/posts',
                                       limit=CHART_LIMIT*15)['data']
            retrieved = True
        except facebook.GraphAPIError:
            print "Failed retrieving Graph object from facebook, retrying..."
            pass
        if i > 14:
            print "Giving up"
            return None
        
    frame = DataFrame(posts)
    ##Later, maybe output this frame for further study
    
    postData = DataFrame(columns=('Date', 'Likes', 'Shares'))
    postData['Shares'] = frame['shares'].map(fmtShares)
    postData['Likes']  = frame['id'].map(fmtLikes)
    postData['Date']   = frame['created_time'].map(fmtDate)
    
    postData = postData.groupby(by='Date', sort=False).mean()
    postData = postData.head(n=CHART_LIMIT)
    postData.fillna(value=0)
    return postData
Example #8
0
def gonzales(data , k):
    #transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:] , index = data[ : , 0])
    #adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    #choosing a random point as the first center

    #center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 =     points_list.head(1)
    centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1))
    centers_list['color'] = 'r'
    colors = "bgcmykw"
    #===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    #===========================================================================
    #looping k-1 time to have k centers
    for k_cycle in range(1,k+1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0 
        next_cluster = np.nan
        #loop on all the points to assign them to their closest center 
        for indexp, p in points_list.iterrows():
            #variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster               
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp 
            
        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index   ])
        centers_list.set_value(next_cluster, 'color', colors[k_cycle])
        #=======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        #=======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(['color'], axis=1 ,inplace=True)


    #===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    #===========================================================================

    #print(points_list)
    return centers_list.as_matrix(columns=[0 ,1])
Example #9
0
 def setup(self, axis):
     N = 1000
     s = Series(N, index=tm.makeStringIndex(N))
     self.series = [s[i:- i] for i in range(1, 10)] * 50
     self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000
     df = DataFrame({'A': range(N)},
                    index=date_range('20130101', periods=N, freq='s'))
     self.empty_left = [DataFrame(), df]
     self.empty_right = [df, DataFrame()]
     self.mixed_ndims = [df, df.head(N // 2)]
Example #10
0
def select_best(clstruct,
scorenames=['sensitivity','mmr','aupr','cliqueness_3_20','nonov_iter','n_proteins','n_complexes_3_20'],
        rfunc=operator.add, use_norm=False, dispn=15, score_factors=None,
        use_ranks=True, output_ranks=False, print_ranks=False,
        require_scores=None):
    cxstructs, stats = clstruct.cxstructs, clstruct.stats
    clusts = [cxstr.cxs for cxstr in cxstructs]
    scorenames = scorenames or list(stats.dtype.names)
    stats = stats[scorenames]
    ranks = rank_columns(stats)
    if use_ranks:
        stats = ranks
    else:
        if use_norm: stats = norm_columns(stats)
        if score_factors: stats = rescale_columns(stats, score_factors)
    inds = np.argsort(reduce(rfunc, [stats[n] for n in scorenames]))[::-1]
    if require_scores is not None:
        for req_name,thresh in require_scores:
            thresh = (np.median(clstruct.stats[req_name]) if thresh is None
                    else thresh)
            inds = [i for i in inds if clstruct.stats[req_name][i] > thresh]
    nstats = len(stats)
    def filt_params(s):
        return " ".join([p[:2]+p.split('=')[1] for p in s.split(',')])
    show_columns = (scorenames if require_scores is None else
            scorenames+ut.i0(require_scores))
    d = DataFrame(clstruct.stats[inds[:dispn]][show_columns],
            index=["#%s: %sc %si %s" %
                (i,len(clusts[i]),len(cxstructs[i].cxppis),
                    filt_params(cxstructs[i].params)) for i in inds[:dispn]])
    print d.head(dispn)
    for i in inds[:dispn]: 
        #print (i, ["%0.4f " % s for s in clstruct.stats[i]], len(clusts[i]), 
                #len(cxstructs[i].cxppis), cxstructs[i].params)
        if print_ranks:
            print i, [nstats-s for s in ranks[i]]
    if output_ranks:
        return inds
    else:
        return clusts[inds[0]], cxstructs[inds[0]].cxppis, inds[0]
Example #11
0
def download_data(order_set,save_set):
    for i in range(len(table_name_set)):
        cursor.execute(order_set[i])
        rows = cursor.fetchall()
        #print(len(rows))
        #print(cursor.description)
        import numpy as np
        rows = np.array(rows).reshape(len(rows),len(rows[0]))
        df = DataFrame(rows,columns=[i[0] for i in cursor.description])
        #print([i[0] for i in cursor.description])
        df.to_csv(save_set[i],index=None,encoding='GB2312')
        print(df.head())
        cnxn.commit()
Example #12
0
class TestPerfectData(unittest.TestCase):
    """what happens with nice data"""

    def setUp(self):
        index = date_range('1/1/2015', periods=365)
        self.df = DataFrame(list(range(len(index))), index=index, columns=['value'])
        self.dataset = Dataset(self.df, 60*60*24, cumulative=False)

    def test_validates(self):
        self.assertTrue(self.dataset.validate())

    def test_partial_validates(self):
        """cut the data up and it still works"""
        d = Dataset(self.df.head(100), 60*60*24, cumulative=False)
        self.assertTrue(d.validate())

    def test_short_raises(self):
        """single value datasets raise an error"""
        d = Dataset(self.df.head(1), 60*60*24, cumulative=False)
        self.assertRaises(ShortDatasetError, d.validate)

    def test_interpolate_skipped(self):
        d2 = self.dataset.interpolate()
        self.assertEqual(self.dataset, d2)
Example #13
0
def make_submission(path, params, threshold_ratio):

    X_train, w_train, y_train = load_training_data()
    indexes_test, X_test = load_test_data()
    y_out = fit_predict(X_train, w_train, y_train, X_test, params)
    y_pred, rank = get_y_pred_rank(y_out, threshold_ratio)

    submission = DataFrame({'EventId': indexes_test, 'RankOrder': rank, 'Class': y_pred},
        columns=['EventId', 'RankOrder', 'Class'])
    submission['Class'] = submission['Class'].apply(lambda x: 's' if x else 'b')

    submission.to_csv(path, index=False)
    print('--------------------- Submission')
    print(submission.head())
    print(path)
    return submission
    def process(self, start_time:datetime, end_time:datetime, input:DataFrame):
        if (self._args is not None and len(self._args) > 2) or \
           (len(self._args) != 0 and not isinstance(self._args[0], QueryFunction)):
            raise ValueError('Invalid argument to absolute value function')

        # get the data
        data = input if len(self._args) == 0 else self._args[0].process(start_time, end_time, input)

        ret = None

        # go through each column, get the average, and apply it to the rows
        for col in data.columns:
            abs = data[col].abs()  # get the absolute value for each value in the column
            abs.name = 'abs ' + col  # update the name

            if ret is None:
                ret = DataFrame(abs)
            else:
                ret = ret.combine_first(DataFrame(abs))  # add it to our return value

        print(ret.head())

        return ret
Example #15
0
    def process(self, start_time: datetime, end_time: datetime, input:DataFrame):
        if str(self.name) not in '+-*/':
            raise ValueError("Unknown math function: " + str(self.name))

        ret = DataFrame()

        # two args means we're doing A + B
        if len(self._args) == 2:
            left = self._args[0].process(start_time, end_time, input) if isinstance(self._args[0], QueryFunction) else self._args[0]
            right = self._args[1].process(start_time, end_time, input) if isinstance(self._args[1], QueryFunction) else self._args[1]

            for l_col in left.columns:
                for r_col in right.columns:
                    if self.name == '+':
                        t = left[l_col] + right[r_col]
                    elif self.name == '-':
                        t = left[l_col] - right[r_col]
                    elif self.name == '*':
                        t = left[l_col] * right[r_col]
                    elif self.name == '/':
                        t = left[l_col] / right[r_col]
                    else:
                        raise ValueError("Unknown operator: " + str(self.name))

                    t = DataFrame(t)
                    t.columns = [l_col + self.name + r_col]

                    print(left.head())
                    print(right.head())
                    print(t.head())
                    ret = ret.combine_first(t)

        else:  # everything is in the input DataFrame
            ret = DataFrame(input.sum(axis=0))
            ret.columns = [' + '.join(input.columns)]

        return ret
from aggregate_ranking_representation.models import RankingName, RawRankingRecord
qs_name = RankingName.objects.filter(short_name='QS')
qs_name = RankingName.objects.filter(short_name='QS')[0]
the_name = RankingName.objects.filter(short_name='THE')[0]
from aggregate_ranking_representation.models import RawRankingRecord, RankingName
from pandas import Series, DataFrame
import pandas as pd
qs_raw_records = qs_name.rawrankingrecord_set.all()
qs_raw_records
qs_raw_record0 = qs_raw_records[0]
qs_raw_record0
qs_raw_records = qs_name.rawrankingrecord_set.all().values()
qs_raw_records
qs_df = DataFrame(qs_raw_records)
qs_df = DataFrame(list(qs_raw_records))
qs_df.head()
the_name = RankingName.objects.filter(short_name='THE')
the_name = RankingName.objects.filter(short_name='THE')[0]
the_raw_records = the_name.rawrankingrecord_set.all()
the_df = DataFrame(list(the_raw_records.values()))
the_df.head()
the_df[:-1]
the_df.index
the_df.drop(0, axis=0)
the_df.head()
the_df.drop(0)
the_df.head()
the_df = DataFrame(list(the_raw_records.values()))
the_df.head()
the_df[0]
the_df[1]
Example #17
0
# In[ ]:

bizframe = DataFrame(bizrecords)
userframe = DataFrame(userrecords)
revframe = DataFrame(revrecords)
checkinframe = DataFrame(checkinrecords)


# In[ ]:

users_df = DataFrame(userrecords, columns=['user_id','yelping_since','review_count', 'average_stars', 'variance_in_rating'])


# In[ ]:

users_df.head(n=10)


# In[ ]:

bizframe_sub = DataFrame(bizrecords, columns=['business_id', 'name', 'categories', 'review_count', 'stars'])


# In[ ]:

bizframe_sub.head(n=10)


# In[ ]:

bsort = bizframe_sub.sort_values(by='review_count', ascending=False)
    df1['entropy5'] = df1[['Asian','Black','Hispanic','White','Other']].apply(entropy,axis=1)
    df1['entropy4'] = df1[['Asian','Black','Hispanic','White']].apply(entropy,axis=1)
    return df1

# <codecell>


msa_list = list(islice(msas(P005_vars_with_name), None))
len(msa_list)

# <codecell>

msa_list
dr = DataFrame(msa_list)
dr = convert_P005_to_int(dr)
dr.head()

grouped = dr.groupby('metropolitan statistical area/micropolitan statistical area').sum()
grouped.head()

# <codecell>


df_diversity = diversity(grouped)

# <codecell>


#'p_Asian', 'p_Black', 'p_Hispanic', 'p_Other','p_White'
df_diversity['p_Asian'] = df_diversity['Asian']/df_diversity['Total']
df_diversity['p_Black'] = df_diversity['Black']/df_diversity['Total']
Example #19
0
#print random_names[:10]
#print births[:10]

dataset = zip(random_names,births)

df = DataFrame(data=dataset, columns=['Names','Births'])
#print df[:10]

df.to_csv("births1880.txt",index=False,header=False)

df = read_csv(r'./births1880.txt',names=["Names","Births"])
print "df.info over all info of df"
print df.info()
print "df.head - first 5 rows"
print df.head()
import os
os.remove(r'./births1880.txt')

uqNames = df['Names'].unique()

print "df['names'].unique()"
print uqNames

print "df.names.describe()"
print df['Names'].describe()

df = df.groupby("Names")  #group by name
print df
df = df.sum() # applys sum to each groupBy obj
print df
Example #20
0
from pandas import DataFrame
import xlrd  # xlsを読み込む際に必要
import numpy as np
import sqlite3

# データフレームを作る
smp = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nebada', 'Nebada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.6, 1.7, 3.5, 4.3]
       }
frame = DataFrame(smp)

# データフレームの要素へのアクセス
frame.year  # frame$year
frame['year']  # frame$year
frame.head()  # head
frame.tail()  # tail
frame2 = DataFrame(
    smp, index=['one', 'two', 'three', 'four', 'five'])  # インデックスを追加
frame2.ix['one']
frame2.describe()  # summary
print(frame2.describe())

# データを読み込む
data = pd.read_csv('stock_px.csv')
print(data)
xlsx_file = pd.ExcelFile('stock_px.xlsx')  # openpyxlのインストールが必要, xlsも可
xlsx_file.sheet_names
data = xlsx_file.parse('stock_px')
print(data)
    
    
    
    
    

    
    

# <codecell>

windowed_df = DataFrame(vpr_window_results)

# <codecell>

windowed_df.head()

# <codecell>

import dendropy

fixtrees = glob.glob('newdomaintrees/*.nwk')
for f in fixtrees:
    if 'Equal' not in f:
        continue
    with open(f) as handle:
        tree = dendropy.Tree.get_from_stream(open(f), 'nexus')
        
    tree.deroot()
    rmnodes = [tree.prune_subtree(t, update_splits = True) for t in tree.leaf_nodes() if t.get_node_str().endswith("copy'")]
    #tree.prune_taxa(rmnodes)
Example #22
0
X=iris.data


##### Now we will apply kmeans for each value of k from 1 to 10 

# In[195]:

KM=[kmeans(X,k) for k in K]
print type(KM),len(KM)


# In[196]:

KM_df=DataFrame(KM)
print KM_df.head(1)


# In[197]:

print KM_df.tail(1)


# In[198]:

KM_df.shape


# In[199]:

KM_v1=KM_df[0]
Example #23
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {'test': [5, 7, 9, 11],
             'test1': [4., 5, 6, 7],
             'other': list('abcd')}, index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {'test': [11, 9],
             'test1': [7., 6],
             'other': ['d', 'c']}, index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {'test': [11, 9, np.nan],
             'test1': [7., 6, np.nan],
             'other': ['d', 'c', np.nan]}, index=rows)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                              'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                              'other': [np.nan, np.nan, np.nan,
                                        'd', 'c', np.nan]},
                             index=rows)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with pytest.raises(KeyError):
            dfnu.loc[['E']]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame(
            {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E'])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)
    result=np.append(result,model.predict(test.ix[i]))


# In[69]:

len(result)


# In[70]:

submission=DataFrame({'Id':np.arange(1,9001,1),'Solution':result})


# In[71]:

submission.head()


# In[75]:

submission.to_csv('submit1_KMeans.csv',index=False)


##### We have to also convert the solution column to int type from float else a 0 score is obtained

# In[82]:

submission.info()


# In[91]:
# <codecell>

from pandas import DataFrame
import numpy as np
from census import Census
from us import states

import settings

c = Census(settings.CENSUS_KEY)

r = c.sf1.get(('NAME', 'P0010001'), {'for': 'state:*'})
df1 = DataFrame(r)

df1.head()

# <codecell>

len(df1)

# <markdowncell>

# **Q21**: Why does `df` have 52 items? Please explain

# <markdowncell>

# **A21**:
# 
# When queried for "states", the US Census API returns data for the 50 states, the District of Columbia, and Puerto Rico: (50+1+1 = 52 entities).
    # placeholder generator
    # replace with your own code
    for k in []:
        yield k


# <codecell>

# use this code to run your code
# I recommend replacing the None in islice to a small number to make sure you're on
# the right track

r = list(islice(places("NAME,P0010001"), None))
places_df = DataFrame(r)
places_df.P0010001 = places_df.P0010001.astype("int")

places_df["FIPS"] = places_df.apply(lambda s: s["state"] + s["place"], axis=1)

print "number of places", len(places_df)
print "total pop", places_df.P0010001.sum()
places_df.head()

# <codecell>

# if you've done this correctly, the following asserts should stop complaining

assert places_df.P0010001.sum() == 228457238
# number of places in 2010 Census
assert len(places_df) == 29261
Example #27
0
def plotter(title,
            df,
            kind = 'line',
            x_label = None,
            y_label = None,
            style = 'ggplot',
            figsize = (8, 4),
            save = False,
            legend_pos = 'best',
            reverse_legend = 'guess',
            num_to_plot = 7,
            tex = 'try',
            colours = 'Accent',
            cumulative = False,
            pie_legend = True,
            partial_pie = False,
            show_totals = False,
            transparent = False,
            output_format = 'png',
            interactive = False,
            black_and_white = False,
            show_p_val = False,
            indices = False,
            **kwargs):
    """Visualise corpus interrogations.

    :param title: A title for the plot
    :type title: str
    :param df: Data to be plotted
    :type df: pandas.core.frame.DataFrame
    :param x_label: A label for the x axis
    :type x_label: str
    :param y_label: A label for the y axis
    :type y_label: str
    :param kind: The kind of chart to make
    :type kind: str ('line'/'bar'/'barh'/'pie'/'area')
    :param style: Visual theme of plot
    :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
    :param figsize: Size of plot
    :type figsize: tuple (int, int)
    :param save: If bool, save with *title* as name; if str, use str as name
    :type save: bool/str
    :param legend_pos: Where to place legend
    :type legend_pos: str ('upper right'/'outside right'/etc)
    :param reverse_legend: Reverse the order of the legend
    :type reverse_legend: bool
    :param num_to_plot: How many columns to plot
    :type num_to_plot: int/'all'
    :param tex: Use TeX to draw plot text
    :type tex: bool
    :param colours: Colourmap for lines/bars/slices
    :type colours: str
    :param cumulative: Plot values cumulatively
    :type cumulative: bool
    :param pie_legend: Show a legend for pie chart
    :type pie_legend: bool
    :param partial_pie: Allow plotting of pie slices only
    :type partial_pie: bool
    :param show_totals: Print sums in plot where possible
    :type show_totals: str -- 'legend'/'plot'/'both'
    :param transparent: Transparent .png background
    :type transparent: bool
    :param output_format: File format for saved image
    :type output_format: str -- 'png'/'pdf'
    :param black_and_white: Create black and white line styles
    :type black_and_white: bool
    :param show_p_val: Attempt to print p values in legend if contained in df
    :type show_p_val: bool
    :param indices: To use when plotting "distance from root"
    :type indices: bool
    :param stacked: When making bar chart, stack bars on top of one another
    :type stacked: str
    :param filled: For area and bar charts, make every column sum to 100
    :type filled: str
    :param legend: Show a legend
    :type legend: bool
    :param rot: Rotate x axis ticks by *rot* degrees
    :type rot: int
    :param subplots: Plot each column separately
    :type subplots: bool
    :param layout: Grid shape to use when *subplots* is True
    :type layout: tuple -- (int, int)
    :param interactive: Experimental interactive options
    :type interactive: list -- [1, 2, 3]
    :returns: matplotlib figure
    """
    import corpkit
    import os

    try:
        from IPython.utils.shimmodule import ShimWarning
        import warnings
        warnings.simplefilter('ignore', ShimWarning)
    except:
        pass

    import matplotlib as mpl
    from matplotlib import rc

    # prefer seaborn plotting
    try:
        import seaborn as sns
    except:
        pass   
    
    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt
    
    import pandas
    from pandas import DataFrame

    import numpy
    from time import localtime, strftime
    from tests import check_pytex, check_spider, check_t_kinter

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    # check what environment we're in
    tk = check_t_kinter()
    running_python_tex = check_pytex()
    running_spider = check_spider()

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np
        new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
        return new_cmap

    def get_savename(imagefolder, save = False, title = False, ext = 'png'):
        """Come up with the savename for the image."""
        import os

        def urlify(s):
            "Turn title into filename"
            import re
            s = s.lower()
            s = re.sub(r"[^\w\s-]", '', s)
            s = re.sub(r"\s+", '-', s)
            s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s)
            return s
        # name as 
        if not ext.startswith('.'):
            ext = '.' + ext
        if type(save) == str:
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        #this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith('%s%s' % (ext, ext)):
            savename = savename.replace('%s%s' % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append('%s (%.2f\%%)' % (w, perc))
                else:
                    the_labs.append('%s (%.2f %%)' % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append('%s (n=%d)' % (w, score))
                else:
                    the_labs.append('%s (n=%d)' % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pandas.DataFrame(vals, index = the_labs)
            dataframe.columns = ['Total']
        return dataframe

    def auto_explode(dataframe, input, was_series = False, num_to_plot = 7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]
        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if type(input) == str or type(input) == int:
            input = [input]
        if type(input) == list:
            for i in input:
                if type(i) == str:
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # check if we're doing subplots
    sbplt = False
    if 'subplots' in kwargs:
        if kwargs['subplots'] is True:
            sbplt = True
    kwargs['subplots'] = sbplt

    if colours is True:
        colours = 'Paired'

    # todo: get this dynamically instead.
    styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white']
    #if style not in styles:
        #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles)))

    if style == 'mpl-white':
        try:
            sns.set_style("whitegrid")
        except:
            pass
        style = 'matplotlib'

    if style is not False and style.startswith('seaborn'):
        colours = False

    # use 'draggable = True' to make a draggable legend
    dragmode = kwargs.get('draggable', False)
    kwargs.pop('draggable', None)

    if kwargs.get('savepath'):
        mpl.rcParams['savefig.directory'] = kwargs.get('savepath')
        kwargs.pop('savepath', None)

    mpl.rcParams['savefig.bbox'] = 'tight'
    mpl.rcParams.update({'figure.autolayout': True})

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams['font.family'] = 'sans-serif'
    mpl.rcParams['text.latex.unicode'] = True
    
    if tex == 'try' or tex is True:
        try:
            rc('text', usetex=True)
            rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
            using_tex = True
        except:
            matplotlib.rc('font', family='sans-serif') 
            matplotlib.rc('font', serif='Helvetica Neue') 
            matplotlib.rc('text', usetex='false') 
            rc('text', usetex=False)
    else:
        rc('text', usetex=False)  

    if interactive:
        using_tex = False 

    if show_totals is False:
        show_totals = 'none'

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    kwargs['kind'] = kind.lower()

    if interactive:
        if kwargs['kind'].startswith('bar'):
            interactive_types = [3]
        elif kwargs['kind'] == 'area':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'line':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'pie':
            interactive_types = None
            warnings.warn('Interactive plotting not yet available for pie plots.')
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if kind == 'pie':
        piemode = True
        # always the best spot for pie
        #if legend_pos == 'best':
            #legend_pos = 'lower left'
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            kwargs['pctdistance'] = 0.6
            if using_tex:
                kwargs['autopct'] = r'%1.1f\%%'
            else:
                kwargs['autopct'] = '%1.1f%%'

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if type(dataframe) == pandas.core.series.Series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True
    
    # attempt to convert x axis to ints:
    try:
        dataframe.index = [int(i) for i in list(dataframe.index)]
    except:
        pass

    # remove totals and tkinter order
    if not was_series and not all(x.lower() == 'total' for x in list(dataframe.columns)):
        for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
            try:
                dataframe = dataframe.drop(name, axis = ax, errors = 'ignore')
            except:
                pass
    else:
        dataframe = dataframe.drop('tkintertable-order', errors = 'ignore')
        dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore')
            
    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == 'guess':
            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = 'all'
            dataframe = dataframe.T
            if y_label is None:
                y_label = 'Percentage of all matches'
            if x_label is None:
                x_label = ''

    # set backend?
    output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf']
    if output_format not in output_formats:
        raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats)))
    
    # don't know if these are necessary
    if 'pdf' in output_format:
        plt.switch_backend(output_format) 
    if 'pgf' in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == 'all':
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if piemode and not sbplt and kwargs.get('explode'):
        kwargs['explode'] = auto_explode(dataframe, 
                                        kwargs['explode'], 
                                        was_series = was_series, 
                                        num_to_plot = num_to_plot)
    else:
        kwargs.pop('explode', None)

    legend = kwargs.get('legend', False)

    #cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != 'Total':
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == 'Total':
            plotting_a_totals_column = True
            if not 'legend' in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        dataframe = dataframe.drop(statfields, axis = 1, errors = 'ignore')
    except:
        pass    
    try:
        dataframe.ix['p']
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]['p']

                def p_string_formatter(val):
                    if val < 0.001:
                        if not using_tex:
                            return 'p < 0.001'
                        else:
                            return r'p $<$ 0.001'
                    else:
                        return 'p = %s' % format(val, '.3f')

                pstr = p_string_formatter(pval)
                newname = '%s (%s)' % (col, pstr)
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore')
        else:
            warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.')
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore')

    # make and set y label
    absolutes = True
    if type(dataframe) == pandas.core.frame.DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0,:].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):        
            absolutes = False

    #  use colormap if need be:
    if num_to_plot > 0:
        if not was_series:
            if kind in ['pie', 'line', 'area']:
                if colours:
                    if not plotting_a_totals_column:
                        if colours == 'Default':
                            colours = 'Paired'
                        kwargs['colormap'] = colours
        #else:

            if colours:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours

    if piemode:
        if num_to_plot > 0:
            if colours == 'Default':
                colours = 'Paired'
            kwargs['colormap'] = colours
        else:
            if num_to_plot > 0:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours
    
    # multicoloured bar charts
    if colours:
        if kind.startswith('bar'):
            if len(list(dataframe.columns)) == 1:
                if not black_and_white:
                    import numpy as np
                    the_range = np.linspace(0, 1, num_to_plot)
                    cmap = plt.get_cmap(colours)
                    kwargs['colors'] = [cmap(n) for n in the_range]
                # make a bar width ... ? ...
                #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5


    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if kind in ['bar', 'barh', 'area', 'line', 'pie']:
        if was_series:
            legend = False
        if kind == 'pie':
            if pie_legend:
                legend = True
            else:
                legend = False
    if kind in ['barh', 'area']:
        if reverse_legend == 'guess':
            rev_leg = True
    if not 'rev_leg' in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = 'best'

    # cut dataframe if just_totals
    try:
        tst = dataframe['Combined total']
        dataframe = dataframe.head(num_to_plot)
    except:
        pass
    
    # rotate automatically
    if 'rot' not in kwargs:
        if not was_series:
            xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]]
            #if 'kind' in kwargs:
                #if kwargs['kind'] in ['barh', 'area']:
                    #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        else:
            xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        if len(max(xvals, key=len)) > 6:
            if not piemode:
                kwargs['rot'] = 45

    # no title for subplots because ugly,
    if title and not sbplt:
        kwargs['title'] = title
        
    # no interactive subplots yet:
    if sbplt and interactive:
        import warnings
        interactive = False
        warnings.warn('No interactive subplots yet, sorry.')
        return
        
    # not using pandas for labels or legend anymore.
    #kwargs['labels'] = None
    #kwargs['legend'] = False

    if legend:
        if num_to_plot > 6:
            if not kwargs.get('ncol'):
                kwargs['ncol'] = num_to_plot / 7
        # kwarg options go in leg_options
        leg_options = {'framealpha': .8,
                       'shadow': kwargs.get('shadow', False),
                       'ncol': kwargs.pop('ncol', 1)}    

        # determine legend position based on this dict
        if legend_pos:
            possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 
                        'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 
                        'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 
                        'outside center right': 'center left', 'outside lower right': 'lower left'}

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys())))
            leg_options['loc'] = the_loc
            #weirdness needed for outside plot
            if legend_pos in ['o r', 'outside right', 'outside upper right']:
                leg_options['bbox_to_anchor'] = (1.02, 1)
            if legend_pos == 'outside center right':
                leg_options['bbox_to_anchor'] = (1.02, 0.5)
            if legend_pos == 'outside lower right':
                leg_options['loc'] == 'upper right'
                leg_options['bbox_to_anchor'] = (0.5, 0.5)
        
        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith('o'):
                leg_options['borderaxespad'] = 1

    if not piemode:
        if show_totals.endswith('both') or show_totals.endswith('legend'):
            dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)
    else:
        if pie_legend:
            if show_totals.endswith('both') or show_totals.endswith('legend'):
                dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs['y'] = list(dataframe.columns)[0]
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.columns)
        else:
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.index)   
    
    def filler(df):
        pby = df.T.copy()
        for i in list(pby.columns):
            tot = pby[i].sum()
            pby[i] = pby[i] * 100.0 / tot
        return pby.T

    areamode = False
    if kind == 'area':
        areamode = True

    if legend is False:
        kwargs['legend'] = False

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kind == 'line':
                kwargs['marker'] = ','
        if not piemode:
            kwargs['alpha'] = 0.1
    
    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A')
                    dataframe = dataframe.set_index(n)

        if kwargs.get('filled'):
            if areamode or kind.startswith('bar'):
                dataframe = filler(dataframe)
            kwargs.pop('filled', None)

    MARKERSIZE = 4
    COLORMAP = {
            0: {'marker': None, 'dash': (None,None)},
            1: {'marker': None, 'dash': [5,5]},
            2: {'marker': "o", 'dash': (None,None)},
            3: {'marker': None, 'dash': [1,3]},
            4: {'marker': "s", 'dash': [5,2,5,2,5,10]},
            5: {'marker': None, 'dash': [5,3,1,2,1,10]},
            6: {'marker': 'o', 'dash': (None,None)},
            7: {'marker': None, 'dash': [5,3,1,3]},
            8: {'marker': "1", 'dash': [1,3]},
            9: {'marker': "*", 'dash': [5,5]},
            10: {'marker': "2", 'dash': [5,2,5,2,5,10]},
            11: {'marker': "s", 'dash': (None,None)}
            }

    HATCHES = {
            0:  {'color': '#dfdfdf', 'hatch':"/"},
            1:  {'color': '#6f6f6f', 'hatch':"\\"},
            2:  {'color': 'b', 'hatch':"|"},
            3:  {'color': '#dfdfdf', 'hatch':"-"},
            4:  {'color': '#6f6f6f', 'hatch':"+"},
            5:  {'color': 'b', 'hatch':"x"}
            }

    if black_and_white:
        if kind == 'line':
            kwargs['linewidth'] = 1

        cmap = plt.get_cmap('Greys')
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kind == 'bar':
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs['colormap'] = new_cmap

    class dummy_context_mgr():
        """a fake context for plotting without style
        perhaps made obsolete by 'classic' style in new mpl"""
        def __enter__(self):
            return None
        def __exit__(self, one, two, three):
            return False

    with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr():

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                kwargs['legend'] = False
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs['stacked'] = False
                    rev_leg = False
            ax = dataframe.plot(figsize = figsize, **kwargs)
            if areamode:
                handles, labels = plt.gca().get_legend_handles_labels()
                del handles
                del labels
        else:
            plt.gcf().set_tight_layout(False)
            if not piemode:
                ax = dataframe.plot(figsize = figsize, **kwargs)
            else:
                ax = dataframe.plot(figsize = figsize, **kwargs)
                handles, labels = plt.gca().get_legend_handles_labels()
                plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1),
                bbox_transform = plt.gcf().transFigure )

                # this line allows layouts with missing plots
                # i.e. layout = (5, 2) with only nine plots
                plt.gcf().set_tight_layout(False)
                
        if 'rot' in kwargs:
            if kwargs['rot'] != 0 and kwargs['rot'] != 90:
                labels = [item.get_text() for item in ax.get_xticklabels()]
                ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right')

        if transparent:
            plt.gcf().patch.set_facecolor('white')
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            if kind == 'line':
                # white background
                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color('black')
                    #line.set_width(1)
                    line.set_dashes(COLORMAP[c]['dash'])
                    line.set_marker(COLORMAP[c]['marker'])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(list(COLORMAP.keys())):
                        c = 0

        # draw legend with proper placement etc
        if legend:
            if not piemode and not sbplt:
                if 3 not in interactive_types:
                    handles, labels = plt.gca().get_legend_handles_labels()
                    # area doubles the handles and labels. this removes half:
                    if areamode:
                        handles = handles[-len(handles) / 2:]
                        labels = labels[-len(labels) / 2:]
                    if rev_leg:
                        handles = handles[::-1]
                        labels = labels[::-1]
                    lgd = plt.legend(handles, labels, **leg_options)

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            else:
                ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            if 2 in interactive_types:
                #if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                #else:
                if kind == 'line':
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)
        
    if piemode:
        if not sbplt:
            plt.axis('equal')
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = 'Year'

    if x_label is not False:
        if type(x_label) == str:
            plt.xlabel(x_label)
        else:
            check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count)
            try:
                if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
                    x_label = 'Year'
                check_x_axis = int(check_x_axis)
                if 1500 < check_x_axis < 2050:
                    x_label = 'Year'
                else:
                    x_label = 'Group'
            except:
                x_label = 'Group'

        if not sbplt:
            if not piemode:
                plt.xlabel(x_label)

    def is_number(s):
        """check if str can be can be made into float/int"""
        try:
            float(s) # for int, long and float
        except ValueError:
            try:
                complex(s) # for complex
            except ValueError:
                return False
        return True

    # for now, always turn off sci notation
    from matplotlib.ticker import ScalarFormatter
    if type(dataframe.index) != pandas.tseries.period.PeriodIndex:
        try:
            if all(is_number(s) for s in list(dataframe.index)):
                plt.gca().xaxis.set_major_formatter(ScalarFormatter()) 
        except:
            pass
    try:
        if all(is_number(s) for s in list(dataframe.columns)):
            plt.gca().yaxis.set_major_formatter(ScalarFormatter()) 
    except:
        pass

    # y labelling
    y_l = False
    if not absolutes:
        y_l = 'Percentage'
    else:
        y_l = 'Absolute frequency'
    
    def suplabel(axis,label,label_prop=None,
                 labelpad=5,
                 ha='center',va='center'):
        ''' Add super ylabel or xlabel to the figure
        Similar to matplotlib.suptitle
        axis       - string: "x" or "y"
        label      - string
        label_prop - keyword dictionary for Text
        labelpad   - padding from the axis (default: 5)
        ha         - horizontal alignment (default: "center")
        va         - vertical alignment (default: "center")
        '''
        fig = plt.gcf()
        xmin = []
        ymin = []
        for ax in fig.axes:
            xmin.append(ax.get_position().xmin)
            ymin.append(ax.get_position().ymin)
        xmin,ymin = min(xmin),min(ymin)
        dpi = fig.dpi
        if axis.lower() == "y":
            rotation=90.
            x = xmin-float(labelpad)/dpi
            y = 0.5
        elif axis.lower() == 'x':
            rotation = 0.
            x = 0.5
            y = ymin - float(labelpad)/dpi
        else:
            raise Exception("Unexpected axis: x or y")
        if label_prop is None: 
            label_prop = dict()
        plt.gcf().text(x,y,label,rotation=rotation,
                   transform=fig.transFigure,
                   ha=ha,va=va,
                   **label_prop)

    if y_label is not False:
        if not sbplt:
            if not piemode:
                if type(y_label) == str:
                    plt.ylabel(y_label)
                else:
                    plt.ylabel(y_l)
        else:
            if type(y_label) == str:
                the_y = y_label
            else:
                the_y = y_l
            #suplabel('y', the_y, labelpad = 1.5)
            plt.gcf().text(0.04, 0.5, the_y, va='center', rotation='vertical')
            #plt.subplots_adjust(left=0.5)
        
        #    if not piemode:
        #        if type(y_label) == str:
        #            plt.ylabel(y_label)
        #        else:
        #            plt.ylabel(y_l)


    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        #plt.gca().suptitle(title, fontsize = 16)
        #plt.subplots_adjust(top=0.9)
        # get all axes
        if 'layout' not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)
    
        # set subplot titles
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis('equal')

            # show grid
            a.grid(b=kwargs.get('grid', False))
            kwargs.pop('grid', None)
    
    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        if kind.startswith('bar'):
            width = ax.containers[0][0].get_width()

        # show grid
        ax.grid(b=kwargs.get('grid', False))
        kwargs.pop('grid', None)

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0,the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')        

    plt.subplots_adjust(left=0.1)
    plt.subplots_adjust(bottom=0.18)

    if 'layout' not in kwargs:
        if not sbplt:
            plt.tight_layout()

    if save:
        import os
        if running_python_tex:
            imagefolder = '../images'
        else:
            imagefolder = 'images'

        savename = get_savename(imagefolder, save = save, title = title, ext = output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith('o'):
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format = output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print('\n' + time + ": " + savename + " created.")
        else:
            raise ValueError("Error making %s." % savename)

    if dragmode:
        plt.legend().draggable()


    if sbplt:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)

    if not interactive and not running_python_tex and not running_spider \
        and not tk:
        plt.gcf().show()
        return
    elif running_spider or tk:
        return plt

    if interactive:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
    def file_load(self, datatype):
        '''
        sales_dict和ads_dict 表示国家对应的广告数据和销售数据的文件目录

        datatype= True,打开广告数据. False, 打开销售数据

        start, end传入时间,可为None。目前暂时用于读取销售数据用。

        读销售数据原理:用os.listdir找到数据月份文件夹(如:2017.03),
        根据时间段与文件匹配,读取该时间段内的数据。

        函数返回DataFrame对象

        '''

        ads_dict = {
            'SXDE': '/data/SX/EU/Ads/DE/ads report/',
            'SXES': '/data/SX/EU/Ads/ES/ads report/',
            'SXFR': '/data/SX/EU/Ads/FR/ads report/',
            'SXIT': '/data/SX/EU/Ads/IT/ads report/',
            'SXUK': '/data/SX/EU/Ads/UK/ads report/',
            'SXJP': '/data/SX/Japan/Ads/',
            'SXCA': '/data/SX/North America/Ads/CA/ads report/',
            'SXUS': '/data/SX/North America/Ads/USA/ads report/',
            'HYYDE': '/data/HYY/EU/ads/DE/',
            'HYYES': '/data/HYY/EU/ads/ES/',
            'HYYFR': '/data/HYY/EU/ads/FR/',
            'HYYIT': '/data/HYY/EU/ads/IT/',
            'HYYUK': '/data/HYY/EU/ads/UK/',
            'HYYJP': '/data/HYY/Japan/Ads/',
            'HYYUS': '/data/HYY/North America/ads/USA/ads report/',
            'TXHLDE': '/data/TXHL/EU/ads/DE/',
            'TXHLES': '/data/TXHL/EU/ads/ES/',
            'TXHLFR': '/data/TXHL/EU/ads/FR/',
            'TXHLIT': '/data/TXHL/EU/ads/IT/',
            'TXHLUK': '/data/TXHL/EU/ads/UK/',
            'TXHLJP': '/data/TXHL/Japan/ads/',
            'TXHLCA': '',
            'TXHLUS': '',
        }

        sales_dict = {
            'SXDE': '/data/SX/EU/business report/DE/',
            'SXES': '/data/SX/EU/business report/ES/',
            'SXFR': '/data/SX/EU/business report/FR/',
            'SXIT': '/data/SX/EU/business report/IT/',
            'SXUK': '/data/SX/EU/business report/UK/',
            'SXJP': '/data/SX/Japan/business report/',
            'SXCA': '/data/SX/North America/business report/CA/',
            'SXUS': '/data/SX/North America/business report/USA/',
            'HYYDE': '/data/HYY/EU/business report/DE/',
            'HYYES': '/data/HYY/EU/business report/ES/',
            'HYYFR': '/data/HYY/EU/business report/FR/',
            'HYYIT': '/data/HYY/EU/business report/IT/',
            'HYYUK': '/data/HYY/EU/business report/UK/',
            'HYYJP': '/data/HYY/Japan/business report/',
            'HYYCA': '/data/HYY/North America/business report/CA/',
            'HYYUS': '/data/HYY/North America/business report/USA/',
            'TXHLDE': '/data/TXHL/EU/business report/DE/',
            'TXHLES': '/data/TXHL/EU/business report/ES/',
            'TXHLFR': '/data/TXHL/EU/business report/FR/',
            'TXHLIT': '/data/TXHL/EU/business report/IT/',
            'TXHLUK': '/data/TXHL/EU/business report/UK/',
            'TXHLJP': '/data/TXHL/Japan/business report/',
            'TXHLCA': '',
            'TXHLUS': '',
        }

        if datatype:
            ad_campaign = DataFrame()
            path = 'F:/PycharmFile'+ ads_dict[self.store + self.country]   # 广告数据文件地址
            file_fold = self.end.strftime('%Y') + '.' + self.end.strftime('%m')
            # 需修改: 直接写出文件夹名,文件名file_name,如果存在,则打开文件,不存在,则查找
            if os.path.isdir(path + file_fold):  # 找到月份文件夹
                file_name = "ADs_" + self.store + self.country + "_" + str(self.end.year) + "-" \
                            + str(self.end.month) + "-" + str(self.end.day) + ".txt"
                if self.country == "JP":
                    ad_campaign = pd.read_table(path + file_fold + "/" + file_name, sep='\t', encoding='Shift-JIS')
                else:
                    ad_campaign = pd.read_table(path + file_fold + "/" + file_name, sep='\t', encoding='utf-8')

            return ad_campaign
        else:
            sales_df = DataFrame()
            path = 'F:/PycharmFile' + sales_dict[self.store + self.country]     #销售数据文件地址
            delta = (self.end - self.start).days
            for i in range(delta+1):
                date = (self.start + timedelta(days=i))
                file_name = self.store + self.country + '-' + date.strftime('%y') + '-' + str(date.month)\
                + '-' + str(date.day) + '.csv'
                for root, subdirs, files, in os.walk(path):
                    for name in files:
                        if name == file_name:
                            print name
                            file_path = root + '/' + name
                            df = pd.read_csv(file_path, encoding='utf8')
                            sales_df = pd.concat([sales_df, df])

            sales_df = sales_df[[u'(子)ASIN', u'商品名称', u'买家访问次数', u'买家访问次数百分比',u'页面浏览次数',
                                 u'页面浏览次数百分比',
                                 u'购买按钮页面浏览率', u'已订购商品数量', u'订单商品数量转化率', u'已订购商品销售额',
                                 u'订单商品种类数']]
            print sales_df.head()

            return sales_df
Example #29
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {'test': [5, 7, 9, 11],
             'test1': [4., 5, 6, 7],
             'other': list('abcd')}, index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {'test': [11, 9],
             'test1': [7., 6],
             'other': ['d', 'c']}, index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {'test': [11, 9, np.nan],
             'test1': [7., 6, np.nan],
             'other': ['d', 'c', np.nan]}, index=rows)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                              'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                              'other': [np.nan, np.nan, np.nan,
                                        'd', 'c', np.nan]},
                             index=rows)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # inconsistent returns for unique/duplicate indices when values are
        # missing
        df = DataFrame(np.random.randn(4, 3), index=list('ABCD'))
        expected = df.reindex(['E'])

        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with catch_warnings(record=True):
            result = dfnu.ix[['E']]
        tm.assert_frame_equal(result, expected)

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame(
            {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E'])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)

        # GH 5835
        # dups on index and missing values
        df = DataFrame(
            np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A'])

        expected = pd.concat(
            [df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'],
                                              index=df.index)], axis=1)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[:, ['A', 'B', 'C']]
        tm.assert_frame_equal(result, expected)

        # GH 6504, multi-axis indexing
        df = DataFrame(np.random.randn(9, 2),
                       index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b'])

        expected = df.iloc[0:6]
        result = df.loc[[1, 2]]
        tm.assert_frame_equal(result, expected)

        expected = df
        result = df.loc[:, ['a', 'b']]
        tm.assert_frame_equal(result, expected)

        expected = df.iloc[0:6, :]
        result = df.loc[[1, 2], ['a', 'b']]
        tm.assert_frame_equal(result, expected)
Example #30
0
import statsmodels.api as sm
from mtkirc import wd
from pandas import DataFrame, Series, read_csv

# -- Imports
# rna-seq
trans = read_csv('%s/data/kirc_transcriptomics_filtered_voom_de.txt' % wd, sep='\t', index_col=0)

# targets
tf_targets = read_csv('%s/tables/tfs_targets.txt' % wd, sep='\t', index_col=0)


# -- Activities
def calc_activity(c):
    ys = trans.ix[tf_targets.index, c].dropna()

    xs = tf_targets.ix[ys.index]
    xs['const'] = 1

    lm = sm.OLS(ys, xs).fit()
    print lm.summary()

    return lm.tvalues.drop('const').to_dict()

tf_activity = DataFrame({c: calc_activity(c) for c in trans})
print tf_activity.head()

# -- Export
tf_activity.to_csv('%s/tables/tf_activity.txt' % wd, sep='\t')
print '[INFO] Done'