Example #1
0
    def post(self):
        post = json.loads(self.request.body)

        MyClient = riak.RiakClient(protocol=RIAK_PROTOCOL, http_port=RIAK_HTTP_PORT, host=RIAK_HOST)

        MyAdminBucket = MyClient.bucket(ADMIN_BUCKET_NAME)

        connection = None
        for c in MyAdminBucket.get("connection").data:
            if c["slug"] == post.get("connection", None):
                connection = c["connection"]

        sql = """SELECT * FROM ({}) AS CUBE LIMIT 10;""".format(post.get("sql", None))

        e = create_engine(connection)
        connection = e.connect()
        try:
            resoverall = connection.execute(text(sql))
        except:
            self.write({"sql": "", "msg": "Error!"})
            self.finish()

        df = DataFrame(resoverall.fetchall())
        if df.empty:
            self.finish()
        df.columns = resoverall.keys()
        df.head()

        self.write({"sql": df.to_json(orient="records"), "msg": "Success!"})
        self.finish()
Example #2
0
def run(cube_slug=None):
    mc = memcache.Client(["127.0.0.1:11211"], debug=0)
    for cube in MyAdminBucket.get("cube").data:
        try:
            slug = cube["slug"]

            if cube_slug and cube_slug != slug:
                continue

            sql = """SELECT * FROM ({}) AS CUBE;""".format(cube["sql"])
            for c in MyAdminBucket.get("connection").data:
                if c["slug"] == cube["connection"]:
                    connection = c["connection"]

            print "\n# CLEAN MEMCACHE/RIAK: {}".format(slug)
            mc.delete(str(slug))
            mc.delete(str("{}-columns".format(slug)))

            MyBucket.new(slug, data="").store()
            MyBucket.new(u"{}-columns".format(slug), data="").store()
            MyBucket.new(u"{}-connect".format(slug), data="").store()
            MyBucket.new(u"{}-sql".format(slug), data="").store()

            print "# CONNECT IN RELATION DATA BASE: {}".format(slug)
            e = create_engine(connection)
            connection = e.connect()

            resoverall = connection.execute(text(sql))

            print "# LOAD DATA ON DATAWAREHOUSE: {}".format(slug)
            df = DataFrame(resoverall.fetchall())
            if df.empty:
                print "[warnning]Empty cube: {}!!".format(cube)
                return
            df.columns = resoverall.keys()
            df.head()

            pdict = map(fix_render, df.to_dict(outtype="records"))

            print "# SAVE DATA (JSON) ON RIAK: {}".format(slug)
            MyBucket.new(slug, data=pdict).store()

            print "# SAVE COLUMNS ON RIAK: {}".format(slug)
            MyBucket.new(u"{}-columns".format(slug), data=json.dumps([c for c in df.columns])).store()

            print "# SAVE CONNECT ON RIAK: {}".format(slug)
            MyBucket.new(u"{}-connect".format(slug), data=c).store()

            print "# SAVE SQL ON RIAK: {}".format(slug)
            MyBucket.new(u"{}-sql".format(slug), data=sql).store()

            print "# CLEAN MEMORY: {}\n".format(slug)
            del pdict, df
            gc.collect()
        except:
            pass

    print "## FINISH"
    return True
Example #3
0
def select_best(
    clstruct,
    scorenames=["sensitivity", "mmr", "aupr", "cliqueness_3_20", "nonov_iter", "n_proteins", "n_complexes_3_20"],
    rfunc=operator.add,
    use_norm=False,
    dispn=15,
    score_factors=None,
    use_ranks=True,
    output_ranks=False,
    print_ranks=False,
    require_scores=None,
):
    cxstructs, stats = clstruct.cxstructs, clstruct.stats
    clusts = [cxstr.cxs for cxstr in cxstructs]
    scorenames = scorenames or list(stats.dtype.names)
    stats = stats[scorenames]
    ranks = rank_columns(stats)
    if use_ranks:
        stats = ranks
    else:
        if use_norm:
            stats = norm_columns(stats)
        if score_factors:
            stats = rescale_columns(stats, score_factors)
    inds = np.argsort(reduce(rfunc, [stats[n] for n in scorenames]))[::-1]
    if require_scores is not None:
        for req_name, thresh in require_scores:
            thresh = np.median(clstruct.stats[req_name]) if thresh is None else thresh
            inds = [i for i in inds if clstruct.stats[req_name][i] > thresh]
    nstats = len(stats)

    def filt_params(s):
        return " ".join([p[:2] + p.split("=")[1] for p in s.split(",")])

    show_columns = scorenames if require_scores is None else scorenames + ut.i0(require_scores)
    d = DataFrame(
        clstruct.stats[inds[:dispn]][show_columns],
        index=[
            "#%s: %sc %si %s" % (i, len(clusts[i]), len(cxstructs[i].cxppis), filt_params(cxstructs[i].params))
            for i in inds[:dispn]
        ],
    )
    print d.head(dispn)
    for i in inds[:dispn]:
        # print (i, ["%0.4f " % s for s in clstruct.stats[i]], len(clusts[i]),
        # len(cxstructs[i].cxppis), cxstructs[i].params)
        if print_ranks:
            print i, [nstats - s for s in ranks[i]]
    if output_ranks:
        return inds
    else:
        return clusts[inds[0]], cxstructs[inds[0]].cxppis, inds[0]
Example #4
0
def gonzales(data, k):
    # transform the data numpy array to data frame using the id as index
    points_list = DataFrame(data[:, 1:], index=data[:, 0])
    # adding two columns in the points data frame for saving the centers and distance
    points_list["distance"] = np.nan
    points_list["center"] = np.nan
    distance_column_index = points_list.columns.get_loc("distance")
    # choosing a random point as the first center

    # center0 =     points_list.sample(n=1 , random_state = randint(0,100) , axis=0)
    center0 = points_list.head(1)
    centers_list = DataFrame(center0.drop(["distance", "center"], axis=1))
    centers_list["color"] = "r"
    colors = "bgcmykw"
    # ===========================================================================
    # print(centers_list)
    # print("==============Initialization finished===========")
    # ===========================================================================
    # looping k-1 time to have k centers
    for k_cycle in range(1, k + 1):
        # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster
        max_distance = 0
        next_cluster = np.nan
        # loop on all the points to assign them to their closest center
        for indexp, p in points_list.iterrows():
            # variables to save the choose the closest center
            min_cluster_distance = math.inf
            closest_cluster = None
            for indexc, center in centers_list.iterrows():
                dis = spatial.distance.euclidean(center.as_matrix(columns=[0, 1]), p.as_matrix(columns=[0, 1]))
                if dis < min_cluster_distance:
                    min_cluster_distance = dis
                    closest_cluster = indexc
            p["distance"] = min_cluster_distance
            p["center"] = closest_cluster
            if min_cluster_distance > max_distance:
                max_distance = min_cluster_distance
                next_cluster = indexp

        centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index])
        centers_list.set_value(next_cluster, "color", colors[k_cycle])
        # =======================================================================
        # print(centers_list)
        # print("==============Cycle finished===========")
        # =======================================================================
    centers_list.drop(centers_list.tail(1).index, inplace=True)
    centers_list.drop(["color"], axis=1, inplace=True)

    # ===========================================================================
    # centers_list.plot(kind='scatter', x=0, y=1 , c='r'   )
    # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2   )
    # plt.show()
    # ===========================================================================

    # print(points_list)
    return centers_list.as_matrix(columns=[0, 1])
def make_submission(path, params, threshold_ratio):

    X_train, w_train, y_train = load_training_data()
    indexes_test, X_test = load_test_data()
    y_out = fit_predict(X_train, w_train, y_train, X_test, params)
    y_pred, rank = get_y_pred_rank(y_out, threshold_ratio)

    submission = DataFrame(
        {"EventId": indexes_test, "RankOrder": rank, "Class": y_pred}, columns=["EventId", "RankOrder", "Class"]
    )
    submission["Class"] = submission["Class"].apply(lambda x: "s" if x else "b")

    submission.to_csv(path, index=False)
    print("--------------------- Submission")
    print(submission.head())
    print(path)
    return submission
Example #6
0
class CubeProcess(object):
    def __init__(self, _cube):

        log_it("START: {}".format(_cube["slug"]), "bin-mining")

        self.mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo()

        MyClient = riak.RiakClient(
            protocol=conf("riak")["protocol"], http_port=conf("riak")["http_port"], host=conf("riak")["host"]
        )

        self.MyBucket = MyClient.bucket(conf("riak")["bucket"])
        self.MyBucket.enable_search()
        del _cube["_id"]
        self.cube = _cube
        self.slug = self.cube["slug"]

    def load(self):
        self.cube["run"] = "run"
        self.mongo["cube"].update({"slug": self.slug}, self.cube)

        self.cube["start_process"] = datetime.now()

        _sql = self.cube["sql"]
        if _sql[-1] == ";":
            _sql = _sql[:-1]
        self.sql = u"""SELECT * FROM ({}) AS CUBE;""".format(_sql)

        self.connection = self.mongo["connection"].find_one({"slug": self.cube["connection"]})["connection"]

        log_it("CONNECT IN RELATION DATA BASE: {}".format(self.slug), "bin-mining")
        e = create_engine(self.connection, **conf("openmining")["sql_conn_params"])
        Session = sessionmaker(bind=e)
        session = Session()

        resoverall = session.execute(text(self.sql))
        self.data = resoverall.fetchall()
        self.keys = resoverall.keys()

    def environment(self, t):
        if t not in ["relational"]:
            self.sql = t

    def _data(self, data):
        self.data = data

    def _keys(self, keys):
        self.keys = keys

    def frame(self):
        log_it("LOAD DATA ON DATAWAREHOUSE: {}".format(self.slug), "bin-mining")
        self.df = DataFrame(self.data)
        if self.df.empty:
            log_it("[warning]Empty cube: {}!!".format(self.cube), "bin-mining")
            return
        self.df.columns = self.keys
        self.df.head()

        self.pdict = map(fix_render, self.df.to_dict(outtype="records"))

    def clean(self):
        log_it("CLEAN DATA (JSON) ON RIAK: {}".format(self.slug), "bin-mining")

        self.MyBucket.new(self.slug, data="").store()
        self.MyBucket.new(u"{}-columns".format(self.slug), data="").store()
        self.MyBucket.new(u"{}-connect".format(self.slug), data="").store()
        self.MyBucket.new(u"{}-sql".format(self.slug), data="").store()

    def save(self):
        self.clean()

        log_it("SAVE DATA (JSON) ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(self.slug, data=self.pdict, content_type="application/json").store()

        log_it("SAVE COLUMNS ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(u"{}-columns".format(self.slug), data=json.dumps(self.keys)).store()

        log_it("SAVE CONNECT ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(u"{}-connect".format(self.slug), data=self.connection).store()

        log_it("SAVE SQL ON RIAK: {}".format(self.slug), "bin-mining")
        self.MyBucket.new(u"{}-sql".format(self.slug), data=self.sql).store()

        self.cube["status"] = True
        self.cube["lastupdate"] = datetime.now()
        self.cube["run"] = True
        self.mongo["cube"].update({"slug": self.cube["slug"]}, self.cube)

        log_it("CLEAN MEMORY: {}".format(self.slug), "bin-mining")
        gc.collect()
alameda_county_tracts_df.head()

# <codecell>

alameda_county_tracts_df.P0010001.sum()

# <codecell>

# Cafe Milano is in tract 4228
MILANO_TRACT_ID = '422800'
alameda_county_tracts_df[alameda_county_tracts_df.tract==MILANO_TRACT_ID]

# <headingcell level=1>

# Using Generators to yield all the tracts in the country

# <markdowncell>

# http://www.jeffknupp.com/blog/2013/04/07/improve-your-python-yield-and-generators-explained/

# <codecell>

import time
import us

from itertools import islice

def census_tracts(variable=('NAME','P0010001'), sleep_time=1.0):
    
    for state in us.states.STATES:
        print state
        for tract in c.sf1.get(variable, 
                    geo={'for':"tract:*", 
                        'in':'state:{state_fips}'.format(state_fips=state.fips)
                        }):
            yield tract
        # don't hit the API more than once a second    
        time.sleep(sleep_time)
 
# limit the number of tracts we crawl for until we're reading to get all of them        
tracts_df = DataFrame(list(islice(census_tracts(), 100)))
tracts_df['P0010001'] = tracts_df['P0010001'].astype('int')

# <codecell>

tracts_df.head()
# Counting Time Zones with pandas

# <markdowncell>

# Recall what `records` is

# <codecell>

len(records)

# <codecell>

# list of dict -> DataFrame

frame = DataFrame(records)
frame.head()

# <headingcell level=1>

# movielens dataset

# <markdowncell>

# PDA p. 26 
# 
# http://www.grouplens.org/node/73 --> there's also a 10 million ratings dataset -- would be interesting to try out to test scalability
# of running IPython notebook on laptop
# 

# <codecell>
qs_name = RankingName.objects.filter(short_name='QS')[0]
the_name = RankingName.objects.filter(short_name='THE')[0]
leiden_name = RankingName.objects.filter(short_name='Leiden')[0]
qs_raw_records = qs_name.rawrankingrecord_set.all().values()
qs_raw_records = list(qs_name.rawrankingrecord_set.all().values())
the_raw_records = list(the_name.rawrankingrecord_set.all().values())
leiden_raw_records = list(leiden_name.rawrankingrecord_set.all().values())
len(qs_raw_records)
len(the_raw_records)
len(leiden_raw_records)
qs_df = DataFrame(qs_raw_records)
the_df = DataFrame(the_raw_records)
leiden_df = DataFrame(leiden_raw_records)
the_df.head()
the_df = the_df.drop(0, axis=0)
the_df.head()
f = lambda x: x - 1
the_df['number_in_ranking_table'] = the_df['number_in_ranking_table'].map(f)
the_df.head()
qs_names_df = the_df[['university_name', 'country']]
qs_names_df.head()
qs_names_df = qs_df[['university_name', 'country']]
the_names_df = the_df[['university_name', 'country']]
leiden_names_df = leiden_df[['university_name', 'country']]
leiden_names_df.head()
qs_the_names_df = pd.merge(qs_names_df, the_names_df, on='university_name', how='inner', suffixes=('_qs', '_the'))
qs_the_names_df
qs_the_names_df[:5]
qs_the_names_df_inner = pd.merge(qs_names_df, the_names_df, on='university_name', how='inner', suffixes=('_qs', '_the'))
qs_the_names_df_outer = pd.merge(qs_names_df, the_names_df, on='university_name', how='outer', suffixes=('_qs', '_the'))
qs_the_names_df_outer[:5]
qs_the_names_df_outer['country_qs' == NaN & 'country_the' == NaN]
    
    
    
    
    

    
    

# <codecell>

windowed_df = DataFrame(vpr_window_results)

# <codecell>

windowed_df.head()

# <codecell>

import dendropy

fixtrees = glob.glob('newdomaintrees/*.nwk')
for f in fixtrees:
    if 'Equal' not in f:
        continue
    with open(f) as handle:
        tree = dendropy.Tree.get_from_stream(open(f), 'nexus')
        
    tree.deroot()
    rmnodes = [tree.prune_subtree(t, update_splits = True) for t in tree.leaf_nodes() if t.get_node_str().endswith("copy'")]
    #tree.prune_taxa(rmnodes)
df.head()

# <codecell>

df.ix[:3]["titles1"]

# <codecell>

# This Cell will examine the areas of interest under the Important Bird Areas Identified in the following document,
# ak.audubon.org/sites/default/files/documents/marine_ibas_report_final_sep_2012.pdf (at Section 1.3 Study Areas)

# Establish daterange definitions
def dateRange(start_date="1900-01-01", stop_date="2100-01-01", constraint="overlaps"):
    if constraint == "overlaps":
        start = fes.PropertyIsLessThanOrEqualTo(propertyname="apiso:TempExtent_begin", literal=stop_date)
        stop = fes.PropertyIsGreaterThanOrEqualTo(propertyname="apiso:TempExtent_end", literal=start_date)
    elif constraint == "within":
        start = fes.PropertyIsGreaterThanOrEqualTo(propertyname="apiso:TempExtent_begin", literal=start_date)
        stop = fes.PropertyIsLessThanOrEqualTo(propertyname="apiso:TempExtent_end", literal=stop_date)
    return start, stop


# Establish bounding box filter for Geographic Range of IBAs
bbox = fes.BBox([-130.5, 47.9, 167.6, 74.7])

# <codecell>

sparql = SPARQLWrapper("http://mmisw.org/sparql")
queryString = """
PREFIX ioos: <http://mmisw.org/ont/ioos/parameter/>
SELECT DISTINCT ?parameter ?definition ?unit ?property ?value 
WHERE {?parameter a ioos:Parameter .
       ?parameter ?property ?value .
       ?parameter ioos:Term ?term . 
       ?parameter ioos:Definition ?definition . 
       ?parameter ioos:Units ?unit .
       FILTER (regex(str(?property), "(exactMatch|closeMatch)", "i") && regex(str(?value), "temperature", "i") )
      } 
ORDER BY ?parameter
"""

sparql.setQuery(queryString)
sparql.setReturnFormat(JSON)
j = sparql.query().convert()

j.keys()

j["head"]["vars"]

# <codecell>

j
dict = j
print j

# <codecell>

# This Cell will access the catalogs for the variables pertinent to the PMEL Models within the set Geographic Range

variables = ["sea_surface_temperature", "sea_water temperature", "fish", "river", "currents", "bathymetry", "wind"]
variables1 = []
records1 = []
titles1 = []
lenrecords1 = []
lentitles1 = []

for endpoint in endpoints[:3]:
    csw = CatalogueServiceWeb(endpoint, timeout=60)
    for v in variables[:2]:
        try:
            csw.getrecords(keywords=[v], maxrecords=60, esn="full")
            records1.append(csw.results)
        except Exception, ex1:
            records1.append("Error")
        try:
            for rec in csw.records:
                titles1.append(csw.records[rec].title)
        except Exception, ex1:
            titles1.append("Error")
    lentitles1.append(len(titles1[-1]))
    lenrecords1.append(len(records1[-1]))

zipvar1 = zip(endpoints, records1, lenrecords1, titles1, lentitles1)
df = DataFrame(data=zipvar1, columns=["endpoints", "records1", "lenrecords1", "titles1", "lentitles1"])
df.head()
df.head()

# <codecell>

import re

def sort_label(label):
    (l1, l2, l3) = re.search("([A-Z,a-z]+)(\d+)([A-Z,a-z]*)\.",label).groups()
    return l1 + " " + "{l2:03d}".format(l2=int(l2)) + l3

df['sort_label'] = df.label.apply(sort_label)

# <codecell>

df[df.label.str.startswith("P5")]

# <codecell>

# let's go right for the variables and generate a dict, DF

from lxml import etree
from itertools import islice
from collections import OrderedDict

SF1_XML_PATH  = "/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/data/sf1.xml"

doc = etree.parse(SF1_XML_PATH)
variables = doc.findall("//variable")

variables_dict = OrderedDict([(v.attrib['name'], 
                               {'concept':v.attrib['concept'],
                                'text': v.text
                                }) for v in variables])


# <codecell>

variables_dict['P0050001']

# <codecell>

def P005_range(n0,n1): 
    return tuple(('P005'+ "{i:04d}".format(i=i) for i in xrange(n0,n1)))

P005_vars = P005_range(1,18)
P005_vars_str = ",".join(P005_vars)

[(v,variables_dict[v]['text']) for v in P005_vars]

# <codecell>

variables_df = DataFrame(variables_dict)
variables_df.head()
df1.head()

# <codecell>

len(df1)

# <markdowncell>

# **Q21**: Why does `df` have 52 items? Please explain

# <markdowncell>

# **A21**:
#
# When queried for "states", the US Census API returns data for the 50 states, the District of Columbia, and Puerto Rico: (50+1+1 = 52 entities).

# <markdowncell>

# Consider the two following expressions:

# <codecell>

print df1.P0010001.sum()
print
print df1.P0010001.astype(int).sum()

# <markdowncell>

# **Q22**: Why is `df1.P0010001.sum()` different from `df1.P0010001.astype(int).sum()`?

# <markdowncell>

# **A22**:
# The data type of `df1.P0010001` is a string.  Hence, performing `sum` on it concatenates the string representation of populations into a longer string.  In contrast, once `df1.P0010001` is converted into integers via `df1.P0010001.astype(int)`, a `sum` operation adds up all the populations into a single integer.

# <codecell>

df1.P0010001 = df1.P0010001.astype(int)
df1[["NAME", "P0010001"]].sort("P0010001", ascending=True).head()

# <markdowncell>

# **Q23**: Describe the output of the following:
#
# ```Python
# df1.P0010001 = df1.P0010001.astype(int)
# df1[['NAME','P0010001']].sort('P0010001', ascending=True).head()
# ```

# <markdowncell>

# **A23**:
# A DataFrame (with 5 rows and 2 columns (NAME, P0010001)) listing the 5 least populous states in ascending order by population.

# <codecell>

df1.set_index("NAME", inplace=True)
df1.ix["Nebraska"]

# <markdowncell>

# **Q24**: After running:
#
# ```Python
#     df1.set_index('NAME', inplace=True)
# ```
#
# how would you access the Series for the state of Nebraska?
#
# 1. `df1['Nebraska']`
# 1. `df1[1]`
# 1. `df1.ix['Nebraska']`
# 1. `df1[df1['NAME'] == 'Nebraska']`

# <markdowncell>

# **A24**:
# <pre>
# 3
# </pre>

# <codecell>

len(states.STATES)

# <markdowncell>

# **Q25**. What is `len(states.STATES)`?

# <markdowncell>

# **A25**:
# <pre>
# 51
# </pre>

# <codecell>

len(df1[np.in1d(df1.state, [s.fips for s in states.STATES])])

# <markdowncell>

# **Q26**. What is
#
# ```Python
# len(df1[np.in1d(df1.state, [s.fips for s in states.STATES])])
# ```

# <markdowncell>

# **A26**:
# <pre>
# 51
# </pre>

# <markdowncell>

# In the next question, we will make use of the negation operator `~`.  Take a look at a specific example

# <codecell>

~Series([True, True, False, True])

# <codecell>

list(df1[~np.in1d(df1.state, [s.fips for s in states.STATES])].index)[0]

# <markdowncell>

# **Q27**. What is
#
# ```Python
#     list(df1[~np.in1d(df1.state, [s.fips for s in states.STATES])].index)[0]
# ```

# <markdowncell>

# **A27**:
# <pre>
# Puerto Rico
# </pre>

# <markdowncell>

# Consider `pop1` and `pop2`:

# <codecell>

pop1 = df1["P0010001"].astype("int").sum()
pop2 = df1[np.in1d(df1.state, [s.fips for s in states.STATES])]["P0010001"].astype("int").sum()

pop1 - pop2

# <markdowncell>

# **Q28**. What does `pop11 - pop2` represent?

# <markdowncell>

# **A28**:
# The population of Puerto Rico in the 2010 Census.

# <headingcell level=1>

# Generator and range

# <codecell>

sum(range(1, 101))

# <markdowncell>

# **Q29**. Given that
#
#     range(10)
#
# is
#
#     [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
#
# How to get the total of every integer from 1 to 100?
#
# 1. `sum(range(1, 101))`
# 1. `sum(range(100))`
# 1. `sum(range(1, 100))`
# 1. None of the above

# <markdowncell>

# **A29**:
# <pre>
# 1
# </pre>

# <codecell>

# itertools is a great library
# http://docs.python.org/2/library/itertools.html#itertools.count
# itertools.count(start=0, step=1):
# "Make an iterator that returns evenly spaced values starting with step."

from itertools import islice, count

c = count(0, 1)
print c.next()
print c.next()

# <markdowncell>

# **Q30**. What output is produced from
#
# ```Python
# # itertools is a great library
# # http://docs.python.org/2/library/itertools.html#itertools.count
# # itertools.count(start=0, step=1):
# # "Make an iterator that returns evenly spaced values starting with step."
#
# from itertools import islice, count
# c = count(0, 1)
# print c.next()
# print c.next()
# ```

# <markdowncell>

# **A30**:
# <pre>
# 0
# 1
# </pre>

# <codecell>

(2 * Series(np.arange(101))).sum()

# <markdowncell>

# **Q31**. Recalling that
#
#     1+2+3+...+100 = 5050
#
# what is:
#
# ```Python
# (2*Series(np.arange(101))).sum()
# ```

# <markdowncell>

# **A31**:
# <pre>
# 10100
# </pre>

# <headingcell level=1>

# Census Places

# <markdowncell>

# Consider the follow generator that we used to query for census places.

# <codecell>

import pandas as pd
from pandas import DataFrame

import census
import settings
import us

from itertools import islice

c = census.Census(settings.CENSUS_KEY)


def places(variables="NAME"):

    for state in us.states.STATES:
        geo = {"for": "place:*", "in": "state:{s_fips}".format(s_fips=state.fips)}
        for place in c.sf1.get(variables, geo=geo):
            yield place


# <markdowncell>

# Now we compute a DataFrame for the places: `places_df`

# <codecell>

r = list(islice(places("NAME,P0010001"), None))
places_df = DataFrame(r)
places_df.P0010001 = places_df.P0010001.astype("int")

print "number of places", len(places_df)
print "total pop", places_df.P0010001.sum()
places_df.head()
Example #14
0
print df[:10]

df.to_csv("births1880.txt", index=False, header=False)

Location = r"births1880.txt"

df = read_csv(Location)

print df

print df.head()

df = read_csv(Location, header=None)

print df

print df.tail()

df = read_csv(Location, names=["Names", "Births"])

print df.head()

import os

os.remove(Location)

print df["Names"].unique()

for x in df["Names"].unique():
    print x
Example #15
0
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt


# ---------load test data---------#
df = pd.read_table("test_items.txt", header=None, names=["item_id"])
print df.head()
print "the row numbers of test set", len(df)

# --------- generate all ones coloumn to mathch 'item_id'----------#

trytest = DataFrame([i for i in range(len(df))], columns=["item_list"])

print trytest.head()

trytest1 = DataFrame([i for i in range(len(df))], columns=["item_list1"])

trytest2 = DataFrame([i for i in range(len(df))], columns=["item_list2"])


# ===================generate test data===================#
testData0 = pd.merge(df, trytest, right_index=True, left_index=True)  # merge ,only with two columns.

testData1 = pd.merge(trytest1, trytest2, right_index=True, left_index=True)


testData = pd.merge(testData0, testData1, right_index=True, left_index=True)
print testData.head()
Example #16
0
len(stations)

# <codecell>

from collections import OrderedDict
OrderedDict([(child.tag, child.text) for child in stations[0].iterchildren()])

# <codecell>

from pandas import DataFrame

def station_to_ordereddict(station):
    return OrderedDict([(child.tag, child.text) for child in station.iterchildren()])

stations_df = DataFrame([station_to_ordereddict(station) for station in stations])
stations_df.head()

# <codecell>

for s in stations_df.T.to_dict().values():
    print s['name'], float(s['gtfs_latitude']), float(s['gtfs_longitude'])

# <codecell>

# plot the maps using folium --- something to do...
# time perhaps to try out leaflet.js widget that Brian working on 
# https://github.com/ellisonbg/leaftletwidget

# http://nbviewer.ipython.org/gist/bburky/7763555/folium-ipython.ipynb

from IPython.display import HTML
plt.plot(avg_c21)


# In[255]:

df_c21 = DataFrame({"c21": c21, "Avg": avg_c21})


# In[256]:

df_c21 = df_c21.sort(columns="Avg")


# In[257]:

df_c21.head(2)


# In[258]:

plt.plot(df_c21["Avg"])


# In[259]:

print df_c21["Avg"].max(), df_c21["Avg"].min()


# In[261]:

df_c21.tail(2)
Example #18
0
def transition():

    levels = ["haut", "bas"]

    taxes_list = ["tva", "tipp", "cot", "irpp", "impot", "property"]
    payments_list = ["chomage", "retraite", "revsoc", "maladie", "educ"]

    year_length = 250
    year_min = 1996
    year_max = year_min + year_length - 1

    arrays = arange(year_min, year_min + 60)
    record = DataFrame(index=arrays)

    simulation = Simulation()

    for param1 in levels:
        for param2 in levels:

            population_scenario = "projpop0760_FEC" + param1 + "ESP" + param2 + "MIG" + param1
            simulation.load_population(population_filename, population_scenario)

            # Adding missing population data between 1996 and 2007 :
            store_pop = HDFStore(
                os.path.join(SRC_PATH, "countries", country, "sources", "Carole_Bonnet", "pop_1996_2006.h5")
            )
            corrected_pop = store_pop["population"]
            simulation.population = concat([corrected_pop, simulation.population])

            simulation.load_profiles(profiles_filename)

            simulation.year_length = year_length
            r = 0.03
            g = 0.01
            n = 0.00
            net_gov_wealth = -3217.7e09
            year_gov_spending = (1094) * 1e09

            # Loading simulation's parameters :
            simulation.set_population_projection(year_length=year_length, method="stable")
            simulation.set_tax_projection(method="per_capita", rate=g)
            simulation.set_growth_rate(g)
            simulation.set_discount_rate(r)
            simulation.set_population_growth_rate(n)
            simulation.set_gov_wealth(net_gov_wealth)
            simulation.set_gov_spendings(year_gov_spending, default=True, compute=True)

            record[population_scenario] = NaN
            col_name2 = population_scenario + "_precision"
            record[col_name2] = NaN

            simulation.create_cohorts()
            simulation.cohorts.compute_net_transfers(
                name="net_transfers", taxes_list=taxes_list, payments_list=payments_list
            )
            simulation.create_present_values(typ="net_transfers")

            for year in range(year_min, year_min + 60):

                # On tente de tronquer la df au fil du temps
                try:
                    simulation.aggregate_pv = simulation.aggregate_pv.drop(labels=year - 1, level="year")
                except:
                    print "except path"
                    pass
                simulation.aggregate_pv = AccountingCohorts(simulation.aggregate_pv)

                #                     imbalance = simulation.compute_gen_imbalance(typ='net_transfers')
                ipl = simulation.compute_ipl(typ="net_transfers")

                # Calcul du résidut de l'IPL pour vérifier la convergence
                # (on se place tard dans la projection)
                precision_df = simulation.aggregate_pv
                print precision_df.head().to_string()

                year_min_ = array(list(precision_df.index.get_level_values(2))).min()
                year_max_ = array(list(precision_df.index.get_level_values(2))).max() - 1
                #         age_min = array(list(self.index.get_level_values(0))).min()
                age_max_ = array(list(precision_df.index.get_level_values(0))).max()
                print "CALIBRATION CHECK : ", year_min_, year_max_

                past_gen_dataframe = precision_df.xs(year_min_, level="year")
                past_gen_dataframe = past_gen_dataframe.cumsum()
                past_gen_transfer = past_gen_dataframe.get_value((age_max_, 1), "net_transfers")
                #                     print '    past_gen_transfer = ', past_gen_transfer

                future_gen_dataframe = precision_df.xs(0, level="age")
                future_gen_dataframe = future_gen_dataframe.cumsum()
                future_gen_transfer = future_gen_dataframe.get_value((1, year_max_), "net_transfers")
                #                     print '    future_gen_transfer =', future_gen_transfer

                # Note : do not forget to eliminate values counted twice
                last_ipl = (
                    past_gen_transfer
                    + future_gen_transfer
                    + net_gov_wealth
                    - simulation.net_gov_spendings
                    - past_gen_dataframe.get_value((0, 0), "net_transfers")
                )
                last_ipl = -last_ipl

                print last_ipl, ipl
                precision = (ipl - last_ipl) / ipl
                print "precision = ", precision

                record.loc[year, population_scenario] = ipl
                record.loc[year, col_name2] = precision
            print record.head().to_string()
    xls = (
        "C:/Users/Utilisateur/Documents/GitHub/ga/src/countries/france/sources/Carole_Bonnet/"
        + "ipl_evolution"
        + ".xlsx"
    )
    print record.head(30).to_string()
    record.to_excel(xls, "ipl")
Example #19
0
def plotter(
    title,
    df,
    x_label=None,
    y_label=None,
    style="ggplot",
    figsize=(8, 4),
    save=False,
    legend_pos="best",
    reverse_legend="guess",
    num_to_plot=7,
    tex="try",
    colours="Paired",
    cumulative=False,
    pie_legend=True,
    partial_pie=False,
    show_totals=False,
    transparent=False,
    output_format="png",
    interactive=False,
    black_and_white=False,
    show_p_val=False,
    indices="guess",
    **kwargs
):
    """plot interrogator() or editor() output.

    **kwargs are for pandas first, which can then send them through to matplotlib.plot():

    http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.plot.html
    http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot

    pie_legend: False to label slices rather than give legend
    show_totals: where to show percent/abs frequencies: False, 'plot', 'legend', or 'both'

    """

    import corpkit
    import os
    import matplotlib as mpl

    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt
    from matplotlib import rc
    import pandas
    import pandas as pd
    from pandas import DataFrame

    import numpy
    from time import localtime, strftime
    from corpkit.tests import check_pytex, check_spider, check_t_kinter

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    tk = check_t_kinter()

    running_python_tex = check_pytex()
    # incorrect spelling of spider on purpose
    running_spider = check_spider()

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np

        new_cmap = colors.LinearSegmentedColormap.from_list(
            "trunc({n},{a:.2f},{b:.2f})".format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))
        )
        return new_cmap

    def get_savename(imagefolder, save=False, title=False, ext="png"):
        """Come up with the savename for the image."""
        import os

        def urlify(s):
            "Turn title into filename"
            import re

            s = s.lower()
            s = re.sub(r"[^\w\s-]", "", s)
            s = re.sub(r"\s+", "-", s)
            s = re.sub(r"-(textbf|emph|textsc|textit)", "-", s)
            return s

        # name as
        if not ext.startswith("."):
            ext = "." + ext
        if type(save) == str:
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        # this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith("%s%s" % (ext, ext)):
            savename = savename.replace("%s%s" % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe, was_series=False, using_tex=False, absolutes=True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append("%s (%.2f\%%)" % (w, perc))
                else:
                    the_labs.append("%s (%.2f %%)" % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append("%s (n=%d)" % (w, score))
                else:
                    the_labs.append("%s (n=%d)" % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pd.DataFrame(vals, index=the_labs)
            dataframe.columns = ["Total"]
        return dataframe

    def auto_explode(dataframe, input, was_series=False, num_to_plot=7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]
        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if type(input) == str or type(input) == int:
            input = [input]
        if type(input) == list:
            for i in input:
                if type(i) == str:
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # are we doing subplots?
    sbplt = False
    if "subplots" in kwargs:
        if kwargs["subplots"] is True:
            sbplt = True

    if colours is True:
        colours = "Paired"

    styles = ["dark_background", "bmh", "grayscale", "ggplot", "fivethirtyeight"]
    if style not in styles:
        raise ValueError("Style %s not found. Use %s" % (style, ", ".join(styles)))

    if "savepath" in kwargs.keys():
        mpl.rcParams["savefig.directory"] = kwargs["savepath"]
        del kwargs["savepath"]

    mpl.rcParams["savefig.bbox"] = "tight"

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams["font.family"] = "sans-serif"
    mpl.rcParams["text.latex.unicode"] = True

    if tex == "try" or tex is True:
        try:
            rc("text", usetex=True)
            rc("font", **{"family": "serif", "serif": ["Computer Modern"]})
            using_tex = True
        except:
            matplotlib.rc("font", family="sans-serif")
            matplotlib.rc("font", serif="Helvetica Neue")
            matplotlib.rc("text", usetex="false")
            rc("text", usetex=False)
    else:
        rc("text", usetex=False)

    if interactive:
        using_tex = False

    if show_totals is False:
        show_totals = "none"

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    if "kind" not in kwargs:
        kwargs["kind"] = "line"

    if interactive:
        if kwargs["kind"].startswith("bar"):
            interactive_types = [3]
        elif kwargs["kind"] == "area":
            interactive_types = [2, 3]
        elif kwargs["kind"] == "line":
            interactive_types = [2, 3]
        elif kwargs["kind"] == "pie":
            interactive_types = None
            warnings.warn("Interactive plotting not yet available for pie plots.")
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if "kind" in kwargs:
        if kwargs["kind"] == "pie":
            piemode = True
            # always the best spot for pie
            # if legend_pos == 'best':
            # legend_pos = 'lower left'
            if show_totals.endswith("plot") or show_totals.endswith("both"):
                kwargs["pctdistance"] = 0.6
                if using_tex:
                    kwargs["autopct"] = r"%1.1f\%%"
                else:
                    kwargs["autopct"] = "%1.1f%%"

    # if piemode:
    # if partial_pie:
    # kwargs['startangle'] = 180

    kwargs["subplots"] = sbplt

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if type(dataframe) == pandas.core.series.Series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True

    # attempt to convert x axis to ints:
    try:
        dataframe.index = [int(i) for i in list(dataframe.index)]
    except:
        pass

    # remove totals and tkinter order
    if not was_series:
        for name, ax in zip(["Total"] * 2 + ["tkintertable-order"] * 2, [0, 1, 0, 1]):
            dataframe = dataframe.drop(name, axis=ax, errors="ignore")
    else:
        dataframe = dataframe.drop("tkintertable-order", errors="ignore")
        dataframe = dataframe.drop("tkintertable-order", axis=1, errors="ignore")

    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == "guess":

            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = "all"
            dataframe = dataframe.T
            if y_label is None:
                y_label = "Percentage of all matches"
            if x_label is None:
                x_label = ""

    # set backend?
    output_formats = ["svgz", "ps", "emf", "rgba", "raw", "pdf", "svg", "eps", "png", "pgf"]
    if output_format not in output_formats:
        raise ValueError("%s output format not recognised. Must be: %s" % (output_format, ", ".join(output_formats)))

    # don't know if these are necessary
    if "pdf" in output_format:
        plt.switch_backend(output_format)
    if "pgf" in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == "all":
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if "explode" in kwargs:
        if not piemode:
            del kwargs["explode"]
    if piemode:
        if "explode" in kwargs:
            if not sbplt:
                kwargs["explode"] = auto_explode(
                    dataframe, kwargs["explode"], was_series=was_series, num_to_plot=num_to_plot
                )

    if "legend" in kwargs:
        legend = kwargs["legend"]
    else:
        legend = True

    # cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != "Total":
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == "Total":
            plotting_a_totals_column = True
            if not "legend" in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ["slope", "intercept", "r", "p", "stderr"]
    try:
        dataframe = dataframe.drop(statfields, axis=1)
    except:
        pass
    try:
        dataframe.ix["p"]
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]["p"]
                newname = "%s (p=%s)" % (col, format(pval, ".5f"))
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis=0, inplace=True)
        else:
            warnings.warn(
                "No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values."
            )
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis=0, inplace=True)

    # make and set y label
    absolutes = True
    if type(dataframe) == pandas.core.frame.DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0, :].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):
            absolutes = False

    #  use colormap if need be:
    if num_to_plot > 0:
        if not was_series:
            if "kind" in kwargs:
                if kwargs["kind"] in ["pie", "line", "area"]:
                    if colours:
                        if not plotting_a_totals_column:
                            if colours == "Default":
                                colours = "Paired"
                            kwargs["colormap"] = colours
            # else:
            if colours:
                if colours == "Default":
                    colours = "Paired"
                kwargs["colormap"] = colours

    if piemode:
        if num_to_plot > 0:
            if colours == "Default":
                colours = "Paired"
            kwargs["colormap"] = colours
        else:
            if num_to_plot > 0:
                if colours == "Default":
                    colours = "Paired"
                kwargs["colormap"] = colours
        # else:
        # if len(dataframe.T.columns) < 8:
        # try:
        # del kwargs['colormap']
        # except:
        # pass

    # multicoloured bar charts
    if "kind" in kwargs:
        if colours:
            if kwargs["kind"].startswith("bar"):
                if len(list(dataframe.columns)) == 1:
                    if not black_and_white:
                        import numpy as np

                        the_range = np.linspace(0, 1, num_to_plot)
                        cmap = plt.get_cmap(colours)
                        kwargs["colors"] = [cmap(n) for n in the_range]
                    # make a bar width ... ?
                    # kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5

    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if "kind" in kwargs:
        if kwargs["kind"] in ["bar", "barh", "area", "line", "pie"]:
            if was_series:
                legend = False
            if kwargs["kind"] == "pie":
                if pie_legend:
                    legend = True
                else:
                    legend = False
        if kwargs["kind"] in ["barh", "area"]:
            if reverse_legend == "guess":
                rev_leg = True
    if not "rev_leg" in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = "best"

    # cut dataframe if just_totals
    try:
        tst = dataframe["Combined total"]
        dataframe = dataframe.head(num_to_plot)
    except:
        pass

    # rotate automatically
    if "rot" not in kwargs:
        if not was_series:
            xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]]
            # if 'kind' in kwargs:
            # if kwargs['kind'] in ['barh', 'area']:
            # xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        else:
            xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        if len(max(xvals, key=len)) > 6:
            if not piemode:
                kwargs["rot"] = 45

    # no title for subplots because ugly,
    if sbplt:
        if "title" in kwargs:
            del kwargs["title"]
    else:
        kwargs["title"] = title

    # no interactive subplots yet:

    if sbplt and interactive:
        import warnings

        interactive = False
        warnings.warn("No interactive subplots yet, sorry.")
        return

    # not using pandas for labels or legend anymore.
    # kwargs['labels'] = None
    # kwargs['legend'] = False

    if legend:
        # kwarg options go in leg_options
        leg_options = {"framealpha": 0.8}
        if "shadow" in kwargs:
            leg_options["shadow"] = True
        if "ncol" in kwargs:
            leg_options["ncol"] = kwargs["ncol"]
            del kwargs["ncol"]
        else:
            if num_to_plot > 6:
                leg_options["ncol"] = num_to_plot / 7

        # determine legend position based on this dict
        if legend_pos:
            possible = {
                "best": 0,
                "upper right": 1,
                "upper left": 2,
                "lower left": 3,
                "lower right": 4,
                "right": 5,
                "center left": 6,
                "center right": 7,
                "lower center": 8,
                "upper center": 9,
                "center": 10,
                "o r": 2,
                "outside right": 2,
                "outside upper right": 2,
                "outside center right": "center left",
                "outside lower right": "lower left",
            }

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError(
                        "legend_pos value must be one of:\n%s\n or an int between 0-10." % ", ".join(possible.keys())
                    )
            leg_options["loc"] = the_loc
            # weirdness needed for outside plot
            if legend_pos in ["o r", "outside right", "outside upper right"]:
                leg_options["bbox_to_anchor"] = (1.02, 1)
            if legend_pos == "outside center right":
                leg_options["bbox_to_anchor"] = (1.02, 0.5)
            if legend_pos == "outside lower right":
                leg_options["loc"] == "upper right"
                leg_options["bbox_to_anchor"] = (0.5, 0.5)

        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith("o"):
                leg_options["borderaxespad"] = 1

    if not piemode:
        if show_totals.endswith("both") or show_totals.endswith("legend"):
            dataframe = rename_data_with_total(
                dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes
            )
    else:
        if pie_legend:
            if show_totals.endswith("both") or show_totals.endswith("legend"):
                dataframe = rename_data_with_total(
                    dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes
                )

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs["y"] = list(dataframe.columns)[0]
            if pie_legend:
                kwargs["legend"] = False
                if was_series:
                    leg_options["labels"] = list(dataframe.index)
                else:
                    leg_options["labels"] = list(dataframe.columns)
        else:
            if pie_legend:
                kwargs["legend"] = False
                if was_series:
                    leg_options["labels"] = list(dataframe.index)
                else:
                    leg_options["labels"] = list(dataframe.index)

    areamode = False
    if "kind" in kwargs:
        if kwargs["kind"] == "area":
            areamode = True

    if legend is False:
        kwargs["legend"] = False

    # cumulative grab first col
    if cumulative:
        kwargs["y"] = list(dataframe.columns)[0]

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kwargs["kind"] == "line":
                kwargs["marker"] = ","
        if not piemode:
            kwargs["alpha"] = 0.1

    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pd.PeriodIndex([d for d in list(dataframe.index)], freq="A")
                    dataframe = dataframe.set_index(n)

    MARKERSIZE = 4
    COLORMAP = {
        0: {"marker": None, "dash": (None, None)},
        1: {"marker": None, "dash": [5, 5]},
        2: {"marker": "o", "dash": (None, None)},
        3: {"marker": None, "dash": [1, 3]},
        4: {"marker": "s", "dash": [5, 2, 5, 2, 5, 10]},
        5: {"marker": None, "dash": [5, 3, 1, 2, 1, 10]},
        6: {"marker": "o", "dash": (None, None)},
        7: {"marker": None, "dash": [5, 3, 1, 3]},
        8: {"marker": "1", "dash": [1, 3]},
        9: {"marker": "*", "dash": [5, 5]},
        10: {"marker": "2", "dash": [5, 2, 5, 2, 5, 10]},
        11: {"marker": "s", "dash": (None, None)},
    }

    HATCHES = {
        0: {"color": "#dfdfdf", "hatch": "/"},
        1: {"color": "#6f6f6f", "hatch": "\\"},
        2: {"color": "b", "hatch": "|"},
        3: {"color": "#dfdfdf", "hatch": "-"},
        4: {"color": "#6f6f6f", "hatch": "+"},
        5: {"color": "b", "hatch": "x"},
    }

    if black_and_white:
        if kwargs["kind"] == "line":
            kwargs["linewidth"] = 1

        cmap = plt.get_cmap("Greys")
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kwargs["kind"] == "bar":
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs["colormap"] = new_cmap

    # use styles and plot

    with plt.style.context((style)):

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs["stacked"] = False
                    rev_leg = False
            ax = dataframe.plot(figsize=figsize, **kwargs)
        else:
            if not piemode and not sbplt:
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                ax = dataframe.plot(figsize=figsize, **kwargs)
                handles, labels = plt.gca().get_legend_handles_labels()
                plt.legend(
                    handles,
                    labels,
                    loc=leg_options["loc"],
                    bbox_to_anchor=(0, -0.1, 1, 1),
                    bbox_transform=plt.gcf().transFigure,
                )
                if not tk:
                    plt.show()
                    return
        if "rot" in kwargs:
            if kwargs["rot"] != 0 and kwargs["rot"] != 90:
                labels = [item.get_text() for item in ax.get_xticklabels()]
                ax.set_xticklabels(labels, rotation=kwargs["rot"], ha="right")

        if transparent:
            plt.gcf().patch.set_facecolor("white")
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            # plt.grid()
            plt.gca().set_axis_bgcolor("w")
            if kwargs["kind"] == "line":
                # white background

                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color("black")
                    # line.set_width(1)
                    line.set_dashes(COLORMAP[c]["dash"])
                    line.set_marker(COLORMAP[c]["marker"])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(COLORMAP.keys()):
                        c = 0

        if legend:
            if not piemode and not sbplt:
                if 3 not in interactive_types:
                    if not rev_leg:
                        lgd = plt.legend(**leg_options)
                    else:
                        handles, labels = plt.gca().get_legend_handles_labels()
                        lgd = plt.legend(handles[::-1], labels[::-1], **leg_options)

            # if black_and_white:
            # lgd.set_facecolor('w')

        # if interactive:
        # if legend:
        # lgd.set_title("")
        # if not sbplt:
        # if 'layout' not in kwargs:
        # plt.tight_layout()

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = ["%s (%s: %d)" % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            else:
                ls = ["%s (%s: %.2f%%)" % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            if 2 in interactive_types:
                # if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                # else:
                if kwargs["kind"] == "line":
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels=ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)

            # works:
            # plugins.connect(plt.gcf(), plugins.LineLabelTooltip(l, labels[i]))

        # labels = ["Point {0}".format(i) for i in range(num_to_plot)]
        # tooltip = plugins.LineLabelTooltip(lines)
        # mpld3.plugins.connect(plt.gcf(), mpld3.plugins.PointLabelTooltip(lines))

    if piemode:
        if not sbplt:
            plt.axis("equal")
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = "Year"

    if x_label is not False:
        if type(x_label) == str:
            plt.xlabel(x_label)
        else:
            check_x_axis = list(dataframe.index)[0]  # get first entry# get second entry of first entry (year, count)
            try:
                if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
                    x_label = "Year"
                check_x_axis = int(check_x_axis)
                if 1500 < check_x_axis < 2050:
                    x_label = "Year"
                else:
                    x_label = "Group"
            except:
                x_label = "Group"

        if not sbplt:
            if not piemode:
                plt.xlabel(x_label)

    # no offsets for numerical x and y values
    if type(dataframe.index) != pandas.tseries.period.PeriodIndex:
        try:
            # check if x axis can be an int
            check_x_axis = list(dataframe.index)[0]
            can_it_be_int = int(check_x_axis)
            # if so, set these things
            from matplotlib.ticker import ScalarFormatter

            plt.gca().xaxis.set_major_formatter(ScalarFormatter())
        except:
            pass

    # same for y axis
    try:
        # check if x axis can be an int
        check_y_axis = list(dataframe.columns)[0]
        can_it_be_int = int(check_y_axis)
        # if so, set these things
        from matplotlib.ticker import ScalarFormatter

        plt.gca().yaxis.set_major_formatter(ScalarFormatter())
    except:
        pass

    # y labelling
    y_l = False
    if not absolutes:
        y_l = "Percentage"
    else:
        y_l = "Absolute frequency"

    if y_label is not False:
        if not sbplt:
            if not piemode:
                if type(y_label) == str:
                    plt.ylabel(y_label)
                else:
                    plt.ylabel(y_l)

    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        # plt.suptitle(title, fontsize = 16)
        # get all axes
        if "layout" not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)

        # set subplot titles

        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis("equal")

    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        if "kind" in kwargs:
            if kwargs["kind"].startswith("bar"):
                width = ax.containers[0][0].get_width()

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith("plot") or show_totals.endswith("both"):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0, the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        # import warnings
                        # warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate("%.2f" % score, (i, score), ha="center", va="bottom")
                else:
                    plt.annotate(score, (i, score), ha="center", va="bottom")
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith("plot") or show_totals.endswith("both"):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        # import warnings
                        # warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate("%.2f" % score, (i, score), ha="center", va="bottom")
                else:
                    plt.annotate(score, (i, score), ha="center", va="bottom")

    # if not running_python_tex:
    # plt.gcf().show()

    plt.subplots_adjust(left=0.1)
    plt.subplots_adjust(bottom=0.18)
    # if 'layout' not in kwargs:
    # plt.tight_layout()

    if save:
        import os

        if running_python_tex:
            imagefolder = "../images"
        else:
            imagefolder = "images"

        savename = get_savename(imagefolder, save=save, title=title, ext=output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith("o"):
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches="tight", format=output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format=output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print "\n" + time + ": " + savename + " created."
        else:
            raise ValueError("Error making %s." % savename)

    if not interactive and not running_python_tex and not running_spider and not tk:
        plt.show()
        return
    if running_spider or tk or sbplt:
        return plt

    if interactive:
        plt.subplots_adjust(right=0.8)
        plt.subplots_adjust(left=0.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
Example #20
0
sns.lmplot("Age", "Survived", hue="Sex", data=titanic_df, palette="winter", x_bins=generations)


# In[83]:

survived = titanic_df.Survived[deck.index]


# In[86]:

cabin_df["Survived"] = survived
cabin_df.head()


# In[93]:

survivor = titanic_df.Survivor[deck.index]
cabin_df["Survivor"] = survivor
cabin_df.head()


# In[96]:

sns.factorplot(
    "Survivor",
    hue="Cabin",
    x_order=["yes", "no"],
    hue_order=["A", "B", "C", "D", "E", "F", "G"],
    data=cabin_df,
    kind="count",
)
Example #21
0
def plotter(
    title,
    df,
    kind="line",
    x_label=None,
    y_label=None,
    style="ggplot",
    figsize=(8, 4),
    save=False,
    legend_pos="best",
    reverse_legend="guess",
    num_to_plot=7,
    tex="try",
    colours="Accent",
    cumulative=False,
    pie_legend=True,
    partial_pie=False,
    show_totals=False,
    transparent=False,
    output_format="png",
    interactive=False,
    black_and_white=False,
    show_p_val=False,
    indices=False,
    **kwargs
):
    """Visualise corpus interrogations.

    :param title: A title for the plot
    :type title: str
    :param df: Data to be plotted
    :type df: pandas.core.frame.DataFrame
    :param x_label: A label for the x axis
    :type x_label: str
    :param y_label: A label for the y axis
    :type y_label: str
    :param kind: The kind of chart to make
    :type kind: str ('line'/'bar'/'barh'/'pie'/'area')
    :param style: Visual theme of plot
    :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
    :param figsize: Size of plot
    :type figsize: tuple (int, int)
    :param save: If bool, save with *title* as name; if str, use str as name
    :type save: bool/str
    :param legend_pos: Where to place legend
    :type legend_pos: str ('upper right'/'outside right'/etc)
    :param reverse_legend: Reverse the order of the legend
    :type reverse_legend: bool
    :param num_to_plot: How many columns to plot
    :type num_to_plot: int/'all'
    :param tex: Use TeX to draw plot text
    :type tex: bool
    :param colours: Colourmap for lines/bars/slices
    :type colours: str
    :param cumulative: Plot values cumulatively
    :type cumulative: bool
    :param pie_legend: Show a legend for pie chart
    :type pie_legend: bool
    :param partial_pie: Allow plotting of pie slices only
    :type partial_pie: bool
    :param show_totals: Print sums in plot where possible
    :type show_totals: str -- 'legend'/'plot'/'both'
    :param transparent: Transparent .png background
    :type transparent: bool
    :param output_format: File format for saved image
    :type output_format: str -- 'png'/'pdf'
    :param black_and_white: Create black and white line styles
    :type black_and_white: bool
    :param show_p_val: Attempt to print p values in legend if contained in df
    :type show_p_val: bool
    :param indices: To use when plotting "distance from root"
    :type indices: bool
    :param stacked: When making bar chart, stack bars on top of one another
    :type stacked: str
    :param filled: For area and bar charts, make every column sum to 100
    :type filled: str
    :param legend: Show a legend
    :type legend: bool
    :param rot: Rotate x axis ticks by *rot* degrees
    :type rot: int
    :param subplots: Plot each column separately
    :type subplots: bool
    :param layout: Grid shape to use when *subplots* is True
    :type layout: tuple -- (int, int)
    :param interactive: Experimental interactive options
    :type interactive: list -- [1, 2, 3]
    :returns: matplotlib figure
    """
    import corpkit
    import os

    try:
        from IPython.utils.shimmodule import ShimWarning
        import warnings

        warnings.simplefilter("ignore", ShimWarning)
    except:
        pass

    import matplotlib as mpl
    from matplotlib import rc

    # prefer seaborn plotting
    try:
        import seaborn as sns
    except:
        pass

    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt

    import pandas
    from pandas import DataFrame

    import numpy
    from time import localtime, strftime
    from tests import check_pytex, check_spider, check_t_kinter

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    # check what environment we're in
    tk = check_t_kinter()
    running_python_tex = check_pytex()
    running_spider = check_spider()

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np

        new_cmap = colors.LinearSegmentedColormap.from_list(
            "trunc({n},{a:.2f},{b:.2f})".format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))
        )
        return new_cmap

    def get_savename(imagefolder, save=False, title=False, ext="png"):
        """Come up with the savename for the image."""
        import os

        def urlify(s):
            "Turn title into filename"
            import re

            s = s.lower()
            s = re.sub(r"[^\w\s-]", "", s)
            s = re.sub(r"\s+", "-", s)
            s = re.sub(r"-(textbf|emph|textsc|textit)", "-", s)
            return s

        # name as
        if not ext.startswith("."):
            ext = "." + ext
        if type(save) == str:
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        # this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith("%s%s" % (ext, ext)):
            savename = savename.replace("%s%s" % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe, was_series=False, using_tex=False, absolutes=True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append("%s (%.2f\%%)" % (w, perc))
                else:
                    the_labs.append("%s (%.2f %%)" % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append("%s (n=%d)" % (w, score))
                else:
                    the_labs.append("%s (n=%d)" % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pandas.DataFrame(vals, index=the_labs)
            dataframe.columns = ["Total"]
        return dataframe

    def auto_explode(dataframe, input, was_series=False, num_to_plot=7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]
        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if type(input) == str or type(input) == int:
            input = [input]
        if type(input) == list:
            for i in input:
                if type(i) == str:
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # check if we're doing subplots
    sbplt = False
    if "subplots" in kwargs:
        if kwargs["subplots"] is True:
            sbplt = True
    kwargs["subplots"] = sbplt

    if colours is True:
        colours = "Paired"

    # todo: get this dynamically instead.
    styles = ["dark_background", "bmh", "grayscale", "ggplot", "fivethirtyeight", "matplotlib", False, "mpl-white"]
    # if style not in styles:
    # raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles)))

    if style == "mpl-white":
        try:
            sns.set_style("whitegrid")
        except:
            pass
        style = "matplotlib"

    if style is not False and style.startswith("seaborn"):
        colours = False

    # use 'draggable = True' to make a draggable legend
    dragmode = kwargs.get("draggable", False)
    kwargs.pop("draggable", None)

    if kwargs.get("savepath"):
        mpl.rcParams["savefig.directory"] = kwargs.get("savepath")
        kwargs.pop("savepath", None)

    mpl.rcParams["savefig.bbox"] = "tight"
    mpl.rcParams.update({"figure.autolayout": True})

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams["font.family"] = "sans-serif"
    mpl.rcParams["text.latex.unicode"] = True

    if tex == "try" or tex is True:
        try:
            rc("text", usetex=True)
            rc("font", **{"family": "serif", "serif": ["Computer Modern"]})
            using_tex = True
        except:
            matplotlib.rc("font", family="sans-serif")
            matplotlib.rc("font", serif="Helvetica Neue")
            matplotlib.rc("text", usetex="false")
            rc("text", usetex=False)
    else:
        rc("text", usetex=False)

    if interactive:
        using_tex = False

    if show_totals is False:
        show_totals = "none"

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    kwargs["kind"] = kind.lower()

    if interactive:
        if kwargs["kind"].startswith("bar"):
            interactive_types = [3]
        elif kwargs["kind"] == "area":
            interactive_types = [2, 3]
        elif kwargs["kind"] == "line":
            interactive_types = [2, 3]
        elif kwargs["kind"] == "pie":
            interactive_types = None
            warnings.warn("Interactive plotting not yet available for pie plots.")
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if kind == "pie":
        piemode = True
        # always the best spot for pie
        # if legend_pos == 'best':
        # legend_pos = 'lower left'
        if show_totals.endswith("plot") or show_totals.endswith("both"):
            kwargs["pctdistance"] = 0.6
            if using_tex:
                kwargs["autopct"] = r"%1.1f\%%"
            else:
                kwargs["autopct"] = "%1.1f%%"

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if type(dataframe) == pandas.core.series.Series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True

    # attempt to convert x axis to ints:
    try:
        dataframe.index = [int(i) for i in list(dataframe.index)]
    except:
        pass

    # remove totals and tkinter order
    if not was_series and not all(x.lower() == "total" for x in list(dataframe.columns)):
        for name, ax in zip(["Total"] * 2 + ["tkintertable-order"] * 2, [0, 1, 0, 1]):
            try:
                dataframe = dataframe.drop(name, axis=ax, errors="ignore")
            except:
                pass
    else:
        dataframe = dataframe.drop("tkintertable-order", errors="ignore")
        dataframe = dataframe.drop("tkintertable-order", axis=1, errors="ignore")

    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == "guess":

            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = "all"
            dataframe = dataframe.T
            if y_label is None:
                y_label = "Percentage of all matches"
            if x_label is None:
                x_label = ""

    # set backend?
    output_formats = ["svgz", "ps", "emf", "rgba", "raw", "pdf", "svg", "eps", "png", "pgf"]
    if output_format not in output_formats:
        raise ValueError("%s output format not recognised. Must be: %s" % (output_format, ", ".join(output_formats)))

    # don't know if these are necessary
    if "pdf" in output_format:
        plt.switch_backend(output_format)
    if "pgf" in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == "all":
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if piemode and not sbplt and kwargs.get("explode"):
        kwargs["explode"] = auto_explode(dataframe, kwargs["explode"], was_series=was_series, num_to_plot=num_to_plot)
    else:
        kwargs.pop("explode", None)

    legend = kwargs.get("legend", False)

    # cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != "Total":
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == "Total":
            plotting_a_totals_column = True
            if not "legend" in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ["slope", "intercept", "r", "p", "stderr"]
    try:
        dataframe = dataframe.drop(statfields, axis=1, errors="ignore")
    except:
        pass
    try:
        dataframe.ix["p"]
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]["p"]

                def p_string_formatter(val):
                    if val < 0.001:
                        if not using_tex:
                            return "p < 0.001"
                        else:
                            return r"p $<$ 0.001"
                    else:
                        return "p = %s" % format(val, ".3f")

                pstr = p_string_formatter(pval)
                newname = "%s (%s)" % (col, pstr)
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis=0, inplace=True, errors="ignore")
        else:
            warnings.warn(
                "No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values."
            )
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis=0, inplace=True, errors="ignore")

    # make and set y label
    absolutes = True
    if type(dataframe) == pandas.core.frame.DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0, :].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):
            absolutes = False

    #  use colormap if need be:
    if num_to_plot > 0:
        if not was_series:
            if kind in ["pie", "line", "area"]:
                if colours:
                    if not plotting_a_totals_column:
                        if colours == "Default":
                            colours = "Paired"
                        kwargs["colormap"] = colours
            # else:

            if colours:
                if colours == "Default":
                    colours = "Paired"
                kwargs["colormap"] = colours

    if piemode:
        if num_to_plot > 0:
            if colours == "Default":
                colours = "Paired"
            kwargs["colormap"] = colours
        else:
            if num_to_plot > 0:
                if colours == "Default":
                    colours = "Paired"
                kwargs["colormap"] = colours

    # multicoloured bar charts
    if colours:
        if kind.startswith("bar"):
            if len(list(dataframe.columns)) == 1:
                if not black_and_white:
                    import numpy as np

                    the_range = np.linspace(0, 1, num_to_plot)
                    cmap = plt.get_cmap(colours)
                    kwargs["colors"] = [cmap(n) for n in the_range]
                # make a bar width ... ? ...
                # kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5

    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if kind in ["bar", "barh", "area", "line", "pie"]:
        if was_series:
            legend = False
        if kind == "pie":
            if pie_legend:
                legend = True
            else:
                legend = False
    if kind in ["barh", "area"]:
        if reverse_legend == "guess":
            rev_leg = True
    if not "rev_leg" in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = "best"

    # cut dataframe if just_totals
    try:
        tst = dataframe["Combined total"]
        dataframe = dataframe.head(num_to_plot)
    except:
        pass

    # rotate automatically
    if "rot" not in kwargs:
        if not was_series:
            xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]]
            # if 'kind' in kwargs:
            # if kwargs['kind'] in ['barh', 'area']:
            # xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        else:
            xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        if len(max(xvals, key=len)) > 6:
            if not piemode:
                kwargs["rot"] = 45

    # no title for subplots because ugly,
    if title and not sbplt:
        kwargs["title"] = title

    # no interactive subplots yet:
    if sbplt and interactive:
        import warnings

        interactive = False
        warnings.warn("No interactive subplots yet, sorry.")
        return

    # not using pandas for labels or legend anymore.
    # kwargs['labels'] = None
    # kwargs['legend'] = False

    if legend:
        if num_to_plot > 6:
            if not kwargs.get("ncol"):
                kwargs["ncol"] = num_to_plot / 7
        # kwarg options go in leg_options
        leg_options = {"framealpha": 0.8, "shadow": kwargs.get("shadow", False), "ncol": kwargs.pop("ncol", 1)}

        # determine legend position based on this dict
        if legend_pos:
            possible = {
                "best": 0,
                "upper right": 1,
                "upper left": 2,
                "lower left": 3,
                "lower right": 4,
                "right": 5,
                "center left": 6,
                "center right": 7,
                "lower center": 8,
                "upper center": 9,
                "center": 10,
                "o r": 2,
                "outside right": 2,
                "outside upper right": 2,
                "outside center right": "center left",
                "outside lower right": "lower left",
            }

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError(
                        "legend_pos value must be one of:\n%s\n or an int between 0-10."
                        % ", ".join(list(possible.keys()))
                    )
            leg_options["loc"] = the_loc
            # weirdness needed for outside plot
            if legend_pos in ["o r", "outside right", "outside upper right"]:
                leg_options["bbox_to_anchor"] = (1.02, 1)
            if legend_pos == "outside center right":
                leg_options["bbox_to_anchor"] = (1.02, 0.5)
            if legend_pos == "outside lower right":
                leg_options["loc"] == "upper right"
                leg_options["bbox_to_anchor"] = (0.5, 0.5)

        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith("o"):
                leg_options["borderaxespad"] = 1

    if not piemode:
        if show_totals.endswith("both") or show_totals.endswith("legend"):
            dataframe = rename_data_with_total(
                dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes
            )
    else:
        if pie_legend:
            if show_totals.endswith("both") or show_totals.endswith("legend"):
                dataframe = rename_data_with_total(
                    dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes
                )

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs["y"] = list(dataframe.columns)[0]
            if pie_legend:
                kwargs["legend"] = False
                if was_series:
                    leg_options["labels"] = list(dataframe.index)
                else:
                    leg_options["labels"] = list(dataframe.columns)
        else:
            if pie_legend:
                kwargs["legend"] = False
                if was_series:
                    leg_options["labels"] = list(dataframe.index)
                else:
                    leg_options["labels"] = list(dataframe.index)

    def filler(df):
        pby = df.T.copy()
        for i in list(pby.columns):
            tot = pby[i].sum()
            pby[i] = pby[i] * 100.0 / tot
        return pby.T

    areamode = False
    if kind == "area":
        areamode = True

    if legend is False:
        kwargs["legend"] = False

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kind == "line":
                kwargs["marker"] = ","
        if not piemode:
            kwargs["alpha"] = 0.1

    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq="A")
                    dataframe = dataframe.set_index(n)

        if kwargs.get("filled"):
            if areamode or kind.startswith("bar"):
                dataframe = filler(dataframe)
            kwargs.pop("filled", None)

    MARKERSIZE = 4
    COLORMAP = {
        0: {"marker": None, "dash": (None, None)},
        1: {"marker": None, "dash": [5, 5]},
        2: {"marker": "o", "dash": (None, None)},
        3: {"marker": None, "dash": [1, 3]},
        4: {"marker": "s", "dash": [5, 2, 5, 2, 5, 10]},
        5: {"marker": None, "dash": [5, 3, 1, 2, 1, 10]},
        6: {"marker": "o", "dash": (None, None)},
        7: {"marker": None, "dash": [5, 3, 1, 3]},
        8: {"marker": "1", "dash": [1, 3]},
        9: {"marker": "*", "dash": [5, 5]},
        10: {"marker": "2", "dash": [5, 2, 5, 2, 5, 10]},
        11: {"marker": "s", "dash": (None, None)},
    }

    HATCHES = {
        0: {"color": "#dfdfdf", "hatch": "/"},
        1: {"color": "#6f6f6f", "hatch": "\\"},
        2: {"color": "b", "hatch": "|"},
        3: {"color": "#dfdfdf", "hatch": "-"},
        4: {"color": "#6f6f6f", "hatch": "+"},
        5: {"color": "b", "hatch": "x"},
    }

    if black_and_white:
        if kind == "line":
            kwargs["linewidth"] = 1

        cmap = plt.get_cmap("Greys")
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kind == "bar":
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs["colormap"] = new_cmap

    class dummy_context_mgr:
        """a fake context for plotting without style
        perhaps made obsolete by 'classic' style in new mpl"""

        def __enter__(self):
            return None

        def __exit__(self, one, two, three):
            return False

    with plt.style.context((style)) if style != "matplotlib" else dummy_context_mgr():

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                kwargs["legend"] = False
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs["stacked"] = False
                    rev_leg = False
            ax = dataframe.plot(figsize=figsize, **kwargs)
            if areamode:
                handles, labels = plt.gca().get_legend_handles_labels()
                del handles
                del labels
        else:
            plt.gcf().set_tight_layout(False)
            if not piemode:
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                ax = dataframe.plot(figsize=figsize, **kwargs)
                handles, labels = plt.gca().get_legend_handles_labels()
                plt.legend(
                    handles,
                    labels,
                    loc=leg_options["loc"],
                    bbox_to_anchor=(0, -0.1, 1, 1),
                    bbox_transform=plt.gcf().transFigure,
                )

                # this line allows layouts with missing plots
                # i.e. layout = (5, 2) with only nine plots
                plt.gcf().set_tight_layout(False)

        if "rot" in kwargs:
            if kwargs["rot"] != 0 and kwargs["rot"] != 90:
                labels = [item.get_text() for item in ax.get_xticklabels()]
                ax.set_xticklabels(labels, rotation=kwargs["rot"], ha="right")

        if transparent:
            plt.gcf().patch.set_facecolor("white")
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            if kind == "line":
                # white background
                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color("black")
                    # line.set_width(1)
                    line.set_dashes(COLORMAP[c]["dash"])
                    line.set_marker(COLORMAP[c]["marker"])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(list(COLORMAP.keys())):
                        c = 0

        # draw legend with proper placement etc
        if legend:
            if not piemode and not sbplt:
                if 3 not in interactive_types:
                    handles, labels = plt.gca().get_legend_handles_labels()
                    # area doubles the handles and labels. this removes half:
                    if areamode:
                        handles = handles[-len(handles) / 2 :]
                        labels = labels[-len(labels) / 2 :]
                    if rev_leg:
                        handles = handles[::-1]
                        labels = labels[::-1]
                    lgd = plt.legend(handles, labels, **leg_options)

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = ["%s (%s: %d)" % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            else:
                ls = ["%s (%s: %.2f%%)" % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            if 2 in interactive_types:
                # if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                # else:
                if kind == "line":
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels=ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)

    if piemode:
        if not sbplt:
            plt.axis("equal")
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = "Year"

    if x_label is not False:
        if type(x_label) == str:
            plt.xlabel(x_label)
        else:
            check_x_axis = list(dataframe.index)[0]  # get first entry# get second entry of first entry (year, count)
            try:
                if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
                    x_label = "Year"
                check_x_axis = int(check_x_axis)
                if 1500 < check_x_axis < 2050:
                    x_label = "Year"
                else:
                    x_label = "Group"
            except:
                x_label = "Group"

        if not sbplt:
            if not piemode:
                plt.xlabel(x_label)

    def is_number(s):
        """check if str can be can be made into float/int"""
        try:
            float(s)  # for int, long and float
        except ValueError:
            try:
                complex(s)  # for complex
            except ValueError:
                return False
        return True

    # for now, always turn off sci notation
    from matplotlib.ticker import ScalarFormatter

    if type(dataframe.index) != pandas.tseries.period.PeriodIndex:
        try:
            if all(is_number(s) for s in list(dataframe.index)):
                plt.gca().xaxis.set_major_formatter(ScalarFormatter())
        except:
            pass
    try:
        if all(is_number(s) for s in list(dataframe.columns)):
            plt.gca().yaxis.set_major_formatter(ScalarFormatter())
    except:
        pass

    # y labelling
    y_l = False
    if not absolutes:
        y_l = "Percentage"
    else:
        y_l = "Absolute frequency"

    def suplabel(axis, label, label_prop=None, labelpad=5, ha="center", va="center"):
        """ Add super ylabel or xlabel to the figure
        Similar to matplotlib.suptitle
        axis       - string: "x" or "y"
        label      - string
        label_prop - keyword dictionary for Text
        labelpad   - padding from the axis (default: 5)
        ha         - horizontal alignment (default: "center")
        va         - vertical alignment (default: "center")
        """
        fig = plt.gcf()
        xmin = []
        ymin = []
        for ax in fig.axes:
            xmin.append(ax.get_position().xmin)
            ymin.append(ax.get_position().ymin)
        xmin, ymin = min(xmin), min(ymin)
        dpi = fig.dpi
        if axis.lower() == "y":
            rotation = 90.0
            x = xmin - float(labelpad) / dpi
            y = 0.5
        elif axis.lower() == "x":
            rotation = 0.0
            x = 0.5
            y = ymin - float(labelpad) / dpi
        else:
            raise Exception("Unexpected axis: x or y")
        if label_prop is None:
            label_prop = dict()
        plt.gcf().text(x, y, label, rotation=rotation, transform=fig.transFigure, ha=ha, va=va, **label_prop)

    if y_label is not False:
        if not sbplt:
            if not piemode:
                if type(y_label) == str:
                    plt.ylabel(y_label)
                else:
                    plt.ylabel(y_l)
        else:
            if type(y_label) == str:
                the_y = y_label
            else:
                the_y = y_l
            # suplabel('y', the_y, labelpad = 1.5)
            plt.gcf().text(0.04, 0.5, the_y, va="center", rotation="vertical")
            # plt.subplots_adjust(left=0.5)

        #    if not piemode:
        #        if type(y_label) == str:
        #            plt.ylabel(y_label)
        #        else:
        #            plt.ylabel(y_l)

    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        # plt.gca().suptitle(title, fontsize = 16)
        # plt.subplots_adjust(top=0.9)
        # get all axes
        if "layout" not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)

        # set subplot titles
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis("equal")

            # show grid
            a.grid(b=kwargs.get("grid", False))
            kwargs.pop("grid", None)

    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        if kind.startswith("bar"):
            width = ax.containers[0][0].get_width()

        # show grid
        ax.grid(b=kwargs.get("grid", False))
        kwargs.pop("grid", None)

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith("plot") or show_totals.endswith("both"):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0, the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        # import warnings
                        # warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate("%.2f" % score, (i, score), ha="center", va="bottom")
                else:
                    plt.annotate(score, (i, score), ha="center", va="bottom")
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith("plot") or show_totals.endswith("both"):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        # import warnings
                        # warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate("%.2f" % score, (i, score), ha="center", va="bottom")
                else:
                    plt.annotate(score, (i, score), ha="center", va="bottom")

    plt.subplots_adjust(left=0.1)
    plt.subplots_adjust(bottom=0.18)

    if "layout" not in kwargs:
        if not sbplt:
            plt.tight_layout()

    if save:
        import os

        if running_python_tex:
            imagefolder = "../images"
        else:
            imagefolder = "images"

        savename = get_savename(imagefolder, save=save, title=title, ext=output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith("o"):
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches="tight", format=output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format=output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print("\n" + time + ": " + savename + " created.")
        else:
            raise ValueError("Error making %s." % savename)

    if dragmode:
        plt.legend().draggable()

    if sbplt:
        plt.subplots_adjust(right=0.8)
        plt.subplots_adjust(left=0.1)

    if not interactive and not running_python_tex and not running_spider and not tk:
        plt.gcf().show()
        return
    elif running_spider or tk:
        return plt

    if interactive:
        plt.subplots_adjust(right=0.8)
        plt.subplots_adjust(left=0.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
submission.head(2)


### Removing some of the non correlated variables

# In[128]:

from scipy.stats.stats import pearsonr


# In[167]:

corrr = np.array([])
p_value = np.array([])
for i in np.arange(0, 40, 1):
    corrr = np.append(corrr, pearsonr(train[i], train_y[0])[0])
    p_value = np.append(p_value, pearsonr(train[i], train_y[0])[1])


# In[168]:

print len(corrr), len(p_value)


# In[169]:

corr = DataFrame({"corelation": corrr, "p_value": p_value})


# In[170]:

corr.head(2)
def places(variables="NAME"):

    # placeholder generator
    # replace with your own code
    for k in []:
        yield k


# <codecell>

# use this code to run your code
# I recommend replacing the None in islice to a small number to make sure you're on
# the right track

r = list(islice(places("NAME,P0010001"), None))
places_df = DataFrame(r)
places_df.P0010001 = places_df.P0010001.astype("int")

places_df["FIPS"] = places_df.apply(lambda s: s["state"] + s["place"], axis=1)

print "number of places", len(places_df)
print "total pop", places_df.P0010001.sum()
places_df.head()

# <codecell>

# if you've done this correctly, the following asserts should stop complaining

assert places_df.P0010001.sum() == 228457238
# number of places in 2010 Census
assert len(places_df) == 29261

def clean_column(df, col):
    df = df[col].str.replace("-", "")
    df = df.str.strip()
    df = df.str.lower()
    return df


products["family_cleaned"] = clean_column(products, "family")

listingsByPManuf = matchListingManufsToProductManufs(pManufsMapping, pManufKeywords)

print(listingsByPManuf.head(5))

print(products.head(5))


def createJson(prod_val, df):
    json = '{"listings": ['
    row_val = ""
    for indx, row in df.iterrows():
        row_val = (
            row_val
            + ' { "currency": '
            + row.iloc[3]
            + ' , "price": '
            + row.iloc[4]
            + ' , "manufacturer": '
            + row.iloc[1]
            + ' , "title": '
Example #25
0
import xlrd  # xlsを読み込む際に必要
import numpy as np
import sqlite3

# データフレームを作る
smp = {
    "state": ["Ohio", "Ohio", "Ohio", "Nebada", "Nebada"],
    "year": [2000, 2001, 2002, 2001, 2002],
    "pop": [1.5, 1.6, 1.7, 3.5, 4.3],
}
frame = DataFrame(smp)

# データフレームの要素へのアクセス
frame.year  # frame$year
frame["year"]  # frame$year
frame.head()  # head
frame.tail()  # tail
frame2 = DataFrame(smp, index=["one", "two", "three", "four", "five"])  # インデックスを追加
frame2.ix["one"]
frame2.describe()  # summary
print(frame2.describe())

# データを読み込む
data = pd.read_csv("stock_px.csv")
print(data)
xlsx_file = pd.ExcelFile("stock_px.xlsx")  # openpyxlのインストールが必要, xlsも可
xlsx_file.sheet_names
data = xlsx_file.parse("stock_px")
print(data)

# web上のデータを読み込む→http://docs.scipy.org/doc/numpy/reference/generated/numpy.DataSource.html
Example #26
0
Y.head(1)


### Univariate analysis

# In[14]:

X.hist()


##### These histograms depicts the distribution of the 4 independent variables

##### We can do this analysis using just 1 variable also

# In[15]:

X[0].hist()


##### we can also get the stats of that variable

# In[16]:

X[0].describe()


# In[22]:

X.boxplot(column=[0, 1, 2, 3])


##### This is what is called a boxplot or a tail and a whisker diagram which gives the kind of distribution of each variable under study

##### The red line in the boxplot gives the mean of that variable  The length of the box gives the spread of the variable  The minimum and maximum values are represented by the whisker ends The box endings are the lower and upper quantiles

# In[38]:

X[4] = 0
X.head(1)
    df1['entropy5'] = df1[['Asian','Black','Hispanic','White','Other']].apply(entropy,axis=1)
    df1['entropy4'] = df1[['Asian','Black','Hispanic','White']].apply(entropy,axis=1)
    return df1

# <codecell>


msa_list = list(islice(msas(P005_vars_with_name), None))
len(msa_list)

# <codecell>

msa_list
dr = DataFrame(msa_list)
dr = convert_P005_to_int(dr)
dr.head()

grouped = dr.groupby('metropolitan statistical area/micropolitan statistical area').sum()
grouped.head()

# <codecell>


df_diversity = diversity(grouped)

# <codecell>


#'p_Asian', 'p_Black', 'p_Hispanic', 'p_Other','p_White'
df_diversity['p_Asian'] = df_diversity['Asian']/df_diversity['Total']
df_diversity['p_Black'] = df_diversity['Black']/df_diversity['Total']
Example #28
0
myDict={'USA':75,'Canada':20}
dictSeries=Series(myDict)
dictSeries

#Creating a data frame from dictionary
empDict={'id':[1,2,3,4],'name':   ['Mark','Ian','Sam','Rich'],'isManager':[False,True,False,True]}

## Data Structure : Data Frame from a dictionary
empDict={'id':[1,2,3,4]}
empDf=DataFrame(empDict)

#Access rows and columns 
empDf.name
empDf.name[2]
empDf[empDf.isManager == False]
empDf.head()
empDf.tail()
empDf.iloc[2,]

#Create new column
empDf.append(Series([5,False,'Derek',2],
                    index=['id','isManager','name','deptId'],
ignore_index=True)
empDf

#Deleting a column
empDf['dummy']=1
empDf
del empDf['dummy']
empDf
Example #29
0
class CubeProcess(object):
    def __init__(self, _cube):

        log_it("START: {}".format(_cube["slug"]), "bin-mining")

        self.mongo = MongoPlugin(uri=conf("mongodb")["uri"], db=conf("mongodb")["db"], json_mongo=True).get_mongo()

        del _cube["_id"]
        self.cube = _cube
        self.slug = self.cube["slug"]

    def load(self):
        self.cube["run"] = "run"
        self.mongo["cube"].update({"slug": self.slug}, self.cube)

        self.cube["start_process"] = datetime.now()

        _sql = self.cube["sql"]
        if _sql[-1] == ";":
            _sql = _sql[:-1]
        self.sql = u"""SELECT * FROM ({}) AS CUBE;""".format(_sql)

        self.connection = self.mongo["connection"].find_one({"slug": self.cube["connection"]})["connection"]

        log_it("CONNECT IN RELATION DATA BASE: {}".format(self.slug), "bin-mining")
        e = create_engine(self.connection, **conf("openmining")["sql_conn_params"])
        Session = sessionmaker(bind=e)
        session = Session()

        resoverall = session.execute(text(self.sql))
        self.data = resoverall.fetchall()
        self.keys = resoverall.keys()

    def environment(self, t):
        if t not in ["relational"]:
            self.sql = t

    def _data(self, data):
        self.data = data

    def _keys(self, keys):
        if type(keys) == list:
            self.keys = keys
        self.keys = list(keys)

    def frame(self):
        log_it("LOAD DATA ON DATAWAREHOUSE: {}".format(self.slug), "bin-mining")
        self.df = DataFrame(self.data)
        if self.df.empty:
            log_it("[warning]Empty cube: {}!!".format(self.cube), "bin-mining")
            return
        self.df.columns = self.keys
        self.df.head()

        self.pdict = map(fix_render, self.df.to_dict(outtype="records"))

    def save(self):
        log_it("SAVE DATA (JSON) ON DATA WAREHOUSE: {}".format(self.slug), "bin-mining")
        data = {"data": self.pdict, "columns": self.keys}
        DW = DataWarehouse()
        DW.save(self.slug, data)

        self.cube["status"] = True
        self.cube["lastupdate"] = datetime.now()
        self.cube["run"] = True
        self.mongo["cube"].update({"slug": self.cube["slug"]}, self.cube)

        log_it("CLEAN MEMORY: {}".format(self.slug), "bin-mining")
        gc.collect()
Example #30
0
obj = Series([4, 7, -5, 3])
myprint(obj.values)
myprint(obj[obj > 0])
myprint(obj.isnull())

data = {
    "state": ["Ohino", "Ohino", "Ohino", "Nevada", "Nevada"],
    "year": [2000, 2001, 2002, 2001, 2002],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9],
}

frame = DataFrame(data)
myprint(frame)
myprint(frame["state"])
myprint(frame.dtypes)
myprint(frame.head(1))

myprint(frame.index)

# 行列转换
myprint(frame.T)

# 排序
myprint(frame.sort_values(by="pop"))


# 选择一列
myprint(frame["year"])

# 切片
myprint(frame[1:3])