Beispiel #1
0
def crime3(path):
    """crime3

  Data loads lazily. Type data(crime3) into the console.

  A data.frame with 106 rows and 12 variables:

  -  district. district number

  -  year. 72 or 78

  -  crime. crimes per 1000 people

  -  clrprc1. clear-up perc, prior year

  -  clrprc2. clear-up perc, two-years prior

  -  d78. =1 if year = 78

  -  avgclr. (clrprc1 + clrprc2)/2

  -  lcrime. log(crime)

  -  clcrime. change in lcrime

  -  cavgclr. change in avgclr

  -  cclrprc1. change in clrprc1

  -  cclrprc2. change in clrprc2

https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_
  isbn_issn=9781111531041

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `crime3.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 106 rows and 12 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'crime3.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/wooldridge/crime3.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='crime3.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #2
0
def channing(path):
    """Channing House Data

  The `channing` data frame has 462 rows and 5 columns.

  Channing House is a retirement centre in Palo Alto, California. These
  data were collected between the opening of the house in 1964 until July
  1, 1975. In that time 97 men and 365 women passed through the centre.
  For each of these, their age on entry and also on leaving or death was
  recorded. A large number of the observations were censored mainly due to
  the resident being alive on July 1, 1975 when the data was collected.
  Over the time of the study 130 women and 46 men died at Channing House.
  Differences between the survival of the sexes, taking age into account,
  was one of the primary concerns of this study.

  This data frame contains the following columns:

  `sex`
      A factor for the sex of each resident (`"Male"` or `"Female"`).

  `entry`
      The residents age (in months) on entry to the centre

  `exit`
      The age (in months) of the resident on death, leaving the centre or
      July 1, 1975 whichever event occurred first.

  `time`
      The length of time (in months) that the resident spent at Channing
      House. (`time=exit-entry`)

  `cens`
      The indicator of right censoring. 1 indicates that the resident died
      at Channing House, 0 indicates that they left the house prior to
      July 1, 1975 or that they were still alive and living in the centre
      at that date.

  The data were obtained from

  Hyde, J. (1980) Testing survival with incomplete observations.
  *Biostatistics Casebook*. R.G. Miller, B. Efron, B.W. Brown and L.E.
  Moses (editors), 31–46. John Wiley.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `channing.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 462 rows and 6 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'channing.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/boot/channing.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='channing.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #3
0

        
Beispiel #4
0
def gpa2(path):
    """gpa2

  Data loads lazily. Type data(gpa2) into the console.

  A data.frame with 4137 rows and 12 variables:

  -  sat. combined SAT score

  -  tothrs. total hours through fall semest

  -  colgpa. GPA after fall semester

  -  athlete. =1 if athlete

  -  verbmath. verbal/math SAT score

  -  hsize. size grad. class, 100s

  -  hsrank. rank in grad. class

  -  hsperc. high school percentile, from top

  -  female. =1 if female

  -  white. =1 if white

  -  black. =1 if black

  -  hsizesq. hsize^2

https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_
  isbn_issn=9781111531041

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `gpa2.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 4137 rows and 12 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'gpa2.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/wooldridge/gpa2.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='gpa2.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #5
0
def wagner_growth(path):
    """Wagner's Hannover Employment Growth Data

  Wagner (1994) investigates the rate of employment growth (`y`) as
  function of percentage of people engaged in **p**\ roducation
  **a**\ ctivities (`PA`) and **h**\ igher **s**\ ervices (`HS`) and
  of the **g**\ rowth of these percentages (`GPA`, `GHS`) during three
  time periods in 21 geographical regions of the greater Hannover area.

  A data frame with *21 \* 3 = 63* observations (one per
  `Region x Period`) on the following 7 variables.

  `Region`
      a `factor` with 21 levels, denoting the corresponding region in
      Hannover (conceptually a “block factor”).

  `PA`
      numeric: percent of people involved in production activities.

  `GPA`
      **g**\ rowth of `PA`.

  `HS`
      a numeric vector

  `GHS`
      a numeric vector

  `y`
      a numeric vector

  `Period`
      a `factor` with levels `1:3`, denoting the time period, 1 =
      1979-1982, 2 = 1983-1988, 3 = 1989-1992.

  Hubert, M. and Rousseeuw, P. J. (1997). Robust regression with both
  continuous and binary regressors, *Journal of Statistical Planning and
  Inference* **57**, 153–163.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `wagner_growth.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 63 rows and 7 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'wagner_growth.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/robustbase/wagnerGrowth.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='wagner_growth.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #6
0
def msq(path):
    """75 mood items from the Motivational State Questionnaire for 3896 participa
  nts

  Emotions may be described either as discrete emotions or in dimensional
  terms. The Motivational State Questionnaire (MSQ) was developed to study
  emotions in laboratory and field settings. The data can be well
  described in terms of a two dimensional solution of energy vs tiredness
  and tension versus calmness. Additional items include what time of day
  the data were collected and a few personality questionnaire scores.

  A data frame with 3896 observations on the following 92 variables.

  `active`
      a numeric vector

  `afraid`
      a numeric vector

  `alert`
      a numeric vector

  `angry`
      a numeric vector

  `anxious`
      a numeric vector

  `aroused`
      a numeric vector

  `ashamed`
      a numeric vector

  `astonished`
      a numeric vector

  `at.ease`
      a numeric vector

  `at.rest`
      a numeric vector

  `attentive`
      a numeric vector

  `blue`
      a numeric vector

  `bored`
      a numeric vector

  `calm`
      a numeric vector

  `cheerful`
      a numeric vector

  `clutched.up`
      a numeric vector

  `confident`
      a numeric vector

  `content`
      a numeric vector

  `delighted`
      a numeric vector

  `depressed`
      a numeric vector

  `determined`
      a numeric vector

  `distressed`
      a numeric vector

  `drowsy`
      a numeric vector

  `dull`
      a numeric vector

  `elated`
      a numeric vector

  `energetic`
      a numeric vector

  `enthusiastic`
      a numeric vector

  `excited`
      a numeric vector

  `fearful`
      a numeric vector

  `frustrated`
      a numeric vector

  `full.of.pep`
      a numeric vector

  `gloomy`
      a numeric vector

  `grouchy`
      a numeric vector

  `guilty`
      a numeric vector

  `happy`
      a numeric vector

  `hostile`
      a numeric vector

  `idle`
      a numeric vector

  `inactive`
      a numeric vector

  `inspired`
      a numeric vector

  `intense`
      a numeric vector

  `interested`
      a numeric vector

  `irritable`
      a numeric vector

  `jittery`
      a numeric vector

  `lively`
      a numeric vector

  `lonely`
      a numeric vector

  `nervous`
      a numeric vector

  `placid`
      a numeric vector

  `pleased`
      a numeric vector

  `proud`
      a numeric vector

  `quiescent`
      a numeric vector

  `quiet`
      a numeric vector

  `relaxed`
      a numeric vector

  `sad`
      a numeric vector

  `satisfied`
      a numeric vector

  `scared`
      a numeric vector

  `serene`
      a numeric vector

  `sleepy`
      a numeric vector

  `sluggish`
      a numeric vector

  `sociable`
      a numeric vector

  `sorry`
      a numeric vector

  `still`
      a numeric vector

  `strong`
      a numeric vector

  `surprised`
      a numeric vector

  `tense`
      a numeric vector

  `tired`
      a numeric vector

  `tranquil`
      a numeric vector

  `unhappy`
      a numeric vector

  `upset`
      a numeric vector

  `vigorous`
      a numeric vector

  `wakeful`
      a numeric vector

  `warmhearted`
      a numeric vector

  `wide.awake`
      a numeric vector

  `alone`
      a numeric vector

  `kindly`
      a numeric vector

  `scornful`
      a numeric vector

  `EA`
      Thayer's Energetic Arousal Scale

  `TA`
      Thayer's Tense Arousal Scale

  `PA`
      Positive Affect scale

  `NegAff`
      Negative Affect scale

  `Extraversion`
      Extraversion from the Eysenck Personality Inventory

  `Neuroticism`
      Neuroticism from the Eysenck Personality Inventory

  `Lie`
      Lie from the EPI

  `Sociability`
      The sociability subset of the Extraversion Scale

  `Impulsivity`
      The impulsivity subset of the Extraversions Scale

  `MSQ_Time`
      Time of day the data were collected

  `MSQ_Round`
      Rounded time of day

  `TOD`
      a numeric vector

  `TOD24`
      a numeric vector

  `ID`
      subject ID

  `condition`
      What was the experimental condition after the msq was given

  `scale`
      a factor with levels `msq` `r` original or revised msq

  `exper`
      Which study were the data collected: a factor with levels `AGES`
      `BING` `BORN` `CART` `CITY` `COPE` `EMIT` `FAST`
      `Fern` `FILM` `FLAT` `Gray` `imps` `item` `knob`
      `MAPS` `mite` `pat-1` `pat-2` `PATS` `post` `RAFT`
      `Rim.1` `Rim.2` `rob-1` `rob-2` `ROG1` `ROG2` `SALT`
      `sam-1` `sam-2` `SAVE/PATS` `sett` `swam` `swam-2`
      `TIME` `VALE-1` `VALE-2` `VIEW`

  Data collected at the Personality, Motivation, and Cognition Laboratory,
  Northwestern University.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `msq.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 3896 rows and 92 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'msq.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/psych/msq.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='msq.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #7
0
def snow_gr(path):
    """Snowfall data for Grand Rapids, MI

  Official snowfall data by month and season for Grand Rapids, MI, going
  back to 1893.

  A data frame with 119 observations of the following variables.

  -  `SeasonStart` Year in which season started (July is start of
     season)

  -  `SeasonEnd` Year in which season ended (June is end of season)

  -  `Jul` Inches of snow in July

  -  `Aug` Inches of snow in August

  -  `Sep` Inches of snow in September

  -  `Oct` Inches of snow in October

  -  `Nov` Inches of snow in November

  -  `Dec` Inches of snow in December

  -  `Jan` Inches of snow in January

  -  `Feb` Inches of snow in February

  -  `Mar` Inches of snow in March

  -  `Apr` Inches of snow in April

  -  `May` Inches of snow in May

  -  `Jun` Inches of snow in June

  -  `Total` Inches of snow for entire season (July-June)

  These data were compiled by Laura Kapitula from data available at
  http://www.crh.noaa.gov/grr/climate/data/grr/snowfall/.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `snow_gr.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 119 rows and 15 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'snow_gr.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/mosaicData/SnowGR.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='snow_gr.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #8
0
def cps_85(path):
    """Data from the 1985 Current Population Survey (CPS85)

  The Current Population Survey (CPS) is used to supplement census
  information between census years. These data consist of a random sample
  of persons from the CPS85, with information on wages and other
  characteristics of the workers, including sex, number of years of
  education, years of work experience, occupational status, region of
  residence and union membership.

  A data frame with 534 observations on the following variables.

  -  `wage` wage (US dollars per hour)

  -  `educ` number of years of education

  -  `race` a factor with levels `NW` (nonwhite) or `W` (white)

  -  `sex` a factor with levels `F` `M`

  -  `hispanic` a factor with levels `Hisp` `NH`

  -  `south` a factor with levels `NS` `S`

  -  `married` a factor with levels `Married` `Single`

  -  `exper` number of years of work experience (inferred from `age`
     and `educ`)

  -  `union` a factor with levels `Not` `Union`

  -  `age` age in years

  -  `sector` a factor with levels `clerical` `const` `manag`
     `manuf` `other` `prof` `sales` `service`

  Data are from http://lib.stat.cmu.edu/DASL.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `cps_85.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 534 rows and 11 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'cps_85.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/mosaicData/CPS85.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='cps_85.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #9
0
def mlb1(path):
    """mlb1

  Data loads lazily. Type data(mlb1) into the console.

  A data.frame with 353 rows and 47 variables:

  -  salary. 1993 season salary

  -  teamsal. team payroll

  -  nl. =1 if national league

  -  years. years in major leagues

  -  games. career games played

  -  atbats. career at bats

  -  runs. career runs scored

  -  hits. career hits

  -  doubles. career doubles

  -  triples. career triples

  -  hruns. career home runs

  -  rbis. career runs batted in

  -  bavg. career batting average

  -  bb. career walks

  -  so. career strike outs

  -  sbases. career stolen bases

  -  fldperc. career fielding perc

  -  frstbase. = 1 if first base

  -  scndbase. =1 if second base

  -  shrtstop. =1 if shortstop

  -  thrdbase. =1 if third base

  -  outfield. =1 if outfield

  -  catcher. =1 if catcher

  -  yrsallst. years as all-star

  -  hispan. =1 if hispanic

  -  black. =1 if black

  -  whitepop. white pop. in city

  -  blackpop. black pop. in city

  -  hisppop. hispanic pop. in city

  -  pcinc. city per capita income

  -  gamesyr. games per year in league

  -  hrunsyr. home runs per year

  -  atbatsyr. at bats per year

  -  allstar. perc. of years an all-star

  -  slugavg. career slugging average

  -  rbisyr. rbis per year

  -  sbasesyr. stolen bases per year

  -  runsyr. runs scored per year

  -  percwhte. percent white in city

  -  percblck. percent black in city

  -  perchisp. percent hispanic in city

  -  blckpb. black\*percblck

  -  hispph. hispan\*perchisp

  -  whtepw. white\*percwhte

  -  blckph. black\*perchisp

  -  hisppb. hispan\*percblck

  -  lsalary. log(salary)

https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_
  isbn_issn=9781111531041

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `mlb1.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 353 rows and 47 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'mlb1.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/wooldridge/mlb1.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='mlb1.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #10
0

        
Beispiel #11
0
def hmda(path):
  """The Boston HMDA Data Set

  a cross-section from 1997-1998

  *number of observations* : 2381 *observation* : individuals *country* :
  United States

  In package version 0.2-9 and earlier this dataset was called Hdma.

  A dataframe containing :

  dir
      debt payments to total income ratio

  hir
      housing expenses to income ratio

  lvr
      ratio of size of loan to assessed value of property

  ccs
      consumer credit score from 1 to 6 (a low value being a good score)

  mcs
      mortgage credit score from 1 to 4 (a low value being a good score)

  pbcr
      public bad credit record ?

  dmi
      denied mortgage insurance ?

  self
      self employed ?

  single
      is the applicant single ?

  uria
      1989 Massachusetts unemployment rate in the applicant's industry

  condominium
      is unit a condominium ? (was called comdominiom in version 0.2-9 and
      earlier versions of the package)

  black
      is the applicant black ?

  deny
      mortgage application denied ?

  Federal Reserve Bank of Boston.

  Munnell, Alicia H., Geoffrey M.B. Tootell, Lynne E. Browne and James
  McEneaney (1996) “Mortgage lending in Boston: Interpreting HMDA data”,
  *American Economic Review*, 25-53.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `hmda.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 2381 rows and 13 columns and
    dictionary `metadata` of column headers (feature names).
  """
  import pandas as pd
  path = os.path.expanduser(path)
  filename = 'hmda.csv'
  if not os.path.exists(os.path.join(path, filename)):
    url = 'http://dustintran.com/data/r/Ecdat/Hmda.csv'
    maybe_download_and_extract(path, url,
                               save_file_name='hmda.csv',
                               resume=False)

  data = pd.read_csv(os.path.join(path, filename), index_col=0,
                     parse_dates=True)
  x_train = data.values
  metadata = {'columns': data.columns}
  return x_train, metadata
Beispiel #12
0

        
Beispiel #13
0
def arbuthnot(path):
    """Arbuthnot's data on male and female birth ratios in London from 1629-1710.

  John Arbuthnot (1710) used these time series data on the ratios of male
  to female births in London from 1629-1710 to carry out the first known
  significance test, comparing observed data to a null hypothesis. The
  data for these 81 years showed that in every year there were more male
  than female christenings.

  On the assumption that male and female births were equally likely, he
  showed that the probability of observing 82 years with more males than
  females was vanishingly small (*~ 4.14 x 10^{-25}*). He used this to
  argue that a nearly constant birth ratio > 1 could be interpreted to
  show the guiding hand of a devine being. The data set adds variables of
  deaths from the plague and total mortality obtained by Campbell and from
  Creighton (1965).

  A data frame with 82 observations on the following 7 variables.

  `Year`
      a numeric vector, 1629-1710

  `Males`
      a numeric vector, number of male christenings

  `Females`
      a numeric vector, number of female christenings

  `Plague`
      a numeric vector, number of deaths from plague

  `Mortality`
      a numeric vector, total mortality

  `Ratio`
      a numeric vector, ratio of Males/Females

  `Total`
      a numeric vector, total christenings in London (000s)

  Arbuthnot, John (1710). "An argument for Devine Providence, taken from
  the constant Regularity observ'd in the Births of both Sexes,"
  *Philosophical transactions*, 27, 186-190. Published in 1711.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `arbuthnot.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 82 rows and 7 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'arbuthnot.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/HistData/Arbuthnot.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='arbuthnot.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #14
0
def spam7(path):
    """Spam E-mail Data

  The data consist of 4601 email items, of which 1813 items were
  identified as spam.

  This data frame contains the following columns:

  crl.tot
      total length of words in capitals

  dollar
      number of occurrences of the \\$ symbol

  bang
      number of occurrences of the ! symbol

  money
      number of occurrences of the word ‘money’

  n000
      number of occurrences of the string ‘000’

  make
      number of occurrences of the word ‘make’

  yesno
      outcome variable, a factor with levels `n` not spam, `y` spam

  George Forman, Hewlett-Packard Laboratories

  These data are available from the University of California at Irvine
  Repository of Machine Learning Databases and Domain Theories. The
  address is: http://www.ics.uci.edu/~Here

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `spam7.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 4601 rows and 7 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'spam7.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/DAAG/spam7.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='spam7.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #15
0
def tucker(path):
  """9 Cognitive variables discussed by Tucker and Lewis (1973)

  Tucker and Lewis (1973) introduced a reliability coefficient for ML
  factor analysis. Their example data set was previously reported by
  Tucker (1958) and taken from Thurstone and Thurstone (1941). The
  correlation matrix is a 9 x 9 for 710 subjects and has two correlated
  factors of ability: Word Fluency and Verbal.

  A data frame with 9 observations on the following 9 variables.

  `t42`
      Prefixes

  `t54`
      Suffixes

  `t45`
      Chicago Reading Test: Vocabulary

  `t46`
      Chicago Reading Test: Sentences

  `t23`
      First and last letters

  `t24`
      First letters

  `t27`
      Four letter words

  `t10`
      Completion

  `t51`
      Same or Opposite

  Tucker, Ledyard (1958) An inter-battery method of factor analysis,
  Psychometrika, 23, 111-136.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `tucker.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 9 rows and 9 columns and
    dictionary `metadata` of column headers (feature names).
  """
  import pandas as pd
  path = os.path.expanduser(path)
  filename = 'tucker.csv'
  if not os.path.exists(os.path.join(path, filename)):
    url = 'http://dustintran.com/data/r/psych/Tucker.csv'
    maybe_download_and_extract(path, url,
                               save_file_name='tucker.csv',
                               resume=False)

  data = pd.read_csv(os.path.join(path, filename), index_col=0,
                     parse_dates=True)
  x_train = data.values
  metadata = {'columns': data.columns}
  return x_train, metadata
Beispiel #16
0
def lowbrth(path):
    """lowbrth

  Data loads lazily. Type data(lowbrth) into the console.

  A data.frame with 100 rows and 36 variables:

  -  year. 1987 or 1990

  -  lowbrth. perc births low weight

  -  infmort. infant mortality rate

  -  afdcprt. # participants in AFDC, 1000s

  -  popul. population, 1000s

  -  pcinc. per capita income

  -  physic. # physicians, 1000s

  -  afdcprc. percent of pop in AFDC

  -  d90. =1 if year == 1990

  -  lpcinc. log of pcinc

  -  cafdcprc. change in afdcprc

  -  clpcinc. change in lpcinc

  -  lphysic. log of physic

  -  clphysic. change in lphysic

  -  clowbrth. change in lowbrth

  -  cinfmort. change in infmort

  -  afdcpay. avg monthly AFDC payment

  -  afdcinc. afdcpay as percent pcinc

  -  lafdcpay. log of afdcpay

  -  clafdcpy. change in lafdcpay

  -  cafdcinc. change in afdcinc

  -  stateabb. state postal code

  -  state. name of state

  -  beds. # hospital beds, 1000s

  -  bedspc. beds per capita

  -  lbedspc. log(bedspc)

  -  clbedspc. change in lbedspc

  -  povrate. percent people below poverty line

  -  cpovrate. change in povrate

  -  afdcpsq. afdcper^2

  -  cafdcpsq. change in afdcpsq

  -  physicpc. physicians per capita

  -  lphypc. log(physicpc)

  -  clphypc. change in lphypc

  -  lpopul. log(popul)

  -  clpopul. change in lpopul

https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_
  isbn_issn=9781111531041

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `lowbrth.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 100 rows and 36 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'lowbrth.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/wooldridge/lowbrth.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='lowbrth.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #17
0
def polio_trials(path):
    """Polio Field Trials Data

  The data frame `PolioTrials` gives the results of the 1954 field
  trials to test the Salk polio vaccine (named for the developer, Jonas
  Salk), conducted by the National Foundation for Infantile Paralysis
  (NFIP). It is adapted from data in the article by Francis et al. (1955).
  There were actually two clinical trials, corresponding to two
  statistical designs (`Experiment`), discussed by Brownlee (1955). The
  comparison of designs and results represented a milestone in the
  development of randomized clinical trials.

  A data frame with 8 observations on the following 6 variables.

  `Experiment`
      a factor with levels `ObservedControl` `RandomizedControl`

  `Group`
      a factor with levels `Controls` `Grade2NotInoculated`
      `IncompleteVaccinations` `NotInoculated` `Placebo`
      `Vaccinated`

  `Population`
      the size of the population in each group in each experiment

  `Paralytic`
      the number of cases of paralytic polio observed in that group

  `NonParalytic`
      the number of cases of paralytic polio observed in that group

  `FalseReports`
      the number of cases initially reported as polio, but later
      determined not to be polio in that group

  Kyle Siegrist, "Virtual Laboratories in Probability and Statistics",
  http://www.math.uah.edu/stat/data/Polio.html

  Thomas Francis, Robert Korn, et al. (1955). "An Evaluation of the 1954
  Poliomyelitis Vaccine Trials", *American Journal of Public Health*, 45,
  (50 page supplement with a 63 page appendix).

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `polio_trials.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 8 rows and 6 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'polio_trials.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/HistData/PolioTrials.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='polio_trials.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #18
0
def us_crime(path):
    """The Effect of Punishment Regimes on Crime Rates

  Criminologists are interested in the effect of punishment regimes on
  crime rates. This has been studied using aggregate data on 47 states of
  the USA for 1960 given in this data frame. The variables seem to have
  been re-scaled to convenient numbers.

  This data frame contains the following columns:

  `M`
      percentage of males aged 14–24.

  `So`
      indicator variable for a Southern state.

  `Ed`
      mean years of schooling.

  `Po1`
      police expenditure in 1960.

  `Po2`
      police expenditure in 1959.

  `LF`
      labour force participation rate.

  `M.F`
      number of males per 1000 females.

  `Pop`
      state population.

  `NW`
      number of non-whites per 1000 people.

  `U1`
      unemployment rate of urban males 14–24.

  `U2`
      unemployment rate of urban males 35–39.

  `GDP`
      gross domestic product per head.

  `Ineq`
      income inequality.

  `Prob`
      probability of imprisonment.

  `Time`
      average time served in state prisons.

  `y`
      rate of crimes in a particular category per head of population.

  Ehrlich, I. (1973) Participation in illegitimate activities: a
  theoretical and empirical investigation. *Journal of Political Economy*,
  **81**, 521–565.

  Vandaele, W. (1978) Participation in illegitimate activities: Ehrlich
  revisited. In *Deterrence and Incapacitation*, eds A. Blumstein, J.
  Cohen and D. Nagin, pp. 270–335. US National Academy of Sciences.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `us_crime.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 47 rows and 16 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'us_crime.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/MASS/UScrime.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='us_crime.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #19
0
def bfox(path):
    """Canadian Women's Labour-Force Participation

  The `Bfox` data frame has 30 rows and 7 columns. Time-series data on
  Canadian women's labor-force participation, 1946–1975.

  This data frame contains the following columns:

  partic
      Percent of adult women in the workforce.

  tfr
      Total fertility rate: expected births to a cohort of 1000 women at
      current age-specific fertility rates.

  menwage
      Men's average weekly wages, in constant 1935 dollars and adjusted
      for current tax rates.

  womwage
      Women's average weekly wages.

  debt
      Per-capita consumer debt, in constant dollars.

  parttime
      Percent of the active workforce working 34 hours per week or less.

  Warning
  ~~~~~~~

  The value of `tfr` for 1973 is misrecorded as 2931; it should be 1931.

  Fox, B. (1980) *Women's Domestic Labour and their Involvement in Wage
  Work.* Unpublished doctoral dissertation, p. 449.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `bfox.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 30 rows and 6 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'bfox.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/car/Bfox.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='bfox.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #20
0
def florida(path):
    """Florida County Voting

  The `Florida` data frame has 67 rows and 11 columns. Vote by county in
  Florida for President in the 2000 election.

  This data frame contains the following columns:

  GORE
      Number of votes for Gore

  BUSH
      Number of votes for Bush.

  BUCHANAN
      Number of votes for Buchanan.

  NADER
      Number of votes for Nader.

  BROWNE
      Number of votes for Browne (whoever that is).

  HAGELIN
      Number of votes for Hagelin (whoever that is).

  HARRIS
      Number of votes for Harris (whoever that is).

  MCREYNOLDS
      Number of votes for McReynolds (whoever that is).

  MOOREHEAD
      Number of votes for Moorehead (whoever that is).

  PHILLIPS
      Number of votes for Phillips (whoever that is).

  Total
      Total number of votes.

  Adams, G. D. and Fastnow, C. F. (2000) A note on the voting
  irregularities in Palm Beach, FL. Formerly at

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `florida.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 67 rows and 11 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'florida.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/car/Florida.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='florida.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #21
0
def flower(path):
  """Flower Characteristics

  8 characteristics for 18 popular flowers.

  A data frame with 18 observations on 8 variables:

  +-------------+-----------+--------------+
  | [ , "V1"]   | factor    | winters      |
  +-------------+-----------+--------------+
  | [ , "V2"]   | factor    | shadow       |
  +-------------+-----------+--------------+
  | [ , "V3"]   | factor    | tubers       |
  +-------------+-----------+--------------+
  | [ , "V4"]   | factor    | color        |
  +-------------+-----------+--------------+
  | [ , "V5"]   | ordered   | soil         |
  +-------------+-----------+--------------+
  | [ , "V6"]   | ordered   | preference   |
  +-------------+-----------+--------------+
  | [ , "V7"]   | numeric   | height       |
  +-------------+-----------+--------------+
  | [ , "V8"]   | numeric   | distance     |
  +-------------+-----------+--------------+

  V1
      winters, is binary and indicates whether the plant may be left in
      the garden when it freezes.

  V2
      shadow, is binary and shows whether the plant needs to stand in the
      shadow.

  V3
      tubers, is asymmetric binary and distinguishes between plants with
      tubers and plants that grow in any other way.

  V4
      color, is nominal and specifies the flower's color (1 = white, 2 =
      yellow, 3 = pink, 4 = red, 5 = blue).

  V5
      soil, is ordinal and indicates whether the plant grows in dry (1),
      normal (2), or wet (3) soil.

  V6
      preference, is ordinal and gives someone's preference ranking going
      from 1 to 18.

  V7
      height, is interval scaled, the plant's height in centimeters.

  V8
      distance, is interval scaled, the distance in centimeters that
      should be left between the plants.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `flower.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 18 rows and 8 columns and
    dictionary `metadata` of column headers (feature names).
  """
  import pandas as pd
  path = os.path.expanduser(path)
  filename = 'flower.csv'
  if not os.path.exists(os.path.join(path, filename)):
    url = 'http://dustintran.com/data/r/cluster/flower.csv'
    maybe_download_and_extract(path, url,
                               save_file_name='flower.csv',
                               resume=False)

  data = pd.read_csv(os.path.join(path, filename), index_col=0,
                     parse_dates=True)
  x_train = data.values
  metadata = {'columns': data.columns}
  return x_train, metadata
Beispiel #22
0
def retinopathy(path):
    """Diabetic Retinopathy

  A trial of laser coagulation as a treatment to delay diabetic
  retinopathy.

  A data frame with 394 observations on the following 9 variables.

  `id`
      numeric subject id

  `laser`
      type of laser used: `xenon` `argon`

  `eye`
      which eye was treated: `right` `left`

  `age`
      age at diagnosis of diabetes

  `type`
      type of diabetes: `juvenile` `adult`, (diagnosis before age 20)

  `trt`
      0 = control eye, 1 = treated eye

  `futime`
      time to loss of vision or last follow-up

  `status`
      0 = censored, 1 = loss of vision in this eye

  `risk`
      a risk score for the eye. This high risk subset is defined as a
      score of 6 or greater in at least one eye.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `retinopathy.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 394 rows and 9 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'retinopathy.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/survival/retinopathy.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='retinopathy.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #23
0
def minwage(path):
    """minwage

  Data loads lazily. Type data(minwage) into the console.

  A data.frame with 612 rows and 58 variables:

  -  emp232. employment, sector 232, 1000s

  -  wage232. hourly wage, sector 232, $

  -  emp236.

  -  wage236.

  -  emp234.

  -  wage234.

  -  emp314.

  -  wage314.

  -  emp228.

  -  wage228.

  -  emp233.

  -  wage233.

  -  emp394.

  -  wage394.

  -  emp231.

  -  wage231.

  -  emp226.

  -  wage226.

  -  emp387.

  -  wage387.

  -  emp056.

  -  wage056.

  -  unem. civilian unemployment rate, percent

  -  cpi. Consumer Price Index (urban), 1982-1984 = 100

  -  minwage. Federal minimum wage, $/hour

  -  lemp232. log(emp232)

  -  lwage232. log(wage232)

  -  gemp232. lemp232 - lemp232[\_n-1]

  -  gwage232. lwage232 - lwage232[\_n-1]

  -  lminwage. log(minwage)

  -  gmwage. lminwage - lminwage[\_n-1]

  -  gmwage\_1. gmwage[\_n-1]

  -  gmwage\_2.

  -  gmwage\_3.

  -  gmwage\_4.

  -  gmwage\_5.

  -  gmwage\_6.

  -  gmwage\_7.

  -  gmwage\_8.

  -  gmwage\_9.

  -  gmwage\_10.

  -  gmwage\_11.

  -  gmwage\_12.

  -  lemp236.

  -  gcpi. lcpi - lcpi[\_n-1]

  -  lcpi. log(cpi)

  -  lwage236.

  -  gemp236.

  -  gwage236.

  -  lemp234.

  -  lwage234.

  -  gemp234.

  -  gwage234.

  -  lemp314.

  -  lwage314.

  -  gemp314.

  -  gwage314.

  -  t. linear time trend, 1 to 612

https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_
  isbn_issn=9781111531041

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `minwage.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 612 rows and 58 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'minwage.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/wooldridge/minwage.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='minwage.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #24
0
def liver(path):
    """Liver related laboratory data

  Liver related laboratory data from a randomized, blind, parallel group
  clinical trial with 4 doses of a drug.

  A data frame with 606 observations on the following 9 variables.

  ALP.B
      Alkaline phosphatase at baseline. A numeric vector.

  ALT.B
      Alanine aminotransferase at baseline. A numeric vector.

  AST.B
      Aspartate aminotransferase at baseline. A numeric vector.

  TBL.B
      Total bilirubin at baseline. A numeric vector.

  ALP.M
      Alkaline phosphatase after treatment. A numeric vector.

  ALT.M
      Alanine aminotransferase after treatment. A numeric vector.

  AST.M
      Aspartate aminotransferase after treatment. A numeric vector.

  TBL.M
      Total bilirubin after treatment. A numeric vector.

  dose
      The treatment group (i.e. dose group). A factor with levels `A`
      `B` `C` `D`


  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `liver.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 606 rows and 9 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'liver.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/texmex/liver.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='liver.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #25
0
def jtrain(path):
  """jtrain

  Data loads lazily. Type data(jtrain) into the console.

  A data.frame with 471 rows and 30 variables:

  -  year. 1987, 1988, or 1989

  -  fcode. firm code number

  -  employ. # employees at plant

  -  sales. annual sales, $

  -  avgsal. average employee salary

  -  scrap. scrap rate (per 100 items)

  -  rework. rework rate (per 100 items)

  -  tothrs. total hours training

  -  union. =1 if unionized

  -  grant. = 1 if received grant

  -  d89. = 1 if year = 1989

  -  d88. = 1 if year = 1988

  -  totrain. total employees trained

  -  hrsemp. tothrs/totrain

  -  lscrap. log(scrap)

  -  lemploy. log(employ)

  -  lsales. log(sales)

  -  lrework. log(rework)

  -  lhrsemp. log(1 + hrsemp)

  -  lscrap\_1. lagged lscrap; missing 1987

  -  grant\_1. lagged grant; assumed 0 in 1987

  -  clscrap. lscrap - lscrap\_1; year > 1987

  -  cgrant. grant - grant\_1

  -  clemploy. lemploy - lemploy[\_n-1]

  -  clsales. lavgsal - lavgsal[\_n-1]

  -  lavgsal. log(avgsal)

  -  clavgsal. lavgsal - lavgsal[\_n-1]

  -  cgrant\_1. cgrant[\_n-1]

  -  chrsemp. hrsemp - hrsemp[\_n-1]

  -  clhrsemp. lhrsemp - lhrsemp[\_n-1]

https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_
  isbn_issn=9781111531041

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `jtrain.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 471 rows and 30 columns and
    dictionary `metadata` of column headers (feature names).
  """
  import pandas as pd
  path = os.path.expanduser(path)
  filename = 'jtrain.csv'
  if not os.path.exists(os.path.join(path, filename)):
    url = 'http://dustintran.com/data/r/wooldridge/jtrain.csv'
    maybe_download_and_extract(path, url,
                               save_file_name='jtrain.csv',
                               resume=False)

  data = pd.read_csv(os.path.join(path, filename), index_col=0,
                     parse_dates=True)
  x_train = data.values
  metadata = {'columns': data.columns}
  return x_train, metadata
Beispiel #26
0
def weimar(path):
    """1932 Weimar election data

  This data set contains election results for 10 kreise (equivalent to
  precincts) from the 1932 Weimar (German) election.

  A table containing 11 variables and 10 observations. The variables are

  Nazi
      Number of votes for the Nazi party

  Government
      Number of votes for the Government

  Communists
      Number of votes for the Communist party

  FarRight
      Number of votes for far right parties

  Other
      Number of votes for other parties, and non-voters

  shareunemployed
      Proportion unemployed

  shareblue
      Proportion working class

  sharewhite
      Proportion white-collar workers

  sharedomestic
      Proportion domestic servants

  shareprotestants
      Proportion Protestant


  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `weimar.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 10 rows and 11 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'weimar.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/Zelig/Weimar.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='weimar.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #27
0
def frogs(path):
    """Frogs Data

  The `frogs` data frame has 212 rows and 11 columns. The data are on
  the distribution of the Southern Corroboree frog, which occurs in the
  Snowy Mountains area of New South Wales, Australia.

  This data frame contains the following columns:

  pres.abs
      0 = frogs were absent, 1 = frogs were present

  northing
      reference point

  easting
      reference point

  altitude
      altitude , in meters

  distance
      distance in meters to nearest extant population

  NoOfPools
      number of potential breeding pools

  NoOfSites
      (number of potential breeding sites within a 2 km radius

  avrain
      mean rainfall for Spring period

  meanmin
      mean minimum Spring temperature

  meanmax
      mean maximum Spring temperature

  Hunter, D. (2000) The conservation and demography of the southern
  corroboree frog (Pseudophryne corroboree). M.Sc. thesis, University of
  Canberra, Canberra.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `frogs.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 212 rows and 10 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'frogs.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/DAAG/frogs.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='frogs.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #28
0
def gravity(path):
    """Acceleration Due to Gravity

  The `gravity` data frame has 81 rows and 2 columns.

  The `grav` data set has 26 rows and 2 columns.

  Between May 1934 and July 1935, the National Bureau of Standards in
  Washington D.C. conducted a series of experiments to estimate the
  acceleration due to gravity, *g*, at Washington. Each experiment
  produced a number of replicate estimates of *g* using the same
  methodology. Although the basic method remained the same for all
  experiments, that of the reversible pendulum, there were changes in
  configuration.

  The `gravity` data frame contains the data from all eight experiments.
  The `grav` data frame contains the data from the experiments 7 and 8.
  The data are expressed as deviations from 980.000 in centimetres per
  second squared.

  This data frame contains the following columns:

  `g`
      The deviation of the estimate from 980.000 centimetres per second
      squared.

  `series`
      A factor describing from which experiment the estimate was derived.

  The data were obtained from

  Cressie, N. (1982) Playing safe with misweighted means. *Journal of the
  American Statistical Association*, **77**, 754–759.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `gravity.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 81 rows and 2 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'gravity.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/boot/gravity.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='gravity.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #29
0
def electricity(path):
    """Cost Function for Electricity Producers

  a cross-section from 1970 to 1970

  *number of observations* : 158

  *observation* : production units

  *country* : United States

  A dataframe containing :

  cost
      total cost

  q
      total output

  pl
      wage rate

  sl
      cost share for labor

  pk
      capital price index

  sk
      cost share for capital

  pf
      fuel price

  sf
      cost share for fuel

  Christensen, L. and W. H. Greene (1976) “Economies of scale in U.S.
  electric power generation”, *Journal of Political Economy*, **84**,
  655-676.

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `electricity.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 158 rows and 8 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'electricity.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/Ecdat/Electricity.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='electricity.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata
Beispiel #30
0
def rabbit(path):
    """Blood Pressure in Rabbits

  Five rabbits were studied on two occasions, after treatment with saline
  (control) and after treatment with the *5-HT\_3* antagonist MDL 72222.
  After each treatment ascending doses of phenylbiguanide were injected
  intravenously at 10 minute intervals and the responses of mean blood
  pressure measured. The goal was to test whether the cardiogenic
  chemoreflex elicited by phenylbiguanide depends on the activation of
  *5-HT\_3* receptors.

  This data frame contains 60 rows and the following variables:

  `BPchange`
      change in blood pressure relative to the start of the experiment.

  `Dose`
      dose of Phenylbiguanide in micrograms.

  `Run`
      label of run (`"C1"` to `"C5"`, then `"M1"` to `"M5"`).

  `Treatment`
      placebo or the *5-HT\_3* antagonist MDL 72222.

  `Animal`
      label of animal used (`"R1"` to `"R5"`).

  | J. Ludbrook (1994) Repeated measurements and multiple comparisons in
    cardiovascular research. *Cardiovascular Research* **28**, 303–311.
  | [The numerical data are not in the paper but were supplied by
    Professor Ludbrook]

  Args:

    path: str.
      Path to directory which either stores file or otherwise file will
      be downloaded and extracted there.
      Filename is `rabbit.csv`.

  Returns:

    Tuple of np.ndarray `x_train` with 60 rows and 5 columns and
    dictionary `metadata` of column headers (feature names).
  """
    import pandas as pd
    path = os.path.expanduser(path)
    filename = 'rabbit.csv'
    if not os.path.exists(os.path.join(path, filename)):
        url = 'http://dustintran.com/data/r/MASS/Rabbit.csv'
        maybe_download_and_extract(path,
                                   url,
                                   save_file_name='rabbit.csv',
                                   resume=False)

    data = pd.read_csv(os.path.join(path, filename),
                       index_col=0,
                       parse_dates=True)
    x_train = data.values
    metadata = {'columns': data.columns}
    return x_train, metadata