def crime3(path): """crime3 Data loads lazily. Type data(crime3) into the console. A data.frame with 106 rows and 12 variables: - district. district number - year. 72 or 78 - crime. crimes per 1000 people - clrprc1. clear-up perc, prior year - clrprc2. clear-up perc, two-years prior - d78. =1 if year = 78 - avgclr. (clrprc1 + clrprc2)/2 - lcrime. log(crime) - clcrime. change in lcrime - cavgclr. change in avgclr - cclrprc1. change in clrprc1 - cclrprc2. change in clrprc2 https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_ isbn_issn=9781111531041 Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `crime3.csv`. Returns: Tuple of np.ndarray `x_train` with 106 rows and 12 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'crime3.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/wooldridge/crime3.csv' maybe_download_and_extract(path, url, save_file_name='crime3.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def channing(path): """Channing House Data The `channing` data frame has 462 rows and 5 columns. Channing House is a retirement centre in Palo Alto, California. These data were collected between the opening of the house in 1964 until July 1, 1975. In that time 97 men and 365 women passed through the centre. For each of these, their age on entry and also on leaving or death was recorded. A large number of the observations were censored mainly due to the resident being alive on July 1, 1975 when the data was collected. Over the time of the study 130 women and 46 men died at Channing House. Differences between the survival of the sexes, taking age into account, was one of the primary concerns of this study. This data frame contains the following columns: `sex` A factor for the sex of each resident (`"Male"` or `"Female"`). `entry` The residents age (in months) on entry to the centre `exit` The age (in months) of the resident on death, leaving the centre or July 1, 1975 whichever event occurred first. `time` The length of time (in months) that the resident spent at Channing House. (`time=exit-entry`) `cens` The indicator of right censoring. 1 indicates that the resident died at Channing House, 0 indicates that they left the house prior to July 1, 1975 or that they were still alive and living in the centre at that date. The data were obtained from Hyde, J. (1980) Testing survival with incomplete observations. *Biostatistics Casebook*. R.G. Miller, B. Efron, B.W. Brown and L.E. Moses (editors), 31–46. John Wiley. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `channing.csv`. Returns: Tuple of np.ndarray `x_train` with 462 rows and 6 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'channing.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/boot/channing.csv' maybe_download_and_extract(path, url, save_file_name='channing.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def gpa2(path): """gpa2 Data loads lazily. Type data(gpa2) into the console. A data.frame with 4137 rows and 12 variables: - sat. combined SAT score - tothrs. total hours through fall semest - colgpa. GPA after fall semester - athlete. =1 if athlete - verbmath. verbal/math SAT score - hsize. size grad. class, 100s - hsrank. rank in grad. class - hsperc. high school percentile, from top - female. =1 if female - white. =1 if white - black. =1 if black - hsizesq. hsize^2 https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_ isbn_issn=9781111531041 Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `gpa2.csv`. Returns: Tuple of np.ndarray `x_train` with 4137 rows and 12 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'gpa2.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/wooldridge/gpa2.csv' maybe_download_and_extract(path, url, save_file_name='gpa2.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def wagner_growth(path): """Wagner's Hannover Employment Growth Data Wagner (1994) investigates the rate of employment growth (`y`) as function of percentage of people engaged in **p**\ roducation **a**\ ctivities (`PA`) and **h**\ igher **s**\ ervices (`HS`) and of the **g**\ rowth of these percentages (`GPA`, `GHS`) during three time periods in 21 geographical regions of the greater Hannover area. A data frame with *21 \* 3 = 63* observations (one per `Region x Period`) on the following 7 variables. `Region` a `factor` with 21 levels, denoting the corresponding region in Hannover (conceptually a “block factor”). `PA` numeric: percent of people involved in production activities. `GPA` **g**\ rowth of `PA`. `HS` a numeric vector `GHS` a numeric vector `y` a numeric vector `Period` a `factor` with levels `1:3`, denoting the time period, 1 = 1979-1982, 2 = 1983-1988, 3 = 1989-1992. Hubert, M. and Rousseeuw, P. J. (1997). Robust regression with both continuous and binary regressors, *Journal of Statistical Planning and Inference* **57**, 153–163. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `wagner_growth.csv`. Returns: Tuple of np.ndarray `x_train` with 63 rows and 7 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'wagner_growth.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/robustbase/wagnerGrowth.csv' maybe_download_and_extract(path, url, save_file_name='wagner_growth.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def msq(path): """75 mood items from the Motivational State Questionnaire for 3896 participa nts Emotions may be described either as discrete emotions or in dimensional terms. The Motivational State Questionnaire (MSQ) was developed to study emotions in laboratory and field settings. The data can be well described in terms of a two dimensional solution of energy vs tiredness and tension versus calmness. Additional items include what time of day the data were collected and a few personality questionnaire scores. A data frame with 3896 observations on the following 92 variables. `active` a numeric vector `afraid` a numeric vector `alert` a numeric vector `angry` a numeric vector `anxious` a numeric vector `aroused` a numeric vector `ashamed` a numeric vector `astonished` a numeric vector `at.ease` a numeric vector `at.rest` a numeric vector `attentive` a numeric vector `blue` a numeric vector `bored` a numeric vector `calm` a numeric vector `cheerful` a numeric vector `clutched.up` a numeric vector `confident` a numeric vector `content` a numeric vector `delighted` a numeric vector `depressed` a numeric vector `determined` a numeric vector `distressed` a numeric vector `drowsy` a numeric vector `dull` a numeric vector `elated` a numeric vector `energetic` a numeric vector `enthusiastic` a numeric vector `excited` a numeric vector `fearful` a numeric vector `frustrated` a numeric vector `full.of.pep` a numeric vector `gloomy` a numeric vector `grouchy` a numeric vector `guilty` a numeric vector `happy` a numeric vector `hostile` a numeric vector `idle` a numeric vector `inactive` a numeric vector `inspired` a numeric vector `intense` a numeric vector `interested` a numeric vector `irritable` a numeric vector `jittery` a numeric vector `lively` a numeric vector `lonely` a numeric vector `nervous` a numeric vector `placid` a numeric vector `pleased` a numeric vector `proud` a numeric vector `quiescent` a numeric vector `quiet` a numeric vector `relaxed` a numeric vector `sad` a numeric vector `satisfied` a numeric vector `scared` a numeric vector `serene` a numeric vector `sleepy` a numeric vector `sluggish` a numeric vector `sociable` a numeric vector `sorry` a numeric vector `still` a numeric vector `strong` a numeric vector `surprised` a numeric vector `tense` a numeric vector `tired` a numeric vector `tranquil` a numeric vector `unhappy` a numeric vector `upset` a numeric vector `vigorous` a numeric vector `wakeful` a numeric vector `warmhearted` a numeric vector `wide.awake` a numeric vector `alone` a numeric vector `kindly` a numeric vector `scornful` a numeric vector `EA` Thayer's Energetic Arousal Scale `TA` Thayer's Tense Arousal Scale `PA` Positive Affect scale `NegAff` Negative Affect scale `Extraversion` Extraversion from the Eysenck Personality Inventory `Neuroticism` Neuroticism from the Eysenck Personality Inventory `Lie` Lie from the EPI `Sociability` The sociability subset of the Extraversion Scale `Impulsivity` The impulsivity subset of the Extraversions Scale `MSQ_Time` Time of day the data were collected `MSQ_Round` Rounded time of day `TOD` a numeric vector `TOD24` a numeric vector `ID` subject ID `condition` What was the experimental condition after the msq was given `scale` a factor with levels `msq` `r` original or revised msq `exper` Which study were the data collected: a factor with levels `AGES` `BING` `BORN` `CART` `CITY` `COPE` `EMIT` `FAST` `Fern` `FILM` `FLAT` `Gray` `imps` `item` `knob` `MAPS` `mite` `pat-1` `pat-2` `PATS` `post` `RAFT` `Rim.1` `Rim.2` `rob-1` `rob-2` `ROG1` `ROG2` `SALT` `sam-1` `sam-2` `SAVE/PATS` `sett` `swam` `swam-2` `TIME` `VALE-1` `VALE-2` `VIEW` Data collected at the Personality, Motivation, and Cognition Laboratory, Northwestern University. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `msq.csv`. Returns: Tuple of np.ndarray `x_train` with 3896 rows and 92 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'msq.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/psych/msq.csv' maybe_download_and_extract(path, url, save_file_name='msq.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def snow_gr(path): """Snowfall data for Grand Rapids, MI Official snowfall data by month and season for Grand Rapids, MI, going back to 1893. A data frame with 119 observations of the following variables. - `SeasonStart` Year in which season started (July is start of season) - `SeasonEnd` Year in which season ended (June is end of season) - `Jul` Inches of snow in July - `Aug` Inches of snow in August - `Sep` Inches of snow in September - `Oct` Inches of snow in October - `Nov` Inches of snow in November - `Dec` Inches of snow in December - `Jan` Inches of snow in January - `Feb` Inches of snow in February - `Mar` Inches of snow in March - `Apr` Inches of snow in April - `May` Inches of snow in May - `Jun` Inches of snow in June - `Total` Inches of snow for entire season (July-June) These data were compiled by Laura Kapitula from data available at http://www.crh.noaa.gov/grr/climate/data/grr/snowfall/. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `snow_gr.csv`. Returns: Tuple of np.ndarray `x_train` with 119 rows and 15 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'snow_gr.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/mosaicData/SnowGR.csv' maybe_download_and_extract(path, url, save_file_name='snow_gr.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def cps_85(path): """Data from the 1985 Current Population Survey (CPS85) The Current Population Survey (CPS) is used to supplement census information between census years. These data consist of a random sample of persons from the CPS85, with information on wages and other characteristics of the workers, including sex, number of years of education, years of work experience, occupational status, region of residence and union membership. A data frame with 534 observations on the following variables. - `wage` wage (US dollars per hour) - `educ` number of years of education - `race` a factor with levels `NW` (nonwhite) or `W` (white) - `sex` a factor with levels `F` `M` - `hispanic` a factor with levels `Hisp` `NH` - `south` a factor with levels `NS` `S` - `married` a factor with levels `Married` `Single` - `exper` number of years of work experience (inferred from `age` and `educ`) - `union` a factor with levels `Not` `Union` - `age` age in years - `sector` a factor with levels `clerical` `const` `manag` `manuf` `other` `prof` `sales` `service` Data are from http://lib.stat.cmu.edu/DASL. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `cps_85.csv`. Returns: Tuple of np.ndarray `x_train` with 534 rows and 11 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'cps_85.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/mosaicData/CPS85.csv' maybe_download_and_extract(path, url, save_file_name='cps_85.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def mlb1(path): """mlb1 Data loads lazily. Type data(mlb1) into the console. A data.frame with 353 rows and 47 variables: - salary. 1993 season salary - teamsal. team payroll - nl. =1 if national league - years. years in major leagues - games. career games played - atbats. career at bats - runs. career runs scored - hits. career hits - doubles. career doubles - triples. career triples - hruns. career home runs - rbis. career runs batted in - bavg. career batting average - bb. career walks - so. career strike outs - sbases. career stolen bases - fldperc. career fielding perc - frstbase. = 1 if first base - scndbase. =1 if second base - shrtstop. =1 if shortstop - thrdbase. =1 if third base - outfield. =1 if outfield - catcher. =1 if catcher - yrsallst. years as all-star - hispan. =1 if hispanic - black. =1 if black - whitepop. white pop. in city - blackpop. black pop. in city - hisppop. hispanic pop. in city - pcinc. city per capita income - gamesyr. games per year in league - hrunsyr. home runs per year - atbatsyr. at bats per year - allstar. perc. of years an all-star - slugavg. career slugging average - rbisyr. rbis per year - sbasesyr. stolen bases per year - runsyr. runs scored per year - percwhte. percent white in city - percblck. percent black in city - perchisp. percent hispanic in city - blckpb. black\*percblck - hispph. hispan\*perchisp - whtepw. white\*percwhte - blckph. black\*perchisp - hisppb. hispan\*percblck - lsalary. log(salary) https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_ isbn_issn=9781111531041 Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `mlb1.csv`. Returns: Tuple of np.ndarray `x_train` with 353 rows and 47 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'mlb1.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/wooldridge/mlb1.csv' maybe_download_and_extract(path, url, save_file_name='mlb1.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def hmda(path): """The Boston HMDA Data Set a cross-section from 1997-1998 *number of observations* : 2381 *observation* : individuals *country* : United States In package version 0.2-9 and earlier this dataset was called Hdma. A dataframe containing : dir debt payments to total income ratio hir housing expenses to income ratio lvr ratio of size of loan to assessed value of property ccs consumer credit score from 1 to 6 (a low value being a good score) mcs mortgage credit score from 1 to 4 (a low value being a good score) pbcr public bad credit record ? dmi denied mortgage insurance ? self self employed ? single is the applicant single ? uria 1989 Massachusetts unemployment rate in the applicant's industry condominium is unit a condominium ? (was called comdominiom in version 0.2-9 and earlier versions of the package) black is the applicant black ? deny mortgage application denied ? Federal Reserve Bank of Boston. Munnell, Alicia H., Geoffrey M.B. Tootell, Lynne E. Browne and James McEneaney (1996) “Mortgage lending in Boston: Interpreting HMDA data”, *American Economic Review*, 25-53. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `hmda.csv`. Returns: Tuple of np.ndarray `x_train` with 2381 rows and 13 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'hmda.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/Ecdat/Hmda.csv' maybe_download_and_extract(path, url, save_file_name='hmda.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def arbuthnot(path): """Arbuthnot's data on male and female birth ratios in London from 1629-1710. John Arbuthnot (1710) used these time series data on the ratios of male to female births in London from 1629-1710 to carry out the first known significance test, comparing observed data to a null hypothesis. The data for these 81 years showed that in every year there were more male than female christenings. On the assumption that male and female births were equally likely, he showed that the probability of observing 82 years with more males than females was vanishingly small (*~ 4.14 x 10^{-25}*). He used this to argue that a nearly constant birth ratio > 1 could be interpreted to show the guiding hand of a devine being. The data set adds variables of deaths from the plague and total mortality obtained by Campbell and from Creighton (1965). A data frame with 82 observations on the following 7 variables. `Year` a numeric vector, 1629-1710 `Males` a numeric vector, number of male christenings `Females` a numeric vector, number of female christenings `Plague` a numeric vector, number of deaths from plague `Mortality` a numeric vector, total mortality `Ratio` a numeric vector, ratio of Males/Females `Total` a numeric vector, total christenings in London (000s) Arbuthnot, John (1710). "An argument for Devine Providence, taken from the constant Regularity observ'd in the Births of both Sexes," *Philosophical transactions*, 27, 186-190. Published in 1711. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `arbuthnot.csv`. Returns: Tuple of np.ndarray `x_train` with 82 rows and 7 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'arbuthnot.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/HistData/Arbuthnot.csv' maybe_download_and_extract(path, url, save_file_name='arbuthnot.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def spam7(path): """Spam E-mail Data The data consist of 4601 email items, of which 1813 items were identified as spam. This data frame contains the following columns: crl.tot total length of words in capitals dollar number of occurrences of the \\$ symbol bang number of occurrences of the ! symbol money number of occurrences of the word ‘money’ n000 number of occurrences of the string ‘000’ make number of occurrences of the word ‘make’ yesno outcome variable, a factor with levels `n` not spam, `y` spam George Forman, Hewlett-Packard Laboratories These data are available from the University of California at Irvine Repository of Machine Learning Databases and Domain Theories. The address is: http://www.ics.uci.edu/~Here Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `spam7.csv`. Returns: Tuple of np.ndarray `x_train` with 4601 rows and 7 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'spam7.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/DAAG/spam7.csv' maybe_download_and_extract(path, url, save_file_name='spam7.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def tucker(path): """9 Cognitive variables discussed by Tucker and Lewis (1973) Tucker and Lewis (1973) introduced a reliability coefficient for ML factor analysis. Their example data set was previously reported by Tucker (1958) and taken from Thurstone and Thurstone (1941). The correlation matrix is a 9 x 9 for 710 subjects and has two correlated factors of ability: Word Fluency and Verbal. A data frame with 9 observations on the following 9 variables. `t42` Prefixes `t54` Suffixes `t45` Chicago Reading Test: Vocabulary `t46` Chicago Reading Test: Sentences `t23` First and last letters `t24` First letters `t27` Four letter words `t10` Completion `t51` Same or Opposite Tucker, Ledyard (1958) An inter-battery method of factor analysis, Psychometrika, 23, 111-136. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `tucker.csv`. Returns: Tuple of np.ndarray `x_train` with 9 rows and 9 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'tucker.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/psych/Tucker.csv' maybe_download_and_extract(path, url, save_file_name='tucker.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def lowbrth(path): """lowbrth Data loads lazily. Type data(lowbrth) into the console. A data.frame with 100 rows and 36 variables: - year. 1987 or 1990 - lowbrth. perc births low weight - infmort. infant mortality rate - afdcprt. # participants in AFDC, 1000s - popul. population, 1000s - pcinc. per capita income - physic. # physicians, 1000s - afdcprc. percent of pop in AFDC - d90. =1 if year == 1990 - lpcinc. log of pcinc - cafdcprc. change in afdcprc - clpcinc. change in lpcinc - lphysic. log of physic - clphysic. change in lphysic - clowbrth. change in lowbrth - cinfmort. change in infmort - afdcpay. avg monthly AFDC payment - afdcinc. afdcpay as percent pcinc - lafdcpay. log of afdcpay - clafdcpy. change in lafdcpay - cafdcinc. change in afdcinc - stateabb. state postal code - state. name of state - beds. # hospital beds, 1000s - bedspc. beds per capita - lbedspc. log(bedspc) - clbedspc. change in lbedspc - povrate. percent people below poverty line - cpovrate. change in povrate - afdcpsq. afdcper^2 - cafdcpsq. change in afdcpsq - physicpc. physicians per capita - lphypc. log(physicpc) - clphypc. change in lphypc - lpopul. log(popul) - clpopul. change in lpopul https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_ isbn_issn=9781111531041 Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `lowbrth.csv`. Returns: Tuple of np.ndarray `x_train` with 100 rows and 36 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'lowbrth.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/wooldridge/lowbrth.csv' maybe_download_and_extract(path, url, save_file_name='lowbrth.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def polio_trials(path): """Polio Field Trials Data The data frame `PolioTrials` gives the results of the 1954 field trials to test the Salk polio vaccine (named for the developer, Jonas Salk), conducted by the National Foundation for Infantile Paralysis (NFIP). It is adapted from data in the article by Francis et al. (1955). There were actually two clinical trials, corresponding to two statistical designs (`Experiment`), discussed by Brownlee (1955). The comparison of designs and results represented a milestone in the development of randomized clinical trials. A data frame with 8 observations on the following 6 variables. `Experiment` a factor with levels `ObservedControl` `RandomizedControl` `Group` a factor with levels `Controls` `Grade2NotInoculated` `IncompleteVaccinations` `NotInoculated` `Placebo` `Vaccinated` `Population` the size of the population in each group in each experiment `Paralytic` the number of cases of paralytic polio observed in that group `NonParalytic` the number of cases of paralytic polio observed in that group `FalseReports` the number of cases initially reported as polio, but later determined not to be polio in that group Kyle Siegrist, "Virtual Laboratories in Probability and Statistics", http://www.math.uah.edu/stat/data/Polio.html Thomas Francis, Robert Korn, et al. (1955). "An Evaluation of the 1954 Poliomyelitis Vaccine Trials", *American Journal of Public Health*, 45, (50 page supplement with a 63 page appendix). Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `polio_trials.csv`. Returns: Tuple of np.ndarray `x_train` with 8 rows and 6 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'polio_trials.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/HistData/PolioTrials.csv' maybe_download_and_extract(path, url, save_file_name='polio_trials.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def us_crime(path): """The Effect of Punishment Regimes on Crime Rates Criminologists are interested in the effect of punishment regimes on crime rates. This has been studied using aggregate data on 47 states of the USA for 1960 given in this data frame. The variables seem to have been re-scaled to convenient numbers. This data frame contains the following columns: `M` percentage of males aged 14–24. `So` indicator variable for a Southern state. `Ed` mean years of schooling. `Po1` police expenditure in 1960. `Po2` police expenditure in 1959. `LF` labour force participation rate. `M.F` number of males per 1000 females. `Pop` state population. `NW` number of non-whites per 1000 people. `U1` unemployment rate of urban males 14–24. `U2` unemployment rate of urban males 35–39. `GDP` gross domestic product per head. `Ineq` income inequality. `Prob` probability of imprisonment. `Time` average time served in state prisons. `y` rate of crimes in a particular category per head of population. Ehrlich, I. (1973) Participation in illegitimate activities: a theoretical and empirical investigation. *Journal of Political Economy*, **81**, 521–565. Vandaele, W. (1978) Participation in illegitimate activities: Ehrlich revisited. In *Deterrence and Incapacitation*, eds A. Blumstein, J. Cohen and D. Nagin, pp. 270–335. US National Academy of Sciences. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `us_crime.csv`. Returns: Tuple of np.ndarray `x_train` with 47 rows and 16 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'us_crime.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/MASS/UScrime.csv' maybe_download_and_extract(path, url, save_file_name='us_crime.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def bfox(path): """Canadian Women's Labour-Force Participation The `Bfox` data frame has 30 rows and 7 columns. Time-series data on Canadian women's labor-force participation, 1946–1975. This data frame contains the following columns: partic Percent of adult women in the workforce. tfr Total fertility rate: expected births to a cohort of 1000 women at current age-specific fertility rates. menwage Men's average weekly wages, in constant 1935 dollars and adjusted for current tax rates. womwage Women's average weekly wages. debt Per-capita consumer debt, in constant dollars. parttime Percent of the active workforce working 34 hours per week or less. Warning ~~~~~~~ The value of `tfr` for 1973 is misrecorded as 2931; it should be 1931. Fox, B. (1980) *Women's Domestic Labour and their Involvement in Wage Work.* Unpublished doctoral dissertation, p. 449. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `bfox.csv`. Returns: Tuple of np.ndarray `x_train` with 30 rows and 6 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'bfox.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/car/Bfox.csv' maybe_download_and_extract(path, url, save_file_name='bfox.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def florida(path): """Florida County Voting The `Florida` data frame has 67 rows and 11 columns. Vote by county in Florida for President in the 2000 election. This data frame contains the following columns: GORE Number of votes for Gore BUSH Number of votes for Bush. BUCHANAN Number of votes for Buchanan. NADER Number of votes for Nader. BROWNE Number of votes for Browne (whoever that is). HAGELIN Number of votes for Hagelin (whoever that is). HARRIS Number of votes for Harris (whoever that is). MCREYNOLDS Number of votes for McReynolds (whoever that is). MOOREHEAD Number of votes for Moorehead (whoever that is). PHILLIPS Number of votes for Phillips (whoever that is). Total Total number of votes. Adams, G. D. and Fastnow, C. F. (2000) A note on the voting irregularities in Palm Beach, FL. Formerly at Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `florida.csv`. Returns: Tuple of np.ndarray `x_train` with 67 rows and 11 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'florida.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/car/Florida.csv' maybe_download_and_extract(path, url, save_file_name='florida.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def flower(path): """Flower Characteristics 8 characteristics for 18 popular flowers. A data frame with 18 observations on 8 variables: +-------------+-----------+--------------+ | [ , "V1"] | factor | winters | +-------------+-----------+--------------+ | [ , "V2"] | factor | shadow | +-------------+-----------+--------------+ | [ , "V3"] | factor | tubers | +-------------+-----------+--------------+ | [ , "V4"] | factor | color | +-------------+-----------+--------------+ | [ , "V5"] | ordered | soil | +-------------+-----------+--------------+ | [ , "V6"] | ordered | preference | +-------------+-----------+--------------+ | [ , "V7"] | numeric | height | +-------------+-----------+--------------+ | [ , "V8"] | numeric | distance | +-------------+-----------+--------------+ V1 winters, is binary and indicates whether the plant may be left in the garden when it freezes. V2 shadow, is binary and shows whether the plant needs to stand in the shadow. V3 tubers, is asymmetric binary and distinguishes between plants with tubers and plants that grow in any other way. V4 color, is nominal and specifies the flower's color (1 = white, 2 = yellow, 3 = pink, 4 = red, 5 = blue). V5 soil, is ordinal and indicates whether the plant grows in dry (1), normal (2), or wet (3) soil. V6 preference, is ordinal and gives someone's preference ranking going from 1 to 18. V7 height, is interval scaled, the plant's height in centimeters. V8 distance, is interval scaled, the distance in centimeters that should be left between the plants. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `flower.csv`. Returns: Tuple of np.ndarray `x_train` with 18 rows and 8 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'flower.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/cluster/flower.csv' maybe_download_and_extract(path, url, save_file_name='flower.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def retinopathy(path): """Diabetic Retinopathy A trial of laser coagulation as a treatment to delay diabetic retinopathy. A data frame with 394 observations on the following 9 variables. `id` numeric subject id `laser` type of laser used: `xenon` `argon` `eye` which eye was treated: `right` `left` `age` age at diagnosis of diabetes `type` type of diabetes: `juvenile` `adult`, (diagnosis before age 20) `trt` 0 = control eye, 1 = treated eye `futime` time to loss of vision or last follow-up `status` 0 = censored, 1 = loss of vision in this eye `risk` a risk score for the eye. This high risk subset is defined as a score of 6 or greater in at least one eye. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `retinopathy.csv`. Returns: Tuple of np.ndarray `x_train` with 394 rows and 9 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'retinopathy.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/survival/retinopathy.csv' maybe_download_and_extract(path, url, save_file_name='retinopathy.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def minwage(path): """minwage Data loads lazily. Type data(minwage) into the console. A data.frame with 612 rows and 58 variables: - emp232. employment, sector 232, 1000s - wage232. hourly wage, sector 232, $ - emp236. - wage236. - emp234. - wage234. - emp314. - wage314. - emp228. - wage228. - emp233. - wage233. - emp394. - wage394. - emp231. - wage231. - emp226. - wage226. - emp387. - wage387. - emp056. - wage056. - unem. civilian unemployment rate, percent - cpi. Consumer Price Index (urban), 1982-1984 = 100 - minwage. Federal minimum wage, $/hour - lemp232. log(emp232) - lwage232. log(wage232) - gemp232. lemp232 - lemp232[\_n-1] - gwage232. lwage232 - lwage232[\_n-1] - lminwage. log(minwage) - gmwage. lminwage - lminwage[\_n-1] - gmwage\_1. gmwage[\_n-1] - gmwage\_2. - gmwage\_3. - gmwage\_4. - gmwage\_5. - gmwage\_6. - gmwage\_7. - gmwage\_8. - gmwage\_9. - gmwage\_10. - gmwage\_11. - gmwage\_12. - lemp236. - gcpi. lcpi - lcpi[\_n-1] - lcpi. log(cpi) - lwage236. - gemp236. - gwage236. - lemp234. - lwage234. - gemp234. - gwage234. - lemp314. - lwage314. - gemp314. - gwage314. - t. linear time trend, 1 to 612 https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_ isbn_issn=9781111531041 Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `minwage.csv`. Returns: Tuple of np.ndarray `x_train` with 612 rows and 58 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'minwage.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/wooldridge/minwage.csv' maybe_download_and_extract(path, url, save_file_name='minwage.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def liver(path): """Liver related laboratory data Liver related laboratory data from a randomized, blind, parallel group clinical trial with 4 doses of a drug. A data frame with 606 observations on the following 9 variables. ALP.B Alkaline phosphatase at baseline. A numeric vector. ALT.B Alanine aminotransferase at baseline. A numeric vector. AST.B Aspartate aminotransferase at baseline. A numeric vector. TBL.B Total bilirubin at baseline. A numeric vector. ALP.M Alkaline phosphatase after treatment. A numeric vector. ALT.M Alanine aminotransferase after treatment. A numeric vector. AST.M Aspartate aminotransferase after treatment. A numeric vector. TBL.M Total bilirubin after treatment. A numeric vector. dose The treatment group (i.e. dose group). A factor with levels `A` `B` `C` `D` Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `liver.csv`. Returns: Tuple of np.ndarray `x_train` with 606 rows and 9 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'liver.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/texmex/liver.csv' maybe_download_and_extract(path, url, save_file_name='liver.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def jtrain(path): """jtrain Data loads lazily. Type data(jtrain) into the console. A data.frame with 471 rows and 30 variables: - year. 1987, 1988, or 1989 - fcode. firm code number - employ. # employees at plant - sales. annual sales, $ - avgsal. average employee salary - scrap. scrap rate (per 100 items) - rework. rework rate (per 100 items) - tothrs. total hours training - union. =1 if unionized - grant. = 1 if received grant - d89. = 1 if year = 1989 - d88. = 1 if year = 1988 - totrain. total employees trained - hrsemp. tothrs/totrain - lscrap. log(scrap) - lemploy. log(employ) - lsales. log(sales) - lrework. log(rework) - lhrsemp. log(1 + hrsemp) - lscrap\_1. lagged lscrap; missing 1987 - grant\_1. lagged grant; assumed 0 in 1987 - clscrap. lscrap - lscrap\_1; year > 1987 - cgrant. grant - grant\_1 - clemploy. lemploy - lemploy[\_n-1] - clsales. lavgsal - lavgsal[\_n-1] - lavgsal. log(avgsal) - clavgsal. lavgsal - lavgsal[\_n-1] - cgrant\_1. cgrant[\_n-1] - chrsemp. hrsemp - hrsemp[\_n-1] - clhrsemp. lhrsemp - lhrsemp[\_n-1] https://www.cengage.com/cgi-wadsworth/course_products_wp.pl?fid=M20b&product_ isbn_issn=9781111531041 Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `jtrain.csv`. Returns: Tuple of np.ndarray `x_train` with 471 rows and 30 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'jtrain.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/wooldridge/jtrain.csv' maybe_download_and_extract(path, url, save_file_name='jtrain.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def weimar(path): """1932 Weimar election data This data set contains election results for 10 kreise (equivalent to precincts) from the 1932 Weimar (German) election. A table containing 11 variables and 10 observations. The variables are Nazi Number of votes for the Nazi party Government Number of votes for the Government Communists Number of votes for the Communist party FarRight Number of votes for far right parties Other Number of votes for other parties, and non-voters shareunemployed Proportion unemployed shareblue Proportion working class sharewhite Proportion white-collar workers sharedomestic Proportion domestic servants shareprotestants Proportion Protestant Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `weimar.csv`. Returns: Tuple of np.ndarray `x_train` with 10 rows and 11 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'weimar.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/Zelig/Weimar.csv' maybe_download_and_extract(path, url, save_file_name='weimar.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def frogs(path): """Frogs Data The `frogs` data frame has 212 rows and 11 columns. The data are on the distribution of the Southern Corroboree frog, which occurs in the Snowy Mountains area of New South Wales, Australia. This data frame contains the following columns: pres.abs 0 = frogs were absent, 1 = frogs were present northing reference point easting reference point altitude altitude , in meters distance distance in meters to nearest extant population NoOfPools number of potential breeding pools NoOfSites (number of potential breeding sites within a 2 km radius avrain mean rainfall for Spring period meanmin mean minimum Spring temperature meanmax mean maximum Spring temperature Hunter, D. (2000) The conservation and demography of the southern corroboree frog (Pseudophryne corroboree). M.Sc. thesis, University of Canberra, Canberra. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `frogs.csv`. Returns: Tuple of np.ndarray `x_train` with 212 rows and 10 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'frogs.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/DAAG/frogs.csv' maybe_download_and_extract(path, url, save_file_name='frogs.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def gravity(path): """Acceleration Due to Gravity The `gravity` data frame has 81 rows and 2 columns. The `grav` data set has 26 rows and 2 columns. Between May 1934 and July 1935, the National Bureau of Standards in Washington D.C. conducted a series of experiments to estimate the acceleration due to gravity, *g*, at Washington. Each experiment produced a number of replicate estimates of *g* using the same methodology. Although the basic method remained the same for all experiments, that of the reversible pendulum, there were changes in configuration. The `gravity` data frame contains the data from all eight experiments. The `grav` data frame contains the data from the experiments 7 and 8. The data are expressed as deviations from 980.000 in centimetres per second squared. This data frame contains the following columns: `g` The deviation of the estimate from 980.000 centimetres per second squared. `series` A factor describing from which experiment the estimate was derived. The data were obtained from Cressie, N. (1982) Playing safe with misweighted means. *Journal of the American Statistical Association*, **77**, 754–759. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `gravity.csv`. Returns: Tuple of np.ndarray `x_train` with 81 rows and 2 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'gravity.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/boot/gravity.csv' maybe_download_and_extract(path, url, save_file_name='gravity.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def electricity(path): """Cost Function for Electricity Producers a cross-section from 1970 to 1970 *number of observations* : 158 *observation* : production units *country* : United States A dataframe containing : cost total cost q total output pl wage rate sl cost share for labor pk capital price index sk cost share for capital pf fuel price sf cost share for fuel Christensen, L. and W. H. Greene (1976) “Economies of scale in U.S. electric power generation”, *Journal of Political Economy*, **84**, 655-676. Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `electricity.csv`. Returns: Tuple of np.ndarray `x_train` with 158 rows and 8 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'electricity.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/Ecdat/Electricity.csv' maybe_download_and_extract(path, url, save_file_name='electricity.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata
def rabbit(path): """Blood Pressure in Rabbits Five rabbits were studied on two occasions, after treatment with saline (control) and after treatment with the *5-HT\_3* antagonist MDL 72222. After each treatment ascending doses of phenylbiguanide were injected intravenously at 10 minute intervals and the responses of mean blood pressure measured. The goal was to test whether the cardiogenic chemoreflex elicited by phenylbiguanide depends on the activation of *5-HT\_3* receptors. This data frame contains 60 rows and the following variables: `BPchange` change in blood pressure relative to the start of the experiment. `Dose` dose of Phenylbiguanide in micrograms. `Run` label of run (`"C1"` to `"C5"`, then `"M1"` to `"M5"`). `Treatment` placebo or the *5-HT\_3* antagonist MDL 72222. `Animal` label of animal used (`"R1"` to `"R5"`). | J. Ludbrook (1994) Repeated measurements and multiple comparisons in cardiovascular research. *Cardiovascular Research* **28**, 303–311. | [The numerical data are not in the paper but were supplied by Professor Ludbrook] Args: path: str. Path to directory which either stores file or otherwise file will be downloaded and extracted there. Filename is `rabbit.csv`. Returns: Tuple of np.ndarray `x_train` with 60 rows and 5 columns and dictionary `metadata` of column headers (feature names). """ import pandas as pd path = os.path.expanduser(path) filename = 'rabbit.csv' if not os.path.exists(os.path.join(path, filename)): url = 'http://dustintran.com/data/r/MASS/Rabbit.csv' maybe_download_and_extract(path, url, save_file_name='rabbit.csv', resume=False) data = pd.read_csv(os.path.join(path, filename), index_col=0, parse_dates=True) x_train = data.values metadata = {'columns': data.columns} return x_train, metadata