Exemple #1
0
def test_spss_labelled_str(datapath):
    # test file from the Haven project (https://haven.tidyverse.org/)
    fname = datapath("io", "data", "spss", "labelled-str.sav")

    df = pd.read_spss(fname, convert_categoricals=True)
    expected = pd.DataFrame({"gender": ["Male", "Female"]})
    expected["gender"] = pd.Categorical(expected["gender"])
    tm.assert_frame_equal(df, expected)

    df = pd.read_spss(fname, convert_categoricals=False)
    expected = pd.DataFrame({"gender": ["M", "F"]})
    tm.assert_frame_equal(df, expected)
Exemple #2
0
def test_spss_labelled_num(datapath):
    # test file from the Haven project (https://haven.tidyverse.org/)
    fname = datapath("io", "data", "spss", "labelled-num.sav")

    df = pd.read_spss(fname, convert_categoricals=True)
    expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0])
    expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
    tm.assert_frame_equal(df, expected)

    df = pd.read_spss(fname, convert_categoricals=False)
    expected = pd.DataFrame({"VAR00002": 1.0}, index=[0])
    tm.assert_frame_equal(df, expected)
Exemple #3
0
def test_spss_labelled_num_na(datapath):
    # test file from the Haven project (https://haven.tidyverse.org/)
    fname = datapath("io", "area_data", "spss", "labelled-num-na.sav")

    df = pd.read_spss(fname, convert_categoricals=True)
    expected = pd.DataFrame({"VAR00002": ["This is one", None]})
    expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
    tm.assert_frame_equal(df, expected)

    df = pd.read_spss(fname, convert_categoricals=False)
    expected = pd.DataFrame({"VAR00002": [1.0, np.nan]})
    tm.assert_frame_equal(df, expected)
Exemple #4
0
def test_spss_umlauts(datapath):
    # test file from the Haven project (https://haven.tidyverse.org/)
    fname = datapath("io", "data", "spss", "umlauts.sav")

    df = pd.read_spss(fname, convert_categoricals=True)
    expected = pd.DataFrame(
        {"var1": ["the ä umlaut", "the ü umlaut", "the ä umlaut", "the ö umlaut"]}
    )
    expected["var1"] = pd.Categorical(expected["var1"])
    tm.assert_frame_equal(df, expected)

    df = pd.read_spss(fname, convert_categoricals=False)
    expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]})
    tm.assert_frame_equal(df, expected)
Exemple #5
0
def _load_spss(spss_files,usecols=None):
    usecols = usecols or DEFAULT_COLS

    pbar = tqdm(sorted(spss_files))
    for fp in pbar:
        pbar.set_description(os.path.split(fp)[-1])
        yield pd.read_spss(fp,usecols=usecols)
Exemple #6
0
def load_data(path=None, conn=None, sql=None, na_values=[]):
    """
    Load a file into a pandas dataframe
    :param path: filepath pointing to a datafile, file must be xlsx or csv
    :param conn: SQLAlchemy connection to a datbase
    :param sql: sql query string to return a table to be converted into a dataframe
    :return : pandas dataframe containing the data
    """
    # load data into dataframe, depending on file path

    if path is not None:
        file_type = path.split(".")[-1].lower()
        if file_type == "csv":
            df = pd.read_csv(path, na_values=na_values)
        elif file_type in ["xlsx", "xls"]:
            # Read the first sheet
            df = pd.read_excel(path, sheet_name=[0], na_values=na_values)
        elif file_type.lower() == "json":
            df = pd.read_json(path, na_values=na_values)
        elif file_type in ["sav", "zsav"]:
            df = pd.read_spss(path, na_values=na_values)
        df = df.dropna(how="all")
        df = df.convert_dtypes()
        # dfs.to_csv("walz_data.csv", index=False)

    elif conn is not None and sql is not None:
        df = pd.read_sql(sql, conn)

    return df
Exemple #7
0
def load_data(year=None,usecols=DEFAULT_COLS+PRACTICE_COLS):
    """Loads the NAMCS Private Practice Health Care Survey Data

    Arguments
        year: None, int, or list

    # Returns
            Pandas Dataframe
    """

    if year is None:
        # Just load latest year
        sel_files = _remote_sel()[-1:]
    elif isinstance(year,(int,list,np.ndarray)):
        sel_files = _remote_sel(year=year)

    # sav_files = [f.split('.zip')[0]+'.sav' for f in sel_files]
    pbar = tqdm(_cache_spss(sel_files))
    dfs = []
    for f in pbar:
        f = f.split('.zip')[0]+'.sav'
        fp = os.path.join(DEFAULT_CACHE_DIR,'namcs',f)
        pbar.set_description(os.path.split(fp)[-1])
        dfs.append(pd.read_spss(fp,usecols=usecols))

    return pd.concat(dfs,sort=False)
Exemple #8
0
def scdb_sav_to_dataframe(scdb_sav_path):
    try:
        dataset = pd.read_spss(scdb_sav_path)
    except (PyreadstatError, ReadstatError):
        dataset, _ = pyreadstat.read_sav(scdb_sav_path,
                                         apply_value_formats=True,
                                         encoding='iso-8859-1')
    return dataset
Exemple #9
0
def read_spss(data_fp, df_lib):
    # https://github.com/dask/dask/issues/9055
    if is_dask_lib(df_lib):
        logger.warning(
            "Falling back to pd.read_spss() since dask backend does not support it"
        )
        return dd.from_pandas(pd.read_spss(data_fp), npartitions=1)
    return df_lib.read_spss(data_fp)
Exemple #10
0
 def _read_file(self, filenm: str, filetype: str,
                **kwargs: {}) -> pd.DataFrame:
     """
     An internal function that dispatches the appropriate pandas io method to read the file into memory and return a pandas DataFrame object.
     :param file: The file to read into memory
     :param filetype: The type of file that will be read.  Note: This parameter is used to determine which Pandas method to dispatch in order to parse the file correctly.
     :param kwargs: Optional arguments that can be passed to the individual file parsers.  Enabled now to allow future development of a configuration file where parameters can be specified distinctly for each file.
     :return: Returns a Pandas DataFrame object.
     """
     if filetype == 'stata':
         args = self._valid_args(kwargs, 'read_stata')
         return pd.read_stata(filenm, **args).convert_dtypes()
     elif filetype == 'csv':
         args = self._valid_args(kwargs, 'read_csv')
         return pd.read_csv(filenm, **args).convert_dtypes()
     elif filetype == 'excel':
         args = self._valid_args(kwargs, 'read_excel')
         return pd.read_excel(filenm, **args).convert_dtypes()
     elif filetype == 'spss':
         args = self._valid_args(kwargs, 'read_spss')
         return pd.read_spss(filenm, **args).convert_dtypes()
     elif filetype == 'sas':
         args = self._valid_args(kwargs, 'read_sas')
         return pd.read_sas(filenm, **args).convert_dtypes()
     elif filetype == 'html':
         args = self._valid_args(kwargs, 'read_html')
         return pd.read_html(filenm, **args).convert_dtypes()
     elif filetype == 'fwf':
         args = self._valid_args(kwargs, 'read_sas')
         return pd.read_sas(filenm, **args).convert_dtypes()
     elif filetype == 'pickle':
         args = self._valid_args(kwargs, 'read_pickle')
         return pd.read_pickle(filenm, **args).convert_dtypes()
     elif filetype == 'tab':
         args = self._valid_args(kwargs, 'read_table')
         return pd.read_table(filenm, **args).convert_dtypes()
     elif filetype == 'json':
         if self.raw_json:
             args = self._valid_args(kwargs, 'read_json')
             return pd.read_json(filenm, **args).convert_dtypes()
         else:
             args = self._valid_args(kwargs, 'json_normalize')
             with open(filenm, 'r') as f:
                 return pd.io.json.json_normalize(json.load(f.read()),
                                                  **args).convert_dtypes()
     elif filetype == 'fwf':
         args = self._valid_args(kwargs, 'read_fwf')
         return pd.read_fwf(filenm, **args).convert_dtypes()
     elif filetype == 'feather':
         args = self._valid_args(kwargs, 'read_feather')
         return pd.read_feather(filenm, **args).convert_dtypes()
     elif filetype == 'parquet':
         args = self._valid_args(kwargs, 'read_parquet')
         return pd.read_parquet(filenm, **args).convert_dtypes()
     elif filetype == 'hdf':
         args = self._valid_args(kwargs, 'read_hdf')
         return pd.read_hdf(filenm, **args).convert_dtypes()
def read_dataset(path: Path) -> pd.DataFrame:
    """
    This method will be responsible to read the dataset.
    Please implement this method so that it returns a pandas dataframe from a given path.
    Notice that this path is of type Path, which is a helper type from python to best handle
    the paths styles of different operating systems.
    """
    # Converting the path object into string for flexible parsing
    path_to_string = str(path)

    # Determining the file type and reading it through appropriate Pandas method
    if '.csv' in path_to_string or '.data' in path_to_string:
        dataset_data = pd.read_csv(path)

    elif '.html' in path_to_string:
        dataset_data = pd.read_html(path)

    elif '.json' in path_to_string:
        dataset_data = pd.read_json(path)

    elif '.xlsx' in path_to_string or '.xls' in path_to_string:
        dataset_data = pd.read_excel(path)

    elif '.sql' in path_to_string:
        dataset_data = pd.read_sql(path)

    elif '.dta' in path_to_string:
        dataset_data = pd.read_stata(path)

    elif '.hdf' in path_to_string:
        dataset_data = pd.read_hdf(path)

    elif '.pickle' in path_to_string or '.pkl' in path_to_string:
        dataset_data = pd.read_pickle(path)

    elif '.spss' in path_to_string:
        dataset_data = pd.read_spss(path)

    elif '.sas' in path_to_string:
        dataset_data = pd.read_sas(path)

    elif '.gbq' in path_to_string:
        dataset_data = pd.read_gbq(path)

    elif '.fwf' in path_to_string:
        dataset_data = pd.read_fwf(path)

    else:
        dataset_data = None

    # Returning the result
    return dataset_data
def main():
	if len(sys.argv) != 3:
		print ('Usage: ' + sys.argv[0] + ' spssfile stafile')
		sys.exit(1)

	try:
		df = pd.read_spss(sys.argv[1])
		df.to_stata(sys.argv[2])
		return

	except:
		print ('Error: ', sys.exc_info())
		traceback.print_exc()
		sys.exit(1)
def main():

    el_paso_data = pd.read_spss(str(DATA_PATH))

    el_paso_data = el_paso_data.rename(
        columns={
            'FST2YRS': "lived first 2 years within 1 mile of ASARCO",
            'Lead_72': "1972 Blood Lead Level (ug / 100mL)",
            'Lead_73': "1973 Blood Lead Level (ug / 100mL)",
        })

    # create boolean mask
    first_2_years = el_paso_data[
        'lived first 2 years within 1 mile of ASARCO'] == 'Yes'

    # 1972
    bll_1972 = el_paso_data["1972 Blood Lead Level (ug / 100mL)"]
    print(
        scipy.stats.f_oneway(bll_1972[first_2_years].dropna(),
                             bll_1972[~first_2_years].dropna()))

    # 1973
    bll_1973 = el_paso_data["1973 Blood Lead Level (ug / 100mL)"]
    print(
        scipy.stats.f_oneway(bll_1973[first_2_years].dropna(),
                             bll_1973[~first_2_years].dropna()))

    mean_near_72 = bll_1972[first_2_years].mean()
    mean_far_72 = bll_1972[~first_2_years].mean()
    mean_near_73 = bll_1973[first_2_years].mean()
    mean_far_73 = bll_1973[~first_2_years].mean()

    plot_df = pd.DataFrame(
        {
            '1972': {
                'within 1 mile': mean_near_72,
                'outside 1 mile': mean_far_72
            },
            '1973': {
                'within 1 mile': mean_near_73,
                'outside 1 mile': mean_far_73
            },
        }, ).unstack().rename('average blood lead levels ug/dL').sort_index(
            level=1)

    plot_df.index = [' '.join(col).strip() for col in plot_df.index.values]

    plot_df.plot(style='D-', rot=8)
    plt.show()
Exemple #14
0
    def loadData(self):
        '''
        Load data from file using panda functions

        :rtype: None
        '''

        if self.fileName.find("csv") != -1:
            self.data = _pandas.read_csv(self.fileName)
        elif self.fileName.find("xlsx") != -1:
            self.data = _pandas.read_excel(self.fileName,
                                           self.excelSheet,
                                           engine='openpyxl')
        elif self.fileName.find("sav") != -1:
            self.data = _pandas.read_spss(self.fileName)
Exemple #15
0
def fakehome(request):
    if request.method == 'POST':
        frm = NewsForm(request.POST)
        if frm.is_valid():
            text = frm.cleaned_data.get('text')
            m = frm.save()
            df, meta = pd.read_spss('final_model.sav')
            prediction = df.predict([var])
            prob = df.predict_proba([var])

            return HttpResponse("The given statement is ", prediction[0],
                                "The truth probability score is ", prob[0][1])
    else:
        frm = NewsForm()
    return render(request, 'fake/index.html', {'formfake': frm})
Exemple #16
0
def extract_clinical_sheet() -> pd.DataFrame:
    """
    Load the clinical information from SPSS file.
    """
    clinical_sheet = pd.read_spss("clinical_20200420.sav")

    # And set Patient ID as index.
    clinical_sheet["studynumber"].name = "Patient ID"
    clinical_sheet.set_index(clinical_sheet["studynumber"].astype(int),
                             inplace=True)
    # Convert stage from float to integer.
    columns_to_int = ["stage", "therapyline"]
    clinical_sheet[columns_to_int] = clinical_sheet[columns_to_int].astype(int)

    return clinical_sheet[clinical_features + outcome_labels]
Exemple #17
0
    def file_path_init(self, file_path):
        """
        #初始化spss文件路径
        """
        self.file_path = file_path
        if len(self.file_path) != 0:
            self.lineEdit_filePath.setText(self.file_path)
            self.current_dataset_name = os.path.split(
                self.lineEdit_filePath.text())[1]
            self.lineEdit_datasetName.setText(self.current_dataset_name)
            logging.info("加载成功file_path{},datasetName:{}".format(
                self.file_path, self.current_dataset_name))

            if len(self.file_path) > 0:
                if self.checkBox_ifColumns.isChecked():
                    header = 0
                else:
                    header = None
                # 仅预览前100条数据
                if self.lineEdit_limitRow.text() == "全部":
                    nrows_preview = 100
                elif int(self.lineEdit_limitRow.text()) <= 100:
                    nrows_preview = int(self.lineEdit_limitRow.text())
                else:
                    nrows_preview = 100

                if self.lineEdit_limitRow.text() == "全部":
                    nrows = 100000000
                else:
                    nrows = int(self.lineEdit_limitRow.text())

                encoding = self.comboBox_encode.currentText()
                skiprows = int(self.lineEdit_passHead.text())

                logging.info(
                    "file_path:{},header:{},skiprows:{},nrows:{}".format(
                        self.file_path, header, skiprows, nrows_preview))

                self.current_dataset = pd.read_spss(self.file_path)

                if len(self.current_dataset) == 0:
                    self.tableWidget_previewData.clear()
                    logging.info("当前有效数据为空")
                else:
                    self.import_dateset_preview()
                    logging.info("数据导入成功")
            else:
                print("请选择数据集")
Exemple #18
0
def load_data(path):
    path = Path(path)

    if path.suffix == ".feather":
        df = pd.read_feather(path)
    elif path.suffix == ".dta":
        df = pd.read_stata(path)
    elif path.suffix == ".csv" or path.suffix == "":
        df = pd.read_csv(path)
    elif path.suffix in [".pkl", ".pickle"]:
        df = pd.read_pickle(path)
    elif path.suffix == ".sav":
        df = pd.read_spss(path)
    else:
        raise NotImplementedError

    return df
def read_and_reduce_sav(sav_file: str) -> pd.DataFrame:
    """Reads the DHS SAV file for household recode and extracts only the columns related to DHS Cluster ID and wealth indexes with 
    column names HV001, HV270, HV271. Then reduce by grouping by DHS Cluster ID and aggregating based on mode for wealth indexes.

    Args:
        sav_file (str): filename with the path for DHS SPSS SAV file.

    Returns:
        pd.DataFrame: returns the reduced dataframe
    """
    df = pd.read_spss(sav_file, usecols=['HV001', 'HV270', 'HV271'])
    df2 = df.groupby('HV001').agg(
        {'HV270': statistics.mode, 'HV271': 'mean'}).reset_index()

    # Rename column HV001 to DHSCLUST
    df2 = df2.rename(columns={'HV001': 'DHSCLUST'})

    return df2
Exemple #20
0
    def post(self, request):

        try:
            myfile = request.FILES['myfile']

        except:
            return render(request, 'not_valid.html')

        new_file = EventLog()
        new_file.spss_file.save('spss_file.sav',
                                File(BytesIO(myfile.read())),
                                save=True)
        new_file.save()

        print(new_file.spss_file.path)
        df = pd.read_spss(new_file.spss_file.path)

        new_file.delete()

        # Excel (Way too slow)

        #output = BytesIO()
        #print('excel writer')
        #writer = pd.ExcelWriter(output, engine='xlsxwriter')
        #print('to excel')
        #df.to_excel(writer, sheet_name='Data', encoding='utf-8', index=False)
        #print('save')
        #writer.save()

        #response = HttpResponse(output.getvalue(), content_type='application/vnd.ms-excel')
        #response['Content-Disposition'] = 'attachment; filename="spss_to_excel.xlsx"'

        # CSV

        output = StringIO()
        print('excel writer')
        print('to excel')
        df.to_csv(output, encoding='utf-8', index=False)
        response = HttpResponse(output.getvalue(), content_type='text/csv')
        response[
            'Content-Disposition'] = 'attachment; filename="spss_to_csv.csv"'

        return response
Exemple #21
0
    def import_dateset_reload(self):
        """
        刷新导入的数据
        """
        header = 0
        nrows_preview = 100
        sep = ','
        skiprows = 0

        if len(self.file_path) > 0:
            if self.checkBox_ifColumns.isChecked():
                header = 0
            else:
                header = None
            # 仅预览前100条数据
            if self.lineEdit_limitRow.text() == "全部":
                nrows_preview = 100
            elif int(self.lineEdit_limitRow.text()) <= 100:
                nrows_preview = int(self.lineEdit_limitRow.text())
            else:
                nrows_preview = 100

            if self.lineEdit_limitRow.text() == "全部":
                nrows = 100000000
            else:
                nrows = int(self.lineEdit_limitRow.text())

            encoding = self.comboBox_encode.currentText()
            skiprows = int(self.lineEdit_passHead.text())

            logging.info("file_path:{},header:{},skiprows:{},nrows:{}".format(
                self.file_path, header, skiprows, nrows_preview))

            self.current_dataset = pd.read_spss(self.file_path)

            if len(self.current_dataset) == 0:
                self.tableWidget_previewData.clear()
                logging.info("当前有效数据为空")
            else:
                self.import_dateset_preview()
                logging.info("数据导入成功")
Exemple #22
0
    def _set_data_frame(data):
        """
        If the loaded data is not an instance of pandas dataframe, create one from the file type using pandas
        """
        if not isinstance(data, pd.DataFrame):
            file_type = data.split(".")[-1]

            if file_type == "csv":
                data = pd.read_csv(data)
            elif file_type == "xlsx":
                data = pd.read_excel(data)
            elif file_type == "dta":
                data = pd.read_stata(data)
            elif file_type == "sav":
                data = pd.read_spss(data)
            else:
                sys.exit(
                    "Error: File type not supported\nCurrent supported files are pandas.Dataframe, csv, xlsx,"
                    " dta and sav"
                )
        return data
Exemple #23
0
def main():

    el_paso_df = pd.read_spss(str(DATA_PATH))

    el_paso_df = el_paso_df.rename(columns={
        'Lead_72': '1972',
        'Lead_73': '1973'
    })
    el_paso_df.boxplot(column=['1972', '1973'])

    plt.ylabel(r'$\mathrm{Blood\ Lead\ Level}\ (\frac{\mu g}{dL})$')
    plt.suptitle('Blood Lead Level by Year', size='x-large')

    plt.savefig('../p2.3_paired_samples_ttest/boxplot.png')

    diff = el_paso_df['1973'] - el_paso_df['1972']

    sample_mean = diff.mean()
    sample_std = diff.std()
    n = diff.count()

    # +|- (two sided)
    critical_value = scipy.stats.t.ppf(.975, n - 1)
    # print(critical_value)

    sample_std_error = sample_std / n**0.5
    t = sample_mean / sample_std_error
    # print(t)

    tstat, pval = scipy.stats.ttest_1samp(diff.dropna(), 0)

    print(f"{'paired samples t-test 1973-1972':^45}")
    print("-" * 45)
    print(f"\tcritical value = \u00B1{critical_value:>8.4f}")
    print(f"\tt-statistic    = {tstat:>9.4f}")
    print(f"\tp-value        = {pval:>9.4f}")
    print(
        f'\tThe difference between groups is {sample_mean:3.4f} [{sample_mean - critical_value * sample_std_error:3.4f} to {sample_mean + critical_value * sample_std_error:3.4f}] (mean [95% CI])'
    )
def main():

    el_paso_df = pd.read_spss(str(DATA_PATH))
    # print(el_paso_df.columns)

    # x = el_paso_df['Lead_72']
    # iq_full = el_paso_df['IQ_Full']
    # iq_verbal = el_paso_df['IQ_Verbal']
    # iq_performance = el_paso_df['IQ_Performance']

    el_paso_df = el_paso_df.rename(
        columns={
            'Lead_72': 'Blood Lead Levels 1972 (ug / 100mL)',
            'Lead_73': 'Blood Lead Levels 1973 (ug / 100mL)'
        })

    fig, ax1 = plt.subplots()
    simple_regression(
        el_paso_df[['Blood Lead Levels 1972 (ug / 100mL)', 'IQ_Full']], ax=ax1)
    plt.savefig("lead_lvl_72_v_iq_full.png")

    fig, ax2 = plt.subplots()
    simple_regression(
        el_paso_df[['Blood Lead Levels 1972 (ug / 100mL)', 'IQ_Verbal']],
        ax=ax2)
    plt.savefig("lead_lvl_72_v_iq_verbal.png")

    fig, ax3 = plt.subplots()
    simple_regression(
        el_paso_df[['Blood Lead Levels 1972 (ug / 100mL)', 'IQ_Performance']],
        ax=ax3)
    plt.savefig("lead_lvl_72_v_iq_performance.png")

    fig, ax3 = plt.subplots()
    simple_regression(
        el_paso_df[['Blood Lead Levels 1973 (ug / 100mL)', 'IQ_Performance']],
        ax=ax3)
    plt.savefig("lead_lvl_73_v_iq_performance.png")
### Loading SAS, Stata, and SPSS Files
For SAS files with extensions`.sas7bdat` or `.xport`, or Stata files with extension `.dta`. the `pd.read_sas()` and `pd.read_stata()` functions work just like other data parsing functions in `pandas`. We can pass the file path, the file name alone (if we've set the working directory), or a URL to these functions and they loads the data directly to a data frame.

On GitHub, I've saved a SAS file containing the monthly inflation rate in the United States since 1983, and a Stata file containing a CBS public opinion poll. To load the SAS file, I type:

url = "https://github.com/jkropko/DS-6001/raw/master/localdata/inflation.sas7bdat"
inflation = pd.read_sas(url)
inflation

And to load the Stata file:

url = "https://github.com/jkropko/DS-6001/raw/master/localdata/cbspoll.dta"
cbspoll = pd.read_stata(url)
cbspoll

SPSS files have the file extension `.sav`, and can be loaded with `pd.read_spss()` in the same way. One issue (at the time this notebook was written) is that the `pd.read_spss()` function only accepts local files, and not files from URLs. I saved data from a public opinion survey on [GitHub](https://github.com/jkropko/DS-6001/raw/master/localdata/survey.sav). If you want to try loading it into Python, download the file and move it to the folder where you've set your working directory (or type out the whole file path in the following code), and type:
```python
survey = pd.read_spss("survey.sav")
survey
```
Eventually I expect this `pd.read_spss()` function to be able to accept URLs as well. At that point, the following code should also work:
```python
url = "https://github.com/jkropko/DS-6001/raw/master/localdata/survey.sav"
survey = pd.read_spss(url)
survey
```

## Saving CSV and ASCII Files to Disk
Suppose we've done all the steps needed to clean and manage the data. We might want to save a clean version of the data in a CSV or other ASCII file on our local disk space. We can do so by applying the `.to_csv()` method to the `anes` data frame. The first argument is the filename with whatever extension we want for the saved file. As with `pd.read_csv()`, we can also specify the `sep` parameter to choose a delimiter for the text-based data file we are creating. Let's save the ANES dataframe as "anes_cleaned.csv" in our working directory:

anes.to_csv("anes_cleaned.csv", sep=",")
    '''Will convert all text to uppercase.'''
    return x.upper()


def replace(x):
    '''Cuts off the last part of county in the Names column.'''
    answer = x.replace('COUNTY', '')
    return answer


#def main():

oklahoma = gpd.read_file(
    '/Users/kellenbullock/Documents/Gradschool/Geographic Analysis/Exercise 1/Mappin/COUNTY_BOUNDARY.shp'
)
data = pd.read_spss(
    '/Users/kellenbullock/Desktop/Geographic Analysis II/Ex6/Part_B_Data.sav')

data['Name'] = data['Name'].apply(upper)
data['Name'] = data['Name'].apply(replace)
data = data.rename(columns={'Name': 'COUNTY_NAM'})

# Combining the datasets.
oklahoma = oklahoma.join(data, lsuffix='COUNTY_NAM', rsuffix='COUNTY_NAM')

fig, ax = plt.subplots(1, 1)
ax.axis('off')
ax.set_title('Hierarchical Cluster Analysis',
             fontdict={
                 "fontsize": 14,
                 "fontweight": 3
             })
Exemple #27
0
def test_spss_usecols(datapath):
    # usecols must be list-like
    fname = datapath("io", "data", "spss", "labelled-num.sav")

    with pytest.raises(TypeError, match="usecols must be list-like."):
        pd.read_spss(fname, usecols="VAR00002")
Exemple #28
0
                   within_buggy,
                   c='red',
                   label='Bug Average Error')

        plt.legend()

        plt.show()

    return within_corrected, between_corrected


#%% Weisberg data reanalysis

# Load in dataset part 1
df_1 = pd.read_spss(
    os.path.join(baseDir, 'Weisberg_Dissertation',
                 'Dissertation_Part_1_Master_Spring_1.sav'))
df_1 = df_1[df_1['Within_Pointing'].notna()]
df_1 = df_1[df_1['Between_Pointing'].notna()]
# Load in dataset part 2
df_2 = pd.read_spss(
    os.path.join(baseDir, 'Weisberg_Dissertation',
                 'Dissertation_Part_2_Master_1.sav'))
df_2 = df_2[df_2['Within_Pointing'].notna()]
df_2 = df_2[df_2['Between_Pointing'].notna()]

# Replicate a few analyses from JEPLMC 2016 paper

# Correlation between OSpan and SymSpan
print(
    np.ma.corrcoef(np.ma.masked_invalid(df_1.OSpan_Total),
Exemple #29
0
def read_spss(data_fp):
    return pd.read_spss(data_fp)
#!/usr/bin/env python
# coding: utf-8

# In[2]:

import pandas as pd

# In[6]:

df = pd.read_spss("colours.sav")

# In[7]:

df

# In[8]:

#To find if there is any correlatio between personality and colour preference

df = df.drop("case", axis=1)

# In[9]:

df.apply(lambda x: len(x.unique()))

# In[10]:

df.describe()

# In[11]: