Example #1
0
def test_writerows_pd_np_issue63():
    """
    issue #63 "ufunc 'isnan' not supported for the input types"
    Caused by strings that contained NaN values
    """
    if skip:
        raise SkipTest
    buff = StringIO(u"""n1,n2,s1,s2
    1,1,a,a
    2,2,b,bb
    3,,c,""")
    desired = [[1.0, 1.0, b'a', b'a'], [2.0, 2.0, b'b', b'bb'],
               [3.0, None, b'c', b'']]

    df = pd.read_csv(buff, chunksize=10**6, sep=',').get_chunk()
    arr = df.values
    savFileName = join(gettempdir(), "check.sav")
    kwargs = dict(varNames=list(df),
                  varTypes=dict(n1=0, n2=0, s1=1, s2=2),
                  savFileName=savFileName,
                  ioUtf8=True)

    # numpy
    with srw.SavWriter(**kwargs) as writer:
        writer.writerows(arr)
    with srw.SavReader(savFileName) as reader:
        actual = reader.all(False)
    assert actual == desired, actual

    # pandas
    with srw.SavWriter(**kwargs) as writer:
        writer.writerows(df)
    with srw.SavReader(savFileName) as reader:
        actual = reader.all(False)
    assert actual == desired, actual
Example #2
0
def get_data_sections(file_name, filterColumnNumber=16):  # or 27
    """
    loads chunks of data surrounding sms interventions.
    assumes data is sorted by pid, day, and time.
    :param file_name: name of save file to read
    :param filterColumnNumber: column which must be present for data to be included
    :return: array of arrays of consecutive data like [[d1, d2], [d6, d7, d8]]
    """
    data_sections = [[]]
    with savReaderWriter.SavReader(file_name,
                                   ioLocale='en_US.UTF-8') as reader:
        row_n = 0
        currentPID = 9
        for line in reader:
            if (line[filterColumnNumber] is
                    not None  # test for if line has data we want in it
                    and line[0] == currentPID):
                #print get_data_dict(line, row_n)
                data_sections[-1].append(get_data_dict(line, row_n))
            else:  # move to next data section
                if len(data_sections[-1]
                       ) > 0:  # only move if not already an empty array
                    data_sections.append([])
                currentPID = line[0]
            row_n += 1
            if row_n >= FILE_END:  # yeah... that happens...
                break
    return data_sections
Example #3
0
def write_ms_access_file(savFilename, mdbFilename=None, overwrite=True):
    """Write the actual MS Access file"""
    if not sys.platform.startswith("win"):
        raise EnvironmentError("Sorry, Windows only")
    if not mdbFilename:
        mdbFilename = os.path.splitext(savFilename)[0] + ".mdb"
        mdbFilename = mdbFilename.replace(" ", "_")
    if os.path.exists(mdbFilename) and overwrite:
        os.remove(mdbFilename)
    pypyodbc.lowercase = False
    create_table = sql_create_table(savFilename)
    insert_table = sql_insert_template(savFilename)
    pypyodbc.win_create_mdb(mdbFilename)

    #cnx = pyodbc.connect("DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=B:\\spss_python\\someFile.mdb", autocommit=True)
    #cnx = pyodbc.connect("DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=someFile.mdb", autocommit=True)
    conn_string = r"DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=" + os.getcwd(
    ) + "\\%s" % mdbFilename

    cnx = pyodbc.connect(conn_string, autocommit=True)

    try:

        cursor = cnx.cursor()
        cursor.execute(create_table)
        with savReaderWriter.SavReader(savFilename) as reader:
            for record in reader:
                cursor.execute(insert_table, tuple(record))
        cursor.commit()
    finally:
        cnx.close()
Example #4
0
 def writer_moredata(self, filepath, filename, valuetypes, start, end,
                     tablename):
     res = writer_data_table()
     with savReaderWriter.SavReader(os.path.join(filepath, filename),
                                    ioUtf8=True) as read:
         # 如果不用ioutf8, 汉字十六进制\被转义,更麻烦
         try:
             for i in read:
                 i = i[start:end]
                 for j in range(len(valuetypes)):
                     # 数据库不认unicode所以要转换下
                     # 将varchar进行json存如数据库
                     if valuetypes[j] == "DATETIME":
                         i[j] = read.spss2strDate(i[j], '%Y-%m-%d %H:%M:%S',
                                                  None)
                     elif valuetypes[j] == "DATE":
                         i[j] = read.spss2strDate(i[j], '%Y-%m-%d', None)
                     elif valuetypes[j] == "VARCHAR":
                         i[j] = i[j]
                 res.insert_sql(tablename, i)
         except Exception as e:
             my_log.error(e)
         finally:
             my_log.info("data write database success !!!")
     res.close()
 def test_data_same(self):
     with sav.SavReader(out_savFileName, rawMode=True,
                        **b_settings) as data:
         out_records = data.all(False)
         out_encoding = data.fileEncoding
     self.assertEqual("utf_8", out_encoding)
     self.assertEqual(in_records, out_records)
Example #6
0
def extract_sav_data(sav_file,
                     ioLocale='en_US.UTF-8',
                     ioUtf8=True,
                     engine='savReaderWriter'):
    """ see parse_sav_file doc """
    if engine == 'savReaderWriter':
        with sr.SavReader(sav_file,
                          returnHeader=True,
                          ioLocale=ioLocale,
                          ioUtf8=ioUtf8) as reader:
            thedata = [x for x in reader]
            header = thedata[0]
            dataframe = pd.DataFrame.from_records(thedata[1:],
                                                  coerce_float=False)
            dataframe.columns = header
            for column in header:
                if isinstance(dataframe[column].dtype, np.object):
                    # Replace None with NaN because SRW returns None if casting dates fails (dates are of type np.object))
                    values = dataframe[column].dropna().values
                    if len(values) > 0:
                        if isinstance(values[0], str):
                            dataframe[column] = dataframe[column].dropna().map(
                                str.strip)
                        elif isinstance(values[0], str):
                            # savReaderWriter casts dates to str
                            dataframe[column] = dataframe[column].dropna().map(
                                str.strip)
                            # creating DATETIME objects should happen here
            return dataframe
    elif engine == 'readstat':
        df, meta = pyreadstat.read_sav(sav_file)
        return df
Example #7
0
def main():

    MAX_ROWS = 5000
    MAX_DECIMALS = 6

    if len(sys.argv) < 2 or len(sys.argv) > 3:
        print('Usage: ' + sys.argv[0] + ' inputfile [outputpath]')
        sys.exit(1)

    if len(sys.argv) == 2:
        sys.argv.append('')

    try:
        with savReaderWriter.SavHeaderReader(sys.argv[1],
                                             ioUtf8=False) as header:
            metadata = header.all()

        res = {
            'alignments': metadata.alignments,
            'columnWidths': metadata.columnWidths,
            'measureLevels': metadata.measureLevels,
            'varFormats': metadata.formats,
            'varTypes': metadata.varTypes,
            'varNames': metadata.varNames,
            'valueLabels': metadata.valueLabels,
            'varLabels': metadata.varLabels,
            # # Otros valores que vienen en el header:
            # 'caseWeightVar': metadata.caseWeightVar,
            # 'fileAttributes': metadata.fileAttributes,
            # 'fileLabel': metadata.fileLabel,
            # 'missingValues': metadata.missingValues,
            # 'multRespDefs': metadata.multRespDefs,
            # 'varAttributes': metadata.varAttributes,
            # 'varRoles': metadata.varRoles,
            # 'varSets': metadata.varSets,
        }

        with open(os.path.join(sys.argv[2], 'header.json'), 'w') as h:
            h.write(json.dumps(convert_recursive(res), indent=4))

        with savReaderWriter.SavReader(sys.argv[1], ioUtf8=False) as reader:
            for i, lines in enumerate(chunks(reader, MAX_ROWS), 1):
                with open(
                        os.path.join(sys.argv[2],
                                     'data_' + str(i).zfill(5) + '.json'),
                        'w') as f:
                    encoded = convert_recursive(lines)
                    jsonText = json.dumps(encoded)
                    # jsonText = truncate_decimals(jsonText, MAX_DECIMALS)
                    f.write(jsonText)

        print('Files successfully created.')

        return

    except:
        print("Error: ", sys.exc_info())
        traceback.print_exc()
        sys.exit(1)
Example #8
0
def test_writerows_tuple():
    records = tuple([tuple(record) for record in desired])
    savFileName = "output_tuple.sav"
    with srw.SavWriter(savFileName, *args) as writer:
        writer.writerows(records)
    with srw.SavReader(savFileName) as reader:
        actual = reader.all(False)
    assert actual == desired, actual
Example #9
0
 def read_sav(self, filepath):
     with savReaderWriter.SavReader(filepath, ioUtf8=True) as read:
         ret = read.getSavFileInfo()
         """
         # getsavfileinfo infomation :
         # (self.numVars, self.nCases, self.varNames, self.varTypes,self.formats, self.varLabels, self.valueLabels)
         """
         return read.formats, read.varNames, read.varLabels, read.valueLabels
Example #10
0
def leerSPSSPrueba():
    data = []
    with savReaderWriter.SavReader(archivo,
                                   returnHeader=True,
                                   ioLocale='Spanish_Spain.1252') as reader:
        for linea in reader:
            data.append(linea)
        return data
    def func(self, savFileName):

        self.outfile = tempfile.mktemp(suffix="_out.sav")
        with rw.SavWriter(self.outfile, [b'v1'], {b'v1': 0}) as writer:
            for i in range(10):
                writer.writerow([i])
        with rw.SavReader(self.outfile) as reader:
            self.assertEqual(reader.all(), [[float(i)] for i in range(10)])
        self.assertTrue(os.path.exists(self.outfile))
Example #12
0
def test_writerows_namedtuple():
    Record = namedtuple("Record", args[0])
    records = [Record(*record) for record in desired]
    savFileName = "output_namedtuple.sav"
    with srw.SavWriter(savFileName, *args) as writer:
        writer.writerows(records)
    with srw.SavReader(savFileName) as reader:
        actual = reader.all(False)
    assert actual == desired, actual
 def test_raises_SPSSIOError(self):
     module = rw if sys.version_info[0] > 2 else rw.error
     SPSSIOError = module.SPSSIOError
     retcodes = module.retcodes
     with self.assertRaises(SPSSIOError):
         with rw.SavReader(self.badSavFile) as reader:
             for line in reader:
                 pass
         error = sys.exc_info()[1]
         self.assertEqual(retcodes.get(error.retcode), "SPSS_INVALID_FILE")
Example #14
0
def test_writerows_pandas():
    if skip:
        raise SkipTest
    df = pd.DataFrame({"a": range(0, 20, 2), "b": range(1, 20, 2)})
    df.loc[0, "a"] = np.nan
    savFileName = "output_pd.sav"
    with srw.SavWriter(savFileName, *args) as writer:
        writer.writerows(df)
    with srw.SavReader(savFileName) as reader:
        actual = reader.all(False)
    assert actual == desired, actual
Example #15
0
def read_sav(path):
    """Read .sav files, return pandas DataFrame"""
    raw_data = list(spss.SavReader(path, returnHeader=True))
    df = pd.DataFrame(raw_data)
    columns = list(df.loc[0])
    columns = [s.decode('utf-8') for s in columns]
    df.columns = columns
    df = df.iloc[1:].reset_index(
        drop=True)  # sets column name to the first row

    return df
Example #16
0
def test_writerows_numpy():
    if skip:
        raise SkipTest
    data = [range(10), range(10, 20)]
    array = np.array(data, dtype=np.float64).reshape(10, 2)
    array[0, 0] = np.nan
    savFileName = "output_np.sav"
    with srw.SavWriter(savFileName, *args) as writer:
        writer.writerows(array)
    with srw.SavReader(savFileName) as reader:
        actual = reader.all(False)
    assert actual == desired, actual
Example #17
0
def readSav(fname):
    with spss.SavReader(fname) as reader:
        header = reader.header
        records = reader.all()
    df = pd.DataFrame(records, columns=header)
    # Decode the encoded data
    df.rename(columns=lambda x: x.decode("utf-8"), inplace=True)
    df.rename(columns=lambda x: x.translate(str.maketrans(' /()?', '_____')),
              inplace=True)
    for c in df.columns:
        if is_string_dtype(df[c]): df[c] = df[c].str.decode("utf-8")
    return df
Example #18
0
    def reload(self):
        import savReaderWriter
        self.rdr = savReaderWriter.SavReader(self.source.resolve())
        with self.rdr as reader:
            self.columns = []
            for i, vname in enumerate(reader.varNames):
                vtype = float if reader.varTypes[vname] == 0 else str
                self.addColumn(ColumnItem(vname.decode('utf-8'), i, type=vtype))

            self.rows = []
            for r in Progress(reader, total=reader.shape.nrows):
                self.rows.append(r)
Example #19
0
def spss_to_csv(input_filepath,
                output_filepath,
                upsample_size=100000,
                w_col='Weight'):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    with srw.SavHeaderReader(input_filepath, ioUtf8=True) as header:
        metadata = header.all()
        value_replacements = metadata.valueLabels

        nan_replacements = dict()
        for k, v in metadata.missingValues.items():
            if v and 'values' in v:
                nan_values = dict()
                if isinstance(v['values'], list):
                    for nan_val in v['values']:
                        nan_values[nan_val] = np.nan
                else:
                    nan_values[v['values']] = np.nan

                nan_replacements[k] = nan_values

        questions = metadata.varLabels

    with srw.SavReader(input_filepath, ioUtf8=True) as reader:
        header = reader.header
        records = reader.all()

    df = pd.DataFrame(records, columns=header)
    df.replace(value_replacements, inplace=True)
    df.replace(nan_replacements, inplace=True)

    df.to_csv(output_filepath)

    questions_file = Path(output_filepath).with_suffix('.json')
    with open(questions_file, 'w') as qf:
        json.dump(questions, qf)

    # upsample the survey for a "representative" sample for analysis
    # if specified
    if upsample_size and upsample_size > 0 and w_col in df.columns:
        rng = np.random.RandomState(12345)

        smpl = rng.choice(df.index,
                          upsample_size,
                          p=df[w_col] / df[w_col].sum())
        df_resampled = df.loc[smpl, :]

        output_path = Path(output_filepath)
        df_resampled.to_csv(
            output_path.with_name(output_path.stem + '_upsampled.csv'))
Example #20
0
    def loadSAV(self):

        raw_data = savReaderWriter.SavReader(str(self.filename),
                                             returnHeader=True)  # This is fast

        raw_data = savReaderWriter.SavReader(str(self.filename),
                                             returnHeader=True)  # This is fast
        raw_data_list = list(raw_data)  # this is slow
        df = pd.DataFrame(raw_data_list)  # this is slow
        df = df.rename(columns=df.loc[0]).iloc[1:]
        print df.head()

        l = list(df.columns)
        print df.head()
        head = self.tableWidget_3.horizontalHeader()
        head.setStretchLastSection(True)
        nrow = len(df.index)
        if nrow > 100:
            nrow = 100
        else:
            nrow = nrow

        #self.datatable = QtGui.QTableWidget(parent=self)
        self.tableWidget_3.setColumnCount(len(df.columns))
        self.tableWidget_3.setRowCount(nrow)
        for i in range(nrow):
            for j in range(len(df.columns)):
                self.tableWidget_3.setItem(
                    i, j, QtGui.QTableWidgetItem(str(df.iget_value(i, j))))
        self.tableWidget_3.setHorizontalHeaderLabels(l)

        self.headerName = l
        self.nonSelectedVariables = self.headerName
        self.data = df
        st = str(nrow) + " of " + str(len(df.index)) + " rows has been shown"
        self.label.setText(st)
        self.label.setVisible(True)
        self.initDict()
        self.initComboBox()
Example #21
0
def main_function():
	rawData = list(spss.SavReader('RevisedDataFile.sav', returnHeader=True))
	pdData = pd.DataFrame(data=rawData[1:],columns=rawData[0]).dropna()
	sq10 = list(pdData.filter(regex='sq10.*'))
	xData_pre = pdData.drop(labels=['K6b','respid','sq08x1_1','sq08x1_2'],axis=1)
	yData = pdData.filter(regex='K6b')
	xData = preprocess_data(xData_pre)
	xData.to_csv('xData.csv'),yData.to_csv('yData.csv')
	assert(len(xData)==len(yData)) #must be true for any ML model to work
	f_obj = open('cornelius_data.txt','w')
	for MLtype in ['LSVC','MNB', 'LR','MLPC']:
		learnData(xData,yData,f_obj,MLtype)
	f_obj.close()
Example #22
0
def loadSav(filename):
   
    with srw.SavReader(filename, returnHeader = True) as reader:
      
        header = reader.next()
        df = pd.DataFrame(reader.all())
        ## set header row as column names
        df = df.rename(columns = df.iloc[0]).drop(df.index[0])

    ## remove 'b character in front of column names due to
    ## some weird utf-8 encoding issue
    temp = [i.decode("utf-8") for i in df.columns.get_values()]
    df.columns = temp

    return(df)
Example #23
0
def opensavfile(file):
    '''
     Read and/or process SPSS files
    :param file: sav data or path to file to open
    :return:
    '''
    text = []
    with spss.SavReader(file, ioUtf8=True) as reader:
        for line in reader:
            text.append(' '.join(str(element) for element in line))
    t: str = re.sub("[\\d,.\S]+", "", ''.join(text)).strip
    s = set()
    if len(s) > 0:
        s: set = findNER(''.join(text))
        if len(s) != 0:
            print(file)
Example #24
0
def extract_sav_data(sav_file, ioLocale='en_US.UTF-8', ioUtf8=True):
    """ see parse_sav_file doc """
    with sr.SavReader(sav_file, returnHeader=True, ioLocale=ioLocale, ioUtf8=ioUtf8) as reader:
        header = next(reader)
        dataframe = pd.DataFrame.from_records(reader, coerce_float=False)
        dataframe.columns = header
        for column in header:
            if isinstance(dataframe[column].dtype, np.object):
                # Replace None with NaN because SRW returns None if casting dates fails (dates are of type np.object))
                values = dataframe[column].dropna().values
                if len(values) > 0:
                    if isinstance(values[0], unicode):
                        dataframe[column] = dataframe[column].dropna().map(unicode.strip)
                    elif isinstance(values[0], str):
                        # savReaderWriter casts dates to str
                        dataframe[column] = dataframe[column].dropna().map(str.strip)
                        # creating DATETIME objects should happen here
        return dataframe
Example #25
0
def writer_data(filepath, filename, valuetypes):
    res = writer_data_table()
    with savReaderWriter.SavReader(filepath, ioUtf8=True) as read:
        # 如果不用ioutf8, 汉字十六进制\被转义,更麻烦
        my_time = my_datetime()
        for i in read:
            for j in range(len(valuetypes)):
                # 数据库不认unicode所以要转换下
                # 将varchar进行json存如数据库
                if valuetypes[j] == "DATETIME":
                    become_time = my_time.become_str(i[j])
                    i[j] = become_time
                elif valuetypes[j] == "DATE":
                    become_time = my_time.become_str(i[j])
                    i[j] = become_time
                elif valuetypes[j] == "VARCHAR":
                    i[j] = json.dumps(i[j])
            res.insert_sql(filename, i)
    res.close()
def sav_to_dataframe(file):
    """
    Converts a sav file to a pandas dataframe
    """
    if not os.path.isfile(file):
        print('File not exists!')
        raise FileNotFoundError()

    if not file.endswith('.sav'):
        print('It is not a sav file!')
        raise FileNotFoundError()

    records = []
    with spss.SavReader(file) as reader:
        print("Reading file:", file, "...")
        for line in reader:
            records.append(line)

    df = pd.DataFrame(records)
    return df
Example #27
0
def leerSPSS():
    with savReaderWriter.SavReader(archivo,
                                   ioLocale='Spanish_Spain.1252') as reader:
        for line in reader:
            codigo = str(line[2]).strip() + str(line[4]).strip() + str(
                line[3]).strip()
            for firma in os.listdir(
                    os.path.join('c:\\', 'dropbox', 'sag', 'dvd',
                                 'firmapruebas')):
                formulario = firma.replace('.', '_').split('_')
                formulariobus = formulario[0] + formulario[1] + formulario[3]
                if codigo == formulariobus:
                    rutanueva = str(line[2]).strip() + "_" + str(
                        line[4]).strip() + "_" + str(int(
                            line[5])).strip() + "_" + str(
                                line[3]).strip() + ".png"
                    os.rename(
                        os.path.join('c:\\', 'dropbox', 'sag', 'dvd',
                                     'firmapruebas16', firma),
                        os.path.join('c:\\', 'dropbox', 'sag', 'dvd',
                                     'firmapruebas16', rutanueva))
Example #28
0
def mocassin_fail_amiy(j, username, diffuse, directoryname, outfoldername,
                       starname):
    print("RUN FAILED! Writing output.")
    print("Failed on line number" + (j + 1) + "of AMIY_input.txt")

    with srw.SavReader('/Users/' + username +
                       '/mocassin-rw_changes/AMIY_number.sav') as reader:
        AMIY_number = reader.next()
    id = ssi(AMIY_number)
    AMIY_number += 1
    srw.SavWriter(
        '/Users/' + username + '/mocassin-rw_changes/AMIY_number.sav',
        AMIY_number)

    if (diffuse[j]): type = 'SN'
    else: type = 'RSG'

    directoryname = "/Users/" + username + "/mocassin-rw_changes/output/" + type + "/" + id + '_' + starname + '_FAIlED'
    os.system("mkdir " + directoryname)
    outfoldername = type + "/" + id + '_' + starname + '_FAIlED'

    os.chdir('/Users/' + username + '/mocassin-rw_changes/output')

    os.system('cp dustGrid.out ' + directoryname + '/dustGrid_' + id +
              '.out.txt')
    os.system('cp runinfo.txt ' + directoryname + '/runinfo_' + id + '.txt')
    os.system('cp SED.out ' + directoryname + '/SED_' + id + '.out.txt')
    if (diffuse[j]):
        os.system('cp equivalentTau.out ' + directoryname + '/equivalentTau_' +
                  id + '.out.txt')
    else:
        os.system('cp tauNu.out ' + directoryname + '/tauNu_' + id +
                  '.out.txt')
    os.system('cp /Users/' + username +
              '/mocassin-rw_changes/input/input.in ' + directoryname +
              '/input_' + id + '.in.txt')
    os.system('cp /Users/' + username +
              '/mocassin-rw_changes/input/ndust/nDUST ' + directoryname +
              '/nDUST_' + id + '.in.txt')
Example #29
0
def read_sav(filename, columns, nrows=0):

    reader = s.SavReader(filename, rawMode=True)
    header = reader.getHeader(None)

    indices = [header.index(col) for col in columns]

    data = []
    for i, line in enumerate(reader):
        if line[0] < 1996.0:
            continue

        values = [line[index] for index in indices]
        data.append(values)
        if i == nrows-1:
            break

    df = pd.DataFrame(data=data, columns=columns)

    na = -1.7976931348623157e+308
    df.replace(na, np.nan, inplace=True)
    df.index = df.SUBJID
    return df
Example #30
0
def write_ms_access_file(savFilename, mdbFilename=None, overwrite=True):
    """Write the actual MS Access file"""
    if not sys.platform.startswith("win"):
        raise EnvironmentError("Sorry, Windows only")
    if not mdbFilename:
        mdbFilename = os.path.splitext(savFilename)[0] + ".mdb"
        mdbFilename = mdbFilename.replace(" ", "_")
    if os.path.exists(mdbFilename) and overwrite:
        os.remove(mdbFilename)

    create_table = sql_create_table(savFilename)
    insert_table = sql_insert_template(savFilename)
    pypyodbc.win_create_mdb(mdbFilename)
    try:
        conn_string = 'Driver={Microsoft Access Driver (*.mdb)};DBQ=%s'
        connection = pypyodbc.connect(conn_string % mdbFilename)
        cursor = connection.cursor()
        cursor.execute(create_table)
        with savReaderWriter.SavReader(savFilename) as reader:
            for record in reader:
                cursor.execute(insert_table, tuple(record))
        cursor.commit()
    finally:
        connection.close()