def test_writerows_pd_np_issue63(): """ issue #63 "ufunc 'isnan' not supported for the input types" Caused by strings that contained NaN values """ if skip: raise SkipTest buff = StringIO(u"""n1,n2,s1,s2 1,1,a,a 2,2,b,bb 3,,c,""") desired = [[1.0, 1.0, b'a', b'a'], [2.0, 2.0, b'b', b'bb'], [3.0, None, b'c', b'']] df = pd.read_csv(buff, chunksize=10**6, sep=',').get_chunk() arr = df.values savFileName = join(gettempdir(), "check.sav") kwargs = dict(varNames=list(df), varTypes=dict(n1=0, n2=0, s1=1, s2=2), savFileName=savFileName, ioUtf8=True) # numpy with srw.SavWriter(**kwargs) as writer: writer.writerows(arr) with srw.SavReader(savFileName) as reader: actual = reader.all(False) assert actual == desired, actual # pandas with srw.SavWriter(**kwargs) as writer: writer.writerows(df) with srw.SavReader(savFileName) as reader: actual = reader.all(False) assert actual == desired, actual
def get_data_sections(file_name, filterColumnNumber=16): # or 27 """ loads chunks of data surrounding sms interventions. assumes data is sorted by pid, day, and time. :param file_name: name of save file to read :param filterColumnNumber: column which must be present for data to be included :return: array of arrays of consecutive data like [[d1, d2], [d6, d7, d8]] """ data_sections = [[]] with savReaderWriter.SavReader(file_name, ioLocale='en_US.UTF-8') as reader: row_n = 0 currentPID = 9 for line in reader: if (line[filterColumnNumber] is not None # test for if line has data we want in it and line[0] == currentPID): #print get_data_dict(line, row_n) data_sections[-1].append(get_data_dict(line, row_n)) else: # move to next data section if len(data_sections[-1] ) > 0: # only move if not already an empty array data_sections.append([]) currentPID = line[0] row_n += 1 if row_n >= FILE_END: # yeah... that happens... break return data_sections
def write_ms_access_file(savFilename, mdbFilename=None, overwrite=True): """Write the actual MS Access file""" if not sys.platform.startswith("win"): raise EnvironmentError("Sorry, Windows only") if not mdbFilename: mdbFilename = os.path.splitext(savFilename)[0] + ".mdb" mdbFilename = mdbFilename.replace(" ", "_") if os.path.exists(mdbFilename) and overwrite: os.remove(mdbFilename) pypyodbc.lowercase = False create_table = sql_create_table(savFilename) insert_table = sql_insert_template(savFilename) pypyodbc.win_create_mdb(mdbFilename) #cnx = pyodbc.connect("DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=B:\\spss_python\\someFile.mdb", autocommit=True) #cnx = pyodbc.connect("DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=someFile.mdb", autocommit=True) conn_string = r"DRIVER={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=" + os.getcwd( ) + "\\%s" % mdbFilename cnx = pyodbc.connect(conn_string, autocommit=True) try: cursor = cnx.cursor() cursor.execute(create_table) with savReaderWriter.SavReader(savFilename) as reader: for record in reader: cursor.execute(insert_table, tuple(record)) cursor.commit() finally: cnx.close()
def writer_moredata(self, filepath, filename, valuetypes, start, end, tablename): res = writer_data_table() with savReaderWriter.SavReader(os.path.join(filepath, filename), ioUtf8=True) as read: # 如果不用ioutf8, 汉字十六进制\被转义,更麻烦 try: for i in read: i = i[start:end] for j in range(len(valuetypes)): # 数据库不认unicode所以要转换下 # 将varchar进行json存如数据库 if valuetypes[j] == "DATETIME": i[j] = read.spss2strDate(i[j], '%Y-%m-%d %H:%M:%S', None) elif valuetypes[j] == "DATE": i[j] = read.spss2strDate(i[j], '%Y-%m-%d', None) elif valuetypes[j] == "VARCHAR": i[j] = i[j] res.insert_sql(tablename, i) except Exception as e: my_log.error(e) finally: my_log.info("data write database success !!!") res.close()
def test_data_same(self): with sav.SavReader(out_savFileName, rawMode=True, **b_settings) as data: out_records = data.all(False) out_encoding = data.fileEncoding self.assertEqual("utf_8", out_encoding) self.assertEqual(in_records, out_records)
def extract_sav_data(sav_file, ioLocale='en_US.UTF-8', ioUtf8=True, engine='savReaderWriter'): """ see parse_sav_file doc """ if engine == 'savReaderWriter': with sr.SavReader(sav_file, returnHeader=True, ioLocale=ioLocale, ioUtf8=ioUtf8) as reader: thedata = [x for x in reader] header = thedata[0] dataframe = pd.DataFrame.from_records(thedata[1:], coerce_float=False) dataframe.columns = header for column in header: if isinstance(dataframe[column].dtype, np.object): # Replace None with NaN because SRW returns None if casting dates fails (dates are of type np.object)) values = dataframe[column].dropna().values if len(values) > 0: if isinstance(values[0], str): dataframe[column] = dataframe[column].dropna().map( str.strip) elif isinstance(values[0], str): # savReaderWriter casts dates to str dataframe[column] = dataframe[column].dropna().map( str.strip) # creating DATETIME objects should happen here return dataframe elif engine == 'readstat': df, meta = pyreadstat.read_sav(sav_file) return df
def main(): MAX_ROWS = 5000 MAX_DECIMALS = 6 if len(sys.argv) < 2 or len(sys.argv) > 3: print('Usage: ' + sys.argv[0] + ' inputfile [outputpath]') sys.exit(1) if len(sys.argv) == 2: sys.argv.append('') try: with savReaderWriter.SavHeaderReader(sys.argv[1], ioUtf8=False) as header: metadata = header.all() res = { 'alignments': metadata.alignments, 'columnWidths': metadata.columnWidths, 'measureLevels': metadata.measureLevels, 'varFormats': metadata.formats, 'varTypes': metadata.varTypes, 'varNames': metadata.varNames, 'valueLabels': metadata.valueLabels, 'varLabels': metadata.varLabels, # # Otros valores que vienen en el header: # 'caseWeightVar': metadata.caseWeightVar, # 'fileAttributes': metadata.fileAttributes, # 'fileLabel': metadata.fileLabel, # 'missingValues': metadata.missingValues, # 'multRespDefs': metadata.multRespDefs, # 'varAttributes': metadata.varAttributes, # 'varRoles': metadata.varRoles, # 'varSets': metadata.varSets, } with open(os.path.join(sys.argv[2], 'header.json'), 'w') as h: h.write(json.dumps(convert_recursive(res), indent=4)) with savReaderWriter.SavReader(sys.argv[1], ioUtf8=False) as reader: for i, lines in enumerate(chunks(reader, MAX_ROWS), 1): with open( os.path.join(sys.argv[2], 'data_' + str(i).zfill(5) + '.json'), 'w') as f: encoded = convert_recursive(lines) jsonText = json.dumps(encoded) # jsonText = truncate_decimals(jsonText, MAX_DECIMALS) f.write(jsonText) print('Files successfully created.') return except: print("Error: ", sys.exc_info()) traceback.print_exc() sys.exit(1)
def test_writerows_tuple(): records = tuple([tuple(record) for record in desired]) savFileName = "output_tuple.sav" with srw.SavWriter(savFileName, *args) as writer: writer.writerows(records) with srw.SavReader(savFileName) as reader: actual = reader.all(False) assert actual == desired, actual
def read_sav(self, filepath): with savReaderWriter.SavReader(filepath, ioUtf8=True) as read: ret = read.getSavFileInfo() """ # getsavfileinfo infomation : # (self.numVars, self.nCases, self.varNames, self.varTypes,self.formats, self.varLabels, self.valueLabels) """ return read.formats, read.varNames, read.varLabels, read.valueLabels
def leerSPSSPrueba(): data = [] with savReaderWriter.SavReader(archivo, returnHeader=True, ioLocale='Spanish_Spain.1252') as reader: for linea in reader: data.append(linea) return data
def func(self, savFileName): self.outfile = tempfile.mktemp(suffix="_out.sav") with rw.SavWriter(self.outfile, [b'v1'], {b'v1': 0}) as writer: for i in range(10): writer.writerow([i]) with rw.SavReader(self.outfile) as reader: self.assertEqual(reader.all(), [[float(i)] for i in range(10)]) self.assertTrue(os.path.exists(self.outfile))
def test_writerows_namedtuple(): Record = namedtuple("Record", args[0]) records = [Record(*record) for record in desired] savFileName = "output_namedtuple.sav" with srw.SavWriter(savFileName, *args) as writer: writer.writerows(records) with srw.SavReader(savFileName) as reader: actual = reader.all(False) assert actual == desired, actual
def test_raises_SPSSIOError(self): module = rw if sys.version_info[0] > 2 else rw.error SPSSIOError = module.SPSSIOError retcodes = module.retcodes with self.assertRaises(SPSSIOError): with rw.SavReader(self.badSavFile) as reader: for line in reader: pass error = sys.exc_info()[1] self.assertEqual(retcodes.get(error.retcode), "SPSS_INVALID_FILE")
def test_writerows_pandas(): if skip: raise SkipTest df = pd.DataFrame({"a": range(0, 20, 2), "b": range(1, 20, 2)}) df.loc[0, "a"] = np.nan savFileName = "output_pd.sav" with srw.SavWriter(savFileName, *args) as writer: writer.writerows(df) with srw.SavReader(savFileName) as reader: actual = reader.all(False) assert actual == desired, actual
def read_sav(path): """Read .sav files, return pandas DataFrame""" raw_data = list(spss.SavReader(path, returnHeader=True)) df = pd.DataFrame(raw_data) columns = list(df.loc[0]) columns = [s.decode('utf-8') for s in columns] df.columns = columns df = df.iloc[1:].reset_index( drop=True) # sets column name to the first row return df
def test_writerows_numpy(): if skip: raise SkipTest data = [range(10), range(10, 20)] array = np.array(data, dtype=np.float64).reshape(10, 2) array[0, 0] = np.nan savFileName = "output_np.sav" with srw.SavWriter(savFileName, *args) as writer: writer.writerows(array) with srw.SavReader(savFileName) as reader: actual = reader.all(False) assert actual == desired, actual
def readSav(fname): with spss.SavReader(fname) as reader: header = reader.header records = reader.all() df = pd.DataFrame(records, columns=header) # Decode the encoded data df.rename(columns=lambda x: x.decode("utf-8"), inplace=True) df.rename(columns=lambda x: x.translate(str.maketrans(' /()?', '_____')), inplace=True) for c in df.columns: if is_string_dtype(df[c]): df[c] = df[c].str.decode("utf-8") return df
def reload(self): import savReaderWriter self.rdr = savReaderWriter.SavReader(self.source.resolve()) with self.rdr as reader: self.columns = [] for i, vname in enumerate(reader.varNames): vtype = float if reader.varTypes[vname] == 0 else str self.addColumn(ColumnItem(vname.decode('utf-8'), i, type=vtype)) self.rows = [] for r in Progress(reader, total=reader.shape.nrows): self.rows.append(r)
def spss_to_csv(input_filepath, output_filepath, upsample_size=100000, w_col='Weight'): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ with srw.SavHeaderReader(input_filepath, ioUtf8=True) as header: metadata = header.all() value_replacements = metadata.valueLabels nan_replacements = dict() for k, v in metadata.missingValues.items(): if v and 'values' in v: nan_values = dict() if isinstance(v['values'], list): for nan_val in v['values']: nan_values[nan_val] = np.nan else: nan_values[v['values']] = np.nan nan_replacements[k] = nan_values questions = metadata.varLabels with srw.SavReader(input_filepath, ioUtf8=True) as reader: header = reader.header records = reader.all() df = pd.DataFrame(records, columns=header) df.replace(value_replacements, inplace=True) df.replace(nan_replacements, inplace=True) df.to_csv(output_filepath) questions_file = Path(output_filepath).with_suffix('.json') with open(questions_file, 'w') as qf: json.dump(questions, qf) # upsample the survey for a "representative" sample for analysis # if specified if upsample_size and upsample_size > 0 and w_col in df.columns: rng = np.random.RandomState(12345) smpl = rng.choice(df.index, upsample_size, p=df[w_col] / df[w_col].sum()) df_resampled = df.loc[smpl, :] output_path = Path(output_filepath) df_resampled.to_csv( output_path.with_name(output_path.stem + '_upsampled.csv'))
def loadSAV(self): raw_data = savReaderWriter.SavReader(str(self.filename), returnHeader=True) # This is fast raw_data = savReaderWriter.SavReader(str(self.filename), returnHeader=True) # This is fast raw_data_list = list(raw_data) # this is slow df = pd.DataFrame(raw_data_list) # this is slow df = df.rename(columns=df.loc[0]).iloc[1:] print df.head() l = list(df.columns) print df.head() head = self.tableWidget_3.horizontalHeader() head.setStretchLastSection(True) nrow = len(df.index) if nrow > 100: nrow = 100 else: nrow = nrow #self.datatable = QtGui.QTableWidget(parent=self) self.tableWidget_3.setColumnCount(len(df.columns)) self.tableWidget_3.setRowCount(nrow) for i in range(nrow): for j in range(len(df.columns)): self.tableWidget_3.setItem( i, j, QtGui.QTableWidgetItem(str(df.iget_value(i, j)))) self.tableWidget_3.setHorizontalHeaderLabels(l) self.headerName = l self.nonSelectedVariables = self.headerName self.data = df st = str(nrow) + " of " + str(len(df.index)) + " rows has been shown" self.label.setText(st) self.label.setVisible(True) self.initDict() self.initComboBox()
def main_function(): rawData = list(spss.SavReader('RevisedDataFile.sav', returnHeader=True)) pdData = pd.DataFrame(data=rawData[1:],columns=rawData[0]).dropna() sq10 = list(pdData.filter(regex='sq10.*')) xData_pre = pdData.drop(labels=['K6b','respid','sq08x1_1','sq08x1_2'],axis=1) yData = pdData.filter(regex='K6b') xData = preprocess_data(xData_pre) xData.to_csv('xData.csv'),yData.to_csv('yData.csv') assert(len(xData)==len(yData)) #must be true for any ML model to work f_obj = open('cornelius_data.txt','w') for MLtype in ['LSVC','MNB', 'LR','MLPC']: learnData(xData,yData,f_obj,MLtype) f_obj.close()
def loadSav(filename): with srw.SavReader(filename, returnHeader = True) as reader: header = reader.next() df = pd.DataFrame(reader.all()) ## set header row as column names df = df.rename(columns = df.iloc[0]).drop(df.index[0]) ## remove 'b character in front of column names due to ## some weird utf-8 encoding issue temp = [i.decode("utf-8") for i in df.columns.get_values()] df.columns = temp return(df)
def opensavfile(file): ''' Read and/or process SPSS files :param file: sav data or path to file to open :return: ''' text = [] with spss.SavReader(file, ioUtf8=True) as reader: for line in reader: text.append(' '.join(str(element) for element in line)) t: str = re.sub("[\\d,.\S]+", "", ''.join(text)).strip s = set() if len(s) > 0: s: set = findNER(''.join(text)) if len(s) != 0: print(file)
def extract_sav_data(sav_file, ioLocale='en_US.UTF-8', ioUtf8=True): """ see parse_sav_file doc """ with sr.SavReader(sav_file, returnHeader=True, ioLocale=ioLocale, ioUtf8=ioUtf8) as reader: header = next(reader) dataframe = pd.DataFrame.from_records(reader, coerce_float=False) dataframe.columns = header for column in header: if isinstance(dataframe[column].dtype, np.object): # Replace None with NaN because SRW returns None if casting dates fails (dates are of type np.object)) values = dataframe[column].dropna().values if len(values) > 0: if isinstance(values[0], unicode): dataframe[column] = dataframe[column].dropna().map(unicode.strip) elif isinstance(values[0], str): # savReaderWriter casts dates to str dataframe[column] = dataframe[column].dropna().map(str.strip) # creating DATETIME objects should happen here return dataframe
def writer_data(filepath, filename, valuetypes): res = writer_data_table() with savReaderWriter.SavReader(filepath, ioUtf8=True) as read: # 如果不用ioutf8, 汉字十六进制\被转义,更麻烦 my_time = my_datetime() for i in read: for j in range(len(valuetypes)): # 数据库不认unicode所以要转换下 # 将varchar进行json存如数据库 if valuetypes[j] == "DATETIME": become_time = my_time.become_str(i[j]) i[j] = become_time elif valuetypes[j] == "DATE": become_time = my_time.become_str(i[j]) i[j] = become_time elif valuetypes[j] == "VARCHAR": i[j] = json.dumps(i[j]) res.insert_sql(filename, i) res.close()
def sav_to_dataframe(file): """ Converts a sav file to a pandas dataframe """ if not os.path.isfile(file): print('File not exists!') raise FileNotFoundError() if not file.endswith('.sav'): print('It is not a sav file!') raise FileNotFoundError() records = [] with spss.SavReader(file) as reader: print("Reading file:", file, "...") for line in reader: records.append(line) df = pd.DataFrame(records) return df
def leerSPSS(): with savReaderWriter.SavReader(archivo, ioLocale='Spanish_Spain.1252') as reader: for line in reader: codigo = str(line[2]).strip() + str(line[4]).strip() + str( line[3]).strip() for firma in os.listdir( os.path.join('c:\\', 'dropbox', 'sag', 'dvd', 'firmapruebas')): formulario = firma.replace('.', '_').split('_') formulariobus = formulario[0] + formulario[1] + formulario[3] if codigo == formulariobus: rutanueva = str(line[2]).strip() + "_" + str( line[4]).strip() + "_" + str(int( line[5])).strip() + "_" + str( line[3]).strip() + ".png" os.rename( os.path.join('c:\\', 'dropbox', 'sag', 'dvd', 'firmapruebas16', firma), os.path.join('c:\\', 'dropbox', 'sag', 'dvd', 'firmapruebas16', rutanueva))
def mocassin_fail_amiy(j, username, diffuse, directoryname, outfoldername, starname): print("RUN FAILED! Writing output.") print("Failed on line number" + (j + 1) + "of AMIY_input.txt") with srw.SavReader('/Users/' + username + '/mocassin-rw_changes/AMIY_number.sav') as reader: AMIY_number = reader.next() id = ssi(AMIY_number) AMIY_number += 1 srw.SavWriter( '/Users/' + username + '/mocassin-rw_changes/AMIY_number.sav', AMIY_number) if (diffuse[j]): type = 'SN' else: type = 'RSG' directoryname = "/Users/" + username + "/mocassin-rw_changes/output/" + type + "/" + id + '_' + starname + '_FAIlED' os.system("mkdir " + directoryname) outfoldername = type + "/" + id + '_' + starname + '_FAIlED' os.chdir('/Users/' + username + '/mocassin-rw_changes/output') os.system('cp dustGrid.out ' + directoryname + '/dustGrid_' + id + '.out.txt') os.system('cp runinfo.txt ' + directoryname + '/runinfo_' + id + '.txt') os.system('cp SED.out ' + directoryname + '/SED_' + id + '.out.txt') if (diffuse[j]): os.system('cp equivalentTau.out ' + directoryname + '/equivalentTau_' + id + '.out.txt') else: os.system('cp tauNu.out ' + directoryname + '/tauNu_' + id + '.out.txt') os.system('cp /Users/' + username + '/mocassin-rw_changes/input/input.in ' + directoryname + '/input_' + id + '.in.txt') os.system('cp /Users/' + username + '/mocassin-rw_changes/input/ndust/nDUST ' + directoryname + '/nDUST_' + id + '.in.txt')
def read_sav(filename, columns, nrows=0): reader = s.SavReader(filename, rawMode=True) header = reader.getHeader(None) indices = [header.index(col) for col in columns] data = [] for i, line in enumerate(reader): if line[0] < 1996.0: continue values = [line[index] for index in indices] data.append(values) if i == nrows-1: break df = pd.DataFrame(data=data, columns=columns) na = -1.7976931348623157e+308 df.replace(na, np.nan, inplace=True) df.index = df.SUBJID return df
def write_ms_access_file(savFilename, mdbFilename=None, overwrite=True): """Write the actual MS Access file""" if not sys.platform.startswith("win"): raise EnvironmentError("Sorry, Windows only") if not mdbFilename: mdbFilename = os.path.splitext(savFilename)[0] + ".mdb" mdbFilename = mdbFilename.replace(" ", "_") if os.path.exists(mdbFilename) and overwrite: os.remove(mdbFilename) create_table = sql_create_table(savFilename) insert_table = sql_insert_template(savFilename) pypyodbc.win_create_mdb(mdbFilename) try: conn_string = 'Driver={Microsoft Access Driver (*.mdb)};DBQ=%s' connection = pypyodbc.connect(conn_string % mdbFilename) cursor = connection.cursor() cursor.execute(create_table) with savReaderWriter.SavReader(savFilename) as reader: for record in reader: cursor.execute(insert_table, tuple(record)) cursor.commit() finally: connection.close()