Beispiel #1
0
    def read(self):
        """
        Read the SAS dataset and return as a Pandas Data Frame or an iterator to read the file in chunks.
        """
        self.reader = None
        retry = False

        # If encoding is not specified, we try some common codecs 
        if self.encoding is None:
            # Try encoding with each of the default codecs
            for cp in self.default_encoding:
                try:
                    self.reader = pd.read_sas(self.filepath, encoding=cp, **self.read_sas_kwargs)
                    self.encoding = cp
                    break
                except UnicodeDecodeError:
                    continue
                except (OverflowError, ValueError) as e:
                    # Pandas has some known limitations when reading SAS files 
                    # https://github.com/pandas-dev/pandas/issues/20927
                    # https://github.com/pandas-dev/pandas/issues/16615
                    self._print_exception("Exception when reading the file with pandas. A second attempt will be made using the SAS7BDAT module", e)
                    retry = True
                    break

            # If pandas failed to read the file we retry with the SAS7BDAT module
            if retry:
                for cp in self.default_encoding:
                    try:
                        handle = SAS7BDAT(self.filepath, skip_header=False, encoding=cp, encoding_errors="strict")
                        self.reader = handle.to_data_frame()
                        handle.close()
                        self.encoding = cp
                        break
                    except UnicodeDecodeError:
                        handle.close()
                        continue
                
        # Instantiate the reader if we haven't already done so
        if self.reader is None:
            try:
                self.reader = pd.read_sas(self.filepath, **self.read_sas_kwargs)
            except (OverflowError, ValueError) as e:
                self._print_exception("Exception when reading the file with pandas. A second attempt will be made using the SAS7BDAT module", e)
                
                # Check if encoding has been specified
                cp = self.encoding
                if cp is None:
                    cp = 'utf_8'                    

                # If pandas failed to read the file we retry with the SAS7BDAT module
                handle = SAS7BDAT(self.filepath, skip_header=False, encoding=cp, encoding_errors="ignore")
                self.reader = handle.to_data_frame()
                handle.close()

        # Send metadata on the result to Qlik
        self._send_table_description()

        # Read the SAS dataset
        return self.reader
Beispiel #2
0
def sas2sqlite(sasfile, sqlitedb):
    import sqlite3
    from sas7bdat import SAS7BDAT
    # Read data from SAS
    f = SAS7BDAT(sasfile)
    x = f.header.cols
    y = [''] * len(x)

    for i, n in enumerate(x):
        if n[1][2] == "numeric":
            y[i] = n[0] + ' real'
        else:
            y[i] = n[0] + ' varchar({})'.format(n[1][1])

    _table = f.header.dataset.title()
    cmd1 = "CREATE TABLE {} ({})".format(_table, ', '.join(y))
    cmd2 = 'INSERT INTO {} VALUES ( {} )'.format(_table,
                                                 ','.join(['?'] * len(x)))
    conn = sqlite3.connect(sqlitedb)
    c = conn.cursor()

    for i, line in enumerate(f.readData()):
        if i == 0:
            c.execute('DROP TABLE IF EXISTS {}'.format(_table))
            c.execute(cmd1)
        else:
            c.execute(cmd2, line)
    conn.commit()
    c.close()
Beispiel #3
0
def read_sas7bdata_pd(fname):
    data = []
    with SAS7BDAT(fname) as f:
        for row in f:
            data.append(row)

    return pd.DataFrame(data[1:], columns=data[0])
Beispiel #4
0
def _read_sas(filename):
    filename = os.path.join(_WS[0], filename)
    with SAS7BDAT(filename) as f:
        reader = f.readlines()
        header = [c.strip() for c in next(reader)]
        for line in reader:
            yield {k: v for k, v in zip(header, line)}
Beispiel #5
0
def main():

    usage = "\n%prog  [options]"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--sasFile",
                      action="store",
                      dest="sasFile",
                      help="Input sas file")
    parser.add_option("-o",
                      "--csvFile",
                      action="store",
                      dest="csvFile",
                      help="Output csv file")

    (options, args) = parser.parse_args()
    for file in ([options.sasFile, options.csvFile]):
        if not (file):
            parser.print_help()
            sys.exit(0)

    sasFile = options.sasFile
    outFile = options.csvFile
    FW = csv.writer(open('outFile', 'wb'))
    with SAS7BDAT(sasFile) as f:
        for row in f:
            FW.writerow(row)
def ex_11():
    """
    Importing SAS files
    In this exercise, you'll figure out how to import a SAS file as a DataFrame using SAS7BDAT and pandas. The file
    'sales.sas7bdat' is already in your working directory and both pandas and matplotlib.pyplot have already been
    imported as follows:

    import pandas as pd
    import matplotlib.pyplot as plt

    The data are adapted from the website of the undergraduate text book Principles of Economics by Hill, Griffiths
    and Lim.
    :return:
    """
    # from sas7bdat import SAS7BDAT -> at the top with the imports

    # Save file to a DataFrame: df_sas
    with SAS7BDAT('sales.sas7bdat') as file:
        df_sas = file.to_data_frame()

    # Print head of DataFrame
    print(df_sas.head())

    # Plot histogram of DataFrame features (pandas and pyplot already imported)
    pd.DataFrame.hist(df_sas[['P']])
    plt.ylabel('count')
    plt.show()
def region_iteratior(base_path, file_name):
    """Iterate over all region folders.

    The data we received is split over 26 folders for the individual regions.
    This iterator helps to load a specific data file for each region and to
    extract the region ID from the path.

    base_path -- folder containing the folders for individual regions
    file_name -- which of the 12 files per folder should be loaded
    returns -- iterator of (region_id, DataFrame) tuples
    """
    input_path = os.path.join(base_path, '**', file_name)
    for f_name in glob.glob(input_path):
        region_id = re.search(r'/Reg(\d+)/', f_name).group(1)
        _LOGGER.info('>>> importing data for region %s', region_id)
        _LOGGER.info('reading from SAS file')
        if file_name.endswith('.sas7bdat'):
            data_frame = SAS7BDAT(f_name).to_data_frame()
        elif file_name.endswith('.csv'):
            data_frame = pd.DataFrame.from_csv(f_name)
        else:
            raise ValueError(
                'Expected sas7bdat or csv files only, got {}'.format(
                    file_name))
        yield (region_id, data_frame)
Beispiel #8
0
    def creatNewPay(self, filename):
        print(" start " + str(datetime.datetime.now()))

        result_file = open(filename, 'wt')
        for source in glob.glob(self.settings['pay_old']['location'] +
                                self.settings['pay_old']['pattern']):
            print("processing " + source)
            i = source.index('payee_pmt_line_')
            year = int(source[i + 15:i + 19])
            c = 0
            count = 0
            with SAS7BDAT(source, skip_header=True) as reader:
                for row in reader:
                    row.append(year)
                    result_file.write(", ".join(map(str, row)) + '\n')
                    c += 1
                    if c >= int(self.settings['pay_old']['blocksize']):
                        print("     " + str(count) + "   " +
                              str(datetime.datetime.now()))
                        count += 1
                        blocks = int(self.settings['pay_old']['blocks'])
                        if blocks > 0 and count > blocks:
                            break
                        c = 0
            print("     " + str(count) + "   " + str(datetime.datetime.now()))

        print("   end " + str(datetime.datetime.now()))
        result_file.close()
Beispiel #9
0
def main():

    usage = "\n%prog  [options]"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option("-i",
                      "--sasFile",
                      action="store",
                      dest="sasFile",
                      help="Input sas file")

    (options, args) = parser.parse_args()
    for file in ([options.sasFile]):
        if not (file):
            parser.print_help()
            sys.exit(0)

    sasFile = options.sasFile
    with SAS7BDAT(sasFile) as f:
        for row in f:
            line = str(row[0])
            for i in range(1, len(row)):
                row[i] = str(row[i])
                row[i] = row[i].replace(",", ";")
                line += ", %s" % (str(row[i]))
            print(line)
Beispiel #10
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        if dai_file_path is None:
            return []
        # check the datatype of user-defined columns
        if not isinstance(dai_file_path, str):
            raise ValueError("Variable: 'dai_file_path' should be <str>")
        if not isinstance(file_names, list):
            raise ValueError("Column: 'file_names' should be <list>")

        # TODO: add checks that files exist
        data_sets = {}

        for f in file_names:
            full_path_to_file = os.path.join(dai_file_path, f)

            if not os.path.exists(full_path_to_file):
                raise ValueError("File <<" + full_path_to_file +
                                 ">> does not exists!")

            with SAS7BDAT(full_path_to_file, skip_header=False) as reader:
                X = reader.to_data_frame()
                print(X.head())
                data_sets.update({f: X})

            return data_sets
Beispiel #11
0
def conditionsymptoms(sclst, condlst, filepath):

    with SAS7BDAT(filepath) as f:
        intrv = f.to_data_frame()

    symcond = intrv[sclst]

    symcond['C3Q03'] = symcond['C3Q03'].apply(lambda x: (-1)
                                              if x in [6, 7] else x)
    symcond['C3Q03'] = symcond['C3Q03'].apply(lambda x: 3
                                              if np.isnan(x) else x)

    symcond.fillna(0, inplace=True)

    conditions = symcond[condlst]

    conditions.columns = [
        'IDNUMXR', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
        18
    ]

    condition = pd.melt(conditions, id_vars='IDNUMXR')

    condition = condition[condition.value != 0]

    condlst.remove('IDNUMXR')
    symcond.drop(condlst, axis=1, inplace=True)

    return symcond, condition
Beispiel #12
0
 def read_data(self):
     """Use the package sas7bdata to read the meta information from every
     sas file under the specified directory
     """
     data = OrderedDict()
     for x in self.sas_files:
         current_sas_name = x.replace('.sas7bdat', '')
         current_file = os.path.join(self.directory, x)
         current = SAS7BDAT(current_file)
         statinfo = os.stat(current_file)
         meta = [['Position', 'Name', 'Type', 'Length', 'Format', 'Label']]
         for i, col in enumerate(current.header.parent.columns, 1):
             _metaline = [i] + map(self.to_unicode, [col.name, col.type, \
                 col.length, col.format, col.label])
             meta.append(_metaline)
         _detail = current.header.properties.__dict__
         blacklist = ['col_count_p1', 'col_count_p2', 'lcp', 'lcs', \
             'filename', 'endianess', 'u64']
         detail = OrderedDict()
         for k in sorted(_detail):
             if k in blacklist:
                 continue
             v = _detail[k]
             if isinstance(v, str):
                 v = v.lower()
             detail[k.upper()] = v
         detail['FILE_SIZE'] = self.convert_size(statinfo.st_size)
         data[current_sas_name] = {'meta': meta, 'detail': detail}
         current.close()
     return data
Beispiel #13
0
    def loadSAS(self):
        try:
            f = SAS7BDAT(str(self.filename))
            df = f.to_data_frame()
        except:
            print "Unexpected error occured"
            return

        l = list(df.columns)
        print l
        head = self.tableWidget_3.horizontalHeader()
        head.setStretchLastSection(True)
        nrow = len(df.index)
        if nrow > 100:
            nrow = 100
        else:
            nrow = nrow

        #self.datatable = QtGui.QTableWidget(parent=self)
        self.tableWidget_3.setColumnCount(len(df.columns))
        self.tableWidget_3.setRowCount(nrow)
        for i in range(nrow):
            for j in range(len(df.columns)):
                self.tableWidget_3.setItem(
                    i, j, QtGui.QTableWidgetItem(str(df.iget_value(i, j))))
        self.tableWidget_3.setHorizontalHeaderLabels(l)

        self.headerName = l
        self.nonSelectedVariables = self.headerName
        self.data = df
        st = str(nrow) + " of " + str(len(df.index)) + " rows has been shown"
        self.label.setText(st)
        self.label.setVisible(True)
        self.initDict()
        self.initComboBox()
Beispiel #14
0
def main(options, args):
    logLevel = logging.DEBUG if options.debug else logging.INFO
    input_paths = [args[0]]
    if len(args) == 1:
        output_paths = ['%s.csv' % os.path.splitext(args[0])[0]]
    elif len(args) == 2 and (args[1] == '-'
                             or args[1].lower().endswith('.csv')):
        output_paths = [args[1]]
    else:
        assert all(x.lower().endswith('.sas7bdat') for x in args)
        input_paths = args
        output_paths = ['%s.csv' % os.path.splitext(x)[0] for x in input_paths]
    assert len(input_paths) == len(output_paths)
    for input_path, output_path in zip(input_paths, output_paths):
        parser = SAS7BDAT(input_path, logLevel=logLevel)
        if options.header:
            parser.logger.info(str(parser.header))
            continue
        rows_iter = parser.readData()
        columns = rows_iter.next()
        data_frame = pd.DataFrame(
            (row for row in rows_iter if row),
            columns=columns,
        )
        print data_frame
Beispiel #15
0
def SAS2DataFrame(inputFullPath, year=[]):
    with SAS7BDAT(inputFullPath, skip_header=False) as reader:
        df = reader.to_data_frame()
        df.columns = [col.lower() for col in df.columns]
        if (not not year) & any(df.columns == 'wave'):
            df = df[df['wave'] == year]
    return df
Beispiel #16
0
def main():

    with SAS7BDAT(IN_FILE) as f:
        raw = [row for row in f][1:]

    # delete if the returns in the beginning years are empty
    country = ""
    start = False
    for obs in raw:
        if obs[0] != country:
            start = False
            country = obs[0]
        if obs[-1] != None:
            start = True
        if start:
            data.append(obs)

    # if ret[i] is empty, ret[i] = avg(ret[:i-1])
    for i in range(len(data)):
        obs = data[i]
        if obs[-1] == None:
            pre = [x[-1] for x in data[:i] if x[0] == obs[0]]
            obs[-1] = sum(pre) / len(pre)

    # output file
    with open(OUT_FILE, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['country', 'year', 'ret', 'ret2'])
        for obs in data:
            writer.writerow(obs)
Beispiel #17
0
def get_list2():
    with open("tickers2.txt", 'w', encoding='utf8') as wf:
        with SAS7BDAT("data_fin_merged.sas7bdat") as rf:
            l = [row[1] for row in rf]
            print(l[:5])
            tickers = set(l[1:])
            for t in tickers:
                wf.write(t + '\n')
Beispiel #18
0
def read_from_sas(filename):
    """
    Return a dataframe from given SAS table.
    https://pypi.python.org/pypi/sas7bdat
    """
    with SAS7BDAT(filename + ".sas7bdat") as f:
        df = f.to_data_frame()
    return df
def merge_qc(rtdf, cog_file, qcVars):
    """ Merge Reaction Time data with metadata from core dataset. This includes 
    rater Z score, computer, complete and time administered."""
    with SAS7BDAT(cog_file) as f:
        cogdf = f.to_data_frame()
    rtdf.index.names = ['vetsaid', 'TrialType']
    rt_qc = rtdf.join(cogdf[qcVars].set_index('vetsaid'), how='left')
    return rt_qc
Beispiel #20
0
 def read_next(self):
     with SAS7BDAT(self.fp) as fh:
         for i, line in enumerate(fh):
             columns = {x: i for i, x in enumerate(line)}
             yield TextItem(
                 meta_list=[line[columns[m]] for m in self.meta],
                 text=[line[columns[t]] for t in self.text]
             )
Beispiel #21
0
    def _send_table_description(self, func=None):
        """
        Send the table description to Qlik as meta data.
        Only used when the SSE is called from the Qlik load script.
        """
        
        # Set up the table description to send as metadata to Qlik
        self.table = SSE.TableDescription()
        
        if func is None:
            self.table.name = "SAS_Dataset"

            if isinstance(self.reader, pd.DataFrame):
                self.sample_data = self.reader.head(5)
            else:
                # Read the SAS file to get sample data
                sample_response = pd.read_sas(self.filepath, format=self.format, encoding=self.encoding, chunksize=5)
            
                # Get the first chunk of data as a Pandas DataFrame
                self.sample_data = sample_response.__next__()

                # Close the file reader
                sample_response.close()
            
            # Fetch field labels from SAS variable attributes if required
            # This may fail for wide tables due to meta data limits. For such cases use the get_labels function.
            if self.labels:
                # Use the sas7bdat library to read the file
                handle = SAS7BDAT(self.filepath, skip_header=False)

                # Get labels for the variables
                labels = [col.label.decode(self.encoding) for col in handle.columns]

                # Close the file reader
                handle.close()
            else:
                # Get the variable names from the sample data
                labels = self.sample_data.columns
            
            # Set field names 
            for col in labels:
                # Set up fields for the table
                self.table.fields.add(name=col)
            
            if self.debug:
                self._print_log(2)
        
        elif func == "get_labels":
            self.table.name = "SAS_Labels"
            self.table.fields.add(name="variable")
            self.table.fields.add(name="label")
        
            if self.debug:
                self._print_log(4)

        # Send table description
        table_header = (('qlik-tabledescription-bin', self.table.SerializeToString()),)
        self.context.send_initial_metadata(table_header)
Beispiel #22
0
def load_sas(fn, schema):
    '''Load a SAS file according to the given schema.

    Parameters:
        fn : str
            Path to the SAS file.
        schema : DataFrame
            Data frame containing the schema of the study.

    Returns:
        df_sub : DataFrame
            The data available in the SAS file, subset to the columns listed in
            the schema.
    '''
    with SAS7BDAT(fn) as f:
        df = f.to_data_frame()
        if 'id' in df.columns:
            df['id'] = df['id'].astype(int)
            df['centre'] = np.asarray(df['id']) // 1000
            df = df.set_index(['centre', 'id'])
        elif 'centre' in df.columns:
            print('WARNING: no id column found in df {}'.format(fn))
            df['id'] = -9
            df['centre'] = df['centre'].astype(int)
            df = df.set_index(['centre', 'id'])
        else:
            print('WARNING: no id column found in df {}'.format(fn))
            df['id'] = -9
            df['centre'] = -9
            df = df.set_index(['centre', 'id'])
    schema_cols = list(schema.index)
    cols = [c for c in df.columns if c in schema_cols]
    dtypes = schema.loc[cols].groupby('Name').first()['Type']
    df_sub = df[cols]
    for col in dtypes.index:
        if dtypes[col] == 'int':
            try:
                df_sub[col] = df_sub[col].astype(pd.Int64Dtype())
            except:
                msg = 'error converting int column {}, leaving as float'
                print(msg.format(col))
                try:
                    df_sub[col] = df_sub[col].astype(float)
                except:
                    print('not a valid float either! dropping {}'.format(col))
                    df_sub = df_sub.drop(col, axis=1)
        elif dtypes[col] == 'choice' or dtypes[col] == 'check':
            try:
                df_sub[col] = pd.Categorical(df_sub[col].astype(
                    pd.Int64Dtype()))
            except:
                msg = 'error converting categorical column {}, dropping it'
                print(msg.format(col))
                df_sub = df_sub.drop(col, axis=1)
        elif dtypes[col] == 'string' or dtypes[col] == 'date':
            df_sub = df_sub.drop(col, axis=1)

    return df_sub
Beispiel #23
0
def sas_to_csv(dirt,dataset):
    print("\n\nReading data from",dirt+"sasdat/" + dataset+".sas7bdat")
    with SAS7BDAT(dirt+"sasdat/" + dataset+".sas7bdat") as f:
        df = f.to_data_frame()
    print("\n\nData description:\n\n",df.describe())
    cols = df.columns
    df.to_csv(dirt+dataset+'d.csv',encoding = 'utf-8',index = False,header =True)
    print("\n\nCheck column\n\n",cols)
    return df
Beispiel #24
0
    def sas_to_csv(self):
        print("\n\nReading data from",
              self.path + self.taskname + '/' + self.dataname)
        with SAS7BDAT(self.path + self.taskname + '/' + self.dataname +
                      '.sas7bdat') as f:
            df = f.to_data_frame()
        print("\n\nData description:\n\n", df.describe())

        return df
def importing_sas_files():
    with SAS7BDAT('data/sales.sas7bdat') as file:
        df_sas = file.to_data_frame()

    print(df_sas.head())
    print(df_sas.info())

    pd.DataFrame.hist(df_sas[['P']])
    plt.ylabel('count')
    plt.show()
def merge_qc(axcptdf, cog_file, qcVars):
    """ Merge AX-CPT data with metadata from core dataset. This includes 
    rater Z score, computer, version, complete and time administered."""
    with SAS7BDAT(cog_file) as f:
        cogdf = f.to_data_frame()
    axcpt_qc = pd.merge(axcptdf,
                        cogdf[qcVars],
                        left_index=True,
                        right_on='vetsaid',
                        how='left')
    return axcpt_qc
Beispiel #27
0
def sas2pd(sasfile):
    import pandas as pd
    from sas7bdat import SAS7BDAT
    a = []
    for i, x in enumerate(SAS7BDAT(sasfile).readData()):
        if i == 0:
            cols = x
        else:
            a.append(x)
    df = pd.DataFrame(a)
    df.columns = cols
    return df
Beispiel #28
0
def sasConvert(bucket, key):
    print('--Loading function')
    try:
        print('--Acquiring Current Time')
        t = datetime.now()
        formatted_time = t.strftime('%d-%m-%y%H%M%S')
        print('--Formatted Time: ' + formatted_time)

        if key[-9:] == '.sas7bdat':
            print('--Generating File Names')
            keySubExt = key[:-9]
            keyName = keySubExt.rsplit('/', 1)
            tmpFile = 'sastocsv/tmp/' + keyName[
                1] + '-' + formatted_time + '.sas7bdat'
            tmpOutFile = 'sastocsv/output/' + keyName[
                1] + '-' + formatted_time + '.csv'
            s3OutName = keyName[0] + '/' + 'output/' + keyName[
                1] + '-' + 'convertedSAS' + '-' + formatted_time + '.csv'

            # Print filenames
            print('*keySubExt: ' + keySubExt)
            print('*keyName: ' + keyName[0] + '   2: ' + keyName[1])
            print('*tmpFile: ' + tmpFile)
            print('*tmpOutFile: ' + tmpOutFile)
            print('*s3OutName: ' + s3OutName)

            print('--Downloading SAS7BDAT File')
            s3.meta.client.download_file(bucket, key, tmpFile)

            print('--Reading SAS7BDAT File to DataFrame')
            with SAS7BDAT(tmpFile) as f:
                df = f.to_data_frame()

            print('--Converting to CSV File')
            df.to_csv(tmpOutFile, index=False)

            print('--Uploading to S3 Bucket')
            s3.meta.client.upload_file(tmpOutFile, bucket, s3OutName)

            os.remove(tmpFile)
            os.remove(tmpOutFile)

            return (s3OutName)
        else:
            print('Invalid File Type')
            return ('Invalid File Type')

    except Exception as e:
        print(e)
        print('Error converting file to CSV')
        raise e
        return (e)
Beispiel #29
0
    def sas2meta(self, encode):
        '''encode like latin
        localpath is a path to the folder where the dataset is stored 
        '''
        if self.isS3 == True:
            self.load_sas_from_s3()  ## bring SAS to local under data folder
            sas = SAS7BDAT(os.path.join('data', self.dataset), encoding=encode)
        elif self.isS3 == False:
            sas = SAS7BDAT(os.path.join(self.localpath, self.dataset),
                           encoding=encode)

        varlist = list(map(lambda i: i.name.decode("utf-8"), sas.columns))
        variablelabels = list(
            map(lambda i: i.label.decode("utf-8"), sas.columns))
        df = pd.DataFrame({
            'ID':
            list(map(lambda i: i.name.decode("utf-8").upper(), sas.columns)),
            'text':
            list(map(lambda i: i.label.decode("utf-8"), sas.columns))
        })
        df['text'] = df["ID"] + ' ' + df["text"]
        df['text'] = df['text'].str.replace('_', ' ')
        return df
Beispiel #30
0
 def read(self):
     logging.info('Getting Terms and Negation.')
     res = []
     with SAS7BDAT(self.fp) as fh:
         columns = fh.column_names_strings
         for row in fh:
             res.append([
                 row[columns['id']],
                 row[columns['text']],
                 row[columns['cui']],
                 row[columns['valence']],
                 row[columns['regexvariation']],
                 row[columns['wordorder']],
             ])
     return res