def _parse_value_with_corresponding_parser_(self, value, col): col_parser_name = 'parse_' + str(col) man_log.debug('parsing %s from %s using %s' % (col, value, col_parser_name)) col_parser = getattr(self, col_parser_name, self.default_parser) return col_parser(value, col)
def interview_date_write_formatter(self, dateobj, coldef): if isinstance(dateobj, self.NoDataError): return coldef.missing_vals if type(dateobj) == str: man_log.debug("date formatter catches a data string") return dateobj return dateobj.strftime('%m/%d/%Y')
def ensure_row(self, datarow): man_log.debug("ENSURING DATA ROW %s" % datarow) for coldef, elem in datarow.items(): if coldef.required: man_log.debug('row[%s](%s) is required' % (coldef, elem)) if isinstance(elem, self.NoDataError): # import ipdb; ipdb.set_trace() man_log.critical("\n\n\nRAISING DROPROW") raise DropRowException('%s' % elem)
def get_filepath(self, save=False, title='open file', filetype='file', quit=True, allownew=True, **kwargs): """this is a generic function that can be extended it simply gets a filepath and asserts it's not empty. if it's empty the program quits unless quit is False. when it will throw an error filetype is a string used for error messages and variable names askopenfilename takes other kwargs as well you can look into all of them provided get passed on. - defaultextension - str expression for default extensions - others check out utils.askopenfilename docs for more - initialdir - str path to where you would like to open TODO: figure out how to disallow new files being made/ allow """ fpath = None # I have hard coded the file types to csv and tsv. if save: fpath = utils.asksaveasfilename(title=title, filetypes=(("csv files", "*.csv"), ("all files", "*.*")), **kwargs) else: fpath = utils.askopenfilename(title=title, filetypes=( ("all files", "*.*"), ("tsv files", "*.tsv"), ("csv files", "*.csv"), ), **kwargs) # Check path validity if fpath == '' or len(fpath) == 0: print('no %s file selected. quitting' % filetype) utils.exit() setattr(self, filetype, fpath) man_log.debug('selected %s to be %s.' % (filetype, fpath)) return fpath
def get_filepath(self, save=False, # This flag is used to specify whether the file is for output or input title='open file', filetype='file', quit=True, allownew=True, **kwargs) -> str: """this is a generic function that can be extended it simply gets a filepath and asserts it's not empty. if it's empty the program quits unless quit is False. when it will throw an error filetype is a string used for error messages and variable names askopenfilename takes other kwargs as well you can look into all of them provided get passed on. - defaultextension - str expression for default extensions - others check out utils.askopenfilename docs for more - initialdir - str path to where you would like to open - initialfile - str default filename TODO: figure out how to disallow new files being made/ allow """ fpath = None # I have hard coded the file types to csv and tsv. if save: fpath = utils.asksaveasfilename(title=title, filetypes=(("csv files", "*.csv"), ("all files", "*.*")), **kwargs) else: fpath = utils.askopenfilename(title=title, filetypes=(("all files", "*.*"), ("tsv files", "*.tsv"), ("csv files", "*.csv"),), **kwargs) # Check path validity if fpath == '' or len(fpath) == 0: print('no %s file selected. quitting' % filetype) utils.exit() setattr(self, filetype, fpath) man_log.debug('selected %s to be %s.' % (filetype, fpath)) return fpath
def load_data(self, clear_src=True): ''' loads the source data file and stores it in self.data so that it can be iterated through easily NOTE. IF datafile lives on a server through a vpn. this runs REALLY REALLY SLOWLY... we should pull the file and use readlines to save them to memory and close the file to speed things up. rather than querying every time. ''' man_log.info('Loading source data into %s' % type(self).__name__) # if we want to clear the src if clear_src: self.initialize_data() # get source data im memory and validate data data = self._read_data_from_source_() if data is not None: self.data = data else: man_log.debug('Data source corrupted.') raise Exception("Data source corrupted. Please check the data source")
def _value_is_in_missing_list_(self, value, col_def): """ Check whether the value is in the missing value of the col_def, if col_def has missing value list :param value: data value :param col_def: :return: """ if hasattr(col_def, 'missing_vals'): man_log.debug('checking if %s in missing vals: %s' % (value, col_def.missing_vals)) missing = [missing_val.strip() for missing_val in col_def.missing_vals.split(",")] if value in missing: return True else: return False man_log.debug("column: %s doesn't have missing vals" % col_def) return False
def write_outfile(self): ''' writes self.data to a the outfile. which the user provides''' outpath, outfile = self.read_output_file() self.write_header(outfile) outwriter = utils.DictWriter(outfile, fieldnames=self.col_defs, delimiter=self.delimiter) outwriter.writeheader() #import ipdb; ipdb.set_trace() for rowid, row in enumerate(self.data): for coldef, elem in row.items(): if isinstance( elem, ssManager.NoDataError): # print the default value. elem = '' formatter = getattr(self, coldef + '_write_formatter', self.default_write_formatter) man_log.debug('trying formatter %s' % (coldef + '_write_formatter')) man_log.debug('formatting row[%s][%s](%s) with %s' % (rowid, coldef, row[coldef], formatter.__name__)) row[coldef] = formatter(row[coldef], coldef) man_log.debug('writing row[%s][%s] is %s' % (rowid, coldef, row[coldef])) outwriter.writerow(row) return outpath
def load_data(self, clear_src=True): ''' loads the source data file and stores it in self.data so that it can be iterated through easily NOTE. IF datafile lives on a server through a vpn. this runs REALLY REALLY SLOWLY... we should pull the file and use readlines to save them to memory and close the file to speed things up. rather than querying every time. ''' man_log.info('Loading source data into %s' % type(self).__name__) # if we want to clear the src if clear_src: self.initialize_data() # get source data im memory and validate data data = self._read_data_from_source_() if data is not None: self.data = data else: man_log.debug('Data source corrupted.') raise Exception( "Data source corrupted. Please check the data source")
def _value_is_in_missing_list_(self, value, col_def): """ Check whether the value is in the missing value of the col_def, if col_def has missing value list :param value: data value :param col_def: :return: """ if hasattr(col_def, 'missing_vals'): man_log.debug('checking if %s in missing vals: %s' % (value, col_def.missing_vals)) missing = [ missing_val.strip() for missing_val in col_def.missing_vals.split(",") ] if value in missing: return True else: return False man_log.debug("column: %s doesn't have missing vals" % col_def) return False
def _read_data_from_source_(self): ''' The implementation of reading data from a local file. It uses a dialog to get data path Overridable: if anyone wants to get data from different source, override this function :return: An array of ordered dictionary that contains the data. ''' data = [] # open file srcfile = open(self.get_src_datpath(), errors='ignore') srcreader = utils.DictReader(srcfile, delimiter=self.delimiter) # assert the file has all the expected fields man_log.debug('expected fieldnames: %s' % self.col_defs) error_flag = False for index, col_name in enumerate(self.col_defs): if col_name not in srcreader.fieldnames: user_error_log.log_mapping_error( col_name, column_id=index + 1, message="this field missing in data file") error_flag = True continue if error_flag: raise self.TemplateError( ('expected columns not ' 'found in source datafile, with fields: %s') % (list(srcreader.fieldnames))) # load each row with each col's parser for rowid, datarow in enumerate(srcreader): man_log.info('loading row %s' % rowid) man_log.debug('parsing row %s : %s' % (rowid, datarow)) row = utils.OrderedDict() for col in self.col_defs: try: # the parser name is defined as "parse_" + col.col_name # e.g, for source_col "subjectID", the parser will be "parse_subjectID" row[col] = self._parse_value_with_corresponding_parser_( datarow[col], col) except Exception as e: man_log.debug('Exception while parsing %s: %s' % (col, e)) row[col] = self.NoDataError('%s' % e) data.append(row) return data
def default_parser(self, value, coldef): ''' just a simple parser if no other is defined used when parsing the template?''' man_log.debug('parsing [%s] from (%s)' % (coldef, value)) # Now the empty string will be regarded as empty if value == "" or self._value_is_in_missing_list_(value, coldef): # If the data is in missing vals, then a no data error will be return as the placeholder man_log.debug('replacing row[%s](%s) with NoData' % (coldef, value)) return self.NoDataError(('value %s identified as a missing ' 'value for col %s') % (value, coldef)) # 999 problem? man_log.debug('parse result is (%s)' % value) return str(value)
def write_sink_data_with_outwriter(self, data, outwriter: utils.DictWriter) -> None: for rowid, row in enumerate(data): for coldef, elem in row.items(): # get formatter (especially for date output). If no customized formatter provided, then nothing. formatter = getattr(self, coldef + '_write_formatter', # formatter is a function self.default_write_formatter) man_log.debug('trying formatter %s' % ( coldef + '_write_formatter')) man_log.debug('formatting row[%s][%s](%s) with %s' % (rowid, coldef, row[coldef], formatter.__name__)) row[coldef] = formatter(row[coldef], coldef) man_log.debug('writing row[%s][%s] is %s' % (rowid, coldef, row[coldef])) outwriter.writerow(row) return None
def _read_data_from_source_(self): """ The implementation of reading data from a local file. It uses a dialog to get data path Overridable: if anyone wants to get data from different source, override this function :return: An array of ordered dictionary that contains the data. """ data = [] # open file srcfile = open(self.get_src_datpath(), errors='ignore') srcreader = utils.DictReader(srcfile, delimiter=self.delimiter) # assert the file has all the expected fields man_log.debug('expected fieldnames: %s' % self.col_defs) # this will throw TemplateException self._check_whether_all_src_cols_in_src_fields_(src_cols=self.col_defs, fieldnames=list(srcreader.fieldnames)) # load each row with each col's parser for rowid, datarow in enumerate(srcreader): man_log.info('loading row %s' % rowid) man_log.debug('parsing row %s : %s' % (rowid, datarow)) row = utils.OrderedDict() for col in self.col_defs: try: # the parser name is defined as "parse_" + col.col_name # e.g, for source_col "subjectID", the parser will be "parse_subjectID" row[col] = self._parse_value_with_corresponding_parser_(datarow[col], col) except Exception as e: man_log.debug('Exception while parsing %s: %s' % (col, e)) row[col] = self.NoDataError('%s' % e) data.append(row) return data
def _read_data_from_source_(self): ''' Follow the api for read data :return: ''' data = [] con = pyodbc.connect("DSN=wtp_data") # To test the primary key. The primary key can be familyid and twin or just familyid. table_type = self.check_table_type(self.data_table[0], con) join_cmd = self.get_join_stmt(self.data_table, table_type) cursor = con.cursor() cursor.execute(join_cmd) desc = cursor.description fieldnames = self._get_fieldnames_(desc) #import ipdb;ipdb.set_trace(); # assert the data source has all the source fields defined in the template # so that no col_defs will map to nothing in the data source man_log.debug('expected fieldnames: %s' % self.col_defs) for col_name in self.col_defs: if col_name not in fieldnames: raise self.TemplateError( ('expected column %s not ' 'found in source datafile, with fields: %s') % (col_name, list(fieldnames))) sql_data = cursor.fetchall() # load each row for rowid, datarow in enumerate(sql_data): man_log.info('loading row %s' % rowid) man_log.debug('parsing row %s : %s' % (rowid, datarow)) row = utils.OrderedDict() for col in self.col_defs: try: # Find the data position due to the fact that you can only access the data in datarow # with index col_name = col.col_name index = fieldnames.index(col_name) # prepare parser col_parser_name = 'parse_' + str(col) man_log.debug('parsing %s from %s using %s' % (col, datarow[index], col_parser_name)) col_parser = getattr(self, col_parser_name, self.default_parser) # The empty item in db will be translated into None in python. # Thus, clean the None into "" if str(datarow[index]) == "None": datarow[index] = "" # I parse everything into datarow row[col] = col_parser(str(datarow[index]), col) except Exception as e: man_log.debug('Exception while parsing %s: %s' % (col, e)) row[col] = self.NoDataError('%s' % e) data.append(row) con.close() return data
def _read_data_from_source_(self) -> List[utils.OrderedDict]: """ Follow the api for read data. This overrided method will connect to tables in wtp_data based on the data table specified in the rocket template, and then convert the data records into a list of orderedDict, as the data source in source manager. :return: """ data = [] con = pyodbc.connect("DSN=wtp_data") # To test the primary key. The primary key can be familyid and twin or just familyid. table_type = self.check_table_type(self.data_table[0], con) # type: TableType # table_type only matters with the join statement join_cmd = self.get_join_stmt(self.data_table, table_type) cursor = con.cursor() cursor.execute(join_cmd) desc = cursor.description fieldnames = self._get_fieldnames_(desc) # import ipdb;ipdb.set_trace(); # assert the data source has all the source fields defined in the template # so that no col_defs will map to nothing in the data source # man_log.debug('expected fieldnames: %s' % self.col_defs) # this will throw exception self._check_whether_all_src_cols_in_src_fields_(self.col_defs, fieldnames) sql_data = cursor.fetchall() # load each row for rowid, datarow in enumerate(sql_data): man_log.info('loading row %s' % rowid) man_log.debug('parsing row %s : %s' % (rowid, datarow)) row = utils.OrderedDict() for col in self.col_defs: try: # Find the data position due to the fact that you can only access the data in datarow # with index col_name = col.col_name index = fieldnames.index(col_name) # prepare parser col_parser_name = 'parse_' + str(col) man_log.debug('parsing %s from %s using %s' % (col, datarow[index], col_parser_name)) col_parser = getattr(self, col_parser_name, self.default_parser) # The empty item in db will be translated into None in python. # Thus, clean the None into "" if str(datarow[index]) == "None": datarow[index] = "" # I parse everything into datarow row[col] = col_parser(str(datarow[index]), col) except Exception as e: man_log.debug('Exception while parsing %s: %s' % (col, e)) row[col] = self.NoDataError('%s' % e) data.append(row) con.close() return data