def check_number_of_rows(self, rows): excel_max_rows = 1048576 if rows > excel_max_rows: self.log.critical(pc.get_error_msg('error', 'too_big')) exit() if rows > 0.8 * excel_max_rows: self.log.warning(pc.get_error_msg('warning', 'near_limit')) return
def do_chi2_test(self, peczely_table, front_table): # calculates the chi^2 test results, and saves them in it's memory warning = '' self.p_values = [] # test for at least 5 (or 3) is 70% percent of the cells # 13*2*0.3=7.8 if ((peczely_table < 5).sum().sum()) > 7: self.log.warning( pc.get_error_msg('warning', 'under_limit').format('Péczely')) warning += 'P' # Péczely excepted = pd.DataFrame(data=0, index=range(len(peczely_table.index)), columns=range(len(peczely_table.columns))) for i in range(13): excepted.iloc[i, :] = ((peczely_table.sum(axis=1).values[i]) / (self.maxdate - self.mindate).days) excepted.iloc[i, 0] = excepted.iloc[i, 0] * peczely_table.sum( axis=0).values[0] excepted.iloc[i, 1] = excepted.iloc[i, 1] * peczely_table.sum( axis=0).values[1] # print('Péczely p value:') # ddof=13, because scipy.stats.chisquare is a bit f****d up res = scs.chisquare(peczely_table, excepted, axis=None, ddof=13) # print(res) self.p_values.append(res) # Front # 9*2*0.3=5.4 if (front_table < 5).sum().sum() > 5: self.log.warning( pc.get_error_msg('warning', 'under_limit').format('Front')) warning += 'F' excepted = pd.DataFrame(data=0, index=range(len(front_table.index)), columns=range(len(front_table.columns))) for i in range(9): excepted.iloc[i, :] = ((front_table.sum(axis=1).values[i]) / (self.maxdate - self.mindate).days) excepted.iloc[i, 0] = excepted.iloc[i, 0] * front_table.sum( axis=0).values[0] excepted.iloc[i, 1] = excepted.iloc[i, 1] * front_table.sum( axis=0).values[1] # print('p value:') # ddof=9, because scipy.stats.chisquare is a bit f****d up res = scs.chisquare(front_table, excepted, axis=None, ddof=9) # print(res) self.p_values.append(res)
def __init__(self, filename, excel=True, sheet_name=0): self.log = logging.getLogger(__name__) self.log.debug("DataManipulator created") self.df = pd.DataFrame() self.excel = excel if excel: try: xlsx_file = pd.ExcelFile(filename) self.df = xlsx_file.parse(sheet_name, header=0) xlsx_file.close() except ImportError: self.log.critical( pc.get_error_msg('error', 'xlrd not imported')) self.column_names = self.df.columns.values.tolist()[1:] else: self.ifile = open(filename, "r", encoding='utf_8') self.ofile = open("peczely_hozzarendelt.tmp", "w", encoding='utf_8') self.column_names = [] self.maxdate = pd.Timestamp.min # = 1677-09-21 self.mindate = pd.Timestamp.max # = 2262-04-11 self.pivot_generator = pd.DataFrame() self.number_of_clusters = 0 self.size_of_dimensions = [] self.dims_can_be_checked = False self.dates = pd.DataFrame() self.p_values = []
def check_errors(self): if not (self.output_file.endswith('.xlsx')): if self.exit_on_warning: self.log.critical(pc.get_error_msg('error', 'extension_error') + self.output_file) exit(-1) else: self.output_file += '.xlsx' self.log.error(pc.get_error_msg('warning', 'file_renamed') + self.output_file) if self.input_sheet_name == 0: # old: ==0 self.log.warning(pc.get_error_msg('warning', 'no_sheet_name')) try: writer = datamanipulator.pd.ExcelWriter(self.output_directory + '/' + self.output_file) datamanipulator.pd.DataFrame(data='Testing of write permission is completed, access granted', index=range(2), columns=range(2)).to_excel(writer, index=False, sheet_name='testing IO error') writer.save() writer.close() except PermissionError as pe: self.log.critical(pe) self.log.critical(pc.get_error_msg('error', 'permission_error')) exit(-2) if not self.logfile_name.endswith('.txt'): self.log.warning(pc.get_error_msg('warning', 'bad_logfile_extension')) if not self.chi_filename.endswith('.txt'): self.log.warning(pc.get_error_msg('warning', 'bad_chi_file_extension')) self.log.debug('First round error checks are completed (UI.check_errors)')
def check_dims(self): a = len(self.size_of_dimensions) if a != self.number_of_clusters: try: self.log.debug(self.df.columns.values) except UnicodeEncodeError: self.log.debug('failed to print self.df.columns.values') self.log.critical( pc.get_error_msg('error', 'dimension_mismatch').format( self.number_of_clusters, len(self.size_of_dimensions))) return False else: return True
def create_generator_table_old(self): # error check clus = 1 for i in range(self.number_of_clusters): clus = clus * self.size_of_dimensions[i] rows = (13 + 9) * 5 * clus self.check_number_of_rows(rows) if rows > 1000000: self.log.warning(pc.get_error_msg('warning', 'near_limit')) # generate peczely_table = self.create_generator_table('Peczely', 13, 2, count_peczely) front_table = self.create_generator_table('Front', 9, 2, count_front) self.pivot_generator = pd.concat([peczely_table, front_table], copy=False)
def assign_numbers(self): self.log.debug("assigning meteorological numbers to dates started") self.number_of_clusters = self.df.shape[1] - 1 # error check if self.dims_can_be_checked: if not self.check_dims(): exit() else: self.dims_can_be_checked = True row_number = 0 pFAssign = PFAssign() meteo_df = pd.DataFrame(columns=[ 'P-2', 'P-1', 'P0', 'P1', 'P2', 'F-2', 'F-1', 'F0', 'F1', 'F2' ]) for index, row in self.df.iterrows(): # print(row[0], row[1], row[2]) date = row[0] # it's a Timestamp format try: meteo_df.loc[row_number] = pFAssign.return_date( round(to_excel_date(date))) except TypeError: self.log.critical(pc.get_error_msg('error', 'format')) exit() # min max if date > self.maxdate: self.maxdate = date if date < self.mindate: self.mindate = date row_number += 1 self.check_ascending() self.df = self.df.join(meteo_df) self.log.info("assigning meteorological numbers to dates ended") self.log.debug('min date: ' + str(self.mindate)) self.log.debug('max date: ' + str(self.maxdate))
def check_ascending(self): if self.mindate != self.df.iloc[0, 0] or self.maxdate != self.df.iloc[-1, 0]: self.log.critical(pc.get_error_msg('error', 'not_sorted')) exit()