def make_grid_for_file(results_dir, list_file, grid_file): POOL_DIRECTIONS = ['H', 'V'] POOL_RANGE = range(1, 13) logger.info('%s => %s', list_file, grid_file) df = DataFrame(filename=os.path.join(results_dir, list_file)) (pair, id) = df.get_columns('pair', 'id') logger.info('%d hits', len(pair)) col_names = [ 'V' + str(i) for i in POOL_RANGE ] row_names = [ 'H' + str(i) for i in POOL_RANGE ] data_dict = dict() # will hold a list of the hits for each row, column pair for r in row_names: for c in col_names: data_dict[(r,c)] = [ ] for (mypair, myid) in zip(pair, id): (horiz, vert) = mypair.split(' x ') data_dict[(horiz, vert)] = data_dict[(horiz, vert)] + [ myid ] # now build a new data frame as a list of tuples, column name and column list data_by_column = [ ] # first column is the row names data_by_column += [(grid_file, row_names)] # subsequent columns are by vertical pool for c in col_names: col_data = [ ] for r in row_names: col_data.append(' '.join(sorted(data_dict[(r,c)]))) data_by_column += [ (c, col_data)] grid_dataframe = DataFrame(data=data_by_column) grid_dataframe.write(os.path.join(results_dir, grid_file))
def create_map_file(data_dir, map_filename): file_list = sorted(os.listdir(data_dir)) pool_list = [ ] base_list = [ ] for file_name in file_list: (base, ext) = os.path.splitext(file_name) if (ext == '.gpr') or (ext == '.GPR'): logger.info('dir %s file %s base %s ext %s', data_dir, file_name, base, ext) toks = re.split('_|-', base) # split on underscore or dash # start looking from the end for a token that is a valid pool name toks.reverse() pool_str = '' for tok in toks: if is_valid_pool_name(tok): pool_str = tok break if not is_valid_pool_name(pool_str): logger.warn('%s has no valid pool name, skipping %s', ext, file_name) continue if pool_str in pool_list: logger.error('pool %s is repeated, ignoring', ext) pool_list.append(pool_str) base_list.append(base) df = DataFrame(data = [('pool', pool_list), ('file', base_list)]) df.write(map_filename) return None
def fit(self, dataframe, dependent_variable): self.dependent_variable = dependent_variable copy_dataframe = DataFrame(dict(dataframe.data_dict), list(dataframe.columns)) columns = list(copy_dataframe.columns) dependent_variable_index = columns.index(dependent_variable) columns.pop(dependent_variable_index) independent_variable = list(columns)[0] self.independent_variable = str(independent_variable) for degree in range(1 , self.degree + 1): copy_dataframe = copy_dataframe.create_exponential(independent_variable, degree) copy_dataframe.data_dict.pop(independent_variable) copy_dataframe.columns.remove(independent_variable) dependent_variable_column = copy_dataframe.columns.index(dependent_variable) dependent_matrix = [[value] for value in copy_dataframe.data_dict[dependent_variable]] data_dict = copy_dataframe.data_dict length = len(data_dict[dependent_variable]) tall_matrix = [[1] for _ in range(length)] for row in range(length): for key in data_dict: if key != dependent_variable: tall_matrix[row].append(data_dict[key][row]) tall_matrix = Matrix(tall_matrix) matrix = tall_matrix.transpose().matrix_multiply(tall_matrix) matrix = matrix.inverse() @ tall_matrix.transpose() matrix = matrix @ Matrix(dependent_matrix) self.coefficients = {'constant':round(matrix.elements[0][0],5)} columns = copy_dataframe.columns columns.remove(dependent_variable) for index in range(len(columns)): self.coefficients[columns[index]] = round(matrix.elements[index+1][0],4)
def split(self, max_depth, depth): # print("testing") if self.impurity != 0 and (max_depth > depth or max_depth == False): if self.unsplit: if self.best_split[0] == 'x': axis = 0 else: axis = 1 low_points = [] high_points = [] for point in self.df.to_array(): if point[axis] < self.best_split[1]: low_points.append(point) elif point[axis] >= self.best_split[1]: high_points.append(point) self.low = Node( DataFrame.from_array(low_points, self.df.columns), self.split_metric) self.high = Node( DataFrame.from_array(high_points, self.df.columns), self.split_metric) self.unsplit = False elif max_depth > depth + 1 or max_depth == False: if self.low.impurity != 0: self.low.split(max_depth, depth + 1) if self.high.impurity != 0: self.high.split(max_depth, depth + 1) else: self.unsplit = False
def testWczytania(): fieldnames = ['IDStalej', 'WartoscStalej'] fieldtypes = ['L', 'f'] df = DataFrame('bazadanych', fieldnames, fieldtypes) rec = df.Record(1, 3.14) df.dodajByTuple(rec) df.dodajByTuple((2, 2.79)) strtemp = TemporaryDirectory(dir='/tmp/temp') strPlik = strtemp.name + "/tmp.csv" df.zapisz(strPlik) df2 = DataFrame.wczytaj(strPlik) df2.zapisz(strtemp.name + "/tmp2.csv") assert cmp(strPlik, strtemp.name + "/tmp2.csv", shallow=False) == True strtemp.cleanup()
def __init__(self, data_dir): data_cols = [ 'data', 'is_nan', 'page_id', 'project', 'access', 'agent', 'test_data', 'test_is_nan' ] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95)
def load_data_frames(from_json=False): if from_json: return [DataFrame.from_json_file('Data/au_chunked.json')] data_frames = [] for data_file, data_name, c_ref, hc_file_name in SOURCE_DATA: data_dict = {'dh': f'Data/{data_file}.txt'} if hc_file_name != '': data_dict['cp'] = f'Data/{hc_file_name}.txt' data_frame = DataFrame.from_sources_dict(data_dict, name=data_name) data_frame.set_initial_conditions(reference_heat_capacity_value=c_ref) data_frames.append(data_frame) return data_frames
def __init__(self, data_class, prediction_column, max_value, delta, constant=True): super().__init__(data_class, prediction_column) self.prediction = prediction_column self.current_input = None self.max_val = max_value self.original_data = DataFrame.from_array(data_class.to_array(), data_class.columns) print("#0" + str(self.original_data.to_array())) self.original_data = self.original_data.append_columns( {'constant': [1 for _ in range(len(data_class.to_array()))]}, ['constant'] + data_class.columns) self.data = data_class.apply( self.prediction_column, lambda x: self.set_bound_replacements(delta, x)) if constant: self.data = self.data.append_columns( {'constant': [1 for _ in range(len(self.data.to_array()))]}, ['constant'] + self.data.columns) self.multipliers = self.solve_coefficients() print("#1" + str(self.multipliers)) print("#2" + str(self.original_data.to_array()))
def fit(self, dataframe, dependent_variable): self.first_variable = dataframe.columns[0] self.dependent_variable = dependent_variable if self.degree == 0: new_columns = [self.dependent_variable] elif self.degree == 1: new_columns = [self.first_variable, self.dependent_variable] else: new_columns = [self.first_variable] for i in range(2, self.degree + 1): new_term = self.first_variable + '^' + str(i) new_columns.append(new_term) new_columns.append(self.dependent_variable) new_dataset = [] for pair in dataframe.to_array(): new_values = [] for i in range(1, self.degree + 1): value = pair[0]**i new_values.append(value) new_values.append(pair[1]) new_dataset.append(new_values) self.df = DataFrame.from_array(new_dataset, new_columns)
def from_table(self, name): sessionId = TajoIdProtos_pb2.SessionIdProto(id=self.sessionId) request = ClientProtos_pb2.GetTableDescRequest(sessionId=sessionId, tableName=name) with client.early_adopter_create_TajoMasterClientProtocolService_stub(self.host, self.port) as stub: res = stub.getTableDesc(request, self.TIMEOUT_SECONDS) return DataFrame.convert_from_tabledesc(res.tableDesc)
def nearest_neighbors(self, observation): close_list = self.compute_distances(observation).to_array() sorted_list = [] for n in range(len(close_list)): sorted_list.append( close_list.pop(self.sort_closest_cookie(close_list))) return DataFrame.from_array(sorted_list[::-1], columns=['distance', 'Cookie Type'])
def fit(self, dataframe, dependent_variable): self.data_frame = dataframe dict_data = self.data_frame.data_dict self.depend_var = dependent_variable independ_var = [var for var in dict_data if var != self.depend_var][0] if self.degree == 0: self.data_frame = DataFrame( {self.depend_var: dict_data[self.depend_var]}, [self.depend_var]) for degree in range(1, self.degree): col = independ_var + '^' + str(degree + 1) col_val = [ dict_data[independ_var][index]**(degree + 1) for index in range(len(dict_data[independ_var])) ] self.data_frame = self.data_frame.add_data(col, col_val) self.coefficients = self.calculate_coefficient()
def write_pool_hit(pool_to_file, pool_hit): pool_list = [ ] file_list = [ ] id_list = [ ] zscore_list = [ ] ratio_list = [ ] for (p, f) in zip(pool_to_file.data['pool'], pool_to_file.data['file']): # some pools have no hits if p not in pool_hit: continue for h in pool_hit[p]: pool_list.append(p) file_list.append(f) id_list.append(h) zscore_list.append(pool_hit[p][h]['zscore']) ratio_list.append(pool_hit[p][h]['ratio']) df = DataFrame( data=[('pool', pool_list), ('file', file_list), ('id', id_list), ('zscore', zscore_list), ('ratio', ratio_list)] ) df.write('pool_hit.txt')
def calc_goodness(self, split, axis): goodness = self.impurity low = [] high = [] for point in self.df.to_array(): if point[axis] < split: low.append(point) elif point[axis] >= split: high.append(point) low_node = Node(DataFrame.from_array(low, self.df.columns), self.split_metric) high_node = Node(DataFrame.from_array(high, self.df.columns), self.split_metric) new_nodes = [low_node, high_node] for split_node in new_nodes: goodness -= (len(split_node.row_indices) / len(self.row_indices)) * split_node.impurity return round(goodness, 3)
def compute_distances(self, observation): data_arr = self.dataframe.to_array() data_dict = self.dataframe.data_dict distances = [] for i in range(len(data_arr)): distances.append([ sum([(observation[entry] - data_dict[entry][i])**2 for entry in observation])**(0.5), data_arr[i][0] ]) return DataFrame.from_array(distances, ['Distance', 'Cookie Type'])
def calc_coefficients(self): df_transform = { key: self.df.data_dict[key] for key in self.df.data_dict } df_transform[self.dv] = [ math.log((self.up_bound / i) - 1) for i in df_transform[self.dv] ] df_transform = DataFrame(df_transform, self.df.columns) linear_reg = LinearRegressor(df_transform, self.dv) return linear_reg.coefficients
def split(self, if_once=False, depth_needed=None): if depth_needed is None or self.depth < depth_needed: if self.low is None and self.high is None: if self.final_split is False: self.possible_splits = self.get_possible_splits() self.get_best_split() if self.best_split is None: return if str(self.depth) in self.tree.splits: self.tree.splits[str(self.depth)].append( self.best_split) else: self.tree.splits[str(self.depth)] = [self.best_split] low = [] high = [] for entry in self.df.to_array(): if entry[self.best_split_index] < self.best_split[1]: low.append(entry) elif entry[ self.best_split_index] >= self.best_split[1]: high.append(entry) self.low = Node(DataFrame.from_array(low, self.df.columns), self.split_metric, (self.depth + 1), tree=self.tree) self.high = Node(DataFrame.from_array( high, self.df.columns), self.split_metric, (self.depth + 1), tree=self.tree) if not if_once: self.low.split(depth_needed=depth_needed) self.high.split(depth_needed=depth_needed) else: return else: if self.low is not None: self.low.split(if_once, depth_needed=depth_needed) if self.high is not None: self.high.split(if_once, depth_needed=depth_needed) return else: return
def transform(self, df): dict_data = df.data_dict.copy() transformed_df = {'x': [1, 2, 3, 2, 3, 4], 'y': [0, 0, 0, 1, 1, 1]} for index in range(len(dict_data[self.depend_var])): element = dict_data[self.depend_var][index] if element == 0: element = self.change if element == 1: element = 1 - self.change transformed_df[self.depend_var][index] = math.log((self.up_bound / element) - 1) return DataFrame(transformed_df, df.columns.copy())
def transform(self, df): dict_data = df.data_dict transformed_df = dict_data.copy() for index in range(len(dict_data[self.depend_var])): element = dict_data[self.depend_var][index] if element == 0: element = 0.1 if element == 1: element = 0.9 transformed_df[self.depend_var][index] = math.log((self.up_bound / element) - 1) return DataFrame(transformed_df, df.columns)
def run_tests(training_set, testing_set, decision_tree, forest = False): correct = 0 training_df = DataFrame.from_array(training_set, ['bmi', 'weight', 'class']) decision_tree.fit(training_df) for test in testing_set: test_dict = {'bmi' : test[0], 'weight' : test[1]} if forest: prediction = decision_tree.predict(test_dict) else: prediction = decision_tree.classify(test_dict) if prediction == test[2]: correct += 1 return correct,len(testing_set)
def compute_distances(self, observation): data = self.df.data_dict.copy() distances = [] for i in range(len(data[self.dependent_variable])): distance = 0 for var in observation: distance += (observation[var] - data[var][i])**2 distance = distance**(1 / 2) distances.append(distance) data['Distance'] = distances columns = ['Distance'] + self.df.columns return DataFrame(data, columns).select(['Distance', self.dependent_variable])
def compute_distances(self, observation): distances = [] for data in self.df.to_array(): distances.append( self.compute_distance(observation, [ data[n] for n in range(len(data)) if n != self.df.columns.index(self.prediction_column) ])) result = [[n] for n in distances] for n in range(len(distances)): result[n].append(self.df.to_array()[n][self.df.columns.index( self.prediction_column)]) return DataFrame.from_array(result, columns=['distance', 'Cookie Type'])
def gather_all_inputs(self, input_dict): col_order = [key for key in input_dict.keys()] for key, data in input_dict.items(): input_dict.update({key: [data]}) data = DataFrame(input_dict, col_order) data.append_pairwise_interactions() data.append_columns({'constant': [1]}) self.current_input = data return data.data_dict
def calc_goodness(self, split, axis_index): goodness = self.impurity low = [] high = [] for point in self.df.to_array(): if point[axis_index] < split: low.append(point) elif point[axis_index] >= split: high.append(point) low_node = Node(DataFrame.from_array(low, self.df.columns), self.split_metric, depth=int(self.depth) + 1, check_splits=False, tree=self.tree) high_node = Node(DataFrame.from_array(high, self.df.columns), self.split_metric, depth=(self.depth + 1), check_splits=False, tree=self.tree) nodes = [low_node, high_node] for split_node in nodes: goodness -= (len(split_node.row_indices) / len(self.row_indices)) * split_node.impurity return goodness
class PolynomialRegressor(LinearRegressor): def __init__(self, degree): self.degree = degree self.data_frame = None self.depend_var = None self.coefficients = None def fit(self, dataframe, dependent_variable): self.data_frame = dataframe dict_data = self.data_frame.data_dict self.depend_var = dependent_variable independ_var = [var for var in dict_data if var != self.depend_var][0] if self.degree == 0: self.data_frame = DataFrame( {self.depend_var: dict_data[self.depend_var]}, [self.depend_var]) for degree in range(1, self.degree): col = independ_var + '^' + str(degree + 1) col_val = [ dict_data[independ_var][index]**(degree + 1) for index in range(len(dict_data[independ_var])) ] self.data_frame = self.data_frame.add_data(col, col_val) self.coefficients = self.calculate_coefficient() def predict(self, predictor): predict = predictor.copy() for key in self.data_frame.columns: if '^' in key: key_n_pwr = key.split('^') predict[key] = predict[key_n_pwr[0]]**int(key_n_pwr[1]) predict_keys = [key for key in predict] predict_keys.insert(0, 'constant') val = [x for x in predict.values()] val.insert(0, 1) coef_val = [ self.coefficients[key] if key in list(self.coefficients.keys()) else 0 for key in predict_keys ] y = 0 for index in range(len(val)): y += coef_val[index] * val[index] return y
def calc_possible_splits(self): points = [[], 'x', [], 'y'] for x in self.df.ordered_dict['x']: if x not in points[0]: points[0].append(x) for y in self.df.ordered_dict['y']: if y not in points[2]: points[2].append(y) splits = [] for n in range(2): for i in range(len(points[2 * n]) - 1): splits.append([ points[2 * n + 1], (points[2 * n][i] + points[2 * n][i + 1]) / 2, self.calc_goodness( (points[2 * n][i] + points[2 * n][i + 1]) / 2, n) ]) return DataFrame.from_array(splits, ['feature', 'value', 'goodness of split'])
def loadCSV(path): csvFile = open(path) players = dict() colNames = [] firstRow = True for line in csv.reader(csvFile.readlines()): if empty(line): continue if firstRow: for name in line: name = name.strip() colNames.append(name) players[name] = [] firstRow = False continue for i, value in enumerate(line): players[colNames[i]].append(value.strip()) return DataFrame(players, colNames)
def __init__(self, dataframe,upperbound, dependent_variable): self.upperbound = upperbound self.dependent_variable = dependent_variable dataframe.data_dict[dependent_variable] = [0.1 if value==0 else value for value in dataframe.data_dict[dependent_variable]] dependent_variable_column = dataframe.columns.index(dependent_variable) dependent_list = [math.log(self.upperbound/value -1) for value in dataframe.data_dict[dependent_variable]] dependent_transformed = dependent_variable + "_transfromed" new_columns = dataframe.columns new_columns[dependent_variable_column] = dependent_transformed transformed_data_dict = dataframe.data_dict #switching out old dependent variable list with transformed one transformed_data_dict[dependent_variable] = dependent_list transformed_data_dict[dependent_transformed] = transformed_data_dict[dependent_variable] del transformed_data_dict[dependent_variable] #Creating Dataframe from new datadict transformed_datafame = DataFrame(transformed_data_dict, new_columns) #linear regressor linear_regressor = LinearRegressor(transformed_datafame, dependent_transformed) self.coefficients = linear_regressor.coefficients
def get_control_from_file(filename, simple=True): """ read the file as a data frame for each id, check how many times it occurs as control or experimental make a dict with (id, name) as key where pair is often as controls, or name is nd """ logger.info('reading controls from %s', filename) control = DataFrame(filename=filename) control_dict = dict() if (simple): (ids, names) = control.get_columns('id','name') for (id, name) in zip(ids, names): control_dict[(id,name)] = True else: (id, name, control, exptl) = control.get_columns('id', 'name', 'control', 'exptl') id_to_name = dict() for (i, n, c, e) in zip(id, name, control, exptl): isND = n in [ 'ND', 'nd', 'N.D.' ] isControl = (i == 'CONTROL') isIgg = (n == 'IgG') if ((c >= e) or isND or isControl or isIgg): control_dict[(i, n)] = True # insert some special cases control_dict[('CONTROL', 'IgG')] = True for (i, n) in zip(id, name): if i not in id_to_name: id_to_name[i] = dict() id_to_name[i][n] = True id_to_names = dict() for i in id_to_name: names = sorted(id_to_name[i].keys()) cnt = len(names) name_str = ','.join(names) id_to_names[i] = dict() id_to_names[i]['cnt'] = cnt id_to_names[i]['names'] = name_str ids = sorted(id_to_names.keys()) cnts = [ id_to_names[x]['cnt'] for x in ids ] names = [ id_to_names[x]['names'] for x in ids ] df = DataFrame(data=[ ('id', ids), ('cnt', cnts), ('names', names)]) df.write('id_to_names.txt') return(control_dict)
def get_possible_splits(self): axis = [ axis for axis in self.df.columns if axis != 'class' and axis != 'indices' ] all_splits = [] for i in range(len(self.distinct_values)): for j in range(len(self.distinct_values[i]) - 1): split_value = (self.distinct_values[i][j] + self.distinct_values[i][j + 1]) / 2 all_splits.append( [axis[i], split_value, self.calc_goodness(split_value, i)]) if self.split_metric == 'random': if len(list(set([split[0] for split in all_splits]))) == 0: return [] random_choice = random.choice( list(set([split[0] for split in all_splits]))) new_splits = [ split for split in all_splits if split[0] == random_choice ] all_splits = new_splits return DataFrame.from_array( all_splits, ['axis', 'split_value', 'goodness of split'])
import sys sys.path.append('src') from dataframe import DataFrame from polynomial_regressor import PolynomialRegressor df = DataFrame.from_array( [(0,1), (1,2), (2,5), (3,10), (4,20), (5,30)], columns = ['x', 'y'] ) constant_regressor = PolynomialRegressor(degree=0) constant_regressor.fit(df, dependent_variable='y') print(constant_regressor.coefficients) {'constant': 11.3333} print(constant_regressor.predict({'x': 2})) 11.3333 linear_regressor = PolynomialRegressor(degree=1) linear_regressor.fit(df, dependent_variable='y') print(linear_regressor.coefficients) {'constant': -3.2381, 'x': 5.8286} print(linear_regressor.predict({'x': 2})) 8.4190 quadratic_regressor = PolynomialRegressor(degree=2) quadratic_regressor.fit(df, dependent_variable='y') print(quadratic_regressor.coefficients) {'constant': 1.1071, 'x': -0.6893, 'x^2': 1.3036}
return struct.pack('>I', frame_length) def write_frame(df, stream=sys.stdout): assert df.IsInitialized() buf = df.SerializeToString() print >> sys.stderr, "Outputting", len(buf) ,"byte frame:" print >> sys.stderr, str(df), "\n" stream.write(make_length_header(len(buf))) stream.write(buf) if __name__ == '__main__': # Manually building dataframe using the protobuf python output df = DataFrame() df.source.add(field='testkey',value='textvalue') df.source.add(field='testing_type',value='number') df.timestamp = int(time.time() * 1000000000) df.payload = DataFrame.NUMBER df.value_numeric = 4242 write_frame(df) # Almost the same, but using a custom helper to set the source from a dict df = DataFrame() df.sourcedict = dict(textkey='testvalue',testing_type='number') df.timestamp = int(time.time() * 1000000000) df.payload = DataFrame.TEXT df.value_textual = 'test string' write_frame(df)
# for i in range(len(pos_neg)): # correct_class = pos_neg[i][3] # observation = into_new_observation(pos_neg[i]) # prediction = r.predict(observation) # if prediction == correct_class: # correct += 1 # assert correct/len(pos_neg) * 100 == 100, 'WRONG ACCURACY BRUH' points = [[x, y, z, 'A'] for z in range(-5, 6) for y in range(-5, 6) for x in range(-5, 6) if x * y * z != 0] points.extend([[x, y, z, 'B'] for z in range(1, 6) for y in range(1, 6) for x in range(1, 6) if x * y * z != 0]) points.extend([[x, y, z, 'B'] for z in range(1, 6) for y in range(1, 6) for x in range(1, 6) if x * y * z != 0]) df = DataFrame.from_array(points, columns=['x', 'y', 'z', 'class']) r = RandomForest(100, depth=None) r.fit(df) correct = 0 for i in range(len(points)): correct_class = points[i][3] observation = into_new_observation(points[i]) prediction = r.predict(observation) if prediction == correct_class: correct += 1 assert correct / len(points) * 100 == 90, 'WRONG ACCURACY BRUH' print('passed')
import matplotlib.pyplot as plt list_data = [[1, 0], [2, 0], [3, 0], [2, 1], [3, 1], [4, 1]] delta_table = [0.1, 0.01, 0.001, 0.0001] all_coords = [] for delta_low in delta_table: # new_list=[] # for pair in list_data: # if pair[1] == 0: # new_list.append([pair[0],delta]) # else: # new_list.append([pair[0],1-delta]) df = DataFrame.from_array(list_data, columns=['x', 'y']) regressor = LogisticRegressor(df, prediction_column='y', max_value=1, delta=delta_low) coords = [[], []] for x in range(20): coords[0].append(x / 100) coords[1].append(regressor.predict({'constant': 1, 'x': x})) all_coords.append(coords) print(all_coords) plt.style.use('bmh') for coords in all_coords: plt.plot(coords[0], coords[1], linewidth=2.5)
df = DataFrame.from_array([[1,0.2], [2,0.25], [3,0.5]], columns = ['hours worked', 'progress']) regressor = LinearRegressor(df, dependent_variable='progress') print('Does all the linear_regressor stuff work') assert regressor.coefficients == [0.01667, 0.15], 'No, coefficients does not work' assert regressor.predict({'hours worked': 4}) == 0.61667, 'No, predict does not work' print('Yes they do', "\n") ''' df = DataFrame.from_array( [[0, 0, 0.1], [1, 0, 0.2], [0, 2, 0.5], [4, 5, 0.6]], columns=['scoops of chocolate', 'scoops of vanilla', 'taste rating']) regressor = LinearRegressor(df, dependent_variable='taste rating') print('Does all the linear_regressor stuff work') reg_coeff = regressor.coefficients.copy() for (key, value) in reg_coeff.items(): reg_coeff[key] = round(value, 8) assert reg_coeff == { 'constant': 0.19252336, 'scoops of chocolate': -0.05981308, 'scoops of vanilla': 0.13271028 }, 'No, coefficients does not work'
"PassengerId": int, "Survived": int, "Pclass": int, "Name": str, "Sex": str, "Age": float, "SibSp": int, "Parch": int, "Ticket": str, "Fare": float, "Cabin": str, "Embarked": str } df = DataFrame.from_csv("kaggle/titanic/dataset_of_knowns.csv", data_types=data_types, parser=parse_line) df2 = df.generate_new_column("Name", "Surname", lambda x: x.split(",")[0][1:]) df3 = df2.generate_new_column( "Cabin", "CabinType", lambda x: None if x is None or len(x) == 0 else x.split(" ")[0][0]) df4 = df3.generate_new_column( "Cabin", "CabinNumber", lambda x: None if x is None or len(y := x.split( " ")) == 0 or len(y[0]) == 1 else int(y[0][1:])) df5 = df4.generate_new_column( "Ticket", "TicketType", lambda x: None if x is None or len(y := x.split(" ")) == 1 else y[0]) df6 = df5.generate_new_column( "Ticket", "TicketNumber", lambda x: None if len(y := x.split(" ")) == 0 or not y[-1].isnumeric() else int(y[-1])) df6.filter_columns([
def process_gpr_file(input_file, output_file, summary_file, \ signal_fg, signal_bg, norm_fg, norm_bg, \ do_norm, do_log, \ control_dict=None): """ open input_file as a gpr extract columns corresponding to F635 Median and B635 Median (fore- and back-ground) add new column fg/bg ratio extract Flags column as a mask if control_dict is None: mask out values with Flags == -100 else: mask out values based on control_dict calculate mean and standard deviation of the ratio calculate z-score for each row calculate stouffer's z-score ?or mean z-score? for probes with same ID print probes with (mean) z-score >= 2.5 """ FLAG_BAD = -100 logger.info('%s => %s', input_file, output_file) gpr = GPR(input_file) # print debug information for a gpr file # gpr.print_summary() # keep track of which columns we've added columns_added = [ ] # start by extracting the flags and adding an index for the original row number (flags, ids, names, fg, bg) = gpr.get_columns(['Flags', 'ID', 'Name', signal_fg, signal_bg]) if do_norm: (n_fg, n_bg) = gpr.get_columns([norm_fg, norm_bg]) n_row_orig = len(flags) logger.info('n_row_orig %d', n_row_orig) row_number_orig = np.array(range(1, n_row_orig + 1)) gpr.add_columns( ('row_number_orig', row_number_orig)) columns_added += ['row_number_orig'] # identify rows with bad flags and delete them # follow the semantics of a numpy masked array: delete where mask is True # controls from a dictionary mask_control = [ False for x in ids ] if (control_dict is not None): control_ids = dict() # for controls, just worry about ID, not name for (i, n) in control_dict.keys(): control_ids[i] = True mask_control = [ i in control_ids for i in ids ] # user interface permits manual flagging of bad data, usually -100 mask_flag = flags <= FLAG_BAD # some text values are clearly controls mask_text = [ id == 'CONTROL' for id in ids ] # bad signal mask_signal = [ (x[0] <= 0) or (x[1] <= 0) for x in zip(fg, bg) ] mask_norm = [ False for x in fg ] if do_norm: mask_norm = [ (x[0] <= 0) or (x[1] <= 0) for x in zip(n_fg, n_bg) ] mask = [ x[0] or x[1] or x[2] or x[3] or x[4] for x in zip(mask_control, mask_flag, mask_text, mask_signal, mask_norm) ] logger.info('deleting %d control rows', sum(mask)) gpr.delete_rows(mask) # re-extract just the good columns columns_extracted = [ 'Name', 'ID', signal_fg, signal_bg ] (name, id, fg, bg) = gpr.get_columns(columns_extracted) n_fg = None n_bg = None if do_norm: columns_norm = [norm_fg, norm_bg] columns_extracted = columns_extracted + columns_norm (n_fg, n_bg) = gpr.get_columns(columns_norm) n_row = len(name) assert(sum(bg == 0) == 0), 'bg has %d zero values' % sum(bg==0) # create a new index, idname, combining id with name # this avoids having one id map to multiple names, which could reflect a difference in probes, etc. idname = [ '_'.join([i,n]) for (i, n) in zip(id, name) ] idname_to_id = dict() idname_to_name = dict() for (idn, i, n) in zip(idname, id, name): idname_to_id[idn] = i idname_to_name[idn] = n gpr.add_columns( ('idname', idname)) columns_added += ['idname'] (ratio, zscore) = get_ratio_zscore(fg, bg, n_fg, n_bg, do_norm, do_log) (id_to_mean_zscore, row_to_mean_zscore, id_to_zscores) = apply_by_group(np.mean, idname, zscore) (id_to_mean_ratio, row_to_mean_ratio, id_to_ratios) = apply_by_group(np.mean, idname, ratio) gpr.add_columns(('ratio', ratio), ('zscore', zscore), ('zscore_mean', row_to_mean_zscore)) columns_added += ['ratio', 'zscore', 'zscore_mean' ] # collect rows where flag is good and either zscore is above a threshold (id_subset, row_subset) = get_good_ids_rows(idname, zscore) columns_display = columns_extracted + columns_added gpr.write(output_file, rows=row_subset, columns=columns_display) # gather data for each good id: # id, name, zscore_mean, zscores id_list = [ idname_to_id[i] for i in id_subset ] name_list = [ idname_to_name[i] for i in id_subset ] zscore_list = [ id_to_mean_zscore[i] for i in id_subset ] ratio_list = [ id_to_mean_ratio[i] for i in id_subset ] zscores_list = [ ';'.join([ str(x) for x in id_to_zscores[i] ]) for i in id_subset] ratios_list = [ ';'.join([ str(x) for x in id_to_ratios[i] ]) for i in id_subset] id_data = DataFrame( data=[('IDName', id_subset), ('ID', id_list), ('Name', name_list), ('zscore', zscore_list), ('ratio', ratio_list), ('zscores', zscores_list), ('ratios', ratios_list)] ) id_data.write(summary_file)
def print_control_dict(control_dict, control_dict_filename): keys = sorted(control_dict.keys()) ids = [ x[0] for x in keys ] names = [ x[1] for x in keys ] df = DataFrame(data=[('id', ids), ('name', names)]) df.write(filename=control_dict_filename)