def gradient_check(thetas, dataset, partial_D, epsilon=config.epsilon, tolerance=config.tolerance): print("\nPerforming gradient check ...\n") is_close = False apx_PD = deepcopy(partial_D) for l in range(1, len(thetas)): for i in range(1, len(thetas[l])): for j in range(1, len(thetas[l][i])): temp_theta = [deepcopy(thetas), deepcopy(thetas)] temp_theta[0][l][i][j] -= epsilon temp_theta[1][l][i][j] += epsilon apx_PD[l][i][j] = (J(temp_theta[1], dataset) - J(temp_theta[0], dataset)) \ / (2 * epsilon) # for i, j in zip(partial_D, apx_PD): # for k, l in zip(i, j): # print(k, l, "\n", sep = "\n") ans_matrix = helper.flatten( helper.list_recur(apx_PD, partial_D, lambda x, y: abs(x - y) < tolerance)) is_close = ((ans_matrix.count(False) / len(ans_matrix)) * 100) < 20 return is_close
def main(): data = [[y for y in x] for x in get_input(11).split('\n') if x] neighbour_lists = [] for row_index, row in enumerate(data): for col_index, col, in enumerate(row): if col == 'L': neighbours = get_neighbours((col_index, row_index), data) neighbour_lists.append(((col_index, row_index), neighbours)) change = True current_layout = [[y for y in x] for x in data] while change: change = False new_layout = [[y for y in x] for x in current_layout] for (col, row), neighbour_set in neighbour_lists: if current_layout[row][col] == 'L': if all( [current_layout[y][x] == 'L' for (x, y) in neighbour_set]): new_layout[row][col] = '#' change = True if current_layout[row][col] == '#': if len([(x, y) for (x, y) in neighbour_set if current_layout[y][x] == '#']) >= 4: new_layout[row][col] = 'L' change = True current_layout = new_layout print(len([x for x in flatten(current_layout) if x == '#']))
def auditors(self) -> set: audit_reports = self.audit_reports auditors = [ list(audit_report.auditors) for audit_report in audit_reports if audit_reports and audit_report.auditors ] auditors = {auditor for auditor in flatten(auditors) if auditor} return auditors
def get_top_verbs_in_path(path, top_size=10): trees = [t for t in get_trees(path) if t] flattened_list = helper.transform_to_list(trees) functions = [f for f in flattened_list if not helper.is_special_function(f)] helper.log_to_file('functions extracted') verbs = helper.flatten([get_verbs_from_function_name(function_name) for function_name in functions]) return collections.Counter(verbs).most_common(top_size)
def co_col_idx(self) -> set: col_idx = lambda idx: idx[1] year_col_idx = {col_idx(idx) for idx in self.year_idx} currency_col_idx = {col_idx(idx) for idx in self.currency_idx} amount_col_idx = {col_idx(idx) for idx in self.amount_idx} if year_col_idx: idxs = [year_col_idx, amount_col_idx, currency_col_idx] col_col_idxs = flatten([list(i.intersection(j)) for i, j in combinations(idxs, 2) if i.intersection(j)]) return set(col_col_idxs) return currency_col_idx.intersection(amount_col_idx)
def add_kams_and_kam_tags_to_db(self, session): kam_items = flatten([kam.items for kam in self.kams]) for kam_item in kam_items: kam_item_record = DB.KeyAuditMatter(news_id=self.news_id, item=kam_item) session.add(kam_item_record) session.commit() self.logger.info( f'>> kam_item: {kam_item_record} inserted to {kam_item_record.__tablename__}' ) self.add_kam_tags_to_db(session=session, kam_item_record=kam_item_record)
def __init__(self, tile_data): tmp = [x for x in tile_data.split('\n') if x] self.nr = int(tmp[0].replace('Tile ', '').replace(':', '')) self.content = [[v for v in t] for t in tmp[1:]] self.borders = { 'top': self.content[0], 'right': [x[len(x) - 1] for x in self.content], 'bottom': self.content[len(self.content) - 1], 'left': [x[0] for x in self.content], } self.possible_borders = flatten([[x, x[::-1]] for x in self.borders.values()])
def _get_page_by_outline(toc, title_pattern, to_page=True) -> list: ''' return a list of matched title pattern page range ''' # print('from outline') # if to_page: # return [page_range[-1] for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)] # else: # return [page_range for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)] # return [list(range(page_range[0], page_range[1] + 1)) for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)] pages = flatten([list(range(page_range[0], page_range[1] + 1)) for outline, page_range in toc.items() if re.search(title_pattern, outline, flags=re.IGNORECASE)]) consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))] return consecutive_pages
def wrapper(*args, post_actions = 0, **kwargs): actions = list(flatten(func(*args, **kwargs))) if self.condition is not None: if post_actions > 0: actions.append(skip(post_actions)) actions.insert(0, skip_if(self.condition, len(actions))) pre_actions = [] if self.if_func is not None: actions_to_skip = len(actions) + post_actions pre_actions = self.if_func(*args, post_actions=actions_to_skip, **kwargs) return list(pre_actions) + actions
def toc(self): with _by_pypdf(self.pdf_obj) as pdf: outlines = flatten(pdf.getOutlines()) outlines, next_outlines = TableOfContent.current_next_outline_pairs(outlines) toc = {} for outline, next_outline in zip(outlines, next_outlines): title = TableOfContent.utf8_str(outline.title).title() from_page, to_page = TableOfContent.outline_page_range(pdf, outline, next_outline) try: toc[title] = sorted([from_page, to_page]) except TypeError as e: logging.warning(f'{e}, from_page: {from_page}, to_page: {to_page}, both are {type(from_page)} which cannot be sorted') continue return toc
def search_outline_in_toc(self, pattern) -> list: ''' return a list of matched title pattern page range ''' print('search by toc!') pages = [] for outline, _page_range in self.toc.items(): if re.search(pattern, outline, flags=re.IGNORECASE): from_page, to_page = _page_range page_range = list(range(from_page, to_page + 1)) pages.append(page_range) pages = flatten(pages) consecutive_pages = [tuple(li) for li in consecutive_int_list(unique(pages))] return consecutive_pages
def tables(self) -> list: inspector = self.inspector tables = { tablename: [ column['name'] + '*' if column['primary_key'] else column['name'] + '+' if column['name'] in flatten([ fk_col['constrained_columns'] for fk_col in inspector.get_foreign_keys(tablename) ]) else column['name'] for column in inspector.get_columns(tablename) ] for tablename in inspector.get_table_names() } return tables
def outlines(self): pypdf_reader = self.pypdf_reader outlines = flatten(pypdf_reader.getOutlines()) def get_page_num(outline): try: return pypdf_reader.getDestinationPageNumber(outline) except AttributeError: return None outlines = [outline for outline in outlines if get_page_num(outline)] titles = [outline.title for outline in outlines] starting_pages = [get_page_num(outline) for outline in outlines] ending_pages = [page_num - 1 for page_num in starting_pages[1:]] page_ranges = zip_longest(starting_pages, ending_pages, fillvalue=max(starting_pages, default=None)) return [Outline(title, page_range, self.pb_pdf) for title, page_range in zip(titles, page_ranges)]
def applyAction ( self, state, action ) : # It is very important that you generate a new variable with deepcopy for the new state # This code is problem specific. An action is applied by adding a number to the next # box (which is slightly complicated to determine given the state representation newState = deepcopy(state) # first flatten the list to determine the row and column in which to place the number flat = flatten(state) idx = len(flat) row = idx // self.size col = idx % self.size # if it is the first element of a new row, the number is added as a list if ( 0 == col ): newState.append( [action] ) else : newState[row].append( action ) return newState
def applyAction(self, state, action): # Creates newState with every call newState = deepcopy(state) # First flatten the state to determine the row and column in which to place the number flat = flatten(state) # Store location of where the first empty box (0) is in the flattened state zero = flat.index(0) # Translates zero location into row-column coordinates row = zero // self.size col = zero % self.size # print(action) # print(row,col) # Create seperate lists for the individual rows and columns in the state entire_row = newState[row] column = flat[col::(col + self.size)] # Boolean variables initialized to False. If they are true then that type of pruning will occur prune_rc = False prune_box = False # PRUNING BEGINS HERE # if action in entire_row or action in column: prune_rc = True # pass # Does not add action to tree if any of above pruning criteria are true # Creates boxes with custom box making function # Marks row to find with -1 temp = newState[row][col] newState[row][col] = -1 box = makeBoxes(newState, self.r, self.c) for rows in box: if -1 in rows: if action in rows: prune_box = True newState[row][col] = temp if prune_rc or prune_box: return [] # Creates a new state by applying a legal action. Numbers that have already been given # are never changed else: newState[row][col] = action # print(newState) return newState
def __init__(self): super(AFW, self).__init__('AFW', 'faces/AFW/testimages/') self.lstImages = [] self.keyPointsDict = [] self.yawPitchRoll = [] self.bbox = [] point_names = ['right_eye_center', 'left_eye_center', 'nose_tip', 'mouth_right_corner', 'mouth_center', 'mouth_left_corner'] f = h5py.File(os.path.join(self.absolute_base_directory, 'anno.mat'), 'r') self.lstImages = ["".join(map(lambda x: chr(x), f[i].value)) for i in f['anno'].value[0]] #magic. #see show_keypoints.py #NOTE: might not take into account multiple pointsets in the same image #corresponding to different subjects. points = [flatten(zipped) for zipped in [zip(coords[0][0], coords[0][1]) for coords in [map(lambda a: f[a].value, coord_ref) for coord_ref in [flatten(f[coord_col]) for coord_col in f['anno'].value[3]]]]] points = [[point for point in keypoints if not math.isnan(point)] for keypoints in points] bbox = [flatten(zipped) for zipped in [zip(coords[0][0], coords[0][1]) for coords in [map(lambda a: f[a].value, coord_ref) for coord_ref in [flatten(f[coord_col]) for coord_col in f['anno'].value[1]]]]] yaw_pitch_roll = [flatten(zipped) for zipped in [zip(coords[0][0], coords[0][1], coords[0][2]) for coords in [map(lambda a: f[a].value, coord_ref) for coord_ref in [flatten(f[coord_col]) for coord_col in f['anno'].value[2]]]]] f.close() for img_ypr in yaw_pitch_roll: self.yawPitchRoll.append(img_ypr) for img_bounds in bbox: self.bbox.append(img_bounds) for img_points in points: self.keyPointsDict.append({}) prev_point = None for i, point in enumerate(img_points): if i % 2 == 0: prev_point = float(point) else: self.keyPointsDict[-1][point_names[i//2]] = (prev_point, float(point)) prev_point = None
def games_advance_stats_extract(seasons, filepath): for season in seasons: url = f'https://api.collegefootballdata.com/stats/game/advanced?year={season}&seasonType=both' data = requests.get(url).json() dfs = [] for game in data: d = flatten(game) df = pd.DataFrame(d, index=[0]) dfs.append(df) try: df = pd.concat(dfs, axis=0, ignore_index=True) df['season'] = season df.to_csv(filepath / f'seasons/{season}/games_advanced_stats.csv', index=False) except ValueError: pass
def __init__(self, filename, K): self.filename = filename self.K = K # getting the data into the desired form raw_data = helper.sanitize(open(filename).read().split(config.r_sep)) mapped_set = [ [float(_) for _ in x.strip(config.f_sep).split(config.f_sep)] for x in raw_data] self.X, self.y = ( np.matrix([np.array(_[:-K]) for _ in mapped_set]), \ np.matrix([ _[-K:] for _ in mapped_set]) ) self.X_trans = self.X.T self.m, self.n = self.X.shape # check if multi-class classification import resource, sys resource.setrlimit(resource.RLIMIT_STACK, (2**29,-1)) sys.setrecursionlimit(config.recur_limit) uniq = list(set(helper.flatten(np.ndarray.tolist(self.y)))) self.is_binary = len(uniq) == 2 self.multi_class = len(uniq) > 2 and len(uniq) <= config.max_K \ and (len(uniq) / self.m ) <= config.relative_unique
def _get_toc(pdf: object) -> dict: ''' under development get the TOC with page number from a pypdf2 object ''' outlines = pdf.getOutlines() outlines = flatten(outlines) if not outlines: logging.warning('Outline is unavailable.') outlines, next_outlines = itertools.tee(outlines, 2) next_outlines = itertools.chain( itertools.islice(next_outlines, 1, None), [None]) toc = {} for outline, next_outline in zip(outlines, next_outlines): title = clean_title(outline.title) from_page, to_page = _get_outline_page_range( pdf, outline, next_outline) logging.info(f'{title.capitalize()}: {from_page} - {to_page}') toc[title.capitalize()] = (sorted([from_page, to_page])) return toc
def main(): input_data = get_input(17) data = flatten([[(i, j, 0, 0) for j, y in enumerate(list(x)) if y == '#'] for i, x in enumerate(input_data.split('\n')) if x]) for _ in range(6): new_data = [] row_pos = [x[0] for x in data] col_pos = [x[1] for x in data] height_pos = [x[2] for x in data] magic_pos = [x[3] for x in data] for r in range(min(row_pos)-1, max(row_pos)+2): for c in range(min(col_pos)-1, max(col_pos)+2): for h in range(min(height_pos)-1, max(height_pos)+2): for m in range(min(magic_pos)-1, max(magic_pos)+2): pos = (r, c, h, m) neighbours = get_neighbours(pos, data) if pos in data and (len(neighbours) == 2 or len(neighbours) == 3): new_data.append(pos) if pos not in data and len(neighbours) == 3: new_data.append(pos) data = new_data print(len(data))
def load_data(self, data_path): self.vocab = helper.Vocab() tag2id, id2tag = helper.load_tag(data_path + 'class.txt') self.id2tag = id2tag val_data = helper.load_data(filePath=data_path + file_names['val_data']) test_data = helper.load_data(filePath=data_path + file_names['test_data']) train_data = helper.load_data(filePath=data_path + file_names['train_data']) self.val_data_y, val_data = helper.mkDataSet(val_data, tag2id) self.test_data_y, test_data = helper.mkDataSet(test_data, tag2id) self.train_data_y, train_data = helper.mkDataSet(train_data, tag2id) if os.path.exists(data_path + 'vocab.txt'): self.vocab.load_vocab_from_file(data_path + 'vocab.txt') else: words = helper.flatten([val_data, test_data, train_data]) self.vocab.construct(words) self.vocab.limit_vocab_length(self.config.vocab_size) self.vocab.save_vocab(data_path + '.vocab.txt') self.val_data_len, self.val_data_x = helper.encodeNpad( val_data, self.vocab, self.config.num_steps) self.test_data_len, self.test_data_x = helper.encodeNpad( test_data, self.vocab, self.config.num_steps) self.train_data_len, self.train_data_x = helper.encodeNpad( train_data, self.vocab, self.config.num_steps) if self.config.pre_trained: embed = helper.readEmbedding(data_path + 'embed/H' + str(self.config.embed_size) + '.utf8') self.embed_matrix = helper.mkEmbedMatrix(embed, self.vocab.word_to_index) else: pass
def main(): data = [int(x) for x in get_input(9).split('\n') if x] preamble_length = 25 preamble = data[:preamble_length] sums = [] for first_nr in preamble: day_sum = [] for second_nr in preamble: if first_nr != second_nr: day_sum.append(first_nr + second_nr) sums.append(day_sum) result = None for index, nr in enumerate(data): if index < preamble_length: continue if nr not in flatten(sums): print(nr) break sums = sums[1:] new_day = [] for second_nr in data[index - preamble_length:index]: if nr != second_nr: new_day.append(nr + second_nr) sums.append(new_day)
def __init__(self): # Index 0 # Light: Mopeds/Motocycles/Quads etc. # Heavy: Tractor/Buss/Van/Lorry # Others: Others/Public works vehicles/All-terrain vehicles self._vclasses = { 'Light': [ 'L1', 'L1e', 'L2', 'L2e', 'L3', 'L3e', 'L4', 'L4e', 'L5', 'L5e', 'L6e', 'L7e', 'KNP' ], 'Car': ['M1', 'M1G'], 'Trailer': ['O1', 'O2', 'O3', 'O4'], 'Heavy': [ 'C1', 'C2', 'T', 'T1', 'T2', 'T3', 'T4', 'T5', 'LTR', 'M2', 'M2G', 'M3', 'N1', 'N1G', 'N2', 'N2G', 'N3', 'N3G' ], 'Others': ['MUU', 'MTK', 'MA'] } # Index 1 self._init_regis = [ 'ir 1990 - 1999', 'ir 1958 - 1979', 'ir 1980 - 1989', 'ir 2010 - 2016', 'ir 2000 - 2009' ] # Index 3 self._usages = { 'Private': ['01'], 'Subject to permit': ['02'], 'School vechile': ['03'], 'Rental': ['04'], 'Sales storage': ['05'] } # Index 6 self._commencement = [ 'cy 1900 - 1989', 'cy 1990 - 1999', 'cy 2000 - 2004', 'cy 2005 - 2009', 'cy 2010 - 2017' ] # Index 7 self._colors = { 'Black': ['0'], 'Brown': ['1'], 'Red': ['2'], 'Green': ['5'], 'Blue': ['6', 'Z'], 'Grey': ['8'], 'White': ['9'], 'Silver': ['Y'], 'Other colors': ['3', '4', '7', 'X'] } # Index 8 self._doors = { 'Less than 4 doors': { 'max': 4 }, '4 doors': { 'eq': 4, 'min': 1e30, 'max': -1e30 }, '5 doors': { 'eq': 5, 'min': 1e30, 'max': -1e30 }, 'More than 5 doors': { 'min': 6 } } # Index 11 self._seats = { '1 seat': { 'eq': 1, 'min': 1e30, 'max': -1e30 }, '2 seats': { 'eq': 2, 'min': 1e30, 'max': -1e30 }, '3 seats': { 'eq': 3, 'min': 1e30, 'max': -1e30 }, '4 seats': { 'eq': 4, 'min': 1e30, 'max': -1e30 }, '5 seats': { 'eq': 5, 'min': 1e30, 'max': -1e30 }, 'More than 5 seats': { 'min': 6 } } # Index 12 # min < val <= max self._mass = { '0 - 1000 kg': { 'max': 1000 }, '1000 - 1500 kg': { 'min': 1000, 'max': 1500 }, '1500 - 2000 kg': { 'min': 1500, 'max': 2000 }, 'Greater than 2000 kg': { 'min': 2000 } } # Index 15 # min < val <= max self._length = { '0 - 4300 mm': { 'max': 4300 }, '4300 - 4700 mm': { 'min': 4300, 'max': 4700 }, 'Longer than 4700 mm': { 'min': 4700 } } # Index 16 # min < val <= max self._width = { '0 - 1700 mm': { 'max': 1700 }, '1700 - 1800 mm': { 'min': 1700, 'max': 1800 }, '1800 - 1900 mm': { 'min': 1800, 'max': 1900 }, '1900 - 2000 mm': { 'min': 1900, 'max': 2000 }, 'Wider than 2000 mm': { 'min': 2000 } } # Index 17 # min < val <= max self._height = { '0 - 1450 mm': { 'max': 1450 }, '1450 - 1500 mm': { 'min': 1450, 'max': 1500 }, '1500 - 1550 mm': { 'min': 1500, 'max': 1550 }, 'Greater than 1550 mm': { 'min': 1550 } } # Index 18 self._fuels = { 'Gasoline': ['01'], 'Diesel': ['02'], 'Other fuels': [ '33', '05', '53', '11', '13', '63', '48', '39', '61', '04', 'Y', '60', '37', '43', '59', '06', '40', '42', '58', '34', '44', '32', '38', '03', '31', '67', '47' ] } # Index 19 # min < val <= max self._displacement = { 'Small engine': { 'max': 1000 }, 'Large engine': { 'min': 2999 }, '1000 - 1999 cc': { 'min': 999, 'max': 2000 }, '2000 - 2999 cc': { 'min': 1999, 'max': 3000 } } # Index 20 # min < val <= max self._power = { '0 - 50 kW': { 'max': 50 }, '50 - 75 kW': { 'min': 50, 'max': 75 }, '75 - 100 kW': { 'min': 75, 'max': 100 }, '100 - 150 kW': { 'min': 100, 'max': 150 }, '150 - 200 kW': { 'min': 150, 'max': 200 }, 'Greater than 200 kW': { 'min': 200 } } # Index 21 self._cylinders = { 'Less than 4 cylinders': { 'max': 4 }, '4 cylinders': { 'eq': 4, 'min': 1e30, 'max': -1e30 }, 'More than 4 cylinders': { 'min': 5 } } # Index 22 self._supercharger = { 'Supercharger': 'true', 'No supercharger': 'false' } # Index 23 self._hybrid = {'Hybrid': 'true', 'Not hybrid': 'false'} # Index 33 # min < val <= max self._co2 = { '0 - 150 g': { 'max': 150 }, '150 - 200 g': { 'min': 150, 'max': 200 }, 'More than 200 g': { 'min': 200 } } # Index 34 # min < val <= max self._km = { 'More than 200000 km': { 'min': 100000, 'max': 200000 }, '0 - 100000 km': { 'max': 100000 }, '100000 - 200000 km': { 'min': 200000 } } self._classes = flatten([ self._vclasses.keys(), self._init_regis, self._usages.keys(), self._commencement, self._colors.keys(), self._doors.keys(), self._seats.keys(), self._mass.keys(), self._length.keys(), self._width.keys(), self._height.keys(), self._fuels.keys(), self._displacement.keys(), self._power.keys(), self._cylinders.keys(), self._supercharger.keys(), self._hybrid.keys(), self._co2.keys(), self._km.keys() ])
def sections(self): return flatten( [page.get_section(AuditFee.section_regex) for page in self.pages])
def add_condition(*conditions): flat_conditions = list(flatten(conditions)) self.conditions += validate(flat_conditions)
def add_action(*actions): flat_actions = list(flatten(actions)) self.actions += validate(flat_actions)
if FLAGS.asyncr: q_size = (2 * num_byzwrks) + 3 krum_op = aggregators.instantiate("krum", q_size, num_byzwrks, None) #Average avg_op = aggregators.instantiate("average", num_workers, num_byzwrks, None) #Median med_op = aggregators.instantiate("median", quorum_ps, num_byzps, None) #Bulyan bul_op = aggregators.instantiate("bulyan", q_size, num_byzwrks, None) #The 'Experiment' interface related definitions loss_tn = experiment.losses( '/cpu:0', ['/' + device + ':' + str(FLAGS.task_index % 2)], trace=False) grad_vars = optimizer.compute_gradients(loss_tn[0]) gradient_tn, flatmap = helper.flatten(grad_vars) #2) Read the gradients for the new iteration and #3) The logic of aggregation grads = [ tf.placeholder(dtype=tf.float32, shape=(grad_length)) for _ in range(quorum) ] if FLAGS.smart: last_grad = krum_op.aggregate(grads) #This list is defined above elif FLAGS.asyncr: last_grad = krum_op.aggregate(grads) #Apply the strongest GAR elif FLAGS.vanilla: last_grad = avg_op.aggregate(grads) #This list is defined above if FLAGS.asyncr: #Then more steps are required before applying this aggregated gradients grads_ps = [ tf.placeholder(dtype=tf.float32, shape=(grad_length))
def main(): tiles = [Tile(tile) for tile in get_input(20).split('\n\n') if tile] side_length = int(math.sqrt(len(tiles))) corners = [] neighbours = {} for i, tile in enumerate(tiles): possible_neighbours = [] for tile2 in tiles: if tile == tile2: continue possible_sides = tile.nr_of_sides_that_can_be_neighbouring(tile2) if possible_sides: possible_neighbours.append(tile2) neighbours[tile] = possible_neighbours if len(possible_neighbours) == 2: corners.append(tile) corner = corners[0] tile_map = [[None for x in range(side_length)] for y in range(side_length)] rotate_first_corner(corner, neighbours[corner]) for row in range(side_length): for col in range(side_length): if row == 0 and col == 0: tile_map[row][col] = corner elif col == 0: prev = tile_map[row - 1][col] tile_map[row][col] = get_bottom(prev, neighbours[prev]) else: prev = tile_map[row][col - 1] tile_map[row][col] = get_right(prev, neighbours[prev]) image = [] for row in tile_map: content = [tile.content for tile in row] for row_index in range(len(content[0])): new_row = [ x for x in ''.join( [''.join(tile.content[row_index]) for tile in row]) ] image.append(new_row) has_matches = False i = 0 transform = [ rotate, rotate, rotate, rotate, mirror_x, rotate, rotate, rotate, rotate, mirror_y, rotate, rotate, rotate, rotate, mirror_x, rotate, rotate, rotate, rotate, ] match_count = 0 while not has_matches: pattern = r'.*.{18}#.{1}.*\n.*#.{4}##.{4}##.{4}###.*\n.*.#.{2}#.{2}#.{2}#.{2}#.{2}#.{3}.*' matches = re.findall(pattern, '\n'.join([''.join(row) for row in image]), re.MULTILINE) if not matches: image = transform[i](image) i += 1 else: for x in range(len(image) - 3): f = ''.join(image[x]) s = ''.join(image[x + 1]) t = ''.join(image[x + 2]) f_m = [] s_m = [] t_m = [] for i in range(len(t) - 20): f_m.append(re.findall('.{18}#.{1}', f[i:])) s_m.append(re.findall('#.{4}##.{4}##.{4}###', s[i:])) t_m.append( re.findall('.#.{2}#.{2}#.{2}#.{2}#.{2}#.{3}', t[i:])) f_m = flatten(f_m) s_m = flatten(s_m) t_m = flatten(t_m) if len(f_m) > 0 and len(s_m) > 0 and len(t_m) > 0: match_count += len( set([f.index(x) for x in f_m]).intersection( set([s.index(x) for x in s_m ]).intersection(set([t.index(x) for x in t_m])))) break print( len( re.findall('#', '\n'.join([''.join(row) for row in image]), re.MULTILINE)) - (match_count * 15))
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ root = param.taxonomy.data print ">>>" + str(param.taxonomy.data) + "<<<" print "initial root weight:", root.edge_weight print "tasks", train_data.keys() print "tax keys", root.get_data_keys() numpy.random.seed(1) # prepare data splits for inner validation # set up validation strategy # this has to be done here, because the training set CANNOT contain # any examples that will be used to evaluate further down the tree # # also by doing it this way, we have equally many examples from each # task in each split inner_train_data = {} inner_eval_data = {} for task_id in root.get_data_keys(): idx = range(len(train_data[task_id])) idx_pos = [idx for idx in range(len(train_data[task_id])) if train_data[task_id][idx].label == 1] idx_neg = [idx for idx in range(len(train_data[task_id])) if train_data[task_id][idx].label == -1] numpy.random.shuffle(idx_pos) numpy.random.shuffle(idx_neg) # distribute pos/negs evenly across splits splits_pos = helper.split_list(idx_pos, FOLD) splits_neg = helper.split_list(idx_neg, FOLD) eval_split_id = 0 train_idx_pos = list(helper.flatten([splits_pos[j] for j in xrange(FOLD) if j!=eval_split_id])) train_idx_neg = list(helper.flatten([splits_neg[j] for j in xrange(FOLD) if j!=eval_split_id])) train_idx = train_idx_pos train_idx.extend(train_idx_neg) numpy.random.shuffle(train_idx) eval_idx_pos = splits_pos[eval_split_id] eval_idx_neg = splits_neg[eval_split_id] eval_idx = eval_idx_pos eval_idx.extend(eval_idx_neg) numpy.random.shuffle(eval_idx) # numpy.random.shuffle(idx) # # splits = helper.split_list(idx, FOLD) # # eval_split_id = 0 # train_idx = list(helper.flatten([splits[j] for j in xrange(FOLD) if j!=eval_split_id])) # eval_idx = splits[eval_split_id] # make sure idx lists are disjoint assert( len(set(train_idx).intersection(set(eval_idx))) == 0 ) print "len train data", len(train_data[task_id]), task_id # select data sets inner_train_data[task_id] = [train_data[task_id][idx] for idx in train_idx] inner_eval_data[task_id] = [train_data[task_id][idx] for idx in eval_idx] ########################################################### # Learn Taxonomy Parameters ########################################################### grey_nodes = [root] #initialize inner cost inner_cost = param.cost while len(grey_nodes)>0: # fetch next node to process node = grey_nodes.pop(0) #pop first item # enqueue children if not node.is_leaf(): grey_nodes.extend(node.children) ################################### #train current node ################################### # concatenate instances from all task for nodes below instance_set_train = list(helper.flatten([inner_train_data[key] for key in node.get_data_keys()])) instance_set_eval = list(helper.flatten([inner_eval_data[key] for key in node.get_data_keys()])) # shuffle to avoid having instances from one task in consecutive order numpy.random.shuffle(instance_set_train) numpy.random.shuffle(instance_set_eval) # extract examples and labels train_examples = [inst.example for inst in instance_set_train] train_labels = [inst.label for inst in instance_set_train] eval_examples = [inst.example for inst in instance_set_eval] eval_labels = [inst.label for inst in instance_set_eval] #import copy #debug_examples = copy.copy(train_examples) #debug_examples.extend(eval_examples) #debug_labels = copy.copy(train_labels) #debug_labels.extend(eval_labels) # only local xval for leaves #if node.is_root(): # inner_param = 0.0 # predictor = self._train_inner_classifier(node, train_examples, train_labels, param, inner_param, param.cost) #else: #TODO: also perform inner validation on non-leaves if node.is_leaf():# not node.is_root(): print "performing inner xval at node", node.name # perform local model selection result_dict = self._perform_inner_xval(node, train_examples, train_labels, eval_examples, eval_labels, param) # use dict for returning args to avoid order glitches inner_edge_weight = result_dict["best_edge_weight"] inner_cost = result_dict["best_inner_cost"] predictor = result_dict["best_predictor"] else: # for non-leaves train without model selection inner_edge_weight = param.transform inner_cost = param.cost predictor = self._train_inner_classifier(node, train_examples, train_labels, param, inner_edge_weight, inner_cost) #predictor = self._train_inner_classifier(node, debug_examples, debug_labels, param, inner_edge_weight, inner_cost) node.predictor = predictor node.edge_weight = inner_edge_weight node.cost = inner_cost ########################################################### # Retrain on whole training set with optimal parameters ########################################################### grey_nodes = [root] while len(grey_nodes)>0: node = grey_nodes.pop(0) #pop first item # enqueue children if not node.is_leaf(): grey_nodes.extend(node.children) # fetch all data that belongs to leaves underneath current node instance_set_retrain = list(helper.flatten([train_data[key] for key in node.get_data_keys()])) # shuffle instances numpy.random.shuffle(instance_set_retrain) # extract examples and labels examples = [inst.example for inst in instance_set_retrain] labels = [inst.label for inst in instance_set_retrain] print "FINAL TRAIN on " + node.name + " C=" + str(node.cost) + " B=" + str(node.edge_weight) predictor = self._train_inner_classifier(node, examples, labels, param, node.edge_weight, node.cost) # attach predictor to node node.predictor = predictor ##################################################### # Wrap things up ##################################################### # wrap up predictors for later use predictors = {} for leaf in root.get_leaves(): assert(leaf.predictor!=None) predictors[leaf.name] = leaf.predictor # make sure we have the same keys (potentially in a different order) sym_diff_keys = set(train_data.keys()).symmetric_difference(set(predictors.keys())) assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys) # save graph plot mypath = "/fml/ag-raetsch/share/projects/multitask/graphs/" filename = mypath + "graph_" + str(param.id) filename_perf = mypath + "performances_" + str(param.id) helper.save(filename_perf, result_dict["performances"]) print "saving results to:", filename_perf root.plot(filename, plot_cost=True, plot_B=True) return predictors
from collections import defaultdict from extractor import read_conllu, read_bio from dynet_model import DynetModel from helper import time, flatten #path_root = "../../../data/ner/" #train_inputs, train_labels = read_bio(path_root + "/wikiann-sk_training.bio") #val_inputs, val_labels = read_bio(path_root + "/wikiann-sk_validation.bio") path_root = "../../../data/pos/" train_inputs, train_labels = read_conllu(path_root + "da/training.conllu") val_inputs, val_labels = read_conllu(path_root + "da/validation.conllu") #embedding, word_count = read_fasttext("embeddings/cc.da.300.vec") tags = list(set(flatten(train_labels))) tags.sort() vocab = list(set(flatten(train_inputs))) vocab.sort() print(tags, len(vocab)) print(len(flatten(train_labels))) int2word = ["<UNK>"] + vocab word2int = {w: i for i, w in enumerate(int2word)} int2tag = tags tag2int = {w: i for i, w in enumerate(int2tag)} def to_input(word, unknown=0): """