def segment_matrix(matrix: List[Path], bin_width, cells_per_file, pangenome_length) -> PangenomeSchematic: from matrixcomponent import JSON_VERSION print(f"Starting Segmentation process on {len(matrix)} Paths.") schematic = PangenomeSchematic(JSON_VERSION, bin_width, 1, 1, [], [p.name for p in matrix], 1, pangenome_length) incoming, outgoing, dividers = dividers_with_max_size( matrix, cells_per_file) start_pos = 0 for valid_start in dividers: if valid_start != 0: current = Component(start_pos, valid_start - 1) # current.active_members = 1 schematic.components.append(current) start_pos = valid_start print(f"Created {len(schematic.components)} components") # populate Component occupancy per Path populate_component_matrix(matrix, schematic) # populate all link columns onto schematic nLinkColumns = 0 for component in schematic.components: # TODO: order columns based on traversal patterns, # TODO: insert additional columns for higher copy number for origin_pos, participants in incoming[component.first_bin].items(): phase_dots = [ indiv in participants for indiv in schematic.path_names ] entering = LinkColumn(origin_pos, component.first_bin, participants=phase_dots) component.arrivals.append(entering) nLinkColumns += 1 for arriving_pos, participants in outgoing[component.last_bin].items(): # phase_dots depends on row ordering of path names, optimized for display phase_dots = [ indiv in participants for indiv in schematic.path_names ] leaving = LinkColumn(component.last_bin, arriving_pos, participants=phase_dots) component.departures.append(leaving) nLinkColumns += 1 for i in range(len(schematic.components) - 1): component, next_component = schematic.components[ i], schematic.components[i + 1] add_adjacent_connector_column(component, next_component, schematic) print(f"Created {nLinkColumns} LinkColumns") return schematic
def add_adjacent_connector_column(component, next_component, schematic): """The last Departure LinkColumn is to the adjacent component Use logic to decide on which rows need adjacent connectors Start with the easy subtractive case of occupancy - departures and move to more complex, multiple copy cases.""" ids = np.arange(len(schematic.path_names)) common = component.occupants & next_component.occupants if ( component and next_component) else [] filtered_rows = np.asarray([ids[j] for j in common]) adjacents = filtered_rows # we take all the filtered IDs if there are no departures if len(filtered_rows) > 0 and len( component.departures) > 0: # potentially there's work to do ids = np.concatenate( [column.participants for column in component.departures]) isin = np.isin(filtered_rows, ids, invert=True) adjacents = filtered_rows[isin] # if adjacents.size > 0: # add linkcolumn as placeholder even when an empty list of participants component.departures.append( LinkColumn( # LinkColumn for adjacents component.last_bin, component.last_bin + 1, participants=np.asarray(adjacents).astype(dtype='int32')))
def segment_matrix(matrix: List[Path], bin_width, cells_per_file, pangenome_length) -> PangenomeSchematic: from matrixcomponent import JSON_VERSION print(f"Starting Segmentation process on {len(matrix)} Paths.") schematic = PangenomeSchematic(JSON_VERSION, bin_width, 1, 1, [], [p.name for p in matrix], 1, pangenome_length) connections, dividers = dividers_with_max_size(matrix, cells_per_file) component_by_first_bin = {} component_by_last_bin = {} start_pos = 0 for valid_start in dividers: if valid_start != 0: current = Component(start_pos, valid_start - 1) # current.active_members = 1 schematic.components.append(current) component_by_first_bin[start_pos] = current component_by_last_bin[valid_start - 1] = current start_pos = valid_start print(f"Created {len(schematic.components)} components") # populate Component occupancy per Path populate_component_matrix(matrix, schematic) connections_array = connections.to_numpy() groups = utils.find_groups(connections_array[:, :2]) path_indices = connections.path_index.to_numpy() participants_mask = np.zeros(len(schematic.path_names), dtype=bool) nLinkColumns = 0 for (start, end) in groups: row = connections_array[start] src, dst = int(row[0]), int(row[1]) participants_mask[:] = False participants_mask[path_indices[start:end]] = True phase_dots = participants_mask.tolist() link_column = LinkColumn(src, dst, participants=phase_dots) src_component = component_by_last_bin.get(src) dst_component = component_by_first_bin.get(dst) if src_component: src_component.departures.append(link_column) nLinkColumns += 1 if dst_component: dst_component.arrivals.append(link_column) nLinkColumns += 1 for i in range(len(schematic.components) - 1): component, next_component = schematic.components[ i], schematic.components[i + 1] add_adjacent_connector_column(component, next_component, schematic) print(f"Created {nLinkColumns} LinkColumns") return schematic
def add_adjacent_connector_column(component, next_component, schematic): """The last Departure LinkColumn is to the adjacent component Use logic to decide on which rows need adjacent connectors Start with the easy subtractive case of occupancy - departures and move to more complex, multiple copy cases.""" adjacents = [] for row in range(len(schematic.path_names)): connection_exists = False if component.occupants[row] and next_component.occupants[ row]: # occupant present # n_arrivals = sum([column.participants[row] for column in component.arrivals]) departed = sum( [column.participants[row] for column in component.departures]) # connection_exists = n_arrivals + 1 > departed connection_exists = not departed # didn't depart adjacents.append(connection_exists) component.departures.append( LinkColumn( # LinkColumn for adjacents component.last_bin, component.last_bin + 1, participants=adjacents))
def segment_matrix(matrix: List[Path], bin_width, cells_per_file, pangenome_length, no_adjacent_links, parallel) -> PangenomeSchematic: from matrixcomponent import JSON_VERSION LOGGER.info(f"Starting Segmentation process on {len(matrix)} Paths.") schematic = PangenomeSchematic(JSON_VERSION, bin_width, 1, 1, not no_adjacent_links, [], [p.name for p in matrix], 1, pangenome_length) connections, dividers = dividers_with_max_size(matrix, cells_per_file) LOGGER.info(f"Created dividers") component_by_first_bin = {} component_by_last_bin = {} start_pos = 0 for valid_start in dividers: if valid_start != 0: current = Component(start_pos, valid_start - 1) # current.active_members = 1 schematic.components.append(current) component_by_first_bin[start_pos] = current component_by_last_bin[valid_start - 1] = current start_pos = valid_start LOGGER.info(f"Created {len(schematic.components)} components") # populate Component occupancy per Path populate_component_matrix(matrix, schematic) LOGGER.info(f"populated matrix") path_indices = connections['path_index'] connections_from = connections['from'] connections_to = connections['to'] groups = utils.find_groups(connections_from, connections_to) for i in range(len(groups) - 1): start, end = groups[i], groups[i + 1] src, dst = int(connections_from[start]), int( connections_to[start]) # important to cast to int() link_column = LinkColumn(src, dst, participants=path_indices[start:end]) src_component = component_by_last_bin.get(src) dst_component = component_by_first_bin.get(dst) if src_component: src_component.departures.append(link_column) if dst_component: dst_component.arrivals.append(link_column) if not no_adjacent_links: for i in range(len(schematic.components) - 1): component, next_component = schematic.components[ i], schematic.components[i + 1] add_adjacent_connector_column(component, next_component, schematic) # add special case connectors for the last component in the file add_adjacent_connector_column(schematic.components[-1], None, schematic) num_link_columns = sum([(len(comp.departures) + len(comp.arrivals)) for comp in schematic.components]) LOGGER.info(f"Created {num_link_columns} LinkColumns") schematic.prerender() return schematic