def build(self): layer_sizes = list(sliding_window(2, self.dimensions)) if self.single_module == -1 or self.single_module == 0: layers = [] for i, size in enumerate(layer_sizes): layers.append(("fc" + str(i), nn.Linear(size[0], size[1]))) if i < len(self.dimensions) - 2: layers.append(("act" + str(i), nn.ELU())) layers.append( ("drop" + str(i + 1), nn.Dropout(self.keep_prob))) self.encoder = nn.Sequential(OrderedDict(layers)) else: self.encoder = nn.Sequential() if self.single_module == 0 or self.single_module == 1: layers = [] for i, size in enumerate(layer_sizes[-1::-1]): layers.append(("fc" + str(i), nn.Linear(size[1], size[0]))) if i < len(self.dimensions) - 2: layers.append(("act" + str(i), nn.ELU())) layers.append( ("drop" + str(i + 1), nn.Dropout(self.keep_prob))) self.decoder = nn.Sequential(OrderedDict(layers)) else: self.decoder = nn.Sequential()
def _construct_relationship(self, path, updated_factors): start_node = path[0] end_node = path[-1] computed_matrix = (self.fuser.factor(start_node) if not start_node.name in updated_factors else updated_factors[start_node.name]) print( type(start_node), start_node, start_node.name in updated_factors, computed_matrix.shape, ) for src, dst in sliding_window(2, path): relation = list(self.fuser.fusion_graph.get_relations(src, dst))[0] print(relation) computed_matrix = np.dot(computed_matrix, self.fuser.backbone(relation)) end_factor = (self.fuser.factor(end_node) if not end_node.name in updated_factors else updated_factors[end_node.name]) computed_matrix = np.dot(computed_matrix, end_factor.T) return computed_matrix
def _persist_header_chain( cls, db: BaseDB, headers: Iterable[BlockHeader] ) -> Tuple[Tuple[BlockHeader, ...], Tuple[BlockHeader, ...]]: try: first_header = first(headers) except StopIteration: return tuple(), tuple() else: for parent, child in sliding_window(2, headers): if parent.hash != child.parent_hash: raise ValidationError( "Non-contiguous chain. Expected {} to have {} as parent but was {}" .format( encode_hex(child.hash), encode_hex(parent.hash), encode_hex(child.parent_hash), )) is_genesis = first_header.parent_hash == GENESIS_PARENT_HASH if not is_genesis and not cls._header_exists( db, first_header.parent_hash): raise ParentNotFound( "Cannot persist block header ({}) with unknown parent ({})" .format(encode_hex(first_header.hash), encode_hex(first_header.parent_hash))) score = 0 if is_genesis else cls._get_score( db, first_header.parent_hash) for header in headers: db.set( header.hash, rlp.encode(header), ) score += header.difficulty db.set( SchemaV1.make_block_hash_to_score_lookup_key(header.hash), rlp.encode(score, sedes=rlp.sedes.big_endian_int), ) try: previous_canonical_head = cls._get_canonical_head(db).hash head_score = cls._get_score(db, previous_canonical_head) except CanonicalHeadNotFound: (new_canonical_headers, old_canonical_headers) = cls._set_as_canonical_chain_head( db, header.hash) else: if score > head_score: (new_canonical_headers, old_canonical_headers) = cls._set_as_canonical_chain_head( db, header.hash) else: new_canonical_headers = tuple() old_canonical_headers = tuple() return new_canonical_headers, old_canonical_headers
def prepare(self): if self.colormap is None: self.colormap = colors.LinearSegmentedColormap.from_list( "", [self.source_color, self.target_color]) if self.linewidth is None: self.set_linewidth() self.prepared_data = {} for n_points, pairs in self.curves_per_length.items(): segments = np.concatenate( [list(sliding_window(2, p[0])) for p in pairs]) weights = np.concatenate( [list(repeat(p[1], n_points)) for p in pairs]) color_values = np.concatenate( list(repeat(np.linspace(0, 1, num=n_points - 1), len(pairs)))) if self.min_linewidth is not None: linewidth = np.squeeze( self.linewidth_transform( weights.reshape(-1, 1), feature_range=(self.min_linewidth, self.linewidth), )) else: linewidth = self.linewidth self.prepared_data[n_points] = { "segments": segments, "weights": weights, "color_values": color_values, "linewidth": linewidth, } self.prepared = True
def build(self): layer_sizes = list(sliding_window(2, self.dimensions)) if self.single_module == -1 or self.single_module == 0: layers = [] for i, size in enumerate(layer_sizes): if i == len(layer_sizes) - 1: self.cluster_layer = RBF_Layer(in_features=size[0], out_features=size[1], basis_func=self.basis_func) else: layers.append(("fc" + str(i), nn.Linear(size[0], size[1]))) if i < len(self.dimensions) - 2: layers.append(("act" + str(i), self.act())) layers.append( ("drop" + str(i + 1), nn.Dropout(self.keep_prob))) self.encoder = nn.Sequential(OrderedDict(layers)) else: self.encoder = nn.Sequential() if self.single_module == 0 or self.single_module == 1: layers = [] layer_sizes[-1] = (layer_sizes[-1][0], layer_sizes[-1][1] + self.extra_feature_len) for i, size in enumerate(layer_sizes[-1::-1]): layers.append(("fc" + str(i), nn.Linear(size[1], size[0]))) if i < len(self.dimensions) - 2: layers.append(("act" + str(i), self.act())) layers.append( ("drop" + str(i + 1), nn.Dropout(self.keep_prob))) self.decoder = nn.Sequential(OrderedDict(layers)) else: self.decoder = nn.Sequential()
def compute_divided_edge_length(self, edge_idx): length = 0.0 for p0, p1 in sliding_window(2, self.subdivision_points[edge_idx]): length += point_distance(p0, p1) return length
def _decode_header_to_dict( cls, encoded_header: bytes) -> Iterator[Tuple[str, Any]]: if len(encoded_header) != cls.smc_encoded_size: raise ValidationError( "Expected encoded header to be of size: {0}. Got size {1} instead.\n- {2}" .format( cls.smc_encoded_size, len(encoded_header), encode_hex(encoded_header), )) start_indices = accumulate(lambda i, field: i + field[2], cls.fields_with_sizes, 0) field_bounds = sliding_window(2, start_indices) for byte_range, field in zip(field_bounds, cls._meta.fields): start_index, end_index = byte_range field_name, field_type = field field_bytes = encoded_header[start_index:end_index] if field_type == rlp.sedes.big_endian_int: # remove the leading zeros, to avoid `not minimal length` error in deserialization formatted_field_bytes = field_bytes.lstrip(b'\x00') elif field_type == address: formatted_field_bytes = field_bytes[-20:] else: formatted_field_bytes = field_bytes yield field_name, field_type.deserialize(formatted_field_bytes)
def _persist_block_chain( cls, db: BaseDB, blocks: Iterable[BaseBeaconBlock], block_class: Type[BaseBeaconBlock] ) -> Tuple[Tuple[BaseBeaconBlock, ...], Tuple[BaseBeaconBlock, ...]]: try: first_block = first(blocks) except StopIteration: return tuple(), tuple() else: for parent, child in sliding_window(2, blocks): if parent.root != child.parent_root: raise ValidationError( "Non-contiguous chain. Expected {} to have {} as parent but was {}" .format( encode_hex(child.root), encode_hex(parent.root), encode_hex(child.parent_root), )) is_genesis = first_block.parent_root == GENESIS_PARENT_HASH if not is_genesis and not cls._block_exists( db, first_block.parent_root): raise ParentNotFound( "Cannot persist block ({}) with unknown parent ({})". format(encode_hex(first_block.root), encode_hex(first_block.parent_root))) if is_genesis: score = 0 else: score = cls._get_score(db, first_block.parent_root) for block in blocks: db.set( block.root, rlp.encode(block), ) # TODO: It's a stub before we implement fork choice rule score = block.slot db.set( SchemaV1.make_block_root_to_score_lookup_key(block.root), rlp.encode(score, sedes=rlp.sedes.big_endian_int), ) try: previous_canonical_head = cls._get_canonical_head(db, block_class).root head_score = cls._get_score(db, previous_canonical_head) except CanonicalHeadNotFound: return cls._set_as_canonical_chain_head(db, block.root, block_class) if score > head_score: return cls._set_as_canonical_chain_head(db, block.root, block_class) else: return tuple(), tuple()
def _persist_header_chain( cls, db: BaseDB, headers: Iterable[BlockHeader] ) -> Tuple[Tuple[BlockHeader, ...], Tuple[BlockHeader, ...]]: headers_iterator = iter(headers) try: first_header = first(headers_iterator) except StopIteration: return tuple(), tuple() is_genesis = first_header.parent_hash == GENESIS_PARENT_HASH if not is_genesis and not cls._header_exists(db, first_header.parent_hash): raise ParentNotFound( "Cannot persist block header ({}) with unknown parent ({})". format(encode_hex(first_header.hash), encode_hex(first_header.parent_hash))) if is_genesis: score = 0 else: score = cls._get_score(db, first_header.parent_hash) curr_chain_head = first_header db.set( curr_chain_head.hash, rlp.encode(curr_chain_head), ) score = cls._set_hash_scores_to_db(db, curr_chain_head, score) orig_headers_seq = concat([(first_header, ), headers_iterator]) for parent, child in sliding_window(2, orig_headers_seq): if parent.hash != child.parent_hash: raise ValidationError( "Non-contiguous chain. Expected {} to have {} as parent but was {}" .format( encode_hex(child.hash), encode_hex(parent.hash), encode_hex(child.parent_hash), )) curr_chain_head = child db.set( curr_chain_head.hash, rlp.encode(curr_chain_head), ) score = cls._set_hash_scores_to_db(db, curr_chain_head, score) try: previous_canonical_head = cls._get_canonical_head(db).hash head_score = cls._get_score(db, previous_canonical_head) except CanonicalHeadNotFound: return cls._set_as_canonical_chain_head(db, curr_chain_head.hash) if score > head_score: return cls._set_as_canonical_chain_head(db, curr_chain_head.hash) return tuple(), tuple()
def itinerary_dists(positions, itinerary): """Return list of pairs, ['dest', 'dist'].""" distances = [ distance(positions[b], positions[a]) for (a, b) in sliding_window(2, itinerary) ] labeled = list(zip(itinerary[1:], distances)) return labeled
def prepare_single_babel_language(corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None): manifests = defaultdict(dict) for split in ('dev', 'eval', 'training'): audio_dir = corpus_dir / f'conversational/{split}/audio' recordings = RecordingSet.from_recordings(Recording.from_sphere(p) for p in audio_dir.glob('*.sph')) if len(recordings) == 0: logging.warning(f"No SPHERE files found in {audio_dir}") manifests[split]['recordings'] = recordings supervisions = [] text_dir = corpus_dir / f'conversational/{split}/transcription' for p in text_dir.glob('*'): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split('_') channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A') # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines = p.read_text().splitlines() + [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) supervisions.append( SupervisionSegment( id=f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}', recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=speaker, ) ) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") manifests[split]['supervisions'] = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions( manifests[split]['recordings'], manifests[split]['superevisions'] ) if output_dir is not None: language = BABELCODE2LANG[lang_code] if split == 'training': split = 'train' manifests[split]['recordings'].to_json(f'recordings_{language}_{split}.json') manifests[split]['supervisions'].to_json(f'supervisions_{language}_{split}.json') return manifests
def get_score(self, current_candidate): total_score = 0.0 for pair in ct.sliding_window(2, current_candidate): current_dict = self.association_dict[pair] current_score = max(current_dict["RL"], current_dict["LR"]) total_score += current_score return total_score
def build_node_memberships(self): self.membership_per_level = defaultdict(dict) self.membership_per_level[0] = dict( zip(map(int, self.network.vertices()), self.block_levels[0])) for i, (l0, l1) in enumerate(sliding_window(2, self.block_levels), start=1): update_level = dict(zip(np.unique(l0), l1)) self.membership_per_level[i] = valmap( lambda x: update_level[x], self.membership_per_level[i - 1])
def _search_grammer_path(self, pos_via_point): pos_via_and_end = (self.START_NODE,) + pos_via_point + (self.END_NODE,) paths = [] cost = 0 for network_start_end in sliding_window(2, pos_via_and_end): path = networkx.bidirectional_dijkstra(self._grammer_graph, network_start_end[0], network_start_end[1]) cost += path[0] node_path = path[1][1:] paths += node_path paths.pop() return cost, paths
def fhs(self, n_scenarios=250, start_date=None, end_date=None): x = sliding_window(n_scenarios+1, range(len(self.ts.index))) scenarios = np.zeros((len(self.ts.index), n_scenarios+1)) for i, el in enumerate(x): l = list(el) cur_idx, hist_idx = l[-1], l[:-1] neutral = self.ts.Value.values[cur_idx] ret = self.ts.DevolLogReturns.values[hist_idx] vol = self.ts.Vola.values[cur_idx] scenarios[cur_idx, 1:] = self.scenario_values(ret, neutral, vol) scenarios[cur_idx, 0] = neutral return scenarios
def prepare_segments(self, level=None): self.segments_per_pair = defaultdict(list) if level is None: level = self.community_level for edge_data in self.edges: segments = list(sliding_window(2, edge_data['spline'])) values = np.linspace(0, 1, num=self.n_points - 1) pair = (self.membership_per_level[level][edge_data['source']], self.membership_per_level[level][edge_data['target']]) #print(pair) #break self.segments_per_pair[pair].append( (segments, values, edge_data['weight']))
def get_pairwise_lists(self, candidate): lr_list = [] #Initiate list of LR association values rl_list = [] #Initiate list of RL association values #Populate the pairwise value lists for current_pair in ct.sliding_window(2, candidate): lr_list.append(self.association_dict[current_pair]["LR"]) rl_list.append(self.association_dict[current_pair]["RL"]) #Send lists to class-external jitted function for processing return_list = calculate_measures(np.array(lr_list), np.array(rl_list)) #Check for end-point try: endpoint_lr = self.association_dict[(candidate[0], candidate[-1])]["LR"] endpoint_rl = self.association_dict[(candidate[0], candidate[-1])]["RL"] except Exception as e: endpoint_lr = 0.0 endpoint_rl = 0.0 #Add Endpoint to return_list return_list.append(endpoint_lr) return_list.append(endpoint_rl) #return_list contains the following items: #--- candidate (representation, index) tuples #--- mean_lr #--- mean_rl #--- min_lr #--- min_rl #--- directional_scalar #--- directional_categorical #--- reduced_beginning_lr #--- reduced_beginning_rl #--- reduced_end_lr #--- reduced_end_rl #--- endpoint_lr #--- endpoint_rl return return_list #----------------------------------------------------------------------------------------------#
def make_ngrams(s, n, joiner=None): """ Make n-grams For character ngrams, s should be a string For token/word ngrams, s should be a sequence of tokens joiner='' is recommended for characters, and joiner='_' for words. """ try: ngrams = tz.sliding_window(n, s) except StopIteration: # bug in toolz/cytoolz? yield from () if joiner is not None: ngrams = (joiner.join(grams) for grams in ngrams) yield from ngrams
def _compute_gas_price(probabilities, desired_probability): """ Given a sorted range of ``Probability`` named-tuples returns a gas price computed based on where the ``desired_probability`` would fall within the range. :param probabilities: An iterable of `Probability` named-tuples sorted in reverse order. :param desired_probability: An floating point representation of the desired probability. (e.g. ``85% -> 0.85``) """ first = probabilities[0] last = probabilities[-1] if desired_probability >= first.prob: return first.gas_price elif desired_probability <= last.prob: return last.gas_price for left, right in sliding_window(2, probabilities): if desired_probability < right.prob: continue elif desired_probability > left.prob: # This code block should never be reachable as it would indicate # that we already passed by the probability window in which our # `desired_probability` is located. raise Exception('Invariant') adj_prob = desired_probability - right.prob window_size = left.prob - right.prob position = adj_prob / window_size gas_window_size = left.gas_price - right.gas_price gas_price = int(math.ceil(right.gas_price + gas_window_size * position)) return gas_price else: # The initial `if/else` clause in this function handles the case where # the `desired_probability` is either above or below the min/max # probability found in the `probabilities`. # # With these two cases handled, the only way this code block should be # reachable would be if the `probabilities` were not sorted correctly. # Otherwise, the `desired_probability` **must** fall between two of the # values in the `probabilities``. raise Exception('Invariant')
def _create_grammer_network(self, posid_list): # cout node and edge node = set() for posid in chain.from_iterable(posid_list): node.add(posid) edge_weight_dict = {} for sentence in posid_list: for one_edge in sliding_window(2, sentence): edge_weight_dict[one_edge] = edge_weight_dict.get(one_edge, 1) + 1 max_weight = max(edge_weight_dict.values()) # NetworkX上、weightはcost扱いなので、出現頻度が高いものが低コストになるようにする # create direct network graph = networkx.DiGraph() graph.add_nodes_from(node) for edge, weight in edge_weight_dict.items(): # 最低コストを1とする cost = max_weight - weight + 1 graph.add_edge(edge[0], edge[1], weight=cost) return graph
def compute_gas_price(probabilities, desired_probability): first = probabilities[0] last = probabilities[-1] if desired_probability >= first.prob: return first.gas_price elif desired_probability <= last.prob: return last.gas_price for left, right in sliding_window(2, probabilities): if desired_probability < right.prob: continue elif desired_probability > left.prob: raise Exception('Invariant') adj_prob = desired_probability - right.prob window_size = left.prob - right.prob position = adj_prob / window_size gas_window_size = left.gas_price - right.gas_price gas_price = int(math.ceil(right.gas_price + gas_window_size * position)) return gas_price else: raise Exception('Invariant')
def _persist_block_chain( cls, db: DatabaseAPI, blocks: Iterable[BaseBeaconBlock], block_class: Type[BaseBeaconBlock], fork_choice_scorings: Iterable[ForkChoiceScoringFn], ) -> Tuple[Tuple[BaseBeaconBlock, ...], Tuple[BaseBeaconBlock, ...]]: blocks_iterator = iter(blocks) scorings_iterator = iter(fork_choice_scorings) try: first_block = first(blocks_iterator) first_scoring = first(scorings_iterator) except StopIteration: return tuple(), tuple() try: previous_canonical_head = cls._get_canonical_head( db, block_class).signing_root head_score = cls._get_score(db, previous_canonical_head) except CanonicalHeadNotFound: no_canonical_head = True else: no_canonical_head = False is_genesis = first_block.is_genesis if not is_genesis and not cls._block_exists(db, first_block.parent_root): raise ParentNotFound( "Cannot persist block ({}) with unknown parent ({})".format( encode_hex(first_block.signing_root), encode_hex(first_block.parent_root), )) score = first_scoring(first_block) curr_block_head = first_block db.set(curr_block_head.signing_root, ssz.encode(curr_block_head)) cls._add_block_root_to_slot_lookup(db, curr_block_head) cls._set_block_score_to_db(db, curr_block_head, score) cls._add_attestations_root_to_block_lookup(db, curr_block_head) orig_blocks_seq = concat([(first_block, ), blocks_iterator]) for parent, child in sliding_window(2, orig_blocks_seq): if parent.signing_root != child.parent_root: raise ValidationError( "Non-contiguous chain. Expected {} to have {} as parent but was {}" .format( encode_hex(child.signing_root), encode_hex(parent.signing_root), encode_hex(child.parent_root), )) curr_block_head = child db.set(curr_block_head.signing_root, ssz.encode(curr_block_head)) cls._add_block_root_to_slot_lookup(db, curr_block_head) cls._add_attestations_root_to_block_lookup(db, curr_block_head) # NOTE: len(scorings_iterator) should equal len(blocks_iterator) try: next_scoring = next(scorings_iterator) except StopIteration: raise MissingForkChoiceScoringFns score = next_scoring(curr_block_head) cls._set_block_score_to_db(db, curr_block_head, score) if no_canonical_head: return cls._set_as_canonical_chain_head( db, curr_block_head.signing_root, block_class) if score > head_score: return cls._set_as_canonical_chain_head( db, curr_block_head.signing_root, block_class) else: return tuple(), tuple()
def sliding_window(self, n): ''' assuming should always be a generator - otherwise - going to get huge ''' return fgenerator(self.__class__(sw) for sw in cytoolz.sliding_window(n, self))
def header_pairs(VM, headers, valid): for pair in sliding_window(2, headers): yield VM, pair[1], pair[0], valid
def slw2(n, seq): for i in toolz.sliding_window(n, ([None] * (n - 1)) + seq): yield tuple(filter(None, i))
def _count_of_exact(seq): subs = ["".join(entry) for entry in sliding_window(len(sub), seq)] return subs.count(sub)
def slw(n, seq): yield from toolz.sliding_window(n, ([None] * (n - 1)) + seq)
def persist_header_chain( self, headers: Iterable[BlockHeader] ) -> Tuple[Tuple[BlockHeader, ...], Tuple[BlockHeader, ...]]: """ Return two iterable of headers, the first containing the new canonical headers, the second containing the old canonical headers """ try: first_header = first(headers) except StopIteration: return tuple(), tuple() else: for parent, child in sliding_window(2, headers): if parent.hash != child.parent_hash: raise ValidationError( "Non-contiguous chain. Expected {} to have {} as parent but was {}" .format( encode_hex(child.hash), encode_hex(parent.hash), encode_hex(child.parent_hash), )) is_genesis = first_header.parent_hash == GENESIS_PARENT_HASH if not is_genesis and not self.header_exists( first_header.parent_hash): raise ParentNotFound( "Cannot persist block header ({}) with unknown parent ({})" .format(encode_hex(first_header.hash), encode_hex(first_header.parent_hash))) score = 0 if is_genesis else self.get_score( first_header.parent_hash) for header in headers: self.db.set( header.hash, rlp.encode(header), ) score += header.difficulty self.db.set( SchemaV1.make_block_hash_to_score_lookup_key(header.hash), rlp.encode(score, sedes=rlp.sedes.big_endian_int), ) try: head_score = self.get_score(self.get_canonical_head().hash) except CanonicalHeadNotFound: (new_canonical_headers, old_canonical_headers) = self._set_as_canonical_chain_head( header.hash) else: if score > head_score: (new_canonical_headers, old_canonical_headers) = self._set_as_canonical_chain_head( header.hash) else: new_canonical_headers = tuple() old_canonical_headers = tuple() return new_canonical_headers, old_canonical_headers
def make_supervisions( sgml_path: Pathlike, recording: Recording) -> Dict[str, List[SupervisionSegment]]: """Create supervisions for sections and segments for a given HUB4 recording.""" doc = try_parse(sgml_path) episode = doc.find("episode") section_supervisions = [] text_supervisions = [] text_idx = 0 for sec_idx, section in enumerate(doc.find("episode").find_all("section")): # Create a "section" supervision segment that informs what's the program and # type/topic of a given section. # It spans multiple regular segments with spoken content. sec_start = float(section.attrs["starttime"]) section_supervisions.append( SupervisionSegment( id=f"{recording.id}_section{sec_idx:03d}", recording_id=recording.id, start=sec_start, duration=round(float(section.attrs["endtime"]) - sec_start, ndigits=3), channel=0, language=episode.attrs["language"], custom={ "section": section.attrs["type"], "program": episode.attrs["program"], }, )) for turn in section.find_all("turn"): # An example of the format in each turn: # # <turn speaker=Peter_Jennings spkrtype=male startTime=336.704 endTime=338.229> # <overlap startTime=336.704 endTime=337.575> # <time sec=336.704> # time served up until # </overlap> # <time sec=337.575> # this point? # </turn> for child in turn.children: # Here, we switch to custom parsing code as explained at the top of this script. lines = [ l for l in str(child).split("\n") if len(l) and not any( l.startswith(b) for b in EXCLUDE_BEGINNINGS) ] if not lines: continue times = [] texts = [] for time_marker, text in group_lines_in_time_marker(lines): match = re.search(r'sec="?(\d+\.?\d*)"?', time_marker) times.append(float(match.group(1))) texts.append(text) times.append(float(turn.attrs["endtime"])) # Having parsed the current section into start/end times and text # for individual speech segments, create a SupervisionSegment for each one. for (start, end), text in zip(sliding_window(2, times), texts): text_supervisions.append( SupervisionSegment( id=f"{recording.id}_segment{text_idx:04d}", recording_id=recording.id, start=start, duration=round(end - start, ndigits=8), channel=0, language=episode.attrs["language"], text=text.strip(), speaker=turn.attrs["speaker"], gender=turn.attrs["spkrtype"], )) text_idx += 1 return {"sections": section_supervisions, "segments": text_supervisions}
def prepare_single_babel_language( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, no_eval_ok: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares manifests using a single BABEL LDC package. This function works like the following: - first, it will scan `corpus_dir` for a directory named `conversational`; if there is more than once, it picks the first one (and emits a warning) - then, it will try to find `dev`, `eval`, and `training` splits inside (if any of them is not present, it will skip it with a warning) - finally, it scans the selected location for SPHERE audio files and transcripts. :param corpus_dir: Path to the root of the LDC package with a BABEL language. :param output_dir: Path where the manifests are stored.json :param no_eval_ok: When set to True, this function won't emit a warning that the eval set was not found. :return: """ manifests = defaultdict(dict) # Auto-detect the location of the "conversational" directory orig_corpus_dir = corpus_dir corpus_dir = Path(corpus_dir) corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()] if not corpus_dir: raise ValueError( f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' " f"- please check your path.") if len(corpus_dir) > 1: # People have very messy data distributions, the best we can do is warn them. logging.warning( f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - " f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided " f"the path to a single language's dir, and the root dir for all BABEL languages." ) corpus_dir = corpus_dir[0].parent for split in ("dev", "eval", "training"): audio_dir = corpus_dir / f"conversational/{split}/audio" sph_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.sph")) wav_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.wav")) recordings = combine(sph_recordings, wav_recordings) if len(recordings) == 0: if split == "eval" and no_eval_ok: continue logging.warning(f"No SPHERE or WAV files found in {audio_dir}") supervisions = [] text_dir = corpus_dir / f"conversational/{split}/transcription" for p in tqdm.tqdm(text_dir.glob("*")): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split( "_") channel = {"inLine": "A", "outLine": "B"}.get(channel, "A") # Fix problematic segments that have two consecutive timestamp lines with no transcript in between lines = p.read_text().splitlines() + [""] lines = [ prev_l for prev_l, l in sliding_window(2, lines) if not (prev_l.startswith("[") and l.startswith("[")) ] # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines += [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): try: start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) # Create supervision supervisions.append( SupervisionSegment( id= f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}", recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=f"{lang_code}_{speaker}_{channel}", )) except Exception as e: logging.warning( f"Error while parsing segment. Message: {str(e)}") raise ValueError( f"Too many errors while parsing segments (file: '{p}'). " f"Please check your data or increase the threshold.") supervisions = deduplicate_supervisions(supervisions) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") supervisions = SupervisionSet.from_segments(supervisions) # Fixing and validation of manifests if split == "eval" and len(supervisions) == 0: # We won't remove missing recordings for the "eval" split in cases where # the user does not have its corresponding transcripts (very likely). pass else: recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) supervisions = trim_supervisions_to_recordings( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests[split] = { "recordings": recordings, "supervisions": supervisions } if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) language = BABELCODE2LANG[lang_code] save_split = "train" if split == "training" else split recordings.to_file(output_dir / f"recordings_{language}_{save_split}.json") supervisions.to_file(output_dir / f"supervisions_{language}_{save_split}.json") return dict(manifests)
def pos_grams(self, n): grams = cytoolz.sliding_window(n, self.words) for bg in cytoolz.remove( lambda x: any(t.like_num or t.is_stop for t in x), grams): yield " ".join(g.pos_ for g in bg)
def _persist_block_chain( cls, db: BaseDB, blocks: Iterable[BaseBeaconBlock], block_class: Type[BaseBeaconBlock] ) -> Tuple[Tuple[BaseBeaconBlock, ...], Tuple[BaseBeaconBlock, ...]]: blocks_iterator = iter(blocks) try: first_block = first(blocks_iterator) except StopIteration: return tuple(), tuple() try: previous_canonical_head = cls._get_canonical_head( db, block_class).signed_root head_score = cls._get_score(db, previous_canonical_head) except CanonicalHeadNotFound: no_canonical_head = True else: no_canonical_head = False is_genesis = first_block.previous_block_root == GENESIS_PARENT_HASH if not is_genesis and not cls._block_exists( db, first_block.previous_block_root): raise ParentNotFound( "Cannot persist block ({}) with unknown parent ({})".format( encode_hex(first_block.signed_root), encode_hex(first_block.previous_block_root), )) if is_genesis: score = 0 # TODO: this should probably be done as part of the fork choice rule processing db.set( SchemaV1.make_finalized_head_root_lookup_key(), first_block.signed_root, ) else: score = first_block.slot curr_block_head = first_block db.set( curr_block_head.signed_root, ssz.encode(curr_block_head), ) cls._add_block_root_to_slot_lookup(db, curr_block_head) cls._set_block_scores_to_db(db, curr_block_head) orig_blocks_seq = concat([(first_block, ), blocks_iterator]) for parent, child in sliding_window(2, orig_blocks_seq): if parent.signed_root != child.previous_block_root: raise ValidationError( "Non-contiguous chain. Expected {} to have {} as parent but was {}" .format( encode_hex(child.signed_root), encode_hex(parent.signed_root), encode_hex(child.previous_block_root), )) curr_block_head = child db.set( curr_block_head.signed_root, ssz.encode(curr_block_head), ) cls._add_block_root_to_slot_lookup(db, curr_block_head) score = cls._set_block_scores_to_db(db, curr_block_head) if no_canonical_head: return cls._set_as_canonical_chain_head( db, curr_block_head.signed_root, block_class) if score > head_score: return cls._set_as_canonical_chain_head( db, curr_block_head.signed_root, block_class) else: return tuple(), tuple()
def polygon_edges(polygon): return sliding_window(2, polygon)