def extract_data_from_feature_bundle_lists(self): # check rule type if not self.target_feature_bundle_list and not self.change_feature_bundle_list: self.transformation_type = DEGENERATE elif not self.target_feature_bundle_list: self.transformation_type = INSERTION elif not self.change_feature_bundle_list: self.transformation_type = DELETION else: self.transformation_type = ASSIMILATION # check context type if not self.left_context_feature_bundle_list and not self.right_context_feature_bundle_list: self.context_type = NO_CONTEXT elif not self.left_context_feature_bundle_list: self.context_type = RIGHT_CONTEXT_ONLY elif not self.right_context_feature_bundle_list: self.context_type = LEFT_CONTEXT_ONLY else: self.context_type = BOTH_CONTEXTS if self.target_feature_bundle_list: self.target_features = self.target_feature_bundle_list.get_first_item() self.target_segments = SegmentTable().get_segments_symbols_by_features(self.target_features) if self.change_feature_bundle_list: self.change_features = self.change_feature_bundle_list.get_first_item() self.change_segments = SegmentTable().get_segments_symbols_by_features(self.change_features) if self.target_feature_bundle_list or self.change_feature_bundle_list: self.target_change_tuples_list = self._get_target_change_tuples_list()
def devoicer(self, words): for i, word in enumerate(words): c = word[-1] segment = SegmentTable().get_segment_by_symbol(c) new_features = deepcopy(segment.features) new_features[Feature('voice', ('+', '-'))] = '-' new_c = SegmentTable().get_segment_symbol_by_features(new_features) if new_c: words[i] = word[:-1] + new_c
def initialise_simulation(self, simulation): self.simulation = simulation Cache.get_cache().flush() self.configurations.load_configuration_for_simulation(simulation) self.configurations.configurations_dict = deepcopy( self.configurations.configurations_dict) segment_table_fixture_path = join(segment_table_dir_path, simulation.segment_table_file_name) SegmentTable.load(segment_table_fixture_path)
def get_prologue_inverse_transducer(): transducer_symbol_table = SegmentTable().transducer_symbol_table prologue_inverse_transducer = fst.Transducer(isyms=transducer_symbol_table, osyms=transducer_symbol_table) alphabet = set(SegmentTable().get_segments_symbols()) for segment in alphabet: prologue_inverse_transducer.add_arc(0, 0, segment, segment) for bracket in BRACKETS: prologue_inverse_transducer.add_arc(0, 0, bracket, EPSILON) prologue_inverse_transducer[0].final = True return prologue_inverse_transducer
def devoice(self, words): for i, word in enumerate(words): # if random.randint(1, 5) != 5: # continue # chance of 5 to 1 of changing c = word[-1] segment = SegmentTable().get_segment_by_symbol(c) new_features = deepcopy(segment.features) new_features[Feature('voice', ('+', '-'))] = '-' new_c = SegmentTable().get_segment_symbol_by_features(new_features) if new_c: words[i] = word[:-1] + new_c
def _add_feature(self): if self.morpheme_boundary or self.word_boundary: return False # WB or MB feature must be standalone available_feature = SegmentTable().get_random_available_feature( self.feature_dict.keys()) if available_feature: self.feature_dict[ available_feature] = available_feature.get_random_value() return True else: return False
def get_all_outputs(self): transducer = self.get_transducer() transducer_symbol_table = SegmentTable().transducer_symbol_table outputs = list() for path in transducer.paths(): output = "" for i in path: symbol = transducer_symbol_table.find(i.olabel) if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY: output += symbol outputs.append(output) return outputs
def apply_noise(self, words): total_to_noise = int(len(words) * self.noise_rate / 100) segment_table = SegmentTable() for i, word in enumerate(words[:total_to_noise]): c = word[-1] segment = segment_table.get_segment_by_symbol(c) if not self._is_voiced_obstruent(segment): continue new_features = deepcopy(segment.features) new_features[Feature('voice', ('+', '-'))] = '-' new_c = segment_table.get_segment_symbol_by_features(new_features) if new_c: words[i] = word[:-1] + new_c
def get_all_outputs(self, with_noise=True): transducer = self.get_transducer(with_noise=with_noise) if configurations["MINIMIZE_TRANSDUCER"]: transducer = self.minimize_transducer(transducer) transducer_symbol_table = SegmentTable().transducer_symbol_table outputs = list() for path in transducer.paths(): output = "" for i in path: symbol = transducer_symbol_table.find(i.olabel) if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY: output += symbol outputs.append(output) return outputs
def get_transducer_outputs(transducer, limit=float("inf")): transducer_symbol_table = SegmentTable().transducer_symbol_table outputs = list() counter = 0 for path in transducer.paths(): output = "" for arc in path: symbol = transducer_symbol_table.find(arc.olabel) if symbol != u"\u03b5": output += symbol outputs.append(output) counter += 1 if counter > limit: break return outputs
def get_random_feature_bundle(cls, role, boundary_position=False): """ :param role: 'target', 'change', 'left_context', 'right_context' :param boundary_position: whether feature bundle is first or last in left or right context, respectively :return: """ available_features = list(SegmentTable().features.keys()) if configurations[ 'WORD_BOUNDARY_FLAG'] and FeatureBundle._is_context_role( role) and boundary_position: available_features.append(WORD_BOUNDARY_FEATURE_NAME) if configurations[ 'MORPHEME_BOUNDARY_FLAG'] and FeatureBundle._is_context_role( role) and boundary_position: available_features.append(MORPHEME_BOUNDARY_FEATURE_NAME) feature_dict = {} random_feature = choice(available_features) if not isinstance(random_feature, Feature): if random_feature == WORD_BOUNDARY_FEATURE_NAME: feature_dict[WORD_BOUNDARY_FEATURE_NAME] = choice( [True, False]) elif random_feature == MORPHEME_BOUNDARY_FEATURE_NAME: feature_dict[MORPHEME_BOUNDARY_FEATURE_NAME] = choice( [True, False]) else: random_value = random_feature.get_random_value() feature_dict[random_feature.name] = random_value return cls(feature_dict, role)
def get_replace_transducer(self): transducer_symbol_table = SegmentTable().transducer_symbol_table inner_replace_transducer = fst.Transducer( isyms=transducer_symbol_table, osyms=transducer_symbol_table) for segment1, segment2 in self.target_change_tuples_list: inner_replace_transducer.add_arc(0, 1, segment1, segment2) inner_replace_transducer[1].final = True inner_replace_transducer_ignore_brackets = [ LEFT_CENTER_BRACKET, RIGHT_CENTER_BRACKET ] for bracket in inner_replace_transducer_ignore_brackets: inner_replace_transducer.add_arc(0, 0, bracket, bracket) inner_replace_transducer.add_arc(1, 1, bracket, bracket) opt_part = left_bracket_transducer + inner_replace_transducer + right_bracket_transducer add_opt(opt_part) sigma_star_regex = "({})*".format("+".join(self.alphabet)) sigma_star_dfa = get_dfa_from_regex(sigma_star_regex, sigma=self.alphabet) sigma_star_dfa_ignore_identity = get_ignore_dfa( self.alphabet | set([LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET]), sigma_star_dfa, set([LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET])) id_sigma_star = pyfst_from_dfa(sigma_star_dfa_ignore_identity) concat_transducer = id_sigma_star + opt_part replace_transducer = concat_transducer.closure() # dot(replace_transducer, "replace_transducer") return replace_transducer
def get_random_feature_bundle_list(cls, is_one_item_list, role=None): is_empty_probab = 1 / len(SegmentTable().features) if random() < is_empty_probab: is_empty = True else: is_empty = False if is_empty: return cls([], is_one_item_list, role) else: max_bundles = 1 if is_one_item_list else configurations[ "MAX_FEATURE_BUNDLE_IN_CONTEXT"] num_bundles = randrange(1, max_bundles + 1) feature_bundles = [] for i in range(num_bundles): is_boundary_position = False if role == 'left_context' and i == 0: is_boundary_position = True elif role == 'right_context' and i == (num_bundles - 1): is_boundary_position = True feature_bundle = FeatureBundle.get_random_feature_bundle( role=role, boundary_position=is_boundary_position) feature_bundles.append(feature_bundle) return cls(feature_bundles, is_one_item_list, role)
def test_(self): self.target_energy = None hmm = {'q0': ['q1'], 'q1': (['q2','qf'], ['abberation', 'abbreviate', 'abolitionist', 'abortion', 'absence', 'abstractionist', 'abutment', 'accent', 'acclaim', 'accolade', 'accommodate', 'accommodation', 'accomodation', 'achiev', 'add', 'administer', 'advertis', 'afford', 'aggravate', 'alert', 'amount', 'announc', 'appeal', 'applaud', 'apprentice', 'arcade', 'arrest', 'assault', 'assum', 'astound', 'attack', 'attempt', 'back', 'bak', 'balance', 'barbecue', 'bath', 'beckon', 'benefit', 'blast', 'blend', 'bless', 'blister', 'bloom', 'blow', 'boast', 'bogey', 'boil', 'bolster', 'bomb', 'borrow', 'bother', 'brac', 'breakfast', 'broadcast', 'broaden', 'bruise', 'buffet', 'burden', 'catalogue', 'cater', 'challeng', 'chang', 'charg', 'charm', 'compris', 'conced', 'conclud', 'condition', 'consum', 'costume', 'deal', 'decid', 'demand', 'describ', 'down', 'draw', 'drink', 'dwell', 'enforc', 'farm', 'feed', 'feel', 'flow', 'gaz', 'glaz', 'invad', 'liv', 'pac']), 'q2': (['qf'], ['ing', 'e', 'd', 'ed', 's', 'es', 'er']) } target = SimulationCase("target", hmm, []) self.target_energy = self.get_energy(target) hmm = {'q0': ['q1'], 'q1': (['q1', 'qf'], SegmentTable().get_segments_symbols()), } initial = SimulationCase("initial", hmm, []) self.get_energy(initial) hmm = {'q0': ['q1'], 'q1': (['q1', 'qf'], self.data[:]), } rote_learning = SimulationCase("rote_learning", hmm, []) self.get_energy(rote_learning)
def is_valid(self): for feature_bundle in self.feature_bundle_list: segments = SegmentTable().get_segments_symbols_by_features( feature_bundle) if not segments: # make sure that the feature_bundle represents any symbol return False return True
def setUp(self): self.initialise_segment_table("plural_english_segment_table.txt") number_of_features = len(SegmentTable().features) self.rule_symbol_length = ceil( log(number_of_features + 6, 2) ) # + 5 for 3 delimiters (feature, bundle, rule part), plus sign and minus sign, 1 for kleene
def get_transducer_acceptor(string_): transducer_symbol_table = SegmentTable().transducer_symbol_table transducer = fst.Transducer(isyms=transducer_symbol_table, osyms=transducer_symbol_table) for i, char in enumerate(string_): transducer.add_arc(i, i + 1, char, char) transducer[i + 1].final = True return transducer
def get_context_string_options(context_features): """ :param context_features: List of feature bundles :return: List of lists of segment symbols matching each feature bundle: [[s1, s2, s3], ...] """ context_string_options = [] for features in context_features: context_string_options.append(SegmentTable().get_segments_symbols_by_features(features)) return context_string_options
def test_change_segment_in_emission(self): self.initialise_segment_table("plural_english_segment_table.txt") hmm = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z'])}) self.write_to_dot_to_file(hmm, "hmm") segments = SegmentTable().get_segments_symbols() hmm.change_segment_in_emission(segments) print(hmm.get_all_emissions())
def get_from_pyfst_transducer(cls, transducer): transducer_symbol_table = SegmentTable().transducer_symbol_table nfa = ParsingNFA() nfa.final_states = list() arcs_dict = dict() probabilities = dict() num_states = len(list(transducer.states)) transition_matrix = np.ones( (num_states, num_states)) * NO_TRANSITION_IDX for state in transducer: m = re.match( r".*#(\w*).*", str(state) ) # get sate number from the string: "<StdState #x with y arcs>" nfa_state1 = m.group(1) if state.initial: nfa.initial_state = nfa_state1 if state.final: nfa.final_states.append(nfa_state1) for arc in state: nfa_state2 = str(arc.nextstate) output_symbol = transducer_symbol_table.find(arc.olabel) if output_symbol == u"\u03b5": output_symbol = NULL_SEGMENT if nfa_state1 not in arcs_dict: arcs_dict[nfa_state1] = {} probabilities[nfa_state1] = [] if output_symbol not in arcs_dict[nfa_state1]: arcs_dict[nfa_state1][output_symbol] = [] arcs_dict[nfa_state1][output_symbol].append(nfa_state2) probabilities[nfa_state1].append((output_symbol, nfa_state2)) segment_idx = NULL_SEGMENT_IDX if output_symbol == NULL_SEGMENT else arc.olabel transition_matrix[int(nfa_state1), int(nfa_state2)] = segment_idx nfa.arcs_dict = arcs_dict nfa.probabilities = probabilities nfa.transition_matrix = transition_matrix return nfa
def get_changed_segment(self, segment_symbol, change_feature_bundle): """ Applies a change of features to a given segment :param segment_symbol: Target segment :param change_feature_bundle: Change feature bundle :return: String of output segment """ args_repr = repr(segment_symbol) + repr(change_feature_bundle) cached = cache.get(args_repr, 'change_segment') if cached is not None: return cached segment = SegmentTable().get_segment_by_symbol(segment_symbol) new_segment_features_dict = deepcopy(segment.features) new_segment_features_dict.update(change_feature_bundle.feature_dict) changed_segment = SegmentTable().get_segment_symbol_by_features(new_segment_features_dict) cache.set(args_repr, changed_segment, 'change_segment') return changed_segment
def get_intro_transducer(sigma, introduced_set): sigma_transducer = get_sigma_transducer_for_intro(sigma) transducer_symbol_table = SegmentTable().transducer_symbol_table cartesian_transducer = fst.Transducer(isyms=transducer_symbol_table, osyms=transducer_symbol_table) for introduced_symbol in introduced_set: cartesian_transducer.add_arc(0, 0, EPSILON, introduced_symbol) cartesian_transducer[0].final = True union_transducer = sigma_transducer | cartesian_transducer intro_transducer = union_transducer.closure() return intro_transducer
def __init__(self, hmm, rule_set=None): if isinstance(hmm, HMM): self.hmm = hmm else: self.hmm = HMM(hmm) segment_table = SegmentTable() self.segment_symbol_length = ceil(log(len(segment_table) + 1, 2)) # + 1 for the delimiter if rule_set: self.rule_set = rule_set else: self.rule_set = RuleSet()
def setUp(self): self.table = "plural_english_segment_table" self.initialise_segment_table("%s.txt" % self.table) self.plural_english_segments = SegmentTable().get_segments_symbols() assimilation_rule = Rule([{ "cons": "+" }], [{ "voice": "-" }], [{ "voice": "-" }], [], True) self.plural_english_rule_set = RuleSet([assimilation_rule])
def pyfst_to_dfa(transducer, alphabet): transducer_symbol_table = SegmentTable().transducer_symbol_table nfa = NFA() nfa.Sigma = alphabet delta = dict() States = list() nfa.Initial = set() for state in transducer: m = re.match(r".*#(\w*).*", str(state)) # get sate number from the string: "<StdState #x with y arcs>" nfa_state1_name = m.group(1) States.append(nfa_state1_name) nfa_state1 = States.index(nfa_state1_name) if state.initial: nfa.Initial.add(nfa_state1) if state.final: nfa.Final.add(nfa_state1) for state in transducer: m = re.match(r".*#(\w*).*", str(state)) nfa_state1_name = m.group(1) nfa_state1 = States.index(nfa_state1_name) for arc in state: nfa_state2 = States.index(str(arc.nextstate)) output_symbol = transducer_symbol_table.find(arc.olabel) if output_symbol == u"\u03b5": output_symbol = FAdo.common.Epsilon if nfa_state1 not in delta: delta[nfa_state1] = dict() if output_symbol not in delta[nfa_state1]: delta[nfa_state1][output_symbol] = set() delta[nfa_state1][output_symbol].add(nfa_state2) nfa.delta = delta nfa.States = States dfa = nfa.toDFA() return dfa
def __init__(self, hmm, rule_set=None): if isinstance(hmm, HMM): self.hmm = hmm else: self.hmm = HMM(hmm) segment_table = SegmentTable() self.segment_symbol_length = uniform_encoding.log2(len(segment_table) + 1) # + 1 for the delimiter if rule_set: self.rule_set = rule_set else: self.rule_set = RuleSet(noise=False) noises = configurations.get("NOISE_RULE_SET", []) self.noise_rule_set = RuleSet.load_noise_rules_from_flat_list(noises) self._cached_hmm_transducer = None self._cached_rule_set_transducer = None self._cached_noise_rule_set_transducer = None
def pyfst_from_dfa(dfa): transducer_symbol_table = SegmentTable().transducer_symbol_table transducer = fst.Transducer(isyms=transducer_symbol_table, osyms=transducer_symbol_table) dfa_state_transducer_state_dict = {i: i for i, dfa_state in enumerate(dfa.States)} for dfa_state1 in dfa.delta: for segment in dfa.delta[dfa_state1]: dfa_state2 = dfa.delta[dfa_state1][segment] transducer_state1 = dfa_state_transducer_state_dict[dfa_state1] transducer_state2 = dfa_state_transducer_state_dict[dfa_state2] transducer.add_arc(transducer_state1, transducer_state2, segment, segment) for dfa_final_state in dfa.Final: transducer_final_state = dfa_state_transducer_state_dict[dfa_final_state] transducer[transducer_final_state].final = True transducer_initial_state = dfa_state_transducer_state_dict[dfa.Initial] transducer[transducer_initial_state].initial = True return transducer
def __init__(self, rules=None, noise=False): if not rules: rules = [] self.rules = rules for rule in self.rules: if rule.noise != noise: if noise: raise ValueError("Non-noise-rule in a noise-rule-set") else: raise ValueError("Noise-rule in a non-noise-rule-set") number_of_features = len(SegmentTable().features) number_of_encoding_symbols = number_of_features + 5 # +5 for 3 delimiters (feature, bundle, # rule part), plus sign, and minus sign if configurations['WORD_BOUNDARY_FLAG']: number_of_encoding_symbols += 1 if configurations['MORPHEME_BOUNDARY_FLAG']: number_of_encoding_symbols += 1 if configurations['CHANGE_KLEENE_VALUE']: number_of_encoding_symbols += 1 self.rule_symbol_length = uniform_encoding.log2( number_of_encoding_symbols)
def __init__(self, feature_string_dict, role=None): """ :param feature_string_dict: dictionary of form {"cons": "+", "WB": True} :param role: "target", "change", "left_context", or "right_context" """ feature_dict = dict() self.role = role self.kleene = False self.word_boundary = False self.morpheme_boundary = False if WORD_BOUNDARY_FEATURE_NAME in feature_string_dict: if feature_string_dict[ WORD_BOUNDARY_FEATURE_NAME] and self._is_context_bundle(): self.word_boundary = True elif MORPHEME_BOUNDARY_FEATURE_NAME in feature_string_dict: if feature_string_dict[ MORPHEME_BOUNDARY_FEATURE_NAME] and self._is_context_bundle( ): self.morpheme_boundary = True else: for feature_name in feature_string_dict: if feature_name is KLEENE_FEATURE_NAME: if self._is_context_bundle( ) and configurations['CHANGE_KLEENE_VALUE']: self.kleene = feature_string_dict[feature_name] else: feature = Feature(feature_name) if not SegmentTable().is_valid_feature(feature): raise ValueError( u"{} not in segment_table".format(feature_name)) else: feature_dict[feature] = feature_string_dict[ feature_name] self.feature_dict = feature_dict
def __init__(self, rule): self.__dict__.update(rule.__dict__) self.alphabet = set(SegmentTable().get_segments_symbols())