def _compute_fst_results_for_comparison(comparison_key, cohort_a, cohort_b, otu_defs, geno_samples, data_dir, file_owner): noprompt = True fst_input_etl.dump_comparison_classification(comparison_key, cohort_a, cohort_b, geno_samples, otu_defs, data_dir, file_owner=file_owner) fst_input_etl.dump_feature_metadata(otu_defs, data_dir, file_owner=file_owner) if DEBUGGING_CONFIG: fst_config = fst_input_etl._make_debugging_config( comparison_key, data_dir, otu_defs) else: fst_config = fst_input_etl._make_fst_config(comparison_key, data_dir, otu_defs) scratch_dirname = dirname_of_fst_scratch(data_dir) file_utils.ensure_directory(scratch_dirname, file_owner=file_owner) os.environ['JOBLIB_TEMP_FOLDER'] = scratch_dirname fst_runner = FST(fst_config, noprompt) print('begin FST.refuse(): ' + comparison_key) fst_runner.refuse() print('end FST.refuse(): ' + comparison_key) return
def fstDistance(self, data): fst = FST() sampleCount = data.shape[1] dm = np.zeros((sampleCount, sampleCount)) for i in range(0, sampleCount): for j in range(0, sampleCount): #extract the whole copy number profile, so the whole column sample1Profile = data[:, i] sample2Profile = data[:, j] dm[i, j] = fst.computeDistance(sample1Profile, sample2Profile) return dm
def french_count(): f = FST('french') f.add_state('start') f.initial_state = 'start' for ii in xrange(10): f.add_arc('start', 'start', [str(ii)], [kFRENCH_TRANS[ii]]) f.set_final('start') return f
def computeCTreeError(cMatrix, realTree): sampleNum = cMatrix.shape[1] #Compute the distance pairwise between samples distanceMatrix = np.empty([sampleNum, sampleNum], dtype=float) for sample1 in range(0, sampleNum): for sample2 in range(0, sampleNum): #The distance can be computed for the entire column at once using the FST dist = FST().computeDistance(cMatrix[:, sample1], cMatrix[:, sample2]) distanceMatrix[sample1, sample2] = dist #Compute the MST fullGraph = generateInitialTree(distanceMatrix, realTree.vertices) inferredTree = computeMST(fullGraph, realTree.vertices) [ ancestrySwapErrorAbsentInInferred, ancestrySwapErrorPresentInInferred, noOfSamplePairs ] = computeAncestrySwapError(realTree, inferredTree) summedError = (ancestrySwapErrorAbsentInInferred + ancestrySwapErrorPresentInInferred) averagedAncestrySwapError = summedError / float(noOfSamplePairs) #simulationErrorHandler = SimulationErrorHandler() #treeScore = simulationErrorHandler.computeTreeError([mst], realTree) return averagedAncestrySwapError
def run_experiment(): score = open("score.csv", "wt") for k in range(1, 21): k *= 10 if k == 0: alphabet, states, init_state, accept_states, transitions = \ SIMPLIFIED_JSON_ALPHABET, SIMPLIFIED_JSON_STATES, SIMPLIFIED_JSON_INIT_STATE, \ SIMPLIFIED_JSON_ACCEPT_STATES, SIMPLIFIED_JSON_TRANSITIONS else: alphabet, states, init_state, accept_states, transitions = load_fst_by_nodes_to_add(k) init_state = init_state[0] fst = FST(alphabet, states, init_state, accept_states, transitions) fst_dataset = FstDataset(BinaryFSTParams(), fst=fst) activator_params = BinaryActivatorParams() activator_params.EPOCHS = 100 activator = binaryActivator(BinaryModule(BinaryModuleParams(alphabet_size=len(fst_dataset.chr_embed))), activator_params, fst_dataset, split_fst_dataset) activator.train(validate_rate=10) score.write(str(k) + "train_loss," + ",".join([str(v) for v in activator.loss_train_vec]) + "\n") score.write(str(k) + "train_acc," + ",".join([str(v) for v in activator.accuracy_train_vec]) + "\n") score.write(str(k) + "train_auc," + ",".join([str(v) for v in activator.auc_train_vec]) + "\n") score.write(str(k) + "dev_loss," + ",".join([str(v) for v in activator.loss_dev_vec]) + "\n") score.write(str(k) + "dev_acc," + ",".join([str(v) for v in activator.accuracy_dev_vec]) + "\n") score.write(str(k) + "dev_auc," + ",".join([str(v) for v in activator.auc_dev_vec]) + "\n")
def fst_test2(test_string): # print "\nThis FST removes all b's from string\n" fst_states = ["q0"] fst_in_alph = ["a", "b"] fst_out_alph = ["a", "b"] fst_start = "q0" fst_final = ["q0"] fst_trans = { "q0": { "a": ["q0", "a"], "b": ["q0", ""] }, } test_fst = FST(fst_states, fst_in_alph, fst_out_alph, fst_start, fst_final, fst_trans) print test_string + " : " + test_fst.transduce_string(test_string)
def french_count(): f = FST('french') f.add_state('start') f.initial_state = 'start' for ii in xrange(10): f.add_arc('start', 'start', str(ii), [kFRENCH_TRANS[ii]]) f.set_final('start') return f
def french_count(): f = FST("french") f.add_state("start") f.initial_state = "start" for ii in xrange(10): f.add_arc("start", "start", str(ii), [kFRENCH_TRANS[ii]]) f.set_final("start") return f
def rand_fst(self, size_states, size_alphabet, num_accept_states): alphabet, states, start_state, accept_states, transitions = [], [], "q0", [], [] for i in range(size_alphabet): alphabet.append("sym" + str(i)) for i in range(size_states): states.append("q" + str(i)) accept_states = [(q, 1) for q in sample(states, num_accept_states)] for q in states: for symbol in alphabet: transitions.append((q, symbol, choice(states))) return FST(alphabet, set(states), start_state, accept_states, transitions)
def fstAlleleDistance( self, data, samples ): #this could have been more efficient when we pass objects fst = FST() sampleCount = data.shape[1] dm = np.zeros((sampleCount, sampleCount)) messages = dict() for i in range(0, sampleCount): for j in range(0, sampleCount): #extract the whole copy number profile, so the whole column sample1Profile = data[:, i] sample2Profile = data[:, j] returnValue = fst.computeAlleleDistance( sample1Profile, sample2Profile, samples[i], samples[j]) messages[(i, j)] = returnValue[0] dm[i, j] = returnValue[1] print "distances: " print dm return [messages, dm]
def letters_to_numbers(): """ Returns an FST that converts letters to numbers as specified by the soundex algorithm """ # Let's define our first FST f1 = FST('soundex-generate') # Indicate that '1' is the initial state f1.add_state('start') f1.add_state('next') f1.initial_state = 'start' # Set all the final states f1.set_final('next') return f1
def fst_test(test_string): # print "\nThis FST replaces the first 'a' in a string with an 'ba'" fst_states = ["q0", "q1"] fst_in_alph = ["a", "b"] fst_out_alph = ["a", "b"] fst_start = "q0" fst_final = ["q0", "q1"] fst_trans = { "q0": { "a": ["q1", "ba"], "b": ["q0", "b"] }, "q1": { "a": ["q1", "a"], "b": ["q1", "b"] }, } test_fst = FST(fst_states, fst_in_alph, fst_out_alph, fst_start, fst_final, fst_trans) print test_string + " : " + test_fst.transduce_string(test_string)
def add_zero_padding(): # Now, the third fst - the zero-padding fst f3 = FST('soundex-padzero') f3.add_state('1') f3.add_state('2') f3.initial_state = '1' f3.set_final('2') return f3
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('1') f2.initial_state = '1' f2.set_final('1') # Add the arcs for letter in string.letters: f2.add_arc('1', '1', (letter), (letter)) for n in range(10): f2.add_arc('1', '1', str(n), str(n)) return f2
def fst_from_prohibited_string(input_alphabet, output_alphabet, banned_string, violation_name): length = len(banned_string) fst = FST(set([""]), "", set(), set(), "") # Add arcs if length > 1: for i in range(1, length): fst.states.add(banned_string[0:i]) add_alternation_arcs(fst, banned_string[0:i - 1], banned_string[0:i], '_', banned_string[i - 1]) # Send a penalty arc to the longest valid suffix fst.arcs.add( Arc(banned_string[0:-1], longest_suffix(banned_string, fst.states), Label('_', banned_string[-1], Counter({violation_name: 1})))) # Add loopback arcs and return for state in fst.states: for char in input_alphabet: add_elision_arc(fst, state, char) for char in fst.chars_not_leaving(state, output_alphabet): add_alternation_arcs(fst, state, longest_suffix(state + char, fst.states), '_', char) return fst
def generate(self, analysis): """Generate the morphologically correct word e.g. p = Parser() analysis = ['p','a','n','i','c','+past form'] p.generate(analysis) ---> 'panicked' """ # Let's define our first FST f1 = FST('morphology-generate') output = ['p', 'a', 'n', 'i', 'c', 'k', 'e', 'd'] return ''.join(output)
def parse(self, word): """Parse a word morphologically e.g. p = Parser() word = ['p','a','n','i','c','k','i','n','g'] p.parse(word) ---> 'panic+present participle form' """ # Ok so now let's do the second FST f2 = FST('morphology-parse') output = ['p', 'a', 'n', 'i', 'c', '+present participle form'] return ''.join(output)
def computeATreeError(aMatrix, lafMatrix, afMatrix, realTree): sampleNum = aMatrix.shape[1] aObjMatrix = np.empty(aMatrix.shape, dtype=object) #Convert the a matrix to an actual allele matrix for row in range(0, aMatrix.shape[0]): for col in range(0, aMatrix.shape[1]): allele = aMatrix[row][col] AOccurrences = [m.start() for m in re.finditer('A', allele)] ACount = len(AOccurrences) BOccurrences = [m.start() for m in re.finditer('B', allele)] BCount = len(BOccurrences) alleleObj = Alleles(ACount, BCount) aObjMatrix[row][col] = alleleObj #Compute the distance pairwise between samples distanceMatrix = np.empty([sampleNum, sampleNum], dtype=float) [chromosomes, positions, segmentation, chromosomeArms] = parseReferenceFile() for sample1 in range(0, sampleNum): for sample2 in range(0, sampleNum): #make a dummy sample object for the FST function sample1Obj = Sample(None, None) sample1Obj.measurements = LAF(lafMatrix[:, sample1], chromosomes, positions, positions) sample1Obj.measurements.segmentation = segmentation sample1Obj.afMeasurements = afMatrix[:, sample1] sample2Obj = Sample(None, None) sample2Obj.measurements = LAF(lafMatrix[:, sample2], chromosomes, positions, positions) sample2Obj.measurements.segmentation = segmentation sample2Obj.afMeasurements = afMatrix[:, sample2] #The distance can be computed for the entire column at once using the FST [messages, dist] = FST().computeAlleleDistance(aObjMatrix[:, sample1], aObjMatrix[:, sample2], sample1Obj, sample2Obj) distanceMatrix[sample1, sample2] = dist #print distanceMatrix #exit() #Compute the MST fullGraph = generateInitialTree(distanceMatrix, realTree.vertices) mst = computeMST(fullGraph, realTree.vertices) simulationErrorHandler = SimulationErrorHandler() treeScore = simulationErrorHandler.computeTreeError([mst], realTree) return treeScore
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('1') f2.initial_state = '1' f2.set_final('1') return f2
def add_zero_padding(): # Now, the third fst - the zero-padding fst #Variable aliases start_state = 'start' just_numbers = 'just_numbers' letter_first = 'letter_first' epsilons = ['e0', 'e1', 'e2', 'e3', 'e4', 'e5'] #Initialization f3 = FST('soundex-padzero') f3.add_state(start_state) f3.add_state(just_numbers) f3.add_state(letter_first) f3.initial_state = start_state add_numbers(f3, start_state, just_numbers) for letter in string.ascii_letters: f3.add_arc(start_state, letter_first, letter, letter) build_letter_first(f3, epsilons, letter_first) build_number_first(f3, epsilons, just_numbers) return f3
def fst_intersect(m, n): arcs = set() state_lookup = dict( (product_state(p, q), set()) for p, q in states_set_product(m, n)) start = product_state(m.start, n.start) # Compute arcs for each state pair for ((x, y), (z, w)) in states_mega_product(m, n): labels_lists = similar_labels_between(m, x, y, n, z, w) elision_arcs = set() for labels_list in labels_lists: arcs_by_input = set() for (k, l) in labels_list: add_arc = False seg = '' if k.output == '': # Faithfulness constraint; cares about input if k.input == l.input: if k.input not in elision_arcs: add_arc = True seg = k.input elision_arcs.add(seg) elif ((k.input == '_') or (k.input == l.input)) and (l.input not in arcs_by_input): # Markedness constraint add_arc = True seg = l.input arcs_by_input.add(seg) elif (l.input == '_') and (k.input not in arcs_by_input): # Markedness constraint add_arc = True seg = k.input arcs_by_input.add(seg) if add_arc: intersection_arc = Arc( product_state(x, z), product_state(y, w), Label(seg, k.output, otimes(k.violation, l.violation))) arcs.add(intersection_arc) state_lookup[intersection_arc.start].add(intersection_arc) # Figure out the states reachable from the start fst_states = traverse_states(state_lookup, start) fst = FST(fst_states, start, fst_states, filter((lambda arc: arc.start in fst_states), arcs), 1) return fst
def computeCTreeError(cMatrix, realTree): sampleNum = cMatrix.shape[1] #Compute the distance pairwise between samples distanceMatrix = np.empty([sampleNum, sampleNum], dtype=float) for sample1 in range(0, sampleNum): for sample2 in range(0, sampleNum): #The distance can be computed for the entire column at once using the FST dist = FST().computeDistance(cMatrix[:, sample1], cMatrix[:, sample2]) distanceMatrix[sample1, sample2] = dist #Compute the MST fullGraph = generateInitialTree(distanceMatrix, realTree.vertices) mst = computeMST(fullGraph, realTree.vertices) simulationErrorHandler = SimulationErrorHandler() treeScore = simulationErrorHandler.computeTreeError([mst], realTree) return treeScore
def generate(self, analysis): """Generate the morphologically correct word e.g. p = Parser() analysis = ['p','a','n','i','c','+past form'] p.generate(analysis) ---> 'panicked' """ start_state = 'start' f = FST('generator') f.add_state(start_state) f.initial_state = start_state self._build_generator_fst(f, analysis, start_state) return ''.join(f.transduce(analysis)[0])
def add_zero_padding(): # Now, the third fst - the zero-padding fst f3 = FST('soundex-padzero') states = ['1', '2', '3', '4'] for state in states: f3.add_state(state) f3.initial_state = '1' f3.set_final('4') for letter in string.letters: f3.add_arc('1', '1', letter, letter) for number in range(1, 10): f3.add_arc('1', '2', str(number), str(number)) f3.add_arc('2', '3', str(number), str(number)) f3.add_arc('3', '4', str(number), str(number)) f3.add_arc('2', '4', (), '00') f3.add_arc('3', '4', (), '0') return f3
def french_count(): f = FST('french') f.add_state('start') f.add_state('z') for i in range(30): f.add_state(str(i)) f.initial_state = ('start') for i in range(20, 30): f.set_final(str(i)) f.set_final('z') f.add_arc('start', 'z', ['z'], [kFRENCH_TRANS[0]]) for i in range(10): f.add_arc('start', str(i), [str(i)], []) for j in range(10, 20): if i is 0: f.add_arc(str(i), str(j), [str(j - 10)], []) elif i is 1: f.add_arc(str(i), str(j), [str(j - 10)], [kFRENCH_TRANS[100]]) elif i in range(2, 10): f.add_arc(str(i), str(j), [str(j - 10)], [kFRENCH_TRANS[i], kFRENCH_TRANS[100]]) for i in range(10, 20): for j in range(20, 30): if i is 10: if j != 20: f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[j - 20]]) else: f.add_arc(str(i), str(j), [str(j - 20)], []) elif i is 11 and j in range(20, 27): f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[j - 10]]) elif i is 11 and j in range(27, 30): f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[10], kFRENCH_TRANS[j - 20]]) elif i in range(12, 17): if j is 20: f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[int(i % 10) * 10]]) elif j is 21: f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[int(i % 10) * 10], kFRENCH_AND, kFRENCH_TRANS[1] ]) else: f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[int(i % 10) * 10], kFRENCH_TRANS[j - 20] ]) elif i is 17: if j is 20: f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[60], kFRENCH_TRANS[10]]) elif j is 21: f.add_arc( str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[60], kFRENCH_AND, kFRENCH_TRANS[11]]) elif j in range(22, 27): f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[60], kFRENCH_TRANS[j - 10]]) elif j in range(27, 30): f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[60], kFRENCH_TRANS[10], kFRENCH_TRANS[j - 20] ]) elif i is 18: if j is 20: f.add_arc(str(i), str(j), [str(j - 20)], [kFRENCH_TRANS[4], kFRENCH_TRANS[20]]) elif j in range(21, 30): f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[4], kFRENCH_TRANS[20], kFRENCH_TRANS[j - 20] ]) elif i is 19: if j in range(20, 27): f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[4], kFRENCH_TRANS[20], kFRENCH_TRANS[j - 10] ]) elif j in range(27, 30): f.add_arc(str(i), str(j), [str(j - 20)], [ kFRENCH_TRANS[4], kFRENCH_TRANS[20], kFRENCH_TRANS[10], kFRENCH_TRANS[j - 20] ]) return f
def union_fst(self, list_fst): new_alphabet, new_states, new_start_state, new_accept_states, new_transitions =\ self._merge_fst(list_fst, op=UNION) return FST(new_alphabet, new_states, new_start_state, new_accept_states, new_transitions)
_states_L1 = {"s0", "s1", "s2", "s3", "s4"} _init_state_L1 = "s0" _accept_states_L1 = [("s2", 1), ("s4", 1)] _transitions_L1 = [ ("s0", "a", "s1"), ("s0", "b", "s3"), ("s1", "a", "s1"), ("s1", "b", "s2"), ("s2", "a", "s1"), ("s2", "b", "s2"), ("s3", "a", "s4"), ("s3", "b", "s3"), ("s4", "a", "s4"), ("s4", "b", "s3") ] _fst_L1 = FST(_alphabet_L1, _states_L1, _init_state_L1, _accept_states_L1, _transitions_L1) print("check FST - L1") print(_fst_L1) assert _fst_L1.go("aaabbababaabbab")[1] assert not _fst_L1.go("aaabbbbba")[1] rand = "".join(_fst_L1.go()) print("sample:" + rand) assert _fst_L1.go(rand) # L2 _alphabet_L2 = ["a", "b"] _states_L2 = {"q0", "q1", "q2", "q3"} _init_state_L2 = "q0" _accept_states_L2 = [("q2", 3)] _transitions_L2 = [ ("q0", "a", "q1"),
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('ste') f2.add_state('L1') f2.add_state('N1') f2.add_state('N2') f2.add_state('N3') f2.add_state('next1') f2.initial_state = 'ste' f2.set_final('next1') for letter in string.letters: f2.add_arc('ste', 'L1', (letter), (letter)) for n in range(10): f2.add_arc('ste', 'N1', (str(n)), (str(n))) f2.add_arc('L1', 'N1', (str(n)), (str(n))) f2.add_arc('N1', 'N2', (str(n)), (str(n))) f2.add_arc('N2', 'N3', (str(n)), (str(n))) f2.add_arc('N3', 'N3', (str(n)), ()) # Add the arcs """for letter in string.letters: f2.add_arc('1', '1', (letter), (letter)) for n in range(10): f2.add_arc('1', '1', (str(n)), (str(n)))""" f2.add_arc('L1', 'next1', (), ()) f2.add_arc('N1', 'next1', (), ()) f2.add_arc('N2', 'next1', (), ()) f2.add_arc('N3', 'next1', (), ()) return f2
fst_trans[char_list[i][0]] = {} fst_trans["start"][char_list[i][0]] = [ char_list[i][:1], arp_list[i][0] + " " ] for i in range(0, len(char_list)): for j in range(1, len(char_list[i])): # adding new states to fst_states fst_states.append(char_list[i][:j + 1]) # adding state transitions, where state names are string of chars in the color # so far so the 3rd state on the way to blue is named 'blu' # example in dictionary: 'ru': {'b': ['rub', 'B ']} fst_trans[char_list[i][:j + 1]] = {} fst_trans[char_list[i][:j]][char_list[i][j]] = [char_list[i][:j + 1]] # if/else accounting for silent letters at end of word if j < len(arp_list[i]): fst_trans[char_list[i][:j]][char_list[i][j]].append( arp_list[i][j] + " ") else: fst_trans[char_list[i][:j]][char_list[i][j]].append("") # Our final states are all of our input words fst_final = char_list # Declaring our fst test_fst = FST(fst_states, fst_in_alph, fst_out_alph, fst_start, fst_final, fst_trans) # Calling transduce on each color in our input for word in char_list: print test_fst.transduce_string(word)
def french_count(): f = FST('french') f.add_state('start') # one number and two trailing unknowns f.add_state('n**') # exception from state n** f.add_state('n**+') # two numbers and one trailing unknown f.add_state('nn*') # zero and two uknown digits trailing and so on f.add_state('0**') f.add_state('00*') f.add_state('00n') f.add_state('0n*') f.add_state('0n*+') f.add_state('0nn') f.add_state('n00') f.add_state('nnn') f.add_state('nnn*') f.add_state('*et*') # vegasimal counting for 7 in ((0/n)n*) f.add_state('0n*Vega7+') f.add_state('0n*Vega7') f.add_state('0nnVega7') # vegasimal counting for 8 in ((0/n)n*) f.add_state('0n*Vega8') f.add_state('0n*Vega8+') f.add_state('0nnVega8') # vegasimal counting for 9 in ((0/n)n*) f.add_state('0n*Vega9') f.add_state('0n*Vega9+') f.add_state('0n*Vega9++') f.add_state('0nnVega9') # set final states f.set_final('00n') f.set_final('0nn') f.set_final('nnn') f.set_final('n00') f.set_final('0nnVega7') f.set_final('0nnVega8') f.set_final('0nnVega9') # initial state f.initial_state = 'start' # remove initial zeroes f.add_arc('start', '0**', '0', ()) f.add_arc('0**', '00*', '0', ()) for ii in xrange(10): #from '0n*Vega8' to '0nnVega8 if ii != 0: f.add_arc('0n*Vega8+', '0nnVega8', str(ii), [kFRENCH_TRANS[ii]]) elif ii == 0: f.add_arc('0n*Vega8+', '0nnVega8', str(ii), ()) #from '0n*Vega7' to '0nnVega7' 7-9 if ii == 0 or ii == 7 or ii ==8 or ii == 9: f.add_arc('0n*Vega7', '0n*Vega7+', (), [kFRENCH_TRANS[10]]) f.add_arc('0n*Vega7+', '0n*Vega7+', str(ii), [kFRENCH_TRANS[ii]]) # f.add_arc('0n*Vega9+', '0n*Vega9++', (), [kFRENCH_TRANS[10]]) f.add_arc('0n*Vega9++', '0nnVega9', str(ii), [kFRENCH_TRANS[ii]]) if ii == 0: f.add_arc('0n*Vega7+', '0nnVega7', '0', ()) f.add_arc('0n*Vega9++', '0nnVega9', '0', ()) elif ii == 7 or ii == 8 or ii == 9: f.add_arc('0n*Vega7+', '0nnVega7', str(ii), [kFRENCH_TRANS[ii]]) #from '0n*Vega' to '0nnVega' 2-6 if ii == 2 or ii == 3 or ii ==4 or ii == 5 or ii == 6: f.add_arc('0n*Vega7', '0nnVega7', str(ii), [kFRENCH_TRANS[ii+10]]) f.add_arc('0n*Vega9+', '0nnVega9', str(ii), [kFRENCH_TRANS[ii+10]]) if ii == 1: f.add_arc('0**','0n*', str(ii), [kFRENCH_TRANS[10]]) f.add_arc('n**','0n*', str(ii), [kFRENCH_TRANS[10]]) f.add_arc('0n*Vega7', '0n*Vega7+', str(ii), [kFRENCH_AND]) f.add_arc('0n*Vega7+', '0nnVega7', str(ii), [kFRENCH_TRANS[ii+10]]) f.add_arc('0n*Vega9+', '0nnVega9', str(ii), [kFRENCH_TRANS[ii+10]]) #from '00*' to '00n' f.add_arc('00*', '00n', str(ii), [kFRENCH_TRANS[ii]]) #from '*n*' to '*nn' 2-9 if ii != 0 and ii !=9: f.add_arc('0n*','0nn', str(ii+1), [kFRENCH_TRANS[ii+1]]) f.add_arc('0n*+','0nn', str(ii), [kFRENCH_TRANS[ii]]) #from 'start' to 'nnn' 200,300,...,900 if ii != 0 and ii !=1: f.add_arc('start','n**+', str(ii), [kFRENCH_TRANS[ii]]) f.add_arc('n**+', 'n**', (), [kFRENCH_TRANS[100]]) #from 'n**' to 'n0*' 0 if ii == 0: f.add_arc('n**', 'n00', '00', ()) if ii == 1: f.add_arc('start', 'n**', '1', [kFRENCH_TRANS[100]]) #from '*n*' to '*et*' 1 f.add_arc('0n*','*et*', '1', [kFRENCH_AND]) #from '*et*' to '*nn' 1 f.add_arc('*et*','0nn', (), [kFRENCH_TRANS[1]]) #from '0**' to '*nn' 10-16 for ii in xrange(10,17): f.add_arc('0**','0nn', str(ii), [kFRENCH_TRANS[ii]]) f.add_arc('n**','0nn', str(ii), [kFRENCH_TRANS[ii]]) #from '0**' to '*nn' 20-60 for ii in xrange(2,7): f.add_arc('0**', '0nn', str(ii*10), [kFRENCH_TRANS[ii*10]]) f.add_arc('n**', '0nn', str(ii*10), [kFRENCH_TRANS[ii*10]]) #from '0**', to *n* f.add_arc('0**','0n*', str(ii), [kFRENCH_TRANS[ii*10]]) #from 'n**' to '0n*' f.add_arc('n**', '0n*+', str(ii), [kFRENCH_TRANS[ii*10]]) for ii in xrange(7,10): if ii == 7: f.add_arc('0**', '0n*Vega7', str(ii), [kFRENCH_TRANS[60]]) f.add_arc('n**', '0n*Vega7', str(ii), [kFRENCH_TRANS[60]]) elif ii == 8: f.add_arc('0**', '0n*Vega8', str(ii), [kFRENCH_TRANS[4]]) f.add_arc('n**', '0n*Vega8', str(ii), [kFRENCH_TRANS[4]]) f.add_arc('0n*Vega8', '0n*Vega8+', (), [kFRENCH_TRANS[20]]) elif ii == 9: f.add_arc('0**', '0n*Vega9', str(ii), [kFRENCH_TRANS[4]]) f.add_arc('n**', '0n*Vega9', str(ii), [kFRENCH_TRANS[4]]) f.add_arc('0n*Vega9', '0n*Vega9+', (), [kFRENCH_TRANS[20]]) f.add_arc('n**', '0n*+', '0', ()) return f
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states f2.add_state('1') f2.add_state('2') f2.add_state('3') f2.add_state('4') f2.initial_state = '1' f2.set_final('1') f2.set_final('2') f2.set_final('3') f2.set_final('4') # Adds letters from input string of 'A###0000' for letter in string.letters: f2.add_arc('1', '1', (letter), (letter)) # Adds numbers from first FST of range 0-9 for n in range(10): f2.add_arc('1', '2', str(n), (str(n))) f2.add_arc('2', '3', str(n), (str(n))) f2.add_arc('3', '4', str(n), (str(n))) f2.add_arc('4', '4', str(n), ()) return f2
def generate_control(self): arguments = self.matchers.keys() # this will be a hypercube control = FST() # zero state is for verb control.add_state("0", is_init=True, is_final=False) # inside states for the cube, except the last, accepting state for i in xrange(1, pow(2, len(arguments))): control.add_state(str(i), is_init=False, is_final=False) # last node of the hypercube control.add_state( str(int(pow(2, len(arguments)))), is_init=False, is_final=True) # first transition control.add_transition(KRPosMatcher("VERB"), [ExpandOperator( self.lexicon, self.working_area)], "0", "1") # count every transition as an increase in number of state for path in permutations(arguments): actual_state = 1 for arg in path: increase = pow(2, arguments.index(arg)) new_state = actual_state + increase control.add_transition( self.matchers[arg], [FillArgumentOperator(arg, self.working_area)], str(actual_state), str(new_state)) actual_state = new_state return control
from fst import FST import argparse parser = argparse.ArgumentParser() parser.add_argument('--fst', action='store_true') parser.add_argument('--isyms', action='store_true') parser.add_argument('--osyms', action='store_true') parser.add_argument('--name', type=str, required=True) parser.add_argument('--file', type=str, required=True) if __name__ == '__main__': args = parser.parse_args() fst = FST(args.name) fst.initial_state = fst.new_state() fst.final_states.append(fst.new_state()) with open(args.file, 'r') as f: for line in f: if fst.name == 'L': word, phones = line.strip().split('\t') tokens = phones.split() elif fst.name == 'S': word = line.strip().split('\t')[0] tokens = list(word) if len(tokens) == 1: fst.add_arc(fst.initial_state, fst.final_states[0], word, tokens[0]) else: state = fst.new_state() fst.add_arc(fst.initial_state, state, word, tokens[0]) for phone in tokens[1:-1]:
from fst import FST # This function returns fn o ... o f3 o f2 o f1 (input) # where ALL transducers use characters as input symbols def compose(input, *fsts): output_list = [input] for fst in fsts: next_output_list = [] for o in output_list: new_output = ''.join(o) next_output_list.extend(fst.transduce(new_output)) output_list = next_output_list return output_list if __name__ == '__main__': f1 = FST('test-generate') # Indicate that '1' is the initial state f1.add_state('start') f1.add_state('next') f1.initial_state = 'start' # Set all the final states f1.set_final('next') # Add the rest of the arcs for letter in ['A','B','C','D']: f1.add_arc('start', 'next', letter, '1') f1.add_arc('next', 'next', letter, '0') f2 = FST('test-generate')
def letters_to_numbers(): """ Returns an FST that converts letters to numbers as specified by the soundex algorithm """ vowels = [ 'a', 'A', 'e', 'E', 'h', 'H', 'i', 'I', 'o', 'O', 'u', 'U', 'w', 'W', 'y', 'Y' ] # Let's define our first FST f1 = FST('soundex-generate') # Indicate that '1' is the initial state f1.add_state('start') f1.add_state('s11') f1.add_state('s22') f1.add_state('s33') f1.add_state('s44') f1.add_state('s55') f1.add_state('s66') f1.add_state('s1') f1.add_state('s2') f1.add_state('s3') f1.add_state('sv') f1.add_state('s4') f1.add_state('s5') f1.add_state('s6') f1.add_state('next') f1.initial_state = 'start' # Set all the final states f1.set_final('next') # Add the rest of the arcs for letter in string.ascii_letters: #f1.add_arc('start', 'next', (letter), (letter)) #f1.add_arc('next', 'next', (letter), ('0')) if letter in vowels: f1.add_arc('start', 'sv', (letter), (letter)) f1.add_arc('s11', 'sv', (letter), ()) f1.add_arc('s33', 'sv', (letter), ()) f1.add_arc('s22', 'sv', (letter), ()) f1.add_arc('s44', 'sv', (letter), ()) f1.add_arc('s55', 'sv', (letter), ()) f1.add_arc('s66', 'sv', (letter), ()) f1.add_arc('sv', 'sv', (letter), ()) f1.add_arc('s1', 'sv', (letter), ()) f1.add_arc('s2', 'sv', (letter), ()) f1.add_arc('s3', 'sv', (letter), ()) f1.add_arc('s4', 'sv', (letter), ()) f1.add_arc('s6', 'sv', (letter), ()) f1.add_arc('s5', 'sv', (letter), ()) #f1.add_arc('s3','s4',(letter),(letter)) elif letter in "Ll": f1.add_arc('start', 's44', (letter), (letter)) f1.add_arc('s44', 's4', (letter), ()) f1.add_arc('s11', 's4', (letter), ('4')) f1.add_arc('s22', 's4', (letter), ('4')) f1.add_arc('s33', 's4', (letter), ('4')) f1.add_arc('s55', 's4', (letter), ('4')) f1.add_arc('s66', 's4', (letter), ('4')) f1.add_arc('s4', 's4', (letter), ()) f1.add_arc('s1', 's4', (letter), ('4')) f1.add_arc('s2', 's4', (letter), ('4')) f1.add_arc('s3', 's4', (letter), ('4')) f1.add_arc('s5', 's4', (letter), ('4')) f1.add_arc('s6', 's4', (letter), ('4')) f1.add_arc('sv', 's4', (letter), ('4')) elif letter in 'Rr': f1.add_arc('start', 's66', (letter), (letter)) f1.add_arc('s66', 's6', (letter), ()) f1.add_arc('s22', 's6', (letter), ('6')) f1.add_arc('s33', 's6', (letter), ('6')) f1.add_arc('s44', 's6', (letter), ('6')) f1.add_arc('s55', 's6', (letter), ('6')) f1.add_arc('s11', 's6', (letter), ('6')) f1.add_arc('s6', 's6', (letter), ()) f1.add_arc('s1', 's6', (letter), ('6')) f1.add_arc('s2', 's6', (letter), ('6')) f1.add_arc('s3', 's6', (letter), ('6')) f1.add_arc('s5', 's6', (letter), ('6')) f1.add_arc('s4', 's6', (letter), ('6')) f1.add_arc('sv', 's6', (letter), ('6')) elif letter in "bfpvBFPV": f1.add_arc('start', 's11', (letter), (letter)) f1.add_arc('s11', 's1', (letter), ()) f1.add_arc('s22', 's1', (letter), ('1')) f1.add_arc('s33', 's1', (letter), ('1')) f1.add_arc('s44', 's1', (letter), ('1')) f1.add_arc('s55', 's1', (letter), ('1')) f1.add_arc('s66', 's1', (letter), ('1')) f1.add_arc('s1', 's1', (letter), ()) f1.add_arc('s5', 's1', (letter), ('1')) f1.add_arc('s2', 's1', (letter), ('1')) f1.add_arc('s3', 's1', (letter), ('1')) f1.add_arc('s4', 's1', (letter), ('1')) f1.add_arc('sv', 's1', (letter), ('1')) f1.add_arc('s6', 's1', (letter), ('1')) elif letter in "cgjkqsxzCGJKQSXZ": f1.add_arc('start', 's22', (letter), (letter)) f1.add_arc('s22', 's2', (letter), ()) f1.add_arc('s11', 's2', (letter), ('2')) f1.add_arc('s33', 's2', (letter), ('2')) f1.add_arc('s44', 's2', (letter), ('2')) f1.add_arc('s55', 's2', (letter), ('2')) f1.add_arc('s66', 's2', (letter), ('2')) f1.add_arc('s2', 's2', (letter), ()) f1.add_arc('s5', 's2', (letter), ('2')) f1.add_arc('s1', 's2', (letter), ('2')) f1.add_arc('s3', 's2', (letter), ('2')) f1.add_arc('s4', 's2', (letter), ('2')) f1.add_arc('sv', 's2', (letter), ('2')) f1.add_arc('s6', 's2', (letter), ('2')) elif letter in "mnMN": f1.add_arc('start', 's55', (letter), (letter)) f1.add_arc('s55', 's5', (letter), ()) f1.add_arc('s11', 's5', (letter), ('5')) f1.add_arc('s44', 's5', (letter), ('5')) f1.add_arc('s33', 's5', (letter), ('5')) f1.add_arc('s22', 's5', (letter), ('5')) f1.add_arc('s66', 's5', (letter), ('5')) f1.add_arc('s5', 's5', (letter), ()) f1.add_arc('s2', 's5', (letter), ('5')) f1.add_arc('s1', 's5', (letter), ('5')) f1.add_arc('s3', 's5', (letter), ('5')) f1.add_arc('s4', 's5', (letter), ('5')) f1.add_arc('sv', 's5', (letter), ('5')) f1.add_arc('s6', 's5', (letter), ('5')) elif letter in "dtDT": f1.add_arc('start', 's33', (letter), (letter)) f1.add_arc('s33', 's3', (letter), ()) f1.add_arc('s11', 's3', (letter), ('3')) f1.add_arc('s44', 's3', (letter), ('3')) f1.add_arc('s55', 's3', (letter), ('3')) f1.add_arc('s22', 's3', (letter), ('3')) f1.add_arc('s66', 's3', (letter), ('3')) f1.add_arc('s3', 's3', (letter), ()) f1.add_arc('s2', 's3', (letter), ('3')) f1.add_arc('s1', 's3', (letter), ('3')) f1.add_arc('s5', 's3', (letter), ('3')) f1.add_arc('s4', 's3', (letter), ('3')) f1.add_arc('sv', 's3', (letter), ('3')) f1.add_arc('s6', 's3', (letter), ('3')) """ else: f1.add_arc('s1','s5',(letter),('1')) f1.add_arc('s4','s5',(letter),('1')) f1.add_arc('s6','s5',(letter),('1')) f1.add_arc('s44','s5',(letter),('1')) f1.add_arc('s66','s5',(letter),('1')) f1.add_arc('s3','s5',(letter),('1')) #f1.add_arc('s5','s5',(letter),()) """ f1.add_arc('s11', 'next', (), ()) f1.add_arc('s22', 'next', (), ()) f1.add_arc('s33', 'next', (), ()) f1.add_arc('s44', 'next', (), ()) f1.add_arc('s55', 'next', (), ()) f1.add_arc('s66', 'next', (), ()) f1.add_arc('s1', 'next', (), ()) f1.add_arc('s2', 'next', (), ()) f1.add_arc('s3', 'next', (), ()) f1.add_arc('sv', 'next', (), ()) f1.add_arc('s4', 'next', (), ()) f1.add_arc('s5', 'next', (), ()) f1.add_arc('s6', 'next', (), ()) return f1
def letters_to_numbers(): """ Returns an FST that converts letters to numbers as specified by the soundex algorithm """ # Let's define our first FST f1 = FST('soundex-generate') # Indicate that '1' is the initial state f1.add_state('start') f1.add_state('next') f1.add_state('one') f1.add_state('two') f1.add_state('three') f1.add_state('four') f1.add_state('five') f1.add_state('six') f1.initial_state = 'start' # Set all the final states f1.set_final('next') f1.set_final('one') f1.set_final('two') f1.set_final('three') f1.set_final('four') f1.set_final('five') f1.set_final('six') list_one = ['b', 'f', 'p', 'v'] list_two = ['c', 'g', 'j', 'k', 'q', 's', 'x', 'z'] list_three = ['d', 't'] list_four = ['l'] list_five = ['m', 'n'] list_six = ['r'] vowels = ['a', 'e', 'h', 'i', 'o', 'u', 'w', 'y'] # Add the rest of the arcs # changed string.ascii_lowercase to string.letters for letter in string.letters: f1.add_arc('start', 'next', (letter), (letter)) for letter in string.letters: if letter in list_one: f1.add_arc('next', 'one', (letter), '1') elif letter in list_two: f1.add_arc('next', 'two', (letter), '2') elif letter in list_three: f1.add_arc('next', 'three', (letter), '3') elif letter in list_four: f1.add_arc('next', 'four', (letter), '4') elif letter in list_five: f1.add_arc('next', 'five', (letter), '5') elif letter in list_six: f1.add_arc('next', 'six', (letter), '6') else: f1.add_arc('next', 'next', (letter), ()) for letter in string.letters: if letter in list_two: f1.add_arc('one', 'two', (letter), '2') elif letter in list_three: f1.add_arc('one', 'three', (letter), '3') elif letter in list_four: f1.add_arc('one', 'four', (letter), '4') elif letter in list_five: f1.add_arc('one', 'five', (letter), '5') elif letter in list_six: f1.add_arc('one', 'six', (letter), '6') else: f1.add_arc('one', 'one', (letter), ()) for letter in string.letters: if letter in list_one: f1.add_arc('two', 'one', (letter), '1') elif letter in list_three: f1.add_arc('two', 'three', (letter), '3') elif letter in list_four: f1.add_arc('two', 'four', (letter), '4') elif letter in list_five: f1.add_arc('two', 'five', (letter), '5') elif letter in list_six: f1.add_arc('two', 'six', (letter), '6') else: f1.add_arc('two', 'two', (letter), ()) for letter in string.letters: if letter in list_one: f1.add_arc('three', 'one', (letter), '1') elif letter in list_two: f1.add_arc('three', 'two', (letter), '2') elif letter in list_four: f1.add_arc('three', 'four', (letter), '4') elif letter in list_five: f1.add_arc('three', 'five', (letter), '5') elif letter in list_six: f1.add_arc('three', 'six', (letter), '6') else: f1.add_arc('three', 'three', (letter), ()) for letter in string.letters: if letter in list_one: f1.add_arc('four', 'one', (letter), '1') elif letter in list_two: f1.add_arc('four', 'two', (letter), '2') elif letter in list_three: f1.add_arc('four', 'three', (letter), '3') elif letter in list_five: f1.add_arc('four', 'five', (letter), '5') elif letter in list_six: f1.add_arc('four', 'six', (letter), '6') else: f1.add_arc('four', 'four', (letter), ()) for letter in string.letters: if letter in list_one: f1.add_arc('five', 'one', (letter), '1') elif letter in list_two: f1.add_arc('five', 'two', (letter), '2') elif letter in list_three: f1.add_arc('five', 'three', (letter), '3') elif letter in list_four: f1.add_arc('five', 'four', (letter), '4') elif letter in list_six: f1.add_arc('five', 'six', (letter), '6') else: f1.add_arc('five', 'five', (letter), ()) for letter in string.letters: if letter in list_one: f1.add_arc('six', 'one', (letter), '1') elif letter in list_two: f1.add_arc('six', 'two', (letter), '2') elif letter in list_three: f1.add_arc('six', 'three', (letter), '3') elif letter in list_four: f1.add_arc('six', 'four', (letter), '4') elif letter in list_five: f1.add_arc('six', 'five', (letter), '5') else: f1.add_arc('six', 'six', (letter), ()) return f1
def add_zero_padding(): # Now, the third fst - the zero-padding fst f3 = FST('soundex-padzero') f3.add_state('st') f3.add_state('L1') f3.add_state('N1') f3.add_state('N2') f3.add_state('N3') f3.add_state('P1') f3.add_state('P2') f3.add_state('P3') f3.initial_state = 'st' f3.set_final('N3') f3.set_final('P3') for letter in string.letters: f3.add_arc('st', 'L1', (letter), (letter)) for number in xrange(10): f3.add_arc('st', 'N1', (str(number)), (str(number))) f3.add_arc('L1', 'N1', (str(number)), (str(number))) f3.add_arc('N1', 'N2', (str(number)), (str(number))) f3.add_arc('N2', 'N3', (str(number)), (str(number))) f3.add_arc('L1', 'P1', (), ('0')) f3.add_arc('N1', 'P2', (), ('0')) f3.add_arc('N2', 'P3', (), ('0')) f3.add_arc('P1', 'P2', (), ('0')) f3.add_arc('P2', 'P3', (), ('0')) return f3
def french_count(): f = FST('french') f.add_state('start') f.initial_state = 'start' f.add_state('1stzero') f.add_state('tens') f.add_state('seventeen') f.add_state('final_seventeen') f.add_state('eighteen') f.add_state('final_eighteen') f.add_state('nineteen') f.add_state('final_nineteen') f.add_state('zero') f.add_state('ones') f.add_state('20-69') f.add_state('70-ten') f.add_state('80s') f.add_state('90s') f.add_state('100s') f.add_state('et') f.add_state('10-et') f.add_state('et-un') f.add_state('et-onze') f.set_final('zero') f.set_final('ones') f.set_final('tens') f.set_final('final_seventeen') f.set_final('final_eighteen') f.set_final('final_nineteen') f.set_final('20-69') f.set_final('70-ten') f.set_final('80s') f.set_final('90s') f.set_final('et-un') f.set_final('et-onze') # 100 - 999 f.add_arc('start', '1stzero', '1', [kFRENCH_TRANS[100]]) for i in range(2, 10): f.add_arc('start', '100s', str(i), [kFRENCH_TRANS[i]]) f.add_arc('100s', '1stzero', (), [kFRENCH_TRANS[100]]) # 0 - 9 f.add_arc('start', '1stzero', '0', []) f.add_arc('1stzero', 'ones', '0', []) for ii in range(1, 10): f.add_arc('ones', 'ones', str(ii), [kFRENCH_TRANS[ii]]) f.add_arc('ones', 'ones', '0', []) # for i in range(10): # f.add_arc('ten-6', 'ten-6', str(i), kFRENCH_TRANS[(i+10]) # 10 - 16 f.add_arc('1stzero', 'tens', '1', []) f.add_arc('tens', 'tens', '0', [kFRENCH_TRANS[10]]) f.add_arc('tens', 'tens', '1', [kFRENCH_TRANS[11]]) f.add_arc('tens', 'tens', '2', [kFRENCH_TRANS[12]]) f.add_arc('tens', 'tens', '3', [kFRENCH_TRANS[13]]) f.add_arc('tens', 'tens', '4', [kFRENCH_TRANS[14]]) f.add_arc('tens', 'tens', '5', [kFRENCH_TRANS[15]]) f.add_arc('tens', 'tens', '6', [kFRENCH_TRANS[16]]) f.add_arc('tens', 'seventeen', '7', [kFRENCH_TRANS[10]]) f.add_arc('seventeen', 'final_seventeen', (), [kFRENCH_TRANS[7]]) f.add_arc('tens', 'eighteen', '8', [kFRENCH_TRANS[10]]) f.add_arc('eighteen', 'final_eighteen', (), [kFRENCH_TRANS[8]]) f.add_arc('tens', 'nineteen', '9', [kFRENCH_TRANS[10]]) f.add_arc('nineteen', 'final_nineteen', (), [kFRENCH_TRANS[9]]) # 20 - 69 f.add_arc('1stzero', '20-69', '2', [kFRENCH_TRANS[20]]) f.add_arc('1stzero', '20-69', '3', [kFRENCH_TRANS[30]]) f.add_arc('1stzero', '20-69', '4', [kFRENCH_TRANS[40]]) f.add_arc('1stzero', '20-69', '5', [kFRENCH_TRANS[50]]) f.add_arc('1stzero', '20-69', '6', [kFRENCH_TRANS[60]]) # special cases: for i in range(2, 10): f.add_arc('20-69', '20-69', str(i), [kFRENCH_TRANS[i]]) # handles 20, 30 ... 60 for i in range(20, 60, 10): f.add_arc('20-69', '20-69', '0', []) # handles 21, 31, ... 61 f.add_arc('20-69', 'et', '1', [kFRENCH_AND]) f.add_arc('et', 'et-un', (),[kFRENCH_TRANS[1]]) # 70 - 79 f.add_arc('1stzero', '70-ten', '7', [kFRENCH_TRANS[60]]) f.add_arc('70-ten', '70-ten', '0', [kFRENCH_TRANS[10]]) # handle 71 here f.add_arc('70-ten', '10-et', '1', [kFRENCH_AND]) f.add_arc('10-et', 'et-onze', (),[kFRENCH_TRANS[11]]) f.add_arc('70-ten', '70-ten', '2', [kFRENCH_TRANS[12]]) f.add_arc('70-ten', '70-ten', '3', [kFRENCH_TRANS[13]]) f.add_arc('70-ten', '70-ten', '4', [kFRENCH_TRANS[14]]) f.add_arc('70-ten', '70-ten', '5', [kFRENCH_TRANS[15]]) f.add_arc('70-ten', '70-ten', '6', [kFRENCH_TRANS[16]]) f.add_arc('70-ten', 'seventeen', '7', [kFRENCH_TRANS[10]]) f.add_arc('seventeen', 'final_seventeen', (), [kFRENCH_TRANS[7]]) f.add_arc('70-ten', 'eighteen', '8', [kFRENCH_TRANS[10]]) f.add_arc('eighteen', 'final_eighteen', (), [kFRENCH_TRANS[8]]) f.add_arc('70-ten', 'nineteen', '9', [kFRENCH_TRANS[10]]) f.add_arc('nineteen', 'final_nineteen', (), [kFRENCH_TRANS[9]]) # 80 - 89 f.add_arc('1stzero', '80s', '8', [kFRENCH_TRANS[4]]) f.add_arc('80s', 'ones', (), [kFRENCH_TRANS[20]]) f.add_arc('80s', '80s', '0', [kFRENCH_TRANS[20]]) # 90 - 99 f.add_arc('1stzero', '90s', '9', [kFRENCH_TRANS[4]]) f.add_arc('90s', 'tens', (), [kFRENCH_TRANS[20]]) return f
def intersection_fst(self, list_fst): new_alphabet, new_states, new_start_state, new_accept_states, new_transitions =\ self._merge_fst(list_fst, op=INTERSECT) return FST(new_alphabet, new_states, new_start_state, new_accept_states, new_transitions)
def letters_to_numbers(): """ Returns an FST that converts letters to numbers as specified by the soundex algorithm """ # Let's define our first FST f1 = FST('soundex-generate') # Indicate that '1' is the initial state f1.add_state('start') f1.add_state('next') f1.initial_state = 'start' # Set all the final states f1.set_final('next') # Add the rest of the arcs for letter in string.ascii_lowercase: f1.add_arc('start', 'next', (letter), (letter)) f1.add_arc('next', 'next', (letter), '0') return f1
def letters_to_numbers(): """ Returns an FST that converts letters to numbers as specified by the soundex algorithm """ # Let's define our first FST f1 = FST('soundex-generate') aeoy = ['a','e','h','i','o','u','w','y'] one = ['b','f','p','v'] two = ['c','g','j','k','q','s','x','z'] three = ['d','t'] four = ['l'] five = ['m','n'] six = ['r'] # Indicate that '1' is the initial state f1.add_state('initial') f1.add_state('0') f1.add_state('1') f1.add_state('2') f1.add_state('3') f1.add_state('4') f1.add_state('5') f1.add_state('6') f1.initial_state = 'initial' # Set all the final states f1.set_final('0') f1.set_final('1') f1.set_final('2') f1.set_final('3') f1.set_final('4') f1.set_final('5') f1.set_final('6') # Add the rest of the arcs for letter in string.ascii_letters: f1.add_arc('initial','0',(letter),(letter)) if letter in aeoy: f1.add_arc('0','0', (letter), ()) f1.add_arc('1','0', (letter), ()) f1.add_arc('2','0', (letter), ()) f1.add_arc('3','0', (letter), ()) f1.add_arc('4','0', (letter), ()) f1.add_arc('5','0', (letter), ()) f1.add_arc('6','0', (letter), ()) else: if letter in one: f1.add_arc('0','1', (letter), '1') f1.add_arc('2','1', (letter), '1') f1.add_arc('3','1', (letter), '1') f1.add_arc('4','1', (letter), '1') f1.add_arc('5','1', (letter), '1') f1.add_arc('6','1', (letter), '1') f1.add_arc('1','0', (letter), ()) if letter in two: f1.add_arc('0','2', (letter), '2') f1.add_arc('1','2', (letter), '2') f1.add_arc('3','2', (letter), '2') f1.add_arc('4','2', (letter), '2') f1.add_arc('5','2', (letter), '2') f1.add_arc('6','2', (letter), '2') f1.add_arc('2','0', (letter), ()) if letter in three: f1.add_arc('0','3', (letter), '3') f1.add_arc('1','3', (letter), '3') f1.add_arc('2','3', (letter), '3') f1.add_arc('4','3', (letter), '3') f1.add_arc('5','3', (letter), '3') f1.add_arc('6','3', (letter), '3') f1.add_arc('3','0', (letter), ()) if letter in four: f1.add_arc('0','4', (letter), '4') f1.add_arc('1','4', (letter), '4') f1.add_arc('2','4', (letter), '4') f1.add_arc('3','4', (letter), '4') f1.add_arc('5','4', (letter), '4') f1.add_arc('6','4', (letter), '4') f1.add_arc('4','0', (letter), ()) if letter in five: f1.add_arc('0','5', (letter), '5') f1.add_arc('1','5', (letter), '5') f1.add_arc('2','5', (letter), '5') f1.add_arc('3','5', (letter), '5') f1.add_arc('4','5', (letter), '5') f1.add_arc('6','5', (letter), '5') f1.add_arc('5','0', (letter), ()) if letter in six: f1.add_arc('0','6', (letter), '6') f1.add_arc('1','6', (letter), '6') f1.add_arc('2','6', (letter), '6') f1.add_arc('3','6', (letter), '6') f1.add_arc('4','6', (letter), '6') f1.add_arc('5','6', (letter), '6') f1.add_arc('6','0', (letter), ()) return f1
def truncate_to_three_digits(): """ Create an FST that will truncate a soundex string to three digits """ # Ok so now let's do the second FST, the one that will truncate # the number of digits to 3 f2 = FST('soundex-truncate') # Indicate initial and final states states = ['1', 'd1', 'd2', 'd3'] for state in states: f2.add_state(state) f2.initial_state = '1' for state in ['d1', 'd2', 'd3']: f2.set_final(state) # Add the arcs for letter in string.letters: f2.add_arc('1', '1', (letter), (letter)) for index, state in enumerate(states): if index > 0: for n in range(10): f2.add_arc(states[index-1], states[index], str(n), str(n)) for n in range(10): f2.add_arc('d3', 'd3', str(n), ()) return f2
def french_count(): f = FST('french') f.add_state('0') f.add_state('1') f.add_state('2') f.add_state('3') f.add_state('4') f.add_state('5') f.add_state('6') f.add_state('7') f.add_state('8') f.add_state('9') f.add_state('10') f.add_state('11') f.add_state('12') f.add_state('13') f.add_state('14') f.add_state('15') f.add_state('16') f.add_state('17') f.add_state('18') f.add_state('19') f.add_state('20') f.add_state('21') f.add_state('22') f.add_state('23') f.add_state('24') f.add_state('25') f.initial_state = '0' f.set_final('1') f.set_final('3') f.set_final('6') f.set_final('7') f.set_final('8') f.set_final('9') f.set_final('11') f.set_final('13') f.set_final('14') f.set_final('18') f.set_final('20') zero = [0] one = [1] two_to_six = [2,3,4,5,6] one_to_six = [1,2,3,4,5,6] seven = [7] seven_eight_nine = [7,8,9] eight = [8] nine = [9] singles_all = [1,2,3,4,5,6,7,8,9] singles = [2,3,4,5,6,7,8,9] tens = [20,30,40,50] # Edge from initial to final, if preceding zero in input for i in zero: # f.add_arc('0','9', str(i), [kFRENCH_TRANS[i]]) f.add_arc('0','0', str(i), ()) f.add_arc('4','6', str(i), ()) f.add_arc('5','8', str(i), ()) f.add_arc('0','9', str(i), [kFRENCH_TRANS[i]]) f.add_arc('10','11', str(i), [kFRENCH_TRANS[i+10]]) f.add_arc('12','13', str(i), [kFRENCH_TRANS[20]]) f.add_arc('16','18', str(i), [kFRENCH_TRANS[20],kFRENCH_TRANS[10]]) f.add_arc('17','19', str(i), ()) f.add_arc('19','9', str(i), ()) for i in one: f.add_arc('0','2', str(i), ()) f.add_arc('17','2', str(i), ()) f.add_arc('0','17', str(i), [kFRENCH_TRANS[100]]) f.add_arc('0','5', str(i), [kFRENCH_TRANS[i*10]]) f.add_arc('17','5', str(i), [kFRENCH_TRANS[i*10]]) f.add_arc('4','7', str(i), [kFRENCH_AND, kFRENCH_TRANS[i]]) f.add_arc('10','11', str(i), [kFRENCH_AND, kFRENCH_TRANS[i+10]]) f.add_arc('12','14', str(i), [kFRENCH_TRANS[20], kFRENCH_AND, kFRENCH_TRANS[i]]) f.add_arc('16','20', str(i), [kFRENCH_TRANS[20], kFRENCH_AND, kFRENCH_TRANS[i+10]]) for i in one_to_six: f.add_arc('2','3', str(i), [kFRENCH_TRANS[i+10]]) for i in two_to_six: f.add_arc('0','4', str(i), [kFRENCH_TRANS[i*10]]) f.add_arc('17','4', str(i), [kFRENCH_TRANS[i*10]]) f.add_arc('10','11', str(i), [kFRENCH_TRANS[i+10]]) f.add_arc('16','20', str(i), [kFRENCH_TRANS[20],kFRENCH_TRANS[i+10]]) for i in singles: f.add_arc('4','7', str(i), [kFRENCH_TRANS[i]]) f.add_arc('0','17', str(i), [kFRENCH_TRANS[i],kFRENCH_TRANS[100]]) f.add_arc('12','14', str(i), [kFRENCH_TRANS[20], kFRENCH_TRANS[i]]) for i in singles_all: f.add_arc('0','1', str(i), [kFRENCH_TRANS[i]]) f.add_arc('19','1', str(i), [kFRENCH_TRANS[i]]) for i in seven_eight_nine: f.add_arc('5','8', str(i), [kFRENCH_TRANS[i]]) f.add_arc('10','11', str(i), [kFRENCH_TRANS[10], kFRENCH_TRANS[i]]) f.add_arc('16','20', str(i), [kFRENCH_TRANS[20], kFRENCH_TRANS[10], kFRENCH_TRANS[i]]) for i in seven: f.add_arc('0','10',str(i), [kFRENCH_TRANS[60]]) f.add_arc('17','10',str(i), [kFRENCH_TRANS[60]]) for i in eight: f.add_arc('0','12',str(i), [kFRENCH_TRANS[4]]) f.add_arc('17','12',str(i), [kFRENCH_TRANS[4]]) for i in nine: f.add_arc('0','16',str(i), [kFRENCH_TRANS[4]]) f.add_arc('17','16',str(i), [kFRENCH_TRANS[4]]) return f
def letters_to_numbers(): """ Returns an FST that converts letters to numbers as specified by the soundex algorithm """ # Let's define our first FST f1 = FST('soundex-generate') # Indicate that '1' is the initial state states = ['q1', 'q2', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6'] for state in states: f1.add_state(state) f1.initial_state = 'q1' # Set all the final states for state in ['q2', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6']: f1.set_final(state) # Add the rest of the arcs for letter in string.ascii_lowercase: f1.add_arc('q1', 'q2', (letter), (letter)) if letter in set('aehiouwy'): for state in ['q2', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6']: f1.add_arc(state, state, (letter), ()) else: if letter in set('bfpv'): for state in ['q2', 'n2', 'n3', 'n4', 'n5', 'n6']: f1.add_arc(state, 'n1', (letter), ('1')) f1.add_arc('n1', 'n1', (letter), ()) elif letter in set('cgjkqsxz'): for state in ['q2', 'n1', 'n3', 'n4', 'n5', 'n6']: f1.add_arc(state, 'n2', (letter), ('2')) f1.add_arc('n2', 'n2', (letter), ()) elif letter in set('dt'): for state in ['q2', 'n1', 'n2', 'n4', 'n5', 'n6']: f1.add_arc(state, 'n3', (letter), ('3')) f1.add_arc('n3', 'n3', (letter), ()) elif letter in set('l'): for state in ['q2', 'n1', 'n2', 'n3', 'n5', 'n6']: f1.add_arc(state, 'n4', (letter), ('4')) f1.add_arc('n4', 'n4', (letter), ()) elif letter in set('mn'): for state in ['q2', 'n1', 'n2', 'n3', 'n4', 'n6']: f1.add_arc(state, 'n5', (letter), ('5')) f1.add_arc('n5', 'n5', (letter), ()) elif letter in set('r'): for state in ['q2', 'n1', 'n2', 'n3', 'n4', 'n5']: f1.add_arc(state, 'n6', (letter), ('6')) f1.add_arc('n6', 'n6', (letter), ()) return f1
def french_count(): f = FST('french') f.add_state('start') f.add_state('0xx') f.add_state('nxx') f.add_state('00x') f.add_state('n0x') f.add_state('n1x') f.add_state('nnx') f.add_state('n7x') f.add_state('n8x') f.add_state('n9x') f.add_state('last') f.initial_state = 'start' f.set_final('last') for ii in xrange(10): f.add_arc('00x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) if ii == 0: f.add_arc('start', '0xx', [str(ii)], ()) f.add_arc('0xx', '00x', [str(ii)], ()) f.add_arc('nxx', 'n0x', [str(ii)], ()) f.add_arc('n0x', 'last', [str(ii)], ()) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[ii + 10]]) f.add_arc('nnx', 'last', [str(ii)], ()) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_TRANS[ii+10]]) f.add_arc('n8x', 'last', [str(ii)], ()) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[ii+10]]) if ii == 1: f.add_arc('start', 'nxx', [str(ii)], [kFRENCH_TRANS[100]]) f.add_arc('0xx', 'n1x', [str(ii)], ()) f.add_arc('nxx', 'n1x', [str(ii)], ()) f.add_arc('n0x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[ii + 10]]) f.add_arc('nnx', 'last', [str(ii)], [kFRENCH_AND,kFRENCH_TRANS[ii]]) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_AND,kFRENCH_TRANS[ii+10]]) f.add_arc('n8x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[ii+10]]) if ii in range(2,7): f.add_arc('start', 'nxx', [str(ii)], [kFRENCH_TRANS[ii],kFRENCH_TRANS[100]]) f.add_arc('0xx', 'nnx', [str(ii)], [kFRENCH_TRANS[ii * 10]]) f.add_arc('nxx', 'nnx', [str(ii)], [kFRENCH_TRANS[ii * 10]]) f.add_arc('n0x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[ii + 10]]) f.add_arc('nnx', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_TRANS[ii+10]]) f.add_arc('n8x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[ii + 10]]) if ii == 7: f.add_arc('start', 'nxx', [str(ii)], [kFRENCH_TRANS[ii],kFRENCH_TRANS[100]]) f.add_arc('0xx', 'n7x', [str(ii)], [kFRENCH_TRANS[6 * 10]]) f.add_arc('nxx', 'n7x', [str(ii)], [kFRENCH_TRANS[6*10]]) f.add_arc('n0x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('nnx', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('n8x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) if ii == 8: f.add_arc('start', 'nxx', [str(ii)], [kFRENCH_TRANS[ii],kFRENCH_TRANS[100]]) f.add_arc('0xx', 'n8x', [str(ii)], [kFRENCH_TRANS[4], kFRENCH_TRANS[20]]) f.add_arc('nxx', 'n8x', [str(ii)], [kFRENCH_TRANS[4], kFRENCH_TRANS[20]]) f.add_arc('n0x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('nnx', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('n8x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) if ii == 9: f.add_arc('start', 'nxx', [str(ii)], [kFRENCH_TRANS[ii],kFRENCH_TRANS[100]]) f.add_arc('0xx', 'n9x', [str(ii)], [kFRENCH_TRANS[4], kFRENCH_TRANS[20]]) f.add_arc('nxx', 'n9x', [str(ii)], [kFRENCH_TRANS[4], kFRENCH_TRANS[20]]) f.add_arc('n0x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n1x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('nnx', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n7x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) f.add_arc('n8x', 'last', [str(ii)], [kFRENCH_TRANS[ii]]) f.add_arc('n9x', 'last', [str(ii)], [kFRENCH_TRANS[10], kFRENCH_TRANS[ii]]) return f
from fst import FST import string, sys from fsmutils import composechars, trace f1 = FST('soundex-generate') f1.add_state('start') f1.add_state('next') f1.initial_state = 'start' f1.set_final('next') list_one = ['b', 'f', 'p', 'v'] list_two = ['c', 'g', 'j', 'k', 'q', 's', 'x', 'z'] list_three = ['d', 't'] list_four = ['l'] list_five = ['m', 'n'] list_six = ['r'] vowels = ['a', 'e', 'h', 'i', 'o', 'u', 'w', 'y']