def __init__(self): """Construct a transliterator object.""" rules_file_location = pkg_resources.resource_filename( "data", "hfst.att") with open(rules_file_location, "r") as f: self.transducer = hfst.AttReader(f).read() self.logger = logging.getLogger("franco_arabic_transliterator") logging.basicConfig(level=logging.DEBUG) with open(pkg_resources.resource_filename("data", "lexicon"), "r") as f: self.wordlist = { l.split("\t")[0]: int(l.split("\t")[1]) for l in f.readlines() } def find_pairs(word, grams=10, max_len=20): pairs = [] chars = ["_" for _ in range(grams)] word = "{}{}".format(word, "$" * (max_len - len(word))) for c in word: pairs.append((c, "".join(chars))) chars = chars[1:] + [c] return pairs pairs = [p for w in self.wordlist for p in find_pairs(w)] self.counts = Counter(pairs) self.sigma_counts = sum(self.counts.values())
def __init__(self): """Construct a transliterator object.""" rules_file_location = pkg_resources.resource_filename( 'data', 'hfst.att') with open(rules_file_location, 'r') as f: self.transducer = hfst.AttReader(f).read() self.logger = logging.getLogger('franco_arabic_transliterator') logging.basicConfig(level=logging.DEBUG) with open(pkg_resources.resource_filename('data', 'lexicon'), 'r') as f: self.wordlist = { l.split('\t')[0]: int(l.split('\t')[1]) for l in f.readlines() } def find_pairs(word, grams=3): pairs = [] chars = ['_' for _ in range(grams)] for c in word: pairs.append((c, ''.join(chars))) chars = chars[1:] + [c] return pairs pairs = [p for w in self.wordlist for p in find_pairs(w)] self.counts = Counter(pairs) self.sigma_counts = sum(self.counts.values())
def get_att_transducer(f): with open(f, 'r', encoding='utf-8') as f: try: r = hfst.AttReader(f) for tr in r: return tr except hfst.exceptions.NotValidAttFormatException as e: print(e.what(), file=sys.stderr)
import hfst transducers = [] with open('testfile.att', 'r') as f: r = hfst.AttReader(f, "<eps>") for tr in r: transducers.append(tr) assert (f.closed) assert (len(transducers)) == 4 transducers = [] with open('testfile_fail.att', 'r') as f: try: r = hfst.AttReader(f, "<eps>") for tr in r: transducers.append(tr) except hfst.exceptions.NotValidAttFormatException as e: assert ("1 baz baz 0.3" in e.what()) assert ("line: 11" in e.what()) assert (f.closed) assert (len(transducers)) == 4
impl = hfst.ImplementationType.TROPICAL_OPENFST_TYPE elif val == 'foma': impl = hfst.ImplementationType.FOMA_TYPE else: raise RuntimeError('type not recognized: ' + val) elif arg == '-e': skip_next = True epsilonstr = argv[i + 1] elif arg == '-i': skip_next = True inputfilename = argv[i + 1] else: raise RuntimeError('argument not recognized: ' + arg) istr = None if inputfilename != None: istr = open(inputfilename, 'r') else: istr = stdin ostr = hfst.HfstOutputStream() att = hfst.AttReader(istr, epsilonstr) for tr in att: ostr.write(tr) ostr.flush() if inputfilename != None: istr.close() ostr.close()