def build(self, text=None): """ Take text (for example as returned by corpus() ) and build a trie. The maximum n-gram length is self.max_chars characters or self.max_words words, whichever is shorter. """### if text == None: text = self.read_corpus() t = TreeStructure.trie() istop = len(text) pr = progress(istop, 'building language model') for i in range(istop): substr = '' j = i nwords = 0 lastchar = ' ' while j < istop: char = text[j] substr += char endword = (char == ' ' and lastchar != ' ') lastchar = char j += 1 if endword: nwords += 1 if len(substr ) >= self.max_chars or nwords >= self.max_words: break #print '"'+substr+'"' t.add(substr) if i % 5000 == 0: pr.update(i, '(%d nodes)' % len(t.nodes)) self.prefixtree = t self.corpus = text self.timestamp = time.localtime()[:6] return self
def __init__(self, filename=None, alphabet=None, translation_file=None, space_delimited=False, max_chars=16, max_words=3, trim=True, verbose=False): if alphabet == None: alphabet = default_alphabet self.alphabet = alphabet self.space_delimited = space_delimited self.translation_file = FindFile(translation_file) self.max_chars = max_chars self.max_words = max_words self.trim = trim self.verbose = verbose self.corpus = None self.timestamp = None self.prefixtree = TreeStructure.trie() if filename != None: self.loadtrie(filename) self.translations = self.read_translations(self.translation_file) self.hashattr = [ 'space_delimited', 'max_words', 'max_chars', 'trim', 'translations' ]
def build(self, text=None): """ Take text (for example as returned by corpus() ) and build a trie. The maximum n-gram length is self.max_chars characters or self.max_words words, whichever is shorter. """### if text == None: text = self.read_corpus() t = TreeStructure.trie() istop = len(text) pr = progress(istop, 'building language model') for i in range(istop): substr = ''; j = i; nwords = 0; lastchar = ' ' while j<istop: char = text[j] substr += char endword = (char == ' ' and lastchar != ' ') lastchar = char j += 1 if endword: nwords += 1 if len(substr) >= self.max_chars or nwords >= self.max_words: break #print '"'+substr+'"' t.add(substr) if i%5000 == 0: pr.update(i, '(%d nodes)'%len(t.nodes)) self.prefixtree = t self.corpus = text self.timestamp = time.localtime()[:6] return self
def convert(fileName): """ Converts input files from standard RAVEN XML to xml-converted GetPot. Produces a ".i" file in the end. @ In, fileName, the name for the XML to convert @ Out, None """ tree = TS.parse(fileName) return tree.printGetPot()
def __init__(self, filename=None, alphabet=None, translation_file=None, space_delimited=False, max_chars=16, max_words=3, trim=True, verbose=False): if alphabet == None: alphabet = default_alphabet self.alphabet = alphabet self.space_delimited = space_delimited self.translation_file = FindFile(translation_file) self.max_chars = max_chars self.max_words = max_words self.trim = trim self.verbose = verbose self.corpus = None self.timestamp = None self.prefixtree = TreeStructure.trie() if filename != None: self.loadtrie(filename) self.translations = self.read_translations(self.translation_file) self.hashattr = ['space_delimited', 'max_words', 'max_chars', 'trim', 'translations']
def loadtrie(self, filename): filename = FindFile( filename) # TODO: @@@ more graceful behaviour if not found? self.prefixtree = TreeStructure.trie(filename) self.alphabet = sorted(self.prefixtree.dist('').keys())
def loadtrie(self, filename): filename = FindFile(filename) # TODO: @@@ more graceful behaviour if not found? self.prefixtree = TreeStructure.trie(filename) self.alphabet = sorted(self.prefixtree.dist('').keys())