def expand_file(self, fname): """Load a list of stops from a file and expand it.""" with codecs.open(fname, 'r', 'UTF-8') as f_in: ctr = 0 for line in f_in: if line.startswith('#'): # skip comments continue # load variant names for a stop stop, variants = self.parse_line(line) # skip those that needn't be inflected any more to_inflect = [var for var in variants if not var in self.stops[stop]] # inflect the rest for variant in to_inflect: words = self.__analyzer.analyze(variant) # in all required cases for case in self.cases_list: forms = [] prev_tag = '' for word in words: forms.append(self.__inflect_word(word, case, prev_tag)) prev_tag = word[2] # use all possible combinations if there are more variants for this case inflected = map(self.__postprocess_func, remove_dups_stable([' '.join(var) for var in itertools.product(*forms)])) self.stops[stop] = list(remove_dups_stable(self.stops[stop] + inflected)) ctr += 1 if ctr % 1000 == 0: print >> sys.stderr, '.', print >> sys.stderr
def expand_file(self, fname): """Load a list of stops from a file and expand it.""" with codecs.open(fname, 'r', 'UTF-8') as f_in: ctr = 0 for line in f_in: if line.startswith('#'): # skip comments continue # load variant names for a stop stop, variants = self.parse_line(line) # skip those that needn't be inflected any more to_inflect = [var for var in variants if not var in self.stops[stop]] # inflect the rest for variant in to_inflect: words = self.__analyzer.analyze(variant) # in all required cases for case in self.cases_list: forms = self.__generator.inflect(words, case, self.personal_names) # use all possible combinations if there are more variants for this case inflected = map(self.__postprocess_func, remove_dups_stable([' '.join(var) for var in itertools.product(*forms)])) self.stops[stop] = list(remove_dups_stable(self.stops[stop] + inflected)) ctr += 1 if ctr % 1000 == 0: print >> sys.stderr, '.', print >> sys.stderr
def expand_abbrevs(stop_name): """Apply all abreviation expansions to the given stop name, all resulting variant names, starting with the 'main' variant.""" # add spaces to have simpler regexes variants = [' ' + stop_name + ' '] # process all regexes for regex, repls in ABBREV_RULES: try: # replacement variants if type(repls) == list: variants = list( remove_dups_stable([ regex.sub(repl, var) for repl in repls for var in variants ])) # just a single replacement else: variants = [regex.sub(repls, var) for var in variants] except Exception as e: print >> sys.stderr, unicode(regex.pattern).encode('utf-8') raise e # keep the first variant as "canonical", to be used in SLU, DM, and NLG stop_name = variants[0] # process numbers in variants if NUM_FINDER.search(stop_name): variants = [expand_numbers(var) for var in variants] # remove the added spaces stop_name = stop_name.strip() variants = [var.strip() for var in variants] # return the result return stop_name, variants
def expand_abbrevs(stop_name): """Apply all abreviation expansions to the given stop name, all resulting variant names, starting with the 'main' variant.""" # add spaces to have simpler regexes variants = [" " + stop_name + " "] # process all regexes for regex, repls in ABBREV_RULES: try: # replacement variants if type(repls) == list: variants = list(remove_dups_stable([regex.sub(repl, var) for repl in repls for var in variants])) # just a single replacement else: variants = [regex.sub(repls, var) for var in variants] except Exception as e: print >>sys.stderr, unicode(regex.pattern).encode("utf-8") raise e # keep the first variant as "canonical", to be used in SLU, DM, and NLG stop_name = variants[0] # process numbers in variants if NUM_FINDER.search(stop_name): variants = [expand_numbers(var) for var in variants] # remove the added spaces stop_name = stop_name.strip() variants = [var.strip() for var in variants] # return the result return stop_name, variants
def load_file(self, fname): """Just load a list of stops from a file and store it in memory.""" with codecs.open(fname, 'r', 'UTF-8') as f_in: for line in f_in: if line.startswith('#'): # skip comments continue stop, variants = self.parse_line(line) self.stops[stop] = list(remove_dups_stable(variants + self.stops[stop]))