Esempio n. 1
0
 def expand_file(self, fname):
     """Load a list of stops from a file and expand it."""
     with codecs.open(fname, 'r', 'UTF-8') as f_in:
         ctr = 0
         for line in f_in:
             if line.startswith('#'):  # skip comments
                 continue
             # load variant names for a stop
             stop, variants = self.parse_line(line)
             # skip those that needn't be inflected any more
             to_inflect = [var for var in variants if not var in self.stops[stop]]
             # inflect the rest
             for variant in to_inflect:
                 words = self.__analyzer.analyze(variant)
                 # in all required cases
                 for case in self.cases_list:
                     forms = []
                     prev_tag = ''
                     for word in words:
                         forms.append(self.__inflect_word(word, case, prev_tag))
                         prev_tag = word[2]
                     # use all possible combinations if there are more variants for this case
                     inflected = map(self.__postprocess_func,
                                     remove_dups_stable([' '.join(var)
                                                         for var in itertools.product(*forms)]))
                     self.stops[stop] = list(remove_dups_stable(self.stops[stop] + inflected))
             ctr += 1
             if ctr % 1000 == 0:
                 print >> sys.stderr, '.',
     print >> sys.stderr
Esempio n. 2
0
 def expand_file(self, fname):
     """Load a list of stops from a file and expand it."""
     with codecs.open(fname, 'r', 'UTF-8') as f_in:
         ctr = 0
         for line in f_in:
             if line.startswith('#'):  # skip comments
                 continue
             # load variant names for a stop
             stop, variants = self.parse_line(line)
             # skip those that needn't be inflected any more
             to_inflect = [var for var in variants if not var in self.stops[stop]]
             # inflect the rest
             for variant in to_inflect:
                 words = self.__analyzer.analyze(variant)
                 # in all required cases
                 for case in self.cases_list:
                     forms = self.__generator.inflect(words, case, self.personal_names)
                     # use all possible combinations if there are more variants for this case
                     inflected = map(self.__postprocess_func,
                                     remove_dups_stable([' '.join(var)
                                                         for var in itertools.product(*forms)]))
                     self.stops[stop] = list(remove_dups_stable(self.stops[stop] + inflected))
             ctr += 1
             if ctr % 1000 == 0:
                 print >> sys.stderr, '.',
     print >> sys.stderr
Esempio n. 3
0
def expand_abbrevs(stop_name):
    """Apply all abreviation expansions to the given stop name, all resulting variant names,
    starting with the 'main' variant."""
    # add spaces to have simpler regexes
    variants = [' ' + stop_name + ' ']
    # process all regexes
    for regex, repls in ABBREV_RULES:
        try:
            # replacement variants
            if type(repls) == list:
                variants = list(
                    remove_dups_stable([
                        regex.sub(repl, var) for repl in repls
                        for var in variants
                    ]))
            # just a single replacement
            else:
                variants = [regex.sub(repls, var) for var in variants]
        except Exception as e:
            print >> sys.stderr, unicode(regex.pattern).encode('utf-8')
            raise e
    # keep the first variant as "canonical", to be used in SLU, DM, and NLG
    stop_name = variants[0]
    # process numbers in variants
    if NUM_FINDER.search(stop_name):
        variants = [expand_numbers(var) for var in variants]
    # remove the added spaces
    stop_name = stop_name.strip()
    variants = [var.strip() for var in variants]
    # return the result
    return stop_name, variants
Esempio n. 4
0
def expand_abbrevs(stop_name):
    """Apply all abreviation expansions to the given stop name, all resulting variant names,
    starting with the 'main' variant."""
    # add spaces to have simpler regexes
    variants = [" " + stop_name + " "]
    # process all regexes
    for regex, repls in ABBREV_RULES:
        try:
            # replacement variants
            if type(repls) == list:
                variants = list(remove_dups_stable([regex.sub(repl, var) for repl in repls for var in variants]))
            # just a single replacement
            else:
                variants = [regex.sub(repls, var) for var in variants]
        except Exception as e:
            print >>sys.stderr, unicode(regex.pattern).encode("utf-8")
            raise e
    # keep the first variant as "canonical", to be used in SLU, DM, and NLG
    stop_name = variants[0]
    # process numbers in variants
    if NUM_FINDER.search(stop_name):
        variants = [expand_numbers(var) for var in variants]
    # remove the added spaces
    stop_name = stop_name.strip()
    variants = [var.strip() for var in variants]
    # return the result
    return stop_name, variants
Esempio n. 5
0
 def load_file(self, fname):
     """Just load a list of stops from a file and store it in memory."""
     with codecs.open(fname, 'r', 'UTF-8') as f_in:
         for line in f_in:
             if line.startswith('#'):  # skip comments
                 continue
             stop, variants = self.parse_line(line)
             self.stops[stop] = list(remove_dups_stable(variants + self.stops[stop]))
Esempio n. 6
0
 def load_file(self, fname):
     """Just load a list of stops from a file and store it in memory."""
     with codecs.open(fname, 'r', 'UTF-8') as f_in:
         for line in f_in:
             if line.startswith('#'):  # skip comments
                 continue
             stop, variants = self.parse_line(line)
             self.stops[stop] = list(remove_dups_stable(variants + self.stops[stop]))