Beispiel #1
0
    def search(self, input):
        """Returns tuple(index:str, resp:list, priority:float)
        
        Note that this method doesn't conform the ABC, you should only use
        it for debugging purpose or you are ONLY using this engine"""
        data = {}
        for index, resp in self._search_db(input):
            key = frozenset(index.split())
            resp = resp.split('\f')
            if key in data:
                data[key].extend(resp)
            else:
                data[key] = resp
        diff = SequenceMatcher(partial(contains, '?,./<>`~!@#$%&*()_+-={}[];:\'"|\\'), input + ' '.join(self.state))
        cleaned = strip_clean(input.lower())
        cleaned_words = cleaned.split()
        words = self.state.union(cleaned_words)

        def matches(entry):
            for key in entry[0]:
                if not any(imap(methodcaller('startswith', key), words)):
                    return False
            return True

        def getdiff(text):
            diff.set_seq2(text)
            return diff.ratio()

        data = filter(matches, data.iteritems())
        data = [(index, resp, getdiff(' '.join(sorted(index, key=cleaned.find))))
                for index, resp in data]
        data.sort(key=itemgetter(2), reverse=True)
        self.state = keywords(input)
        return data
Beispiel #2
0
 def acronyms(self):
     for word in self.words[:]:
         if word in acronyms:
             self.text = re.sub(re.escape(word), '', self.text, flags=re.I)
     caps_count = len([None for i in self.text if i in ascii_uppercase])
     letter_count = len([None for i in self.text if i in ascii_letters])
     if letter_count != 0 and caps_count / letter_count < .5:
         # Doesn't look like all caps spam.
         self.text = recapword.sub('', self.text)
     self.lower = self.text.lower()
     self.words = strip_clean(self.lower, proper_letters).split()
Beispiel #3
0
    def search(self, input):
        input = rewhite.sub(' ', strip_clean(input))
        out = []
        regexes = []
        diff_ = SequenceMatcher(partial(contains, '?,./<>`~!@#$%&*()_+-={}[];:\'"|\\'), input)

        def diff(text):
            diff_.set_seq2(text)
            return diff_.ratio()

        for regex, resp in self._search_db(input):
            regexes.append((self.regex[regex], resp.split('\f')))

        for regex, resp in regexes:
            match = regex.search(input)
            if match is not None:
                # Strip all backreference groups off
                if match.lastindex is not None:
                    base = StringIO()
                    last = match.start()
                    for i in xrange(1, match.lastindex+1):
                        base.write(input[last:match.start(i)])
                        last = match.end(i)
                    base.write(input[last:match.end()])
                    priority = diff(base.getvalue())
                else:
                    priority = diff(match.group(0))
                
                g0 = match.group(0)

                def expand(resp):
                    return respaces.sub(' ', match.expand(resp.replace(r'\0', g0).replace('\g<0>', g0)))
                
                resp = map(expand, resp) # expand \1, \g<1>, \g<name>
                out.append((match, resp, priority))
        
        out.sort(key=itemgetter(2), reverse=True)
        return out
Beispiel #4
0
def tokens(text):
    """Returns text as a list of words(tokens) in lowercase"""
    return strip_clean(text.lower(), alpha).split()
Beispiel #5
0
 def __init__(self, text):
     self.text = text
     self.lower = text.lower()
     self.words = strip_clean(self.lower, proper_letters).split()
     self.error = 0
     self.reasons = set()