def __init__(self, region, lines, rule, commands, source): Geometry.__init__(self, region, lines) # Subscribe.__init__(self, 'topic') Regex.__init__(self, rule) Database.__init__(self) self.commands = commands self.source_id = source
def __init__(self, numH, strings): self.hSpace_ = list() self.strings_ = strings self.baseH_ = Regex(strings) self.baseHProb_ = self.likelihood(self.baseH_) self.numH_ = numH self.addRegexes([(self.baseH_.copy(), self.baseHProb_)])
def main(): ###### testing code snippets (leftover from development) ###### re = Regex.compile('(.)\\1') re.display() assert re.match('AA') assert not re.match('AB') print "====================================" re = Regex.compile('AA') re.display() assert not re.match('A') assert re.match('AA') assert not re.match('AAAA') print "====================================" re = Regex.compile('(O|RHH|MM)*') re.display() assert re.match('') assert re.match('OOOO') assert re.match('MMORHHO') assert not re.match('MMORHHH') assert re.match('ORHH') print "====================================" re = Regex.compile('((A)\\2)\\1') re.display() assert re.match('AAAA') return 0
class Detokenizer(object): """\ A simple de-tokenizer class. """ def __init__(self): """\ Constructor (pre-compile all needed regexes). """ # compile regexes self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE) self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE) self._contract = Regex(r" (\p{Alpha}+) ' ?(ll|ve|re|[dsmt])(?= )", flags=UNICODE|IGNORECASE) self._fixes = Regex(r" (do|go[nt]|wan) (n't|ta|na)(?= )", flags=UNICODE|IGNORECASE) self._replace_table = {' i ':' I ', ' im ': ' I\'m ', ' dont ': ' don\'t '} def detokenize(self, text): """\ Detokenize the given text. """ text = ' ' + text + ' ' text = self._currency_or_init_punct.sub(r' \1', text) text = self._noprespace_punct.sub(r'\1 ', text) text = self._contract.sub(r" \1'\2", text) text = self._fixes.sub(r' \1\2', text) for tok, repl in self._replace_table.iteritems(): text = text.replace(tok, repl) text = text.strip() # capitalize if not text: return '' text = text[0].upper() + text[1:] return text
def main(): x_regexes = [ '.*H.*H.*', '(DI|NS|TH|OM)*', 'F.*[AO].*[AO].*', '(O|RHH|MM)*', '.*', 'C*MC(CCC|MM)*', '[^C]*[^R]*III.*', '(...?)\\1*', '([^X]|XCC)*', '(RR|HHH)*.?', 'N.*X.X.X.*E', 'R*D*M*', '.(C|HH)*', ] y_regexes = [ '(ND|ET|IN)[^X]*', '[CHMNOR]*I[CHMNOR]*', 'P+(..)\\1.*', '(E|CR|MN)*', '([^MC]|MM|CC)*', '[AM]*CM(RC)*R?', '.*', '.*PRR.*DDC.*', '(HHX|[^HX])*', '([^EMC]|EM)*', '.*OXR.*', '.*LR.*RL.*', '.*SE.*UE.*', ] # start with x = 0, y = max z_regexes = [ '.*G.*V.*H.*', '[CR]*', '.*XEXM*', '.*DD.*CCM.*', '.*XHCR.*X.*', '.*(.)(.)(.)(.)\\4\\3\\2\\1.*', '.*(IN|SE|HI)', '[^C]*MMM[^C]*', '.*(.)C\\1X\\1.*', '[CEIMU]*OH[AEMOR]*', '(RX|[^R])*', '[^M]*M[^M]*', '(S|MM|HHH)*', ] n = 7 x_regexes = [Regex.compile(i) for i in x_regexes] y_regexes = [Regex.compile(i) for i in y_regexes] z_regexes = [Regex.compile(i) for i in z_regexes] arr = RegexCrossword.solve(n, x_regexes, y_regexes, z_regexes) display_hexagon(arr) return 0
def __init__(self, value): regex = value.replace("<HOST>", "(?:::f{4,6}:)?(?P<host>\S+)") Regex.__init__(self, regex) if "host" not in self._regexObj.groupindex: raise RegexException("No 'host' group in '%s'" % self._regex)
def isNumber(token): e =Regex.e() pi =Regex.pi() num=Regex.number() if e.match(token)==None and pi.match(token)==None and num.match(token)==None: return False else: return True
def __init__(self, id: int, m: Regex) -> None: super().__init__() self._attrs = ["mstart", "mend", "id"] self.key = "R{}".format(id) self.id = id self.match = m self.mstart = m.span(self.key)[0] self.mend = m.span(self.key)[1] self._text = m.group(self.key)
def isNumber(token): e = Regex.e() pi = Regex.pi() num = Regex.number() if e.match(token) == None and pi.match(token) == None and num.match( token) == None: return False else: return True
def __init__(self, id: int, m: Regex) -> None: super().__init__() self._attrs = ['mstart', 'mend', 'id'] self.key = 'R{}'.format(id) self.id = id self.match = m self.mstart = m.span(self.key)[0] self.mend = m.span(self.key)[1] self._text = m.group(self.key)
def __init__(self, is_training=False): self.classifier = None self.feature_model = None self.regex_rule = Regex() if not is_training: self.classifier = utils.load( os.path.join('vnspliter/model', 'model.pkl')) if self.classifier is None: print "Unable to load model!" exit(-1)
def regex_to_fa(self): regex_str = self.regex_input.text() try: self.fa = Regex(regex_str).dfa except SyntaxError as e: self.show_error(e) return self.fa.regex_str = regex_str self.add_fa_to_list()
def determine(token): rg_e =Regex.e() rg_pi =Regex.pi() rg_num=Regex.number() if rg_e.match(token)!=None: return Number(e) elif rg_pi.match(token)!=None: return Number(pi) else: return Number(token)
def determine(token): rg_e = Regex.e() rg_pi = Regex.pi() rg_num = Regex.number() if rg_e.match(token) != None: return Number(e) elif rg_pi.match(token) != None: return Number(pi) else: return Number(token)
def determine(name, param=None, base=None, exponential=None): sin=Regex.sin() cos=Regex.cos() tan=Regex.tan() csc=Regex.csc() sec=Regex.sec() cot=Regex.cot() exp=Regex.exp() pow=Regex.pow() log=Regex.log() if sin.match(name)!=None: return Sin(param) elif cos.match(name)!=None: return Cos(param) elif tan.match(name)!=None: return Tan(param) elif csc.match(name)!=None: return Csc(param) elif sec.match(name)!=None: return Sec(param) elif cot.match(name)!=None: return Cot(param) elif exp.match(name)!=None: return Exp(param) elif pow.match(name)!=None: return Pow(base, exponential) elif log.match(name)!=None: return Log(base, exponential) else: return None
def to_regex(self): """ Returns a regex approximation Args: None Returns: str: A regex approximation """ from regex import Regex converter = Regex(self) return converter.get_regex()
def add_starred_from_converters(self, _from1, _to1, functional_object, converters): other_things = [(f, t, functional_object2) for f, t, functional_object2 in converters] for _from2, _to2, functional_object2 in flatten_optional_list_triple(other_things): if "*" in _to2: other_things_regex = Regex("^" + _from2.replace("*", r"(\w+)") + "$") m = other_things_regex.match(_to1) if m: new_to = _to2.replace("*", m.group(1)) new_from = _to2.replace("*", m.group(1)) self.add_edge(_to1, new_from, functional_object2)
class Replacer(object): def __init__(self): self.__author__ = "Revo" self.__date__ = "2017-10-27" # email address: self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)') # url address: self.__url_addr = Regex( r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))' ) # Numbers self.__numbers = Regex(r'([+\-]?\d*[\.,]?\d+[\d\.,+\-eE]*)') # Replace with add one self.__addone = Regex(r'(__(NUM|EMAIL|URL)(\d+)__)') # double space to single self.__spaces = Regex(r'\s+', flags=UNICODE) self.line = 0 def process(self, text, ori_line): #print text self.line += 1 list_tags = self.__addone.findall(text) if list_tags: #print list_tags print "LINE:", self.line print "IN,", text print "ORI,", ori_line email_list = self.__email_addr.findall(ori_line) num_list = self.__numbers.findall(ori_line) url_list = self.__url_addr.findall(ori_line) print "EMAIL,", email_list print "NUM,", num_list print "URL,", url_list for match in list_tags: try: if match[1] == "URL": text = text.replace(match[0], url_list[int(match[2]) - 1][0]) elif match[1] == "EMAIL": text = text.replace(match[0], email_list[int(match[2]) - 1]) elif match[1] == "NUM": # eight->problem text = text.replace(match[0], num_list[int(match[2]) - 1]) except BaseException: print "F****D" pass print "REPLACED:", text print "-----"
def __init__(self, db, app): self.wnioski = Wnioski(db) self.db = db self.app = app self.regex = Regex() if self.db.session.query(TassDB).all() == []: print('baza pusta, wczytuje dane') self.inicjuj_baze() print('dane wczytane') print('wyciągam lokalizacje') self._czysc_lokalizacje() self.regexuj_lokalizacje() print('baza danych gotowa') else: print('baza została już wcześniej utworzona') print('aby ją wczytać ponownie usun plik bazy serwer/TASS.db')
def afd_minimo(archivo_regex, archivo_automata): regex = Regex.crear_desde_archivo(archivo_regex) automata = regex.automata() automata.determinizar() automata.minimizar() automata.escribir_archivo(archivo_automata)
def add_starred(self, _from1, _to1, functional_object, converters): if _from1 == None: _from1 = OUT_OF_THE_BOX if "*" in _from1: other_things = [(f, t) for f, t, o in converters] new_things_regex = Regex("^" + _from1.replace("*", r"(\w+)") + "$") for _from2, _to2 in flatten_optional_list_pair(other_things): m = new_things_regex.match(_to2) if m: new_from = _to1.replace("*", m.group(1)) self.add_edge(_to2, new_from, functional_object) self.add_starred_from_converters(_to2, new_from, functional_object, converters)
def _greedy_split(self, input: str, re: regex.Regex) -> List[str]: """ Splits an input string greedily from a list of prefixes. Stops when no more matches are found. Args: input (str): input string re (regex.Regex): Prefix match object Returns: (list) of prefixes Raises: (KrakenEncodeException) if no prefix match is found for some part of the string. """ r = [] # type: List[str] idx = 0 while True: mo = re.match(input, idx) if mo is None or idx == len(input): if len(input) > idx: raise KrakenEncodeException('No prefix matches for input after {}'.format(idx)) return r r.append(mo.group()) idx = mo.end()
def refang_text2(txt: str, re: regex.Regex = re_fang, fangs: dict = FANGS): ''' Remove address de-fanging in text blobs, .e.g. example[.]com to example.com Notes: Matches to keys in FANGS is case-insensitive, but replacement will always be with the lowercase version of the re-fanged value. For example, ``HXXP://FOO.COM`` will be returned as ``http://FOO.COM`` Args: txt (str): The text to re-fang. Returns: tuple(str, dict): A tuple containing the new text, and a dictionary containing offset information where the new text was altered with respect to the original text. ''' # The _consumed key is a offset used to track how many chars have been # consumed while the cb is called. This is because the match group # span values are based on their original string locations, and will not # produce values which can be cleanly mapped backwards. offsets = {'_consumed': 0} cb = functools.partial(_refang2_func, offsets=offsets, fangs=fangs) # Start applying FANGs and modifying the info to match the output ret = re.sub(cb, txt) # Remove the _consumed key since it is no longer useful for later use. offsets.pop('_consumed') return ret, offsets
def _greedy_split(self, input: str, re: regex.Regex) -> List[str]: """ Splits an input string greedily from a list of prefixes. Stops when no more matches are found. Args: input (str): input string re (regex.Regex): Prefix match object Returns: (list) of prefixes Raises: (KrakenEncodeException) if no prefix match is found for some part of the string. """ r = [] # type: List[str] idx = 0 while True: mo = re.match(input, idx) if mo is None or idx == len(input): if len(input) > idx: raise KrakenEncodeException( 'No prefix matches for input after {}'.format(idx)) return r r.append(mo.group()) idx = mo.end()
def __init__(self): """\ Constructor (pre-compile all needed regexes). """ # compile regexes self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE) self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE) self._contract = Regex(r" (\p{Alpha}+) ' (ll|ve|re|[dsmt])(?= )", flags=UNICODE | IGNORECASE) self._dash_fixes = Regex( r" (\p{Alpha}+|£ [0-9]+) - (priced|star|friendly|(?:£ )?[0-9]+) ", flags=UNICODE | IGNORECASE) self._dash_fixes2 = Regex(r" (non) - ([\p{Alpha}-]+) ", flags=UNICODE | IGNORECASE)
def __init__(self, codes, separator='@@', vocab=None, glossaries=None): # check version information #codes = codecs.open(codes,"r", encoding='utf-8') firstline = codes.readline() if firstline.startswith('#version:'): self.version = tuple([ int(x) for x in re.sub(r'(\.0+)*$', '', firstline.split()[-1]).split(".") ]) else: self.version = (0, 1) codes.seek(0) self.bpe_codes = [tuple(item.split()) for item in codes] # some hacking to deal with duplicates (only consider first instance) self.bpe_codes = dict([ (code, i) for (i, code) in reversed(list(enumerate(self.bpe_codes))) ]) self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair, i in self.bpe_codes.items()]) self.separator = separator self.vocab = vocab #self.glossaries = glossaries if glossaries else [] self.glossaries = [] # for i in xrange(30): # self.glossaries.append("__URL"+str(i)+"__") # #self.glossaries.append("__NUM"+str(i)+"__") # self.glossaries.append("__EMAIL"+str(i)+"__") # self.cache = {} # added by revo self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)') # url address: self.__url_addr = Regex( r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))' )
def __init__(self): """\ Constructor (pre-compile all needed regexes). """ # compile regexes self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE) self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE) self._contract = Regex(r" (\p{Alpha}+) ' ?(ll|ve|re|[dsmt])(?= )", flags=UNICODE|IGNORECASE) self._fixes = Regex(r" (do|go[nt]|wan) (n't|ta|na)(?= )", flags=UNICODE|IGNORECASE) self._replace_table = {' i ':' I ', ' im ': ' I\'m ', ' dont ': ' don\'t '}
def save_regex(self): regex = Regex(self.regex_input.text()) path, _ = QFileDialog.getSaveFileName(self) if path: file = open(path, 'w') file.write(regex.regex_str) file.close() else: return
class Detokenizer(object): """\ A simple de-tokenizer class. """ def __init__(self): """\ Constructor (pre-compile all needed regexes). """ # compile regexes self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE) self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE) self._contract = Regex(r" (\p{Alpha}+) ' (ll|ve|re|[dsmt])(?= )", flags=UNICODE | IGNORECASE) self._dash_fixes = Regex( r" (\p{Alpha}+|£ [0-9]+) - (priced|star|friendly|(?:£ )?[0-9]+) ", flags=UNICODE | IGNORECASE) self._dash_fixes2 = Regex(r" (non) - ([\p{Alpha}-]+) ", flags=UNICODE | IGNORECASE) def detokenize(self, text): """\ Detokenize the given text. """ replace_with_blank = [ "somewhat rather", "sort of", "somewhat", "rather" ] text = ' ' + text + ' ' text = self._dash_fixes.sub(r' \1-\2 ', text) text = self._dash_fixes2.sub(r' \1-\2 ', text) text = self._currency_or_init_punct.sub(r' \1', text) text = self._noprespace_punct.sub(r'\1 ', text) text = self._contract.sub(r" \1'\2", text) text = text.strip() for word in replace_with_blank: text = text.replace(word, "") # capitalize if not text: return '' text = text[0].upper() + text[1:] return text
def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # load no-break prefixes for the given language self.__load_nobreaks(options.get('language'), options.get('nobreak_file')) # compile regexes self.__spaces = Regex(r'\s+') self.__space_at_end = Regex(r'(^|\n) ') self.__space_at_begin = Regex(r' ($|\n)') self.__non_period = Regex(r'([?!]|\.{2,}) +' + self.SENT_STARTER) self.__in_punct = Regex(r'([?!\.] *' + self.FINAL_PUNCT + r') +' + self.SENT_STARTER) self.__punct_follows = Regex(r'([?!\.]) +' + self.SENT_STARTER_PUNCT) self.__period = Regex(r'([\p{Alnum}\.\-]+)(' + self.FINAL_PUNCT + r')? *$') self.__ucase_acronym = Regex(r'\.[\p{Upper}\-]+$') self.__numbers = Regex(r'^\p{N}') self.__sent_starter = Regex(self.SENT_STARTER)
def __init__(self): self.__author__ = "Revo" self.__date__ = "2017-10-27" # email address: self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)') # url address: self.__url_addr = Regex( r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))' ) # Numbers self.__numbers = Regex(r'([+\-]?\d*[\.,]?\d+[\d\.,+\-eE]*)') # Replace with add one self.__addone = Regex(r'(__(NUM|EMAIL|URL)(\d+)__)') # double space to single self.__spaces = Regex(r'\s+', flags=UNICODE) self.line = 0
def do_regex(spam_text, data_resp): results = [] for data in data_resp: text = data['text'] result = Regex.match_string(text, spam_text) result['profile_img'] = data['profile_img'] result['name'] = data['name'] result['screen_name'] = data['screen_name'] results.append(result) return results
def __init__(self): self.__author__ = "Revo" self.__date__ = "2017-12-28" #self.__date__ = "2017-10-24" # email address: self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)') # url address: self.__url_addr = Regex( r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))' ) #self.__date_list = ["a.m","p.m","A.M","P.M"] # Numbers self.__numbers = Regex(r'([+\-]?\d*[\.,]?\d+[\d\.,+\-eE]*)') # Replace with add one self.__addone = Regex(r'(__(NUM|EMAIL|URL)__)') self.__addone_search = Regex(r'(__(NUM|EMAIL|URL)(\d+)__)') # double space to single self.__spaces = Regex(r'\s+', flags=UNICODE) # self.__counter = dict({"URL": 0, "EMAIL": 0}) # self.line = 0
def open_regex(self): path, _ = QFileDialog.getOpenFileName(self) string = "" if path: file = open(path, 'r') string = file.read() try: regex = Regex(string) self.regex_input.setText(regex.regex_str) except SyntaxError as e: self.show_error(e) return file.close()
def __init__(self, alpha, path): self.alpha = alpha self.rules = [] with open(path, 'r') as f: for l in f.readlines(): l = l.strip() if len(l) == 0: continue l = l.split('=>') rx = Regex(l[0].strip(), self.alpha) tag = l[1].strip() self.rules.append([rx, tag])
def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # process options self.moses_deescape = True if options.get('moses_deescape') else False self.language = options.get('language', 'en') self.capitalize_sents = True if options.get( 'capitalize_sents') else False # compile regexes self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$') self.__noprespace_punct = Regex(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$') self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F' + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F' + r'\uFF65-\uFFDC]') self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$') # language-specific regexes self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$') self.__contract = None if self.language in self.CONTRACTIONS: self.__contract = Regex(self.CONTRACTIONS[self.language], IGNORECASE)
def __init__(self, **options): """\ Constructor (pre-compile all needed regexes). """ # process options self.moses_deescape = True if options.get('moses_deescape') else False self.language = options.get('language', 'en') self.capitalize_sents = True if options.get('capitalize_sents') else False # compile regexes self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$') self.__noprespace_punct = Regex(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$') self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F' + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F' + r'\uFF65-\uFFDC]') self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$') # language-specific regexes self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$') self.__contract = None if self.language in self.CONTRACTIONS: self.__contract = Regex(self.CONTRACTIONS[self.language], IGNORECASE)
import requests import unittest import sys import xlrd sys.path.append("../utils") sys.path.append("../data") from regex import Regex print(Regex.re(1, 1)) class LatentAPI(unittest.TestCase): strBaseurl = 'http://www.baidu.com' url1 = 'https://api.opifices.com/v1/specifications.json' url2 = 'https://api.opifices.com/oauth/token' data = { 'username': '******', 'password': '******', 'client_id': '1000a0200d80800dc40d322db6747f09b36825c418141a8f02449fdf1003fb55', 'client_secret': 'a567554533c4c6ce3c6c4f1fe9fb02a1fa4d4ab582b6626fc1d5d6b3cc24ec2c', 'grant_type': 'password' } def SengMSG(self): a = requests.get(self.url1).text b = requests.post(self.url2, self.data, verify=False).text print(Regex.re(a, a)) print(Regex.re(a, b))
def SengMSG(self): a = requests.get(self.url1).text b = requests.post(self.url2, self.data, verify=False).text print(Regex.re(a, a)) print(Regex.re(a, b))
def __init__(self, regex): Regex.__init__(self, regex) if "host" not in self._regexObj.groupindex: raise RegexException("No 'host' group in '%s'" % self._regex)
def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # process options self.lowercase = True if options.get('lowercase') else False self.moses_escape = True if options.get('moses_escape') else False # compile regexes self.__spaces = Regex(r'\s+', flags=UNICODE) self.__ascii_junk = Regex(r'[\000-\037]') self.__special_chars = \ Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)') # single quotes: all unicode quotes + prime self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]') # double quotes: all unicode chars incl. Chinese + double prime + ditto self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])') self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])') self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])') self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])') # hyphen: separate every time but for unary minus self.__minus = Regex(r'([-−])') self.__pre_notnum = Regex(r'(-)([^\p{N}])') self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')
def isDoubleParamFunction(name): for now_regex in Regex.doubleParamFunctions(): if now_regex.match(name)!=None: return True return False
class Tokenizer(object): """\ A simple tokenizer class, capable of tokenizing given strings. """ # Moses special characters escaping ESCAPES = [('&', '&'), # must go first to prevent double escaping! ('|', '&bar;'), ('<', '<'), ('>', '>'), ('[', '&bra;'), (']', '&ket;')] def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # process options self.lowercase = True if options.get('lowercase') else False self.moses_escape = True if options.get('moses_escape') else False # compile regexes self.__spaces = Regex(r'\s+', flags=UNICODE) self.__ascii_junk = Regex(r'[\000-\037]') self.__special_chars = \ Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)') # single quotes: all unicode quotes + prime self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]') # double quotes: all unicode chars incl. Chinese + double prime + ditto self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])') self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])') self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])') self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])') # hyphen: separate every time but for unary minus self.__minus = Regex(r'([-−])') self.__pre_notnum = Regex(r'(-)([^\p{N}])') self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)') def tokenize_factors(self, pretoks, factor_no=0): """\ Further tokenize a list of factored tokens (separated by `|'), separating the given factor and copying the other factor to all its parts. """ out = [] for pretok in pretoks: factors = pretok.split('|') tokens = ['|'.join(factors[:factor_no] + [token] + factors[factor_no + 1:]) for token in self.tokenize(factors[factor_no]).split(' ')] out.extend(tokens) return out def tokenize_factored_text(self, factored_text, factor_no=0): """\ Further tokenize pre-tokenized text composed of several factors (separated by `|'). Tokenize further the given factor and copy all other factors. """ pretoks = self.__spaces.split(factored_text) return ' '.join(self.tokenize_factors(pretoks, factor_no)) def tokenize(self, text): """\ Tokenize the given text using current settings. """ # pad with spaces so that regexes match everywhere text = ' ' + text + ' ' # spaces to single space text = self.__spaces.sub(' ', text) # remove ASCII junk text = self.__ascii_junk.sub('', text) # separate punctuation (consecutive items of same type stay together) text = self.__special_chars.sub(r' \1 ', text) # separate dots and commas everywhere except in numbers text = self.__no_numbers.sub(r'\1 \2 \3', text) text = self.__pre_numbers.sub(r'\1 \2 \3', text) text = self.__post_numbers.sub(r'\1 \2 \3', text) # normalize quotes text = self.__to_single_quotes.sub('\'', text) text = self.__to_double_quotes.sub('"', text) # separate hyphen, minus text = self.__pre_notnum.sub(r'\1 \2', text) text = self.__post_num_or_nospace.sub(r'\1\2 ', text) text = self.__minus.sub(r' \1', text) # spaces to single space text = self.__spaces.sub(' ', text) text = text.strip() # escape chars that are special to Moses if self.moses_escape: for char, repl in self.ESCAPES: text = text.replace(char, repl) # lowercase if self.lowercase: text = text.lower() return text
def isVariable(token): special=Regex.special() if special.match(token)==None: return True else: return False
class SentenceSplitter(object): """\ A simple sentence splitter class. """ # TODO look at quote characters, CZ quotes possibly have wrong # Unicode classes! # sentence starters (possibly some starting punctuation) + upper-case char. SENT_STARTER = r'([\'\"\(\[\¿\¡\p{Pi}]* *[\p{Upper}\p{N}])' # sentence starters with compulsory punctuation SENT_STARTER_PUNCT = r'([\'\"\(\[\¿\¡\p{Pi}]+ *[\p{Upper}p{N}])' # final punctuation FINAL_PUNCT = r'[\'\"\)\]\p{Pf}\%]+' # non-breaking prefix directory NOBREAK_DIR = 'nonbreaking_prefixes' # non-breaking prefix file NOBREAK_FILE = 'nonbreaking_prefix.' def __init__(self, options={}): """\ Constructor (pre-compile all needed regexes). """ # load no-break prefixes for the given language self.__load_nobreaks(options.get('language'), options.get('nobreak_file')) # compile regexes self.__spaces = Regex(r'\s+') self.__space_at_end = Regex(r'(^|\n) ') self.__space_at_begin = Regex(r' ($|\n)') self.__non_period = Regex(r'([?!]|\.{2,}) +' + self.SENT_STARTER) self.__in_punct = Regex(r'([?!\.] *' + self.FINAL_PUNCT + r') +' + self.SENT_STARTER) self.__punct_follows = Regex(r'([?!\.]) +' + self.SENT_STARTER_PUNCT) self.__period = Regex(r'([\p{Alnum}\.\-]+)(' + self.FINAL_PUNCT + r')? *$') self.__ucase_acronym = Regex(r'\.[\p{Upper}\-]+$') self.__numbers = Regex(r'^\p{N}') self.__sent_starter = Regex(self.SENT_STARTER) def split_sentences(self, text): """\ Split sentences in the given text using current settings. """ # clean text = self.__spaces.sub(r' ', text) text = self.__space_at_begin.sub(r'\1', text) text = self.__space_at_end.sub(r'\1', text) # break on special cases text = self.__non_period.sub(r'\1\n\2', text) text = self.__in_punct.sub(r'\1\n\2', text) text = self.__punct_follows.sub(r'\1\n\2', text) # break on periods words = text.split('. ') text = '' for word, next_word in zip(words[:-1], words[1:]): text += word + '.' match = self.__period.search(word) # check periods if match: prefix, end_punct = match.groups() # never break on no-break prefixes, upper case acronyms # and numeric no-breaks before numbers if (prefix in self.__nobreaks and not end_punct) or \ self.__ucase_acronym.search(prefix) or \ (prefix in self.__numeric_nobreaks and not end_punct and self.__numbers.match(next_word)): text += ' ' # break before sentence starters elif self.__sent_starter.match(next_word): text += "\n" # don't break otherwise else: text += ' ' # don't break when there's no period else: text += ' ' # append last token (we stopped iterating just before it) text += words[-1] # return the result return text.split("\n") def __load_nobreaks(self, language=None, filename=None): """\ Load non-breaking prefixes for the given language from a default location or from the given file. """ # initialize sets of non-breaking prefixes self.__nobreaks = set() self.__numeric_nobreaks = set() # obtain file name from language specification if filename is None and language is not None: filename = os.path.dirname(__file__) + os.sep + \ self.NOBREAK_DIR + os.sep + self.NOBREAK_FILE + language # try to load prefixes from file if filename and os.path.isfile(filename): fh = codecs.open(filename, 'r', 'UTF-8') for item in fh: item = item.strip() if item and not item.startswith('#'): match = regex.match(r'^(.*)\s+#NUMERIC_ONLY#', item) if match: self.__numeric_nobreaks.add(match.group(1)) else: self.__nobreaks.add(item)
class Inference: def __init__(self, numH, strings): self.hSpace_ = list() self.strings_ = strings self.baseH_ = Regex(strings) self.baseHProb_ = self.likelihood(self.baseH_) self.numH_ = numH self.addRegexes([(self.baseH_.copy(), self.baseHProb_)]) def addRegexes(self, reSet): # add set for re, prob in reSet: load = True for h, _ in self.hSpace_: if re.equalTo(h): load = False continue if load: self.hSpace_.append((re, prob)) # remove extra hypotheses self.sortHypotheses() # self.cullHypotheses() def generateAll(self): print "Generating hypothesis for", len(self.baseH_.states_), "states." allRegexes = totalSet(list((s.ID_) for s in self.baseH_.states_.values())) print "Total number of regexes", len(allRegexes) for regexStates in allRegexes: newRegex = self.baseH_.copy() for a in regexStates: if len(a) == 1: continue for b in a[1:]: newRegex.mergeRandom(a[0], b) self.addRegexes([(newRegex, self.likelihood(newRegex))]) def cullHypotheses(self): for a in range(len(self.hSpace_) - self.numH_): del self.hSpace_[-1] def sortHypotheses(self): # sort by descending probability self.hSpace_ = sorted(self.hSpace_, key=lambda array: -array[1]) def likelihood(self, re): result = re.logPrior() for string in self.strings_: accept, LL = re.string(string) if not accept: print "Error, regex does not accept string", string re.printText() re.printGraph("output/inference/error.png") assert False result += LL return result def duplicateHypotheses(self, permute=False): newH = list() for i in range(self.numH_ ): newRe = self.baseH_.copy() newRe.permuteRegex() newH.append((newRe, self.likelihood(newRe))) while len(self.hSpace_) < 2 * self.numH_: for re, prob in self.hSpace_[:]: re2 = re.copy() if permute: re2.permuteRegex() newH.append((re2, self.likelihood(re2))) self.addRegexes(newH) def testString(self, testString): totalProb = 0 acceptProb = 0 for h, prob in self.hSpace_: totalProb += exp(prob) accept, _ = h.string(testString) if accept: acceptProb += exp(prob) return acceptProb / totalProb def beamStep(self, re): newRegexes = list() # generate merged steps for stateID1 in list((s.ID_) for s in re.states_.values()): for stateID2 in list((s.ID_) for s in re.states_.values()): if stateID1 == stateID2: continue newRe = re.copy() newRe.mergeRandom(stateID1, stateID2) newRegexes.append((newRe, self.likelihood(newRe))) # generate wildcard steps for stateID1 in list((s.ID_) for s in re.states_.values()): for wildcard in ['S', 'N', 'A']: for k, s in re.states_[stateID1].next_: if keysOverlap(k, wildcard) and keyMinus(wildcard, k) != '': newRe = re.copy() newRe.wildcardize(stateID1, wildcard) newRegexes.append((newRe, self.likelihood(newRe))) # only replace one of the transitions for a wildcard break return newRegexes def beamSearch(self): beam = [(self.baseH_, self.baseHProb_)] newBeam = list() i = 0 while len(beam) > 0: print "beam iteration:", i, "hypotheses:", len(beam) i += 1 # take step forward while len(beam) > 0: h, prob = beam.pop(0) newBeam.extend(self.beamStep(h)) # exit if there is no more step if len(newBeam) == 0: return # copy best hypotheses to old beam newBeam = sorted(newBeam, key=lambda array: -array[1]) while len(beam) < BEAM_SIZE and len(newBeam) > 0: re1, prob1 = newBeam.pop(0) # add = True # for re2, prob2 in beam: # if re1.equalTo(re2): # add = False # break # if add: if True: beam.append((re1, prob1)) beam[0][0].printGraph("output/beam-iter-%d-1.png"%i) beam[1][0].printGraph("output/beam-iter-%d-2.png"%i) beam[2][0].printGraph("output/beam-iter-%d-3.png"%i) beam[3][0].printGraph("output/beam-iter-%d-4.png"%i) beam[4][0].printGraph("output/beam-iter-%d-5.png"%i) # add hypotheses in beam to hset, clear newbeam self.addRegexes(beam) self.addRegexes(newBeam) newBeam = list()
class Detokenizer(object): """Based on Ondrej Dusek's code""" # Moses special characters de-escaping ESCAPES = [('&bar;', '|'), ('<', '<'), ('>', '>'), ('&bra;', '['), ('&ket;', ']'), ('&', '&')] # should go last to prevent double de-escaping # Contractions for different languages CONTRACTIONS = {'en': r'^\p{Alpha}+(\'(ll|ve|re|[dsm])|n\'t)$', 'fr': r'^([cjtmnsdl]|qu)\'\p{Alpha}+$', 'es': r'^[dl]\'\p{Alpha}+$', 'it': r'^\p{Alpha}*(l\'\p{Alpha}+|[cv]\'è)$', 'cs': r'^\p{Alpha}+[-–](mail|li)$', } def __init__(self, **options): """\ Constructor (pre-compile all needed regexes). """ # process options self.moses_deescape = True if options.get('moses_deescape') else False self.language = options.get('language', 'en') self.capitalize_sents = True if options.get('capitalize_sents') else False # compile regexes self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$') self.__noprespace_punct = Regex(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$') self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F' + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F' + r'\uFF65-\uFFDC]') self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$') # language-specific regexes self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$') self.__contract = None if self.language in self.CONTRACTIONS: self.__contract = Regex(self.CONTRACTIONS[self.language], IGNORECASE) def detokenize(self, text): """\ Detokenize the given text using current settings. """ # paste text back, omitting spaces where needed words = text.split(' ') text = '' pre_spc = ' ' quote_count = {'\'': 0, '"': 0, '`': 0} for pos, word in enumerate(words): # remove spaces in between CJK chars if self.__cjk_chars.match(text[-1:]) and \ self.__cjk_chars.match(word[:1]): text += word pre_spc = ' ' # no space after currency and initial punctuation elif self.__currency_or_init_punct.match(word): text += pre_spc + word pre_spc = '' # no space before commas etc. (exclude some punctuation for French) elif self.__noprespace_punct.match(word) and \ (self.language != 'fr' or not self.__fr_prespace_punct.match(word)): text += word pre_spc = ' ' # contractions with comma or hyphen elif word in "'-–" and pos > 0 and pos < len(words) - 1 \ and self.__contract is not None \ and self.__contract.match(''.join(words[pos - 1:pos + 2])): text += word pre_spc = '' # handle quoting elif word in '\'"„“”‚‘’`': # detect opening and closing quotes by counting # the appropriate quote types quote_type = word if quote_type in '„“”': quote_type = '"' elif quote_type in '‚‘’': quote_type = '\'' # exceptions for true Unicode quotes in Czech & German if self.language in ['cs', 'de'] and word in '„‚': quote_count[quote_type] = 0 elif self.language in ['cs', 'de'] and word in '“‘': quote_count[quote_type] = 1 # special case: possessives in English ("Jones'" etc.) if self.language == 'en' and text.endswith('s'): text += word pre_spc = ' ' # really a quotation mark else: # opening quote if quote_count[quote_type] % 2 == 0: text += pre_spc + word pre_spc = '' # closing quote else: text += word pre_spc = ' ' quote_count[quote_type] += 1 # keep spaces around normal words else: text += pre_spc + word pre_spc = ' ' # de-escape chars that are special to Moses if self.moses_deescape: for char, repl in self.ESCAPES: text = text.replace(char, repl) # strip leading/trailing space text = text.strip() # capitalize, if the sentence ends with a final punctuation if self.capitalize_sents and self.__final_punct.search(text): text = text[0].upper() + text[1:] return text
def compile(self, grammar_type="regex"): """ 根据文法类型进行编译, 产生dfa. regex 表示 正则表达式, regular 表示 正规文法 :param grammar: 文法类型 :return: """ if grammar_type == 'regex': nfas = [] for le in self.lexs: # print le nfas.append(Regex.compile_nfa(le[1], extend=True, type=le[0])) nfa = NFA.combine(*nfas) self.lex_dfa = nfa.convert_dfa(copy_meta=["type"]) return elif grammar_type == "regular": """ 本来没有想到会做三型文法解析, 由于parser里也有文法解析.. 此处应该跟那边合并.. """ nfas = [] grammar = defaultdict(list) g_in, g_out = defaultdict(int), defaultdict(int) all_symbol = set() for l_hand, r_hand in self.lexs: l_hand = l_hand[1:-1] r_hands = [[x[1:-1] for x in r.strip().split()] for r in r_hand.split('|')] for hand in r_hands: for h in hand: g_in[h] += 1 all_symbol.add(h) g_out[l_hand] += 1 all_symbol.add(l_hand) grammar[l_hand].extend(r_hands) grammar['limit'] = [[' '], ['\t'], ['\n']] ter, not_ter = [], [] for sym in all_symbol: if g_in[sym] == 0: not_ter.append(sym) if g_out[sym] == 0: ter.append(sym) # print ter, not_ter nfas = [] for token_type in not_ter: nfa = NFA() nfa.start = NFANode(r_name=token_type) end_node = NFANode(type=token_type) end_node.end = True nfa.end = {end_node} vis = {token_type: nfa.start} def get_node(name): if name in vis: return vis[name] vis[name] = NFANode(r_name=name) return vis[name] que = Queue() que.put(token_type) while not que.empty(): t = que.get() node = get_node(t) if node.meta.get('vis', 0) > 0: continue node.meta['vis'] = node.meta.get('vis', 0) + 1 for r_hand in grammar[t]: node.next.setdefault(r_hand[0], set()) if len(r_hand) == 2: node.next[r_hand[0]].add(get_node(r_hand[1])) que.put(r_hand[1]) else: node.next[r_hand[0]].add(end_node) nfas.append(nfa) nfa = NFA.combine(*nfas) self.lex_dfa = nfa.convert_dfa(copy_meta=["type"]) return