Python Regex Beispiele, regex.Regex Python Beispiele

Beispiel #1

0

Datei anzeigen

 def __init__(self, region, lines, rule, commands, source):
     Geometry.__init__(self, region, lines)
     # Subscribe.__init__(self, 'topic')
     Regex.__init__(self, rule)
     Database.__init__(self)
     self.commands = commands
     self.source_id = source

Beispiel #2

0

Datei anzeigen

 def __init__(self, numH, strings):
     self.hSpace_ = list()
     self.strings_ = strings
     self.baseH_ = Regex(strings)
     self.baseHProb_ = self.likelihood(self.baseH_)
     self.numH_ = numH
     self.addRegexes([(self.baseH_.copy(), self.baseHProb_)])

Beispiel #3

0

Datei anzeigen

Datei: test_regex.py Projekt: dneiter/regex-crossword

def main():
    ###### testing code snippets (leftover from development) ######

    re = Regex.compile('(.)\\1')
    re.display()
    assert re.match('AA')
    assert not re.match('AB')
    print "===================================="

    re = Regex.compile('AA')
    re.display()
    assert not re.match('A')
    assert re.match('AA')
    assert not re.match('AAAA')
    print "===================================="

    re = Regex.compile('(O|RHH|MM)*')
    re.display()
    assert re.match('')
    assert re.match('OOOO')
    assert re.match('MMORHHO')
    assert not re.match('MMORHHH')
    assert re.match('ORHH')
    print "===================================="

    re = Regex.compile('((A)\\2)\\1')
    re.display()
    assert re.match('AAAA')

    return 0

Beispiel #4

0

Datei anzeigen

class Detokenizer(object):
    """\
    A simple de-tokenizer class.
    """

    def __init__(self):
        """\
        Constructor (pre-compile all needed regexes).
        """
        # compile regexes
        self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE)
        self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE)
        self._contract = Regex(r" (\p{Alpha}+) ' ?(ll|ve|re|[dsmt])(?= )", flags=UNICODE|IGNORECASE)
        self._fixes = Regex(r" (do|go[nt]|wan) (n't|ta|na)(?= )", flags=UNICODE|IGNORECASE)
        self._replace_table = {' i ':' I ',
                               ' im ': ' I\'m ',
                               ' dont ': ' don\'t '}

    def detokenize(self, text):
        """\
        Detokenize the given text.
        """
        text = ' ' + text + ' '
        text = self._currency_or_init_punct.sub(r' \1', text)
        text = self._noprespace_punct.sub(r'\1 ', text)
        text = self._contract.sub(r" \1'\2", text)
        text = self._fixes.sub(r' \1\2', text)
        for tok, repl in self._replace_table.iteritems():
            text = text.replace(tok, repl)
        text = text.strip()
        # capitalize
        if not text:
            return ''
        text = text[0].upper() + text[1:]
        return text

Beispiel #5

0

Datei anzeigen

Datei: main.py Projekt: dneiter/regex-crossword

def main():
    x_regexes = [
        '.*H.*H.*',
        '(DI|NS|TH|OM)*',
        'F.*[AO].*[AO].*',
        '(O|RHH|MM)*',
        '.*',
        'C*MC(CCC|MM)*',
        '[^C]*[^R]*III.*',
        '(...?)\\1*',
        '([^X]|XCC)*',
        '(RR|HHH)*.?',
        'N.*X.X.X.*E',
        'R*D*M*',
        '.(C|HH)*',
    ]

    y_regexes = [
        '(ND|ET|IN)[^X]*',
        '[CHMNOR]*I[CHMNOR]*',
        'P+(..)\\1.*',
        '(E|CR|MN)*',
        '([^MC]|MM|CC)*',
        '[AM]*CM(RC)*R?',
        '.*',
        '.*PRR.*DDC.*',
        '(HHX|[^HX])*',
        '([^EMC]|EM)*',
        '.*OXR.*',
        '.*LR.*RL.*',
        '.*SE.*UE.*',
    ]

    # start with x = 0, y = max
    z_regexes = [
        '.*G.*V.*H.*',
        '[CR]*',
        '.*XEXM*',
        '.*DD.*CCM.*',
        '.*XHCR.*X.*',
        '.*(.)(.)(.)(.)\\4\\3\\2\\1.*',
        '.*(IN|SE|HI)',
        '[^C]*MMM[^C]*',
        '.*(.)C\\1X\\1.*',
        '[CEIMU]*OH[AEMOR]*',
        '(RX|[^R])*',
        '[^M]*M[^M]*',
        '(S|MM|HHH)*',
    ]

    n = 7
    x_regexes = [Regex.compile(i) for i in x_regexes]
    y_regexes = [Regex.compile(i) for i in y_regexes]
    z_regexes = [Regex.compile(i) for i in z_regexes]

    arr = RegexCrossword.solve(n, x_regexes, y_regexes, z_regexes)
    display_hexagon(arr)

    return 0

Beispiel #6

0

Datei anzeigen

	def __init__(self, value):
		
		regex = value.replace("<HOST>", "(?:::f{4,6}:)?(?P<host>\S+)")
		
		Regex.__init__(self, regex)
		
		if "host" not in self._regexObj.groupindex:
			raise RegexException("No 'host' group in '%s'" % self._regex)

Beispiel #7

0

Datei anzeigen

Datei: factor.py Projekt: HaebinShin/differentiation-api

	def isNumber(token):
		e  =Regex.e()
		pi =Regex.pi()
		num=Regex.number()
		if e.match(token)==None and pi.match(token)==None and num.match(token)==None:
			return False
		else:
			return True

Beispiel #8

0

Datei anzeigen

 def __init__(self, id: int, m: Regex) -> None:
     super().__init__()
     self._attrs = ["mstart", "mend", "id"]
     self.key = "R{}".format(id)
     self.id = id
     self.match = m
     self.mstart = m.span(self.key)[0]
     self.mend = m.span(self.key)[1]
     self._text = m.group(self.key)

Beispiel #9

0

Datei anzeigen

 def isNumber(token):
     e = Regex.e()
     pi = Regex.pi()
     num = Regex.number()
     if e.match(token) == None and pi.match(token) == None and num.match(
             token) == None:
         return False
     else:
         return True

Beispiel #10

0

Datei anzeigen

Datei: types.py Projekt: pombredanne/ctparse

 def __init__(self, id: int, m: Regex) -> None:
     super().__init__()
     self._attrs = ['mstart', 'mend', 'id']
     self.key = 'R{}'.format(id)
     self.id = id
     self.match = m
     self.mstart = m.span(self.key)[0]
     self.mend = m.span(self.key)[1]
     self._text = m.group(self.key)

Beispiel #11

0

Datei anzeigen

 def __init__(self, is_training=False):
     self.classifier = None
     self.feature_model = None
     self.regex_rule = Regex()
     if not is_training:
         self.classifier = utils.load(
             os.path.join('vnspliter/model', 'model.pkl'))
         if self.classifier is None:
             print "Unable to load model!"
             exit(-1)

Beispiel #12

0

Datei anzeigen

    def regex_to_fa(self):
        regex_str = self.regex_input.text()
        try:
            self.fa = Regex(regex_str).dfa
        except SyntaxError as e:
            self.show_error(e)
            return

        self.fa.regex_str = regex_str
        self.add_fa_to_list()

Beispiel #13

0

Datei anzeigen

Datei: factor.py Projekt: HaebinShin/differentiation-api

	def determine(token):
		rg_e  =Regex.e()
		rg_pi =Regex.pi()
		rg_num=Regex.number()
		if rg_e.match(token)!=None:
			return Number(e)
		elif rg_pi.match(token)!=None:
			return Number(pi)
		else:
			return Number(token)

Beispiel #14

0

Datei anzeigen

 def determine(token):
     rg_e = Regex.e()
     rg_pi = Regex.pi()
     rg_num = Regex.number()
     if rg_e.match(token) != None:
         return Number(e)
     elif rg_pi.match(token) != None:
         return Number(pi)
     else:
         return Number(token)

Beispiel #15

0

Datei anzeigen

Datei: functions.py Projekt: HaebinShin/differentiation-api

	def determine(name, param=None, base=None, exponential=None):
		sin=Regex.sin()
		cos=Regex.cos()
		tan=Regex.tan()
		csc=Regex.csc()
		sec=Regex.sec()
		cot=Regex.cot()
		exp=Regex.exp()
		pow=Regex.pow()
		log=Regex.log()

		if sin.match(name)!=None:
			return Sin(param)
		elif cos.match(name)!=None:
			return Cos(param)
		elif tan.match(name)!=None:
			return Tan(param)
		elif csc.match(name)!=None:
			return Csc(param)
		elif sec.match(name)!=None:
			return Sec(param)
		elif cot.match(name)!=None:
			return Cot(param)
		elif exp.match(name)!=None:
			return Exp(param)
		elif pow.match(name)!=None:
			return Pow(base, exponential)
		elif log.match(name)!=None:
			return Log(base, exponential)
		else:
			return None

Beispiel #16

0

Datei anzeigen

Datei: dfa.py Projekt: rurbin3/lightbulb-framework

 def to_regex(self):
     """
     Returns a regex approximation
     Args:
         None
     Returns:
         str: A regex approximation
     """
     from regex import Regex
     converter = Regex(self)
     return converter.get_regex()

Beispiel #17

0

Datei anzeigen

Datei: PathAnt.py Projekt: c0ntradicti0n/LayoutEagle

    def add_starred_from_converters(self, _from1, _to1, functional_object, converters):
        other_things = [(f, t, functional_object2) for f, t, functional_object2 in converters]
        for _from2, _to2, functional_object2 in flatten_optional_list_triple(other_things):
            if "*" in _to2:

                other_things_regex = Regex("^" + _from2.replace("*", r"(\w+)") + "$")
                m = other_things_regex.match(_to1)

                if m:
                    new_to = _to2.replace("*", m.group(1))

                    new_from = _to2.replace("*", m.group(1))
                    self.add_edge(_to1, new_from, functional_object2)

Beispiel #18

0

Datei anzeigen

class Replacer(object):
    def __init__(self):
        self.__author__ = "Revo"
        self.__date__ = "2017-10-27"
        # email address:
        self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)')
        # url address:
        self.__url_addr = Regex(
            r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))'
        )
        # Numbers
        self.__numbers = Regex(r'([+\-]?\d*[\.,]?\d+[\d\.,+\-eE]*)')
        # Replace with add one
        self.__addone = Regex(r'(__(NUM|EMAIL|URL)(\d+)__)')
        # double space to single
        self.__spaces = Regex(r'\s+', flags=UNICODE)

        self.line = 0

    def process(self, text, ori_line):
        #print text
        self.line += 1
        list_tags = self.__addone.findall(text)
        if list_tags:
            #print list_tags
            print "LINE:", self.line
            print "IN,", text
            print "ORI,", ori_line
            email_list = self.__email_addr.findall(ori_line)
            num_list = self.__numbers.findall(ori_line)
            url_list = self.__url_addr.findall(ori_line)
            print "EMAIL,", email_list
            print "NUM,", num_list
            print "URL,", url_list
            for match in list_tags:
                try:
                    if match[1] == "URL":
                        text = text.replace(match[0],
                                            url_list[int(match[2]) - 1][0])
                    elif match[1] == "EMAIL":
                        text = text.replace(match[0],
                                            email_list[int(match[2]) - 1])
                    elif match[1] == "NUM":
                        # eight->problem
                        text = text.replace(match[0],
                                            num_list[int(match[2]) - 1])
                except BaseException:
                    print "F****D"
                    pass
            print "REPLACED:", text
            print "-----"

Beispiel #19

0

Datei anzeigen

 def __init__(self, db, app):
     self.wnioski = Wnioski(db)
     self.db = db
     self.app = app
     self.regex = Regex()
     if self.db.session.query(TassDB).all() == []:
         print('baza pusta, wczytuje dane')
         self.inicjuj_baze()
         print('dane wczytane')
         print('wyciągam lokalizacje')
         self._czysc_lokalizacje()
         self.regexuj_lokalizacje()
         print('baza danych gotowa')
     else:
         print('baza została już wcześniej utworzona')
         print('aby ją wczytać ponownie usun plik bazy serwer/TASS.db')

Beispiel #20

0

Datei anzeigen

Datei: ejercicio_a.py Projekt: ajgara/tleng-tp1

def afd_minimo(archivo_regex, archivo_automata):
    regex = Regex.crear_desde_archivo(archivo_regex)

    automata = regex.automata()
    automata.determinizar()
    automata.minimizar()
    automata.escribir_archivo(archivo_automata)

Beispiel #21

0

Datei anzeigen

Datei: PathAnt.py Projekt: c0ntradicti0n/LayoutEagle

    def add_starred(self, _from1, _to1, functional_object, converters):

        if _from1 == None:
            _from1 = OUT_OF_THE_BOX

        if "*" in _from1:

            other_things = [(f, t) for f, t, o in converters]
            new_things_regex = Regex("^" + _from1.replace("*", r"(\w+)") + "$")

            for _from2, _to2 in flatten_optional_list_pair(other_things):
                m = new_things_regex.match(_to2)
                if m:
                    new_from = _to1.replace("*", m.group(1))
                    self.add_edge(_to2, new_from, functional_object)
                    self.add_starred_from_converters(_to2, new_from, functional_object, converters)

Beispiel #22

0

Datei anzeigen

Datei: codec.py Projekt: mittagessen/kraken

    def _greedy_split(self, input: str, re: regex.Regex) -> List[str]:
        """
        Splits an input string greedily from a list of prefixes. Stops when no
        more matches are found.

        Args:
            input (str): input string
            re (regex.Regex): Prefix match object

        Returns:
            (list) of prefixes

        Raises:
            (KrakenEncodeException) if no prefix match is found for some part
            of the string.
        """
        r = []  # type: List[str]
        idx = 0
        while True:
            mo = re.match(input, idx)
            if mo is None or idx == len(input):
                if len(input) > idx:
                    raise KrakenEncodeException('No prefix matches for input after {}'.format(idx))
                return r
            r.append(mo.group())
            idx = mo.end()

Beispiel #23

0

Datei anzeigen

Datei: scrape.py Projekt: vishalbelsare/synapse

def refang_text2(txt: str, re: regex.Regex = re_fang, fangs: dict = FANGS):
    '''
    Remove address de-fanging in text blobs, .e.g. example[.]com to example.com

    Notes:
        Matches to keys in FANGS is case-insensitive, but replacement will
        always be with the lowercase version of the re-fanged value.
        For example, ``HXXP://FOO.COM`` will be returned as ``http://FOO.COM``

    Args:
        txt (str): The text to re-fang.

    Returns:
        tuple(str, dict): A tuple containing the new text, and a dictionary
        containing offset information where the new text was altered with
        respect to the original text.
    '''
    # The _consumed key is a offset used to track how many chars have been
    # consumed while the cb is called. This is because the match group
    # span values are based on their original string locations, and will not
    # produce values which can be cleanly mapped backwards.
    offsets = {'_consumed': 0}
    cb = functools.partial(_refang2_func, offsets=offsets, fangs=fangs)
    # Start applying FANGs and modifying the info to match the output
    ret = re.sub(cb, txt)

    # Remove the _consumed key since it is no longer useful for later use.
    offsets.pop('_consumed')
    return ret, offsets

Beispiel #24

0

Datei anzeigen

    def _greedy_split(self, input: str, re: regex.Regex) -> List[str]:
        """
        Splits an input string greedily from a list of prefixes. Stops when no
        more matches are found.

        Args:
            input (str): input string
            re (regex.Regex): Prefix match object

        Returns:
            (list) of prefixes

        Raises:
            (KrakenEncodeException) if no prefix match is found for some part
            of the string.
        """
        r = []  # type: List[str]
        idx = 0
        while True:
            mo = re.match(input, idx)
            if mo is None or idx == len(input):
                if len(input) > idx:
                    raise KrakenEncodeException(
                        'No prefix matches for input after {}'.format(idx))
                return r
            r.append(mo.group())
            idx = mo.end()

Beispiel #25

0

Datei anzeigen

Datei: inference.py Projekt: nickborg/PSYCH204_Project

	def __init__(self, numH, strings):
		self.hSpace_ = list()
		self.strings_ = strings
		self.baseH_ = Regex(strings)
		self.baseHProb_ = self.likelihood(self.baseH_)
		self.numH_ = numH
		self.addRegexes([(self.baseH_.copy(), self.baseHProb_)])

Beispiel #26

0

Datei anzeigen

 def __init__(self):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # compile regexes
     self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ',
                                          flags=UNICODE)
     self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ',
                                    flags=UNICODE)
     self._contract = Regex(r" (\p{Alpha}+) ' (ll|ve|re|[dsmt])(?= )",
                            flags=UNICODE | IGNORECASE)
     self._dash_fixes = Regex(
         r" (\p{Alpha}+|£ [0-9]+) - (priced|star|friendly|(?:£ )?[0-9]+) ",
         flags=UNICODE | IGNORECASE)
     self._dash_fixes2 = Regex(r" (non) - ([\p{Alpha}-]+) ",
                               flags=UNICODE | IGNORECASE)

Beispiel #27

0

Datei anzeigen

Datei: apply_bpe.py Projekt: virgulvirgul/GOD.util

    def __init__(self, codes, separator='@@', vocab=None, glossaries=None):

        # check version information
        #codes = codecs.open(codes,"r", encoding='utf-8')
        firstline = codes.readline()
        if firstline.startswith('#version:'):
            self.version = tuple([
                int(x) for x in re.sub(r'(\.0+)*$', '',
                                       firstline.split()[-1]).split(".")
            ])
        else:
            self.version = (0, 1)
            codes.seek(0)

        self.bpe_codes = [tuple(item.split()) for item in codes]

        # some hacking to deal with duplicates (only consider first instance)
        self.bpe_codes = dict([
            (code, i)
            for (i, code) in reversed(list(enumerate(self.bpe_codes)))
        ])

        self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair)
                                       for pair, i in self.bpe_codes.items()])

        self.separator = separator

        self.vocab = vocab

        #self.glossaries = glossaries if glossaries else []
        self.glossaries = []
        # for i in xrange(30):
        #     self.glossaries.append("__URL"+str(i)+"__")
        #     #self.glossaries.append("__NUM"+str(i)+"__")
        #     self.glossaries.append("__EMAIL"+str(i)+"__")
        #
        self.cache = {}
        # added by revo
        self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)')
        # url address:
        self.__url_addr = Regex(
            r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))'
        )

Beispiel #28

0

Datei anzeigen

 def __init__(self):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # compile regexes
     self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ', flags=UNICODE)
     self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ', flags=UNICODE)
     self._contract = Regex(r" (\p{Alpha}+) ' ?(ll|ve|re|[dsmt])(?= )", flags=UNICODE|IGNORECASE)
     self._fixes = Regex(r" (do|go[nt]|wan) (n't|ta|na)(?= )", flags=UNICODE|IGNORECASE)
     self._replace_table = {' i ':' I ',
                            ' im ': ' I\'m ',
                            ' dont ': ' don\'t '}

Beispiel #29

0

Datei anzeigen

    def save_regex(self):
        regex = Regex(self.regex_input.text())

        path, _ = QFileDialog.getSaveFileName(self)
        if path:
            file = open(path, 'w')
            file.write(regex.regex_str)
            file.close()
        else:
            return

Beispiel #30

0

Datei anzeigen

class Detokenizer(object):
    """\
    A simple de-tokenizer class.
    """
    def __init__(self):
        """\
        Constructor (pre-compile all needed regexes).
        """
        # compile regexes
        self._currency_or_init_punct = Regex(r' ([\p{Sc}\(\[\{\¿\¡]+) ',
                                             flags=UNICODE)
        self._noprespace_punct = Regex(r' ([\,\.\?\!\:\;\\\%\}\]\)]+) ',
                                       flags=UNICODE)
        self._contract = Regex(r" (\p{Alpha}+) ' (ll|ve|re|[dsmt])(?= )",
                               flags=UNICODE | IGNORECASE)
        self._dash_fixes = Regex(
            r" (\p{Alpha}+|£ [0-9]+) - (priced|star|friendly|(?:£ )?[0-9]+) ",
            flags=UNICODE | IGNORECASE)
        self._dash_fixes2 = Regex(r" (non) - ([\p{Alpha}-]+) ",
                                  flags=UNICODE | IGNORECASE)

    def detokenize(self, text):
        """\
        Detokenize the given text.
        """
        replace_with_blank = [
            "somewhat rather", "sort of", "somewhat", "rather"
        ]
        text = ' ' + text + ' '
        text = self._dash_fixes.sub(r' \1-\2 ', text)
        text = self._dash_fixes2.sub(r' \1-\2 ', text)
        text = self._currency_or_init_punct.sub(r' \1', text)
        text = self._noprespace_punct.sub(r'\1 ', text)
        text = self._contract.sub(r" \1'\2", text)
        text = text.strip()
        for word in replace_with_blank:
            text = text.replace(word, "")
        # capitalize
        if not text:
            return ''
        text = text[0].upper() + text[1:]
        return text

Beispiel #31

0

Datei anzeigen

Datei: split_sentences.py Projekt: LiesbethA/mtmonkey

 def __init__(self, options={}):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # load no-break prefixes for the given language
     self.__load_nobreaks(options.get('language'),
                          options.get('nobreak_file'))
     # compile regexes
     self.__spaces = Regex(r'\s+')
     self.__space_at_end = Regex(r'(^|\n) ')
     self.__space_at_begin = Regex(r' ($|\n)')
     self.__non_period = Regex(r'([?!]|\.{2,}) +' + self.SENT_STARTER)
     self.__in_punct = Regex(r'([?!\.] *' + self.FINAL_PUNCT + r') +' +
                             self.SENT_STARTER)
     self.__punct_follows = Regex(r'([?!\.]) +' + self.SENT_STARTER_PUNCT)
     self.__period = Regex(r'([\p{Alnum}\.\-]+)(' + self.FINAL_PUNCT +
                           r')? *$')
     self.__ucase_acronym = Regex(r'\.[\p{Upper}\-]+$')
     self.__numbers = Regex(r'^\p{N}')
     self.__sent_starter = Regex(self.SENT_STARTER)

Beispiel #32

0

Datei anzeigen

    def __init__(self):
        self.__author__ = "Revo"
        self.__date__ = "2017-10-27"
        # email address:
        self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)')
        # url address:
        self.__url_addr = Regex(
            r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))'
        )
        # Numbers
        self.__numbers = Regex(r'([+\-]?\d*[\.,]?\d+[\d\.,+\-eE]*)')
        # Replace with add one
        self.__addone = Regex(r'(__(NUM|EMAIL|URL)(\d+)__)')
        # double space to single
        self.__spaces = Regex(r'\s+', flags=UNICODE)

        self.line = 0

Beispiel #33

0

Datei anzeigen

Datei: filter_algorithm.py Projekt: josephsalimin/spam-detector

    def do_regex(spam_text, data_resp):
        results = []

        for data in data_resp:
            text = data['text']
            result = Regex.match_string(text, spam_text)
            result['profile_img'] = data['profile_img']
            result['name'] = data['name']
            result['screen_name'] = data['screen_name']
            results.append(result)

        return results

Beispiel #34

0

Datei anzeigen

Datei: ph_number.py Projekt: virgulvirgul/GOD.util

 def __init__(self):
     self.__author__ = "Revo"
     self.__date__ = "2017-12-28"
     #self.__date__ = "2017-10-24"
     # email address:
     self.__email_addr = Regex(r'([\w\.-]+@[\w\.-]+)')
     # url address:
     self.__url_addr = Regex(
         r'(?P<url>https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)|[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))'
     )
     #self.__date_list = ["a.m","p.m","A.M","P.M"]
     # Numbers
     self.__numbers = Regex(r'([+\-]?\d*[\.,]?\d+[\d\.,+\-eE]*)')
     # Replace with add one
     self.__addone = Regex(r'(__(NUM|EMAIL|URL)__)')
     self.__addone_search = Regex(r'(__(NUM|EMAIL|URL)(\d+)__)')
     # double space to single
     self.__spaces = Regex(r'\s+', flags=UNICODE)
     #
     self.__counter = dict({"URL": 0, "EMAIL": 0})
     #
     self.line = 0

Beispiel #35

0

Datei anzeigen

 def open_regex(self):
     path, _ = QFileDialog.getOpenFileName(self)
     string = ""
     if path:
         file = open(path, 'r')
         string = file.read()
         try:
             regex = Regex(string)
             self.regex_input.setText(regex.regex_str)
         except SyntaxError as e:
             self.show_error(e)
             return
         file.close()

Beispiel #36

0

Datei anzeigen

    def __init__(self, alpha, path):
        self.alpha = alpha
        self.rules = []

        with open(path, 'r') as f:
            for l in f.readlines():
                l = l.strip()
                if len(l) == 0:
                    continue
                l = l.split('=>')
                rx = Regex(l[0].strip(), self.alpha)
                tag = l[1].strip()
                self.rules.append([rx, tag])

Beispiel #37

0

Datei anzeigen

Datei: postprocessing_mt_05.py Projekt: jona01234/MU_04

 def __init__(self, options={}):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # process options
     self.moses_deescape = True if options.get('moses_deescape') else False
     self.language = options.get('language', 'en')
     self.capitalize_sents = True if options.get(
         'capitalize_sents') else False
     # compile regexes
     self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$')
     self.__noprespace_punct = Regex(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$')
     self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F' +
                              r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F' +
                              r'\uFF65-\uFFDC]')
     self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$')
     # language-specific regexes
     self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$')
     self.__contract = None
     if self.language in self.CONTRACTIONS:
         self.__contract = Regex(self.CONTRACTIONS[self.language],
                                 IGNORECASE)

Beispiel #38

0

Datei anzeigen

Datei: detokenization.py Projekt: machacek/segranks

 def __init__(self, **options):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # process options
     self.moses_deescape = True if options.get('moses_deescape') else False
     self.language = options.get('language', 'en')
     self.capitalize_sents = True if options.get('capitalize_sents') else False
     # compile regexes
     self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$')
     self.__noprespace_punct = Regex(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$')
     self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F'
                              + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F'
                              + r'\uFF65-\uFFDC]')
     self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$')
     # language-specific regexes
     self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$')
     self.__contract = None
     if self.language in self.CONTRACTIONS:
         self.__contract = Regex(self.CONTRACTIONS[self.language],
                                 IGNORECASE)

Beispiel #39

0

Datei anzeigen

Datei: LatentAPI.py Projekt: flamecontrol/flamecontrol

import requests
import unittest
import sys

import xlrd
sys.path.append("../utils")
sys.path.append("../data")
from regex import Regex


print(Regex.re(1, 1))

class LatentAPI(unittest.TestCase):
    strBaseurl = 'http://www.baidu.com'
    url1 = 'https://api.opifices.com/v1/specifications.json'
    url2 = 'https://api.opifices.com/oauth/token'
    data = {
        'username': '******',
        'password': '******',
        'client_id': '1000a0200d80800dc40d322db6747f09b36825c418141a8f02449fdf1003fb55',
        'client_secret': 'a567554533c4c6ce3c6c4f1fe9fb02a1fa4d4ab582b6626fc1d5d6b3cc24ec2c',
        'grant_type': 'password'
    }

    def SengMSG(self):

        a = requests.get(self.url1).text
        b = requests.post(self.url2, self.data, verify=False).text
        print(Regex.re(a, a))
        print(Regex.re(a, b))

Beispiel #40

0

Datei anzeigen

Datei: LatentAPI.py Projekt: flamecontrol/flamecontrol

    def SengMSG(self):

        a = requests.get(self.url1).text
        b = requests.post(self.url2, self.data, verify=False).text
        print(Regex.re(a, a))
        print(Regex.re(a, b))

Beispiel #41

0

Datei anzeigen

    def __init__(self, regex):

        Regex.__init__(self, regex)

        if "host" not in self._regexObj.groupindex:
            raise RegexException("No 'host' group in '%s'" % self._regex)

Beispiel #42

0

Datei anzeigen

Datei: tokenize.py Projekt: LiesbethA/mtmonkey

 def __init__(self, options={}):
     """\
     Constructor (pre-compile all needed regexes).
     """
     # process options
     self.lowercase = True if options.get('lowercase') else False
     self.moses_escape = True if options.get('moses_escape') else False
     # compile regexes
     self.__spaces = Regex(r'\s+', flags=UNICODE)
     self.__ascii_junk = Regex(r'[\000-\037]')
     self.__special_chars = \
             Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)')
     # single quotes: all unicode quotes + prime
     self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]')
     # double quotes: all unicode chars incl. Chinese + double prime + ditto
     self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])')
     self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])')
     self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])')
     self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])')
     # hyphen: separate every time but for unary minus
     self.__minus = Regex(r'([-−])')
     self.__pre_notnum = Regex(r'(-)([^\p{N}])')
     self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')

Beispiel #43

0

Datei anzeigen

Datei: functions.py Projekt: HaebinShin/differentiation-api

	def isDoubleParamFunction(name):
		for now_regex in Regex.doubleParamFunctions():
			if now_regex.match(name)!=None:
				return True
		return False

Beispiel #44

0

Datei anzeigen

Datei: tokenize.py Projekt: LiesbethA/mtmonkey

class Tokenizer(object):
    """\
    A simple tokenizer class, capable of tokenizing given strings.
    """

    # Moses special characters escaping
    ESCAPES = [('&', '&amp;'), # must go first to prevent double escaping!
               ('|', '&bar;'),
               ('<', '&lt;'),
               ('>', '&gt;'),
               ('[', '&bra;'),
               (']', '&ket;')]

    def __init__(self, options={}):
        """\
        Constructor (pre-compile all needed regexes).
        """
        # process options
        self.lowercase = True if options.get('lowercase') else False
        self.moses_escape = True if options.get('moses_escape') else False
        # compile regexes
        self.__spaces = Regex(r'\s+', flags=UNICODE)
        self.__ascii_junk = Regex(r'[\000-\037]')
        self.__special_chars = \
                Regex(r'(([^\p{IsAlnum}\s\.\,−\-])\2*)')
        # single quotes: all unicode quotes + prime
        self.__to_single_quotes = Regex(r'[`‛‚‘’‹›′]')
        # double quotes: all unicode chars incl. Chinese + double prime + ditto
        self.__to_double_quotes = Regex(r'(\'\'|``|[«»„‟“”″〃「」『』〝〞〟])')
        self.__no_numbers = Regex(r'([^\p{N}])([,.])([^\p{N}])')
        self.__pre_numbers = Regex(r'([^\p{N}])([,.])([\p{N}])')
        self.__post_numbers = Regex(r'([\p{N}])([,.])([^\p{N}])')
        # hyphen: separate every time but for unary minus
        self.__minus = Regex(r'([-−])')
        self.__pre_notnum = Regex(r'(-)([^\p{N}])')
        self.__post_num_or_nospace = Regex(r'(\p{N} *|[^ ])(-)')


    def tokenize_factors(self, pretoks, factor_no=0):
        """\
        Further tokenize a list of factored tokens (separated by `|'), 
        separating the given factor and copying the other factor to all its
        parts.    
        """
        out = []
        for pretok in pretoks:
            factors = pretok.split('|')
            tokens = ['|'.join(factors[:factor_no] + [token] +
                               factors[factor_no + 1:])
                      for token in
                      self.tokenize(factors[factor_no]).split(' ')]
            out.extend(tokens)
        return out

    def tokenize_factored_text(self, factored_text, factor_no=0):
        """\
        Further tokenize pre-tokenized text composed of several factors
        (separated by `|'). Tokenize further the given factor and copy all
        other factors.
        """
        pretoks = self.__spaces.split(factored_text)
        return ' '.join(self.tokenize_factors(pretoks, factor_no))

    def tokenize(self, text):
        """\
        Tokenize the given text using current settings.
        """
        # pad with spaces so that regexes match everywhere
        text = ' ' + text + ' '
        # spaces to single space
        text = self.__spaces.sub(' ', text)
        # remove ASCII junk
        text = self.__ascii_junk.sub('', text)
        # separate punctuation (consecutive items of same type stay together)
        text = self.__special_chars.sub(r' \1 ', text)
        # separate dots and commas everywhere except in numbers
        text = self.__no_numbers.sub(r'\1 \2 \3', text)
        text = self.__pre_numbers.sub(r'\1 \2 \3', text)
        text = self.__post_numbers.sub(r'\1 \2 \3', text)
        # normalize quotes
        text = self.__to_single_quotes.sub('\'', text)
        text = self.__to_double_quotes.sub('"', text)
        # separate hyphen, minus
        text = self.__pre_notnum.sub(r'\1 \2', text)
        text = self.__post_num_or_nospace.sub(r'\1\2 ', text)
        text = self.__minus.sub(r' \1', text)
        # spaces to single space
        text = self.__spaces.sub(' ', text)
        text = text.strip()
        # escape chars that are special to Moses
        if self.moses_escape:
            for char, repl in self.ESCAPES:
                text = text.replace(char, repl)
        # lowercase
        if self.lowercase:
            text = text.lower()
        return text

Beispiel #45

0

Datei anzeigen

Datei: factor.py Projekt: HaebinShin/differentiation-api

	def isVariable(token):
		special=Regex.special()
		if special.match(token)==None:
			return True
		else:
			return False

Beispiel #46

0

Datei anzeigen

Datei: split_sentences.py Projekt: LiesbethA/mtmonkey

class SentenceSplitter(object):
    """\
    A simple sentence splitter class.
    """

    # TODO look at quote characters, CZ quotes possibly have wrong
    # Unicode classes!

    # sentence starters (possibly some starting punctuation) + upper-case char.
    SENT_STARTER = r'([\'\"\(\[\¿\¡\p{Pi}]* *[\p{Upper}\p{N}])'
    # sentence starters with compulsory punctuation
    SENT_STARTER_PUNCT = r'([\'\"\(\[\¿\¡\p{Pi}]+ *[\p{Upper}p{N}])'
    # final punctuation
    FINAL_PUNCT = r'[\'\"\)\]\p{Pf}\%]+'
    # non-breaking prefix directory
    NOBREAK_DIR = 'nonbreaking_prefixes'
    # non-breaking prefix file
    NOBREAK_FILE = 'nonbreaking_prefix.'

    def __init__(self, options={}):
        """\
        Constructor (pre-compile all needed regexes).
        """
        # load no-break prefixes for the given language
        self.__load_nobreaks(options.get('language'),
                             options.get('nobreak_file'))
        # compile regexes
        self.__spaces = Regex(r'\s+')
        self.__space_at_end = Regex(r'(^|\n) ')
        self.__space_at_begin = Regex(r' ($|\n)')
        self.__non_period = Regex(r'([?!]|\.{2,}) +' + self.SENT_STARTER)
        self.__in_punct = Regex(r'([?!\.] *' + self.FINAL_PUNCT + r') +' +
                                self.SENT_STARTER)
        self.__punct_follows = Regex(r'([?!\.]) +' + self.SENT_STARTER_PUNCT)
        self.__period = Regex(r'([\p{Alnum}\.\-]+)(' + self.FINAL_PUNCT +
                              r')? *$')
        self.__ucase_acronym = Regex(r'\.[\p{Upper}\-]+$')
        self.__numbers = Regex(r'^\p{N}')
        self.__sent_starter = Regex(self.SENT_STARTER)

    def split_sentences(self, text):
        """\
        Split sentences in the given text using current settings.
        """
        # clean
        text = self.__spaces.sub(r' ', text)
        text = self.__space_at_begin.sub(r'\1', text)
        text = self.__space_at_end.sub(r'\1', text)
        # break on special cases
        text = self.__non_period.sub(r'\1\n\2', text)
        text = self.__in_punct.sub(r'\1\n\2', text)
        text = self.__punct_follows.sub(r'\1\n\2', text)
        # break on periods
        words = text.split('. ')
        text = ''
        for word, next_word in zip(words[:-1], words[1:]):
            text += word + '.'
            match = self.__period.search(word)
            # check periods
            if match:
                prefix, end_punct = match.groups()
                # never break on no-break prefixes, upper case acronyms
                # and numeric no-breaks before numbers
                if (prefix in self.__nobreaks and not end_punct) or \
                        self.__ucase_acronym.search(prefix) or \
                        (prefix in self.__numeric_nobreaks and
                         not end_punct and self.__numbers.match(next_word)):
                    text += ' '
                # break before sentence starters
                elif self.__sent_starter.match(next_word):
                    text += "\n"
                # don't break otherwise
                else:
                    text += ' '
            # don't break when there's no period
            else:
                text += ' '
        # append last token (we stopped iterating just before it)
        text += words[-1]
        # return the result
        return text.split("\n")


    def __load_nobreaks(self, language=None, filename=None):
        """\
        Load non-breaking prefixes for the given language from a default
        location or from the given file.
        """
        # initialize sets of non-breaking prefixes
        self.__nobreaks = set()
        self.__numeric_nobreaks = set()
        # obtain file name from language specification
        if filename is None and language is not None:
            filename = os.path.dirname(__file__) + os.sep + \
                    self.NOBREAK_DIR + os.sep + self.NOBREAK_FILE + language
        # try to load prefixes from file
        if filename and os.path.isfile(filename):
            fh = codecs.open(filename, 'r', 'UTF-8')
            for item in fh:
                item = item.strip()
                if item and not item.startswith('#'):
                    match = regex.match(r'^(.*)\s+#NUMERIC_ONLY#', item)
                    if match:
                        self.__numeric_nobreaks.add(match.group(1))
                    else:
                        self.__nobreaks.add(item)

Beispiel #47

0

Datei anzeigen

Datei: inference.py Projekt: nickborg/PSYCH204_Project

class Inference:
	def __init__(self, numH, strings):
		self.hSpace_ = list()
		self.strings_ = strings
		self.baseH_ = Regex(strings)
		self.baseHProb_ = self.likelihood(self.baseH_)
		self.numH_ = numH
		self.addRegexes([(self.baseH_.copy(), self.baseHProb_)])

	def addRegexes(self, reSet):
		# add set
		for re, prob in reSet:
			load = True
			for h, _ in self.hSpace_:
				if re.equalTo(h):
					load = False
					continue
			if load:
				self.hSpace_.append((re, prob))

		# remove extra hypotheses
		self.sortHypotheses()
		# self.cullHypotheses()

	def generateAll(self):
		print "Generating hypothesis for", len(self.baseH_.states_), "states."
		allRegexes = totalSet(list((s.ID_) for s in self.baseH_.states_.values()))
		print "Total number of regexes", len(allRegexes)
		for regexStates in allRegexes:
			newRegex = self.baseH_.copy()
			for a in regexStates:
				if len(a) == 1:
					continue
				for b in a[1:]:
					newRegex.mergeRandom(a[0], b)
			self.addRegexes([(newRegex, self.likelihood(newRegex))])

	def cullHypotheses(self):
		for a in range(len(self.hSpace_) - self.numH_):
			del self.hSpace_[-1]

	def sortHypotheses(self):
		# sort by descending probability
		self.hSpace_ = sorted(self.hSpace_, key=lambda array: -array[1])

	def likelihood(self, re):
		result = re.logPrior()
		for string in self.strings_:
			accept, LL = re.string(string)
			if not accept:
				print "Error, regex does not accept string", string
				re.printText()
				re.printGraph("output/inference/error.png")
				assert False
			result += LL
		return result

	def duplicateHypotheses(self, permute=False):
		newH = list()
		for i in range(self.numH_ ):
			newRe = self.baseH_.copy()
			newRe.permuteRegex()
			newH.append((newRe, self.likelihood(newRe)))
		while len(self.hSpace_) < 2 * self.numH_:
			for re, prob in self.hSpace_[:]:
				re2 = re.copy()
				if permute:
					re2.permuteRegex()
				newH.append((re2, self.likelihood(re2)))
			self.addRegexes(newH)

	def testString(self, testString):
		totalProb = 0
		acceptProb = 0
		for h, prob in self.hSpace_:
			totalProb += exp(prob)
			accept, _ = h.string(testString)
			if accept:
				acceptProb += exp(prob)

		return acceptProb / totalProb

	def beamStep(self, re):
		newRegexes = list()
		# generate merged steps
		for stateID1 in list((s.ID_) for s in re.states_.values()):
			for stateID2 in list((s.ID_) for s in re.states_.values()):
				if stateID1 == stateID2:
					continue
				newRe = re.copy()
				newRe.mergeRandom(stateID1, stateID2)
				newRegexes.append((newRe, self.likelihood(newRe)))

		# generate wildcard steps
		for stateID1 in list((s.ID_) for s in re.states_.values()):
			for wildcard in ['S', 'N', 'A']:
				for k, s in re.states_[stateID1].next_:
					if keysOverlap(k, wildcard) and keyMinus(wildcard, k) != '':
						newRe = re.copy()
						newRe.wildcardize(stateID1, wildcard)
						newRegexes.append((newRe, self.likelihood(newRe)))

						# only replace one of the transitions for a wildcard
						break

		return newRegexes

	def beamSearch(self):
		beam = [(self.baseH_, self.baseHProb_)]
		newBeam = list()
		i = 0
		while len(beam) > 0:
			print "beam iteration:", i, "hypotheses:", len(beam)
			i += 1

			# take step forward
			while len(beam) > 0:
				h, prob = beam.pop(0)
				newBeam.extend(self.beamStep(h))

			# exit if there is no more step
			if len(newBeam) == 0:
				return

			# copy best hypotheses to old beam
			newBeam = sorted(newBeam, key=lambda array: -array[1])
			while len(beam) < BEAM_SIZE and len(newBeam) > 0:
				re1, prob1 = newBeam.pop(0)
				# add = True
				# for re2, prob2 in beam:
				# 	if re1.equalTo(re2):
				# 		add = False
				# 		break
				# if add:
				if True:
					beam.append((re1, prob1))

			beam[0][0].printGraph("output/beam-iter-%d-1.png"%i)
			beam[1][0].printGraph("output/beam-iter-%d-2.png"%i)
			beam[2][0].printGraph("output/beam-iter-%d-3.png"%i)
			beam[3][0].printGraph("output/beam-iter-%d-4.png"%i)
			beam[4][0].printGraph("output/beam-iter-%d-5.png"%i)

			# add hypotheses in beam to hset, clear newbeam
			self.addRegexes(beam)
			self.addRegexes(newBeam)
			newBeam = list()

Beispiel #48

0

Datei anzeigen

Datei: detokenization.py Projekt: machacek/segranks

class Detokenizer(object):
    """Based on Ondrej Dusek's code"""

    # Moses special characters de-escaping
    ESCAPES = [('&bar;', '|'),
               ('&lt;', '<'),
               ('&gt;', '>'),
               ('&bra;', '['),
               ('&ket;', ']'),
               ('&amp;', '&')]  # should go last to prevent double de-escaping

    # Contractions for different languages
    CONTRACTIONS = {'en': r'^\p{Alpha}+(\'(ll|ve|re|[dsm])|n\'t)$',
                    'fr': r'^([cjtmnsdl]|qu)\'\p{Alpha}+$',
                    'es': r'^[dl]\'\p{Alpha}+$',
                    'it': r'^\p{Alpha}*(l\'\p{Alpha}+|[cv]\'è)$',
                    'cs': r'^\p{Alpha}+[-–](mail|li)$', }

    def __init__(self, **options):
        """\
        Constructor (pre-compile all needed regexes).
        """
        # process options
        self.moses_deescape = True if options.get('moses_deescape') else False
        self.language = options.get('language', 'en')
        self.capitalize_sents = True if options.get('capitalize_sents') else False
        # compile regexes
        self.__currency_or_init_punct = Regex(r'^[\p{Sc}\(\[\{\¿\¡]+$')
        self.__noprespace_punct = Regex(r'^[\,\.\?\!\:\;\\\%\}\]\)]+$')
        self.__cjk_chars = Regex(r'[\u1100-\u11FF\u2E80-\uA4CF\uA840-\uA87F'
                                 + r'\uAC00-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F'
                                 + r'\uFF65-\uFFDC]')
        self.__final_punct = Regex(r'([\.!?])([\'\"\)\]\p{Pf}\%])*$')
        # language-specific regexes
        self.__fr_prespace_punct = Regex(r'^[\?\!\:\;\\\%]$')
        self.__contract = None
        if self.language in self.CONTRACTIONS:
            self.__contract = Regex(self.CONTRACTIONS[self.language],
                                    IGNORECASE)

    def detokenize(self, text):
        """\
        Detokenize the given text using current settings.
        """
        # paste text back, omitting spaces where needed 
        words = text.split(' ')
        text = ''
        pre_spc = ' '
        quote_count = {'\'': 0, '"': 0, '`': 0}
        for pos, word in enumerate(words):
            # remove spaces in between CJK chars
            if self.__cjk_chars.match(text[-1:]) and \
                    self.__cjk_chars.match(word[:1]):
                text += word
                pre_spc = ' '
            # no space after currency and initial punctuation
            elif self.__currency_or_init_punct.match(word):
                text += pre_spc + word
                pre_spc = ''
            # no space before commas etc. (exclude some punctuation for French)
            elif self.__noprespace_punct.match(word) and \
                    (self.language != 'fr' or not
                     self.__fr_prespace_punct.match(word)):
                text += word
                pre_spc = ' '
            # contractions with comma or hyphen 
            elif word in "'-–" and pos > 0 and pos < len(words) - 1 \
                    and self.__contract is not None \
                    and self.__contract.match(''.join(words[pos - 1:pos + 2])):
                text += word
                pre_spc = ''
            # handle quoting
            elif word in '\'"„“”‚‘’`':
                # detect opening and closing quotes by counting 
                # the appropriate quote types
                quote_type = word
                if quote_type in '„“”':
                    quote_type = '"'
                elif quote_type in '‚‘’':
                    quote_type = '\''
                # exceptions for true Unicode quotes in Czech & German
                if self.language in ['cs', 'de'] and word in '„‚':
                    quote_count[quote_type] = 0
                elif self.language in ['cs', 'de'] and word in '“‘':
                    quote_count[quote_type] = 1
                # special case: possessives in English ("Jones'" etc.)                    
                if self.language == 'en' and text.endswith('s'):
                    text += word
                    pre_spc = ' '
                # really a quotation mark
                else:
                    # opening quote
                    if quote_count[quote_type] % 2 == 0:
                        text += pre_spc + word
                        pre_spc = ''
                    # closing quote
                    else:
                        text += word
                        pre_spc = ' '
                    quote_count[quote_type] += 1
            # keep spaces around normal words
            else:
                text += pre_spc + word
                pre_spc = ' '
        # de-escape chars that are special to Moses
        if self.moses_deescape:
            for char, repl in self.ESCAPES:
                text = text.replace(char, repl)
        # strip leading/trailing space
        text = text.strip()
        # capitalize, if the sentence ends with a final punctuation
        if self.capitalize_sents and self.__final_punct.search(text):
            text = text[0].upper() + text[1:]
        return text

Beispiel #49

0

Datei anzeigen

Datei: lex.py Projekt: ChacesXia/schepy

    def compile(self, grammar_type="regex"):
        """
        根据文法类型进行编译, 产生dfa. regex 表示 正则表达式, regular 表示 正规文法
        :param grammar: 文法类型
        :return:
        """
        if grammar_type == 'regex':
            nfas = []
            for le in self.lexs:
                # print le
                nfas.append(Regex.compile_nfa(le[1], extend=True, type=le[0]))
            nfa = NFA.combine(*nfas)
            self.lex_dfa = nfa.convert_dfa(copy_meta=["type"])
            return
        elif grammar_type == "regular":
            """
            本来没有想到会做三型文法解析, 由于parser里也有文法解析.. 此处应该跟那边合并..
            """
            nfas = []
            grammar = defaultdict(list)
            g_in, g_out = defaultdict(int), defaultdict(int)
            all_symbol = set()
            for l_hand, r_hand in self.lexs:
                l_hand = l_hand[1:-1]
                r_hands = [[x[1:-1] for x in r.strip().split()] for r in r_hand.split('|')]
                for hand in r_hands:
                    for h in hand:
                        g_in[h] += 1
                        all_symbol.add(h)
                g_out[l_hand] += 1
                all_symbol.add(l_hand)
                grammar[l_hand].extend(r_hands)
            grammar['limit'] = [[' '], ['\t'], ['\n']]
            ter, not_ter = [], []
            for sym in all_symbol:
                if g_in[sym] == 0:
                    not_ter.append(sym)
                if g_out[sym] == 0:
                    ter.append(sym)
            # print ter, not_ter
            nfas = []
            for token_type in not_ter:
                nfa = NFA()
                nfa.start = NFANode(r_name=token_type)
                end_node = NFANode(type=token_type)
                end_node.end = True
                nfa.end = {end_node}
                vis = {token_type: nfa.start}

                def get_node(name):
                    if name in vis:
                        return vis[name]
                    vis[name] = NFANode(r_name=name)
                    return vis[name]

                que = Queue()
                que.put(token_type)
                while not que.empty():
                    t = que.get()
                    node = get_node(t)
                    if node.meta.get('vis', 0) > 0:
                        continue
                    node.meta['vis'] = node.meta.get('vis', 0) + 1
                    for r_hand in grammar[t]:
                        node.next.setdefault(r_hand[0], set())
                        if len(r_hand) == 2:
                            node.next[r_hand[0]].add(get_node(r_hand[1]))
                            que.put(r_hand[1])
                        else:
                            node.next[r_hand[0]].add(end_node)
                nfas.append(nfa)
            nfa = NFA.combine(*nfas)
            self.lex_dfa = nfa.convert_dfa(copy_meta=["type"])
            return