Ejemplo n.º 1
0
 def test_qualified_re_split(self):
     self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
     self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
     self.assertEqual(re.split("(:)", ":a:b::c", 2),
                      ['', ':', 'a', ':', 'b::c'])
     self.assertEqual(re.split("(:*)", ":a:b::c", 2),
                      ['', ':', 'a', ':', 'b::c'])
Ejemplo n.º 2
0
def split_setences(text):
	sentences = []

	results = re.split("\\.|!|\\?",text)

	for item in results:
		sentences.append(item)

	return 	sentences
Ejemplo n.º 3
0
 def test_re_split(self):
     self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
     self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
     self.assertEqual(re.split("(:*)", ":a:b::c"),
                      ['', ':', 'a', ':', 'b', '::', 'c'])
     self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
     self.assertEqual(re.split("(:)*", ":a:b::c"),
                      ['', ':', 'a', ':', 'b', ':', 'c'])
     self.assertEqual(re.split("([b:]+)", ":a:b::c"),
                      ['', ':', 'a', ':b::', 'c'])
     self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
                      ['', None, ':', 'a', None, ':', '', 'b', None, '',
                       None, '::', 'c'])
     self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
                      ['', 'a', '', '', 'c'])
Ejemplo n.º 4
0
def extract_entities(text,deduplication=False):
    sentences = split_setences(text)
    text_preprocessed = remove_accents(text)
    text_preprocessed = remove_digits(text_preprocessed)

    regexp2bat = build_regexpression()
    matches = re.finditer(regexp2bat, text_preprocessed)

    pt_patterns = PortuguesePatterns()

    phase_0_entities = []
    for matchNum, match in enumerate(matches):
        matchNum = matchNum + 1
        phase_0_entities.append(match.group())
    phase_1_entities = []
    for token in phase_0_entities:
	    doc = nlp(token)
	    prefix_pos=doc[0].pos_
	    if(prefix_pos in pt_patterns.tags_exclusions):
		    token=token.replace(doc[0].text,'',1)
		    doc[0].pos_=''
	    if(doc[0].pos_=='' and len(doc)>1):
	        if(doc[1].pos_ in pt_patterns.tags_exclusions):
		        token=token.replace(doc[1].text,'',1)
	    if(token.strip()!=''):
		    phase_1_entities.append(token)

    unique_tokens = []
    for token in phase_1_entities:
        if(token != '' and len(token) > 2):
            token = token.strip()
            if not(token in pt_patterns.stopwords or token in pt_patterns.preprositions):
                if(deduplication):
                    if not(token in unique_tokens):
                        unique_tokens.append(token)
                else:
                    unique_tokens.append(token)

    origin="("+"|".join(unique_tokens)+")" 
    text=re.split(origin,text)
    output={'text':text,'tokens':unique_tokens}

    return output
Ejemplo n.º 5
0
def execute(mode, code, input_str):
  result = ""

  if mode == "l":
     rows = [pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code)]
     table = {}
     for row in rows:
       table.update(dict(zip(row[:-1],[row[-1]]*(len(row)-1))))
     if input_str in table:
       result = table[input_str]
     else:
       result = table["?"]
  elif mode == "f":
     result = code % ast.literal_eval(input_str)
  elif mode == "F":
     literal = ast.literal_eval(input_str)
     if isinstance(literal, tuple):
       result = code % literal
       input_str = str(sum([len(str(x)) for x in literal]))
     else:
       result = code % literal
       input_str = str(len(str(literal)))
  elif mode == "g":
    for string in exrex.generate(code):
      print(string.encode("utf-8").decode("unicode-escape"))
    return # Generate is always terminal
  elif mode == "h":
    if type(input_str) is str:
      input_str = pcre.escape(input_str)
    for string in exrex.generate(code % input_str):
      print(string.encode("utf-8").decode("unicode-escape"))
    return
  elif mode == "p":
    literal = ast.literal_eval(input_str)
    if isinstance(literal, int):
      result = pcre.sub(r"(?<![^\\]\\)~(.+?)(?<![^\\]\\)~",r"\1" * literal, code, flags=pcre.DOTALL)
    else:
      result = pcre.sub(r"(?<![^\\]\\)%(.+?)(?<![^\\]\\)%",r"\1" * literal[1], pcre.sub(r"~(.+?)~",r"\1" * literal[0], code, flags=pcre.DOTALL), flags=pcre.DOTALL)
  elif mode == "P":
    result = pcre.sub(r"(.)(?<![^\\]\\)~",r"\1" * ast.literal_eval(input_str), code, flags=pcre.DOTALL)
  elif mode == "e":
    rows = [pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code)]
    table = {}
    for row in rows:
      table.update(dict(zip(row[:-1],[row[-1]]*(len(row)-1))))
    for char in i:
      result += table[i]
  elif mode == "o":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    print(pieces[0].encode("utf-8").decode("unicode-escape"))
    result = "`" + "`".join(pieces[1:])
  elif mode == "s":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])
    sub_length = len(subs)

    for i in range(0, len(subs), 2):
      input_str = pcre.sub(subs[i], subs[i + 1], input_str)

    if len(pieces) > 1:
      result = "`" + "`".join(pieces[1:])
    else:
      result = input_str
  elif mode == "d":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])

    for sub in subs:
      input_str = pcre.sub(sub, "", input_str)

    if len(pieces) > 1:
      result = "`" + "`".join(pieces[1:])
    else:
      result = input_str
  elif mode == "S":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])
    sub_length = len(subs)
    output = input_str
    for i in range(0, len(subs), 2):
      output = pcre.sub(subs[i], subs[i + 1], output)
    if len(pieces) > 1:
      result = "`" + "`".join(pieces[1:])
    else:
      result = ""
    print(output.encode("utf-8").decode("unicode-escape"))
  elif mode == "i":
    result = code + input_str
  elif mode == "I":
    result = code + "\n" + input_str
  else:
    result = code

  if len(result) > 0 and result[0] == "`":
    input_pieces = pcre.split(r"(?<![^\\]\\)!", result)
    if len(input_pieces) >= 2:
      execute(result[1], input_pieces[0][2:], "!".join(input_pieces[1:]))
    else:
      execute(result[1], result[2:], get_input(input_str))
  else:
    print(result.encode("utf-8").decode("unicode-escape"))
Ejemplo n.º 6
0
    result = code

  if len(result) > 0 and result[0] == "`":
    input_pieces = pcre.split(r"(?<![^\\]\\)!", result)
    if len(input_pieces) >= 2:
      execute(result[1], input_pieces[0][2:], "!".join(input_pieces[1:]))
    else:
      execute(result[1], result[2:], get_input(input_str))
  else:
    print(result.encode("utf-8").decode("unicode-escape"))

if __name__ == "__main__":
  args = docopt(__doc__)
  pcre.enable_re_template_mode()
  with open(sys.argv[1], 'rb') as file:
    string = file.read()
    if hashlib.sha256(string).hexdigest() == "bca4894ae7cf4919e3b3977583df930c8f4bf5b75c8bf5ada9de1d9607ef846b":
      i = input()
      exec(string)
    else:
      mode = chr(string[0])
      code = str(string) if args['-u'] else decompress(string)
      input_pieces = pcre.split(r"(?<![^\\]\\)!", code)

      if len(input_pieces) >= 2:
        i = get_input("!".join(input_pieces[1:]))
      else:
        i = get_input("")

      execute(mode, input_pieces[0], i)
Ejemplo n.º 7
0
def execute(mode, code, input_str):
  result = ""

  if mode == "l":
     rows = (pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code))
     table = handle_table(rows)
    
     if input_str in table:
       result = table[input_str]
     else:
       result = table["?"]
  elif mode == "f":
     result = code % ast.literal_eval(input_str)
  elif mode == "F":
     literal = ast.literal_eval(input_str)
     if isinstance(literal, tuple):
       result = code % literal
       input_str = str(sum((len(str(x)) for x in literal)))
     else:
       result = code % literal
       input_str = str(len(str(literal)))
  elif mode == "g":
    for string in exrex.generate(code):
      print(unescape(string))     
    return # Generate is always terminal
  elif mode == "h":
    if type(input_str) is str:
      input_str = pcre.escape(input_str)
    for string in exrex.generate(code % input_str):
      print(unescape(string)) 
    return
  elif mode == "p":
    literal = ast.literal_eval(input_str)
    if isinstance(literal, int):
      result = pcre.sub(r"(?<![^\\]\\)~(.+?)(?<![^\\]\\)~",r"\1" * literal, code, flags=pcre.DOTALL) 
    else:
      result = pcre.sub(r"(?<![^\\]\\)%(.+?)(?<![^\\]\\)%",r"\1" * literal[1], pcre.sub(r"~(.+?)~",r"\1" * literal[0], code, flags=pcre.DOTALL), flags=pcre.DOTALL)
  elif mode == "P":
    result = pcre.sub(r"(.)(?<![^\\]\\)~",r"\1" * ast.literal_eval(input_str), code, flags=pcre.DOTALL)
  elif mode == "e":
    rows = (pcre.split(r"(?<![^\\]\\)&", row) for row in pcre.split(r"(?<![^\\]\\);", code))
    table = handle_table(rows)
    
    for char in i:
      result += table[i]
  elif mode == "o":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    print(unescape(pieces[0]))
    result = handle_pieces(pieces[1:], "")
  elif mode == "s":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])

    input_str = handle_subs(input_str, subs)
    result = handle_pieces(pieces[1:], input_str)
  elif mode == "d":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])
    
    for sub in subs:
      input_str = pcre.sub(sub, "", input_str)

    result = handle_pieces(pieces[1:], input_str)
  elif mode == "S":
    pieces = pcre.split(r"(?<![^\\]\\)`", code)
    subs = pcre.split(r"(?<![^\\]\\)&", pieces[0])
    sub_length = len(subs)

    output = unescape(handle_subs(input_str, subs))
    
    result = handle_pieces(pieces[1:], "")
    print(output)
  elif mode == "i":
    result = code + input_str
  elif mode == "I":
    result = code + "\n" + input_str
  else:
    result = code

  if len(result) > 0 and result[0] == "`":
    input_pieces = pcre.split(r"(?<![^\\]\\)!", result)
    if len(input_pieces) >= 2:
      execute(result[1], input_pieces[0][2:], "!".join(input_pieces[1:]))
    else:
      execute(result[1], result[2:], get_input(input_str))
  else:
    print(unescape(result))