def test_bug_449000(self): # Test for sub() on escaped characters self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), 'abc\ndef\n') self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 'abc\ndef\n') self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 'abc\ndef\n') self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 'abc\ndef\n')
def get_dll_exports(self) -> str: file_type = self._get_filetype(self.file_data) if HAVE_PEFILE and file_type and ("PE32" in file_type or "MS-DOS executable" in file_type): try: pe = pefile.PE(self.file_path) if hasattr(pe, "DIRECTORY_ENTRY_EXPORT"): exports = [] for exported_symbol in pe.DIRECTORY_ENTRY_EXPORT.symbols: try: if not exported_symbol.name: continue if isinstance(exported_symbol.name, bytes): exports.append( re.sub(b"[^A-Za-z0-9_?@-]", b"", exported_symbol.name).decode()) else: exports.append( re.sub("[^A-Za-z0-9_?@-]", "", exported_symbol.name)) except Exception as e: log.error(e, exc_info=True) return ",".join(exports) except Exception as e: log.error("PE type not recognised") log.error(e, exc_info=True) return ""
def parsers(uscfiles, findreplace): parsedfiles = [] for counter, section in enumerate(uscfiles): #print "" #print "File", counter parsedfile = subfile(section, findreplace) #print parsedfile # Replace Multiple Section references with links # Include [^<] to make sure no group is transformed twice pattern = r'@@@\s[Ss]ections?\s([^<]*?)@@@@@(.*?)@@' pattern_replace = [r'%s' % u'(\d+\w*(?:\(\w+\))*[-|–]?\d*)([, @])', r'<a href="/laws/target/%s/\1" class="sec">\1</a>\2'] parsedfile = parsesections(pattern, pattern_replace, parsedfile) #parsedfile = re.sub(r'@of-ref@', r'ref-Title-'+title, parsedfile) parsedfile = re.sub(r'@@ref-.*?@', r'', parsedfile) #parsedfile = re.sub(r'ref-title-this', r'ref-title-'+title, parsedfile) # Encode Named Acts by removing lowercase and non-word characters, and appending the length of the name w/o non-word characters #pattern = r'@@ref-namedact-(.*?)@@' pattern = r'/ref-namedact-(.*?)/' parsedfile = parsenamedacts(pattern, parsedfile) parsedfile = re.sub(r'@of-ref@', r'ref-title-this', parsedfile) parsedfile = re.sub(r'@@ref-.*?@', r'', parsedfile) # Remove remaining @ parsedfile = parsedfile.replace('@','')#.translate(None, '@') parsedfiles.append(parsedfile) return parsedfiles
def build_options(self): """Generate analysis options. @return: options dict. """ options = {} options["id"] = self.task.id options["ip"] = self.machine.resultserver_ip options["port"] = self.machine.resultserver_port options["category"] = self.task.category options["target"] = self.task.target options["package"] = self.task.package options["options"] = self.task.options options["enforce_timeout"] = self.task.enforce_timeout options["clock"] = self.task.clock options["terminate_processes"] = self.cfg.cuckoo.terminate_processes if not self.task.timeout or self.task.timeout == 0: options["timeout"] = self.cfg.timeouts.default else: options["timeout"] = self.task.timeout if self.task.category == "file": options["file_name"] = File(self.task.target).get_name() options["file_type"] = File(self.task.target).get_type() # if it's a PE file, collect export information to use in more smartly determining the right # package to use options["exports"] = "" if HAVE_PEFILE and ("PE32" in options["file_type"] or "MS-DOS executable" in options["file_type"]): try: pe = pefile.PE(self.task.target) if hasattr(pe, "DIRECTORY_ENTRY_EXPORT"): exports = [] for exported_symbol in pe.DIRECTORY_ENTRY_EXPORT.symbols: try: if not exported_symbol.name: continue if isinstance(exported_symbol.name, bytes): exports.append( re.sub(b"[^A-Za-z0-9_?@-]", b"", exported_symbol.name).decode( "utf-8")) else: exports.append( re.sub("[^A-Za-z0-9_?@-]", "", exported_symbol.name)) except Exception as e: log.error(e, exc_info=True) options["exports"] = ",".join(exports) except Exception as e: log.error("PE type not recognised") log.error(e, exc_info=True) # options from auxiliar.conf for plugin in self.aux_cfg.auxiliar_modules.keys(): options[plugin] = self.aux_cfg.auxiliar_modules[plugin] return options
def handle_techniques(line, **opts): vb_vars = opts["vb_vars"] enc_func_name = opts["enc_func_name"] decrypt_func = opts["decrypt_func"] def var_substitute(m): var = m.group(1) line = line.replace('"', '"""') line = re.sub(r'"""([A-F0-9]{2,})"""', decode_hex, line) line = re.sub(r'"""([\w_+=/]{2,})"""', decode_base64, line) line = re.sub(r'(?i)Chr[A-Z$]\(Asc[A-Z$](.+?)\)\)', r"\1", line) line = re.sub(r'(?i)Asc[A-Z$]\("""(\w)\w*"""\)', lambda m: str(ord(m.group(1))), line) line = re.sub(r'(?i)((?:Chr[A-Z$]?\(\d+\)\s*&?\s*)+)', decode_chr, line) line = re.sub(r'(?i)\b%s\s*\(\w+\("""(.+?)"""\),\s*\w+\("""(.+?)"""' % enc_func_name, decrypt_func, line) line = re.sub(r'(?i)\b%s\((?:""")?(.+?)(?:""")?,\s*(?:""")?(.+?)(?:""")?\)' % enc_func_name, decrypt_func, line) line = re.sub(r'(?i)StrReverse\(.+?"""(.+?)"""\)', decode_reverse, line) line = re.sub(r'""".+?"""\s+&+\s+""".+?""".+', concatenate, line) while "Chr(Asc(" in line: lastline = line line = re.sub(r'(?i)Chr\(Asc\((.+?)\)\)', r"\1",line) if line == lastline: break # Remove quotes before regexing against them. line = line.replace('""" + """','') line = line.replace('"""','') # Remove a few concat patterns. Theres a bug with some obfuscation # techniques. line = line.replace(" + ", "") line = line.replace(" & ","") return line
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) if eventData in self.results: self.sf.debug("Skipping " + eventData + " as already mapped.") return None else: self.results[eventData] = True address = eventData # Skip post office boxes if address.lower().startswith('po box'): self.sf.debug("Skipping PO BOX address") return None rx1 = re.compile(r'^(c/o|care of|attn:|attention:)\s+[0-9a-z\s\.]', flags=re.IGNORECASE) # Remove address prefixes for delivery instructions address = re.sub(rx1, r'', address) rx2 = re.compile(r'^(Level|Floor|Suite|Room)\s+[0-9a-z]+,', flags=re.IGNORECASE) # Remove address prefixes known to return no results (floor, level, suite, etc). address = re.sub(rx2, r'', address) # Search for address data = self.query(eventData) # Usage Policy mandates no more than 1 request per second time.sleep(1) if data is None: self.sf.debug("Found no results for " + eventData) return None self.sf.info("Found " + str(len(data)) + " matches for " + eventData) for location in data: try: lat = location.get('lat') lon = location.get('lon') except BaseException as e: self.sf.debug("Failed to get lat/lon: " + str(e)) continue if not lat or not lon: continue coords = str(lat) + "," + str(lon) self.sf.debug("Found coordinates: " + coords) evt = SpiderFootEvent("PHYSICAL_COORDINATES", coords, self.__name__, event) self.notifyListeners(evt)
def handle_techniques(line, **opts): vb_vars = opts["vb_vars"] enc_func_name = opts["enc_func_name"] decrypt_func = opts["decrypt_func"] def var_substitute(m): var = m.group(1) line = line.replace('"', '"""') line = re.sub(r'"""([A-F0-9]{2,})"""', decode_hex, line) line = re.sub(r'"""([\w_+=/]{2,})"""', decode_base64, line) line = re.sub(r"(?i)Chr[A-Z$]\(Asc[A-Z$](.+?)\)\)", r"\1", line) line = re.sub(r'(?i)Asc[A-Z$]\("""(\w)\w*"""\)', lambda m: ord(m.group(1)), line) line = re.sub(r"(?i)((?:Chr[A-Z$]?\(\d+\)\s*&?\s*)+)", decode_chr, line) line = re.sub(r'(?i)\b%s\s*\(\w+\("""(.+?)"""\),\s*\w+\("""(.+?)"""' % enc_func_name, decrypt_func, line) line = re.sub(r'(?i)\b%s\((?:""")?(.+?)(?:""")?,\s*(?:""")?(.+?)(?:""")?\)' % enc_func_name, decrypt_func, line) line = re.sub(r'(?i)StrReverse\(.+?"""(.+?)"""\)', decode_reverse, line) line = re.sub(r'""".+?"""\s+&+\s+""".+?""".+', concatenate, line) while "Chr(Asc(" in line: lastline = line line = re.sub(r"(?i)Chr\(Asc\((.+?)\)\)", r"\1", line) if line == lastline: break # Remove quotes before regexing against them. line = line.replace('""" + """', "") line = line.replace('"""', "") # Remove a few concat patterns. Theres a bug with some obfuscation # techniques. line = line.replace(" + ", "") line = line.replace(" & ", "") return line
def clean(t): """ normalize numbers, discard some punctuation that can be ambiguous """ t = re.sub('[.,\d]*\d', '<NUM>', t) t = re.sub('[^a-zA-Z0-9,.;:<>\-\'\/?!$% ]', '', t) t = t.replace('--', ' ') # sometimes starts a sentence... trouble return t
def __open_text_cleaner(word): """ Direct copy from the original BSG setup. The tokens matching logic was moved to bsg_tokenizer.py """ word = re.sub(r'[^\w\'\-]|[\'\-\_]{2,}', "", word) if len(word) == 1: word = re.sub(r'[^\daiu]', '', word) return word
def run(self, results): """Run Moloch to import pcap @return: nothing """ self.key = "moloch" self.alerthash = {} self.fileshash = {} self.MOLOCH_CAPTURE_BIN = self.options.get("capture", None) self.MOLOCH_CAPTURE_CONF = self.options.get("captureconf", None) self.CUCKOO_INSTANCE_TAG = self.options.get("node", None) self.MOLOCH_USER = self.options.get("user", None) self.MOLOCH_PASSWORD = self.options.get("pass", None) self.MOLOCH_REALM = self.options.get("realm", None) self.MOLOCH_AUTH = self.options.get("auth", "digest") self.pcap_path = os.path.join(self.analysis_path, "dump.pcap") self.MOLOCH_URL = self.options.get("base", None) self.task_id = results["info"]["id"] self.custom = None if "machine" in results["info"] and results["info"][ "machine"] and "name" in results["info"]["machine"]: self.machine_name = re.sub(r"[\W]", "_", str(results["info"]["machine"]["name"])) else: self.machine_name = "Unknown" if results["info"].has_key( "options") and results["info"]["options"].has_key("setgw"): self.gateway = re.sub(r"[\W]", "_", str(results["info"]["options"]["setgw"])) else: self.gateway = "Default" if results["info"].has_key("options") and results["info"].has_key( "custom"): self.custom = re.sub(r"[\W]", "_", str(results["info"]["custom"])) if not os.path.exists(self.MOLOCH_CAPTURE_BIN): log.warning( "Unable to Run moloch-capture: BIN File %s Does Not Exist" % (self.MOLOCH_CAPTURE_BIN)) return if not os.path.exists(self.MOLOCH_CAPTURE_CONF): log.warning( "Unable to Run moloch-capture Conf File %s Does Not Exist" % (self.MOLOCH_CAPTURE_CONF)) return try: cmd = "%s -c %s -r %s -n %s -t %s:%s -t cuckoo_jtype:%s -t cuckoo_machine:%s -t cuckoo_gw:%s" % ( self.MOLOCH_CAPTURE_BIN, self.MOLOCH_CAPTURE_CONF, self.pcap_path, self.CUCKOO_INSTANCE_TAG, self.CUCKOO_INSTANCE_TAG, self.task_id, self.task["category"], self.machine_name, self.gateway) if self.custom: cmd = cmd + " -t custom:%s" % (self.custom) except Exception, e: log.warning("Unable to Build Basic Moloch CMD: %s" % e)
def termcenter(): parser = argparse.ArgumentParser(description='Center stuff on terminals') parser.add_argument('string', nargs='*', type=str) args = parser.parse_args() for e in [sys.stdin] + args.string: lines = [e] if isinstance(e, str) else e.readlines() if lines: width = max(map(len, map(lambda s: re.sub(r'\x1B\[[0-9;]+m|\$.*\$', '', s), lines))) pad = int((os.get_terminal_size()[0]- width)/2) for line in lines: print(' '*pad + re.sub(r'\$.*\$|\n', '', line))
def process_tweet(tweet, stopwords): # convert to lowercase tweet = tweet.lower() # replace any links with "URL" tweet = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # replace "@username" references with "AT_USER" tweet = re.sub(r'@[\S]+', 'AT_USER', tweet) # replace hashtags: #word -> word tweet = re.sub(r'#([\S]+)', r'\1', tweet) tweet = word_tokenize(tweet) return [word for word in tweet if word not in stopwords]
def preprocess(sentence): """ A simple function to handle preprocessing of a sentence""" # Lowercase sentence sentence = sentence.lower() # Remove @ mentions sentence = re.sub('@[a-z0-9_]+', '', sentence) # Remove URLs sentence = re.sub( r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', sentence) # Remove NewLines sentence = sentence.replace('\r\n', ' ').replace('\n', ' ').replace('\r', ' ') return sentence
def are_equals_urls(url1, url2): result = False url1 = urllib.parse.unquote(url1) url2 = urllib.parse.unquote(url2) url1 = re.sub("^(https?://(www.)?)", "", url1) url1 = re.sub(' ', '', url1) url2 = re.sub("^(https?://(www.)?)", "", url2) url2 = re.sub(' ', '', url2) if url1.endswith('/'): url1 = url1[0:-1] if url2.endswith('/'): url2 = url2[0:-1] if url1 == url2: result = True return result
def get_features(frag, model): """ ... w1. (sb?) w2 ... Features, listed roughly in order of importance: (1) w1: word that includes a period (2) w2: the next word, if it exists (3) w1length: number of alphabetic characters in w1 (4) w2cap: true if w2 is capitalized (5) both: w1 and w2 (6) w1abbr: log count of w1 in training without a final period (7) w2lower: log count of w2 in training as lowercased (8) w1w2upper: w1 and w2 is capitalized """ words1 = clean(frag.tokenized).split() if not words1: w1 = '' else: w1 = words1[-1] if frag.next: words2 = clean(frag.next.tokenized).split() if not words2: w2 = '' else: w2 = words2[0] else: words2 = [] w2 = '' c1 = re.sub('(^.+?\-)', '', w1) c2 = re.sub('(\-.+?)$', '', w2) feats = {} feats['w1'] = c1 feats['w2'] = c2 feats['both'] = c1 + '_' + c2 len1 = min(10, len(re.sub('\W', '', c1))) if c1.replace('.','').isalpha(): feats['w1length'] = str(len1) try: feats['w1abbr'] = str(int(math.log(1+model.non_abbrs[c1[:-1]]))) except: feats['w1abbr'] = str(int(math.log(1))) if c2.replace('.','').isalpha(): feats['w2cap'] = str(c2[0].isupper()) try: feats['w2lower'] = str(int(math.log(1+model.lower_words[c2.lower()]))) except: feats['w2lower'] = str(int(math.log(1))) feats['w1w2upper'] = c1 + '_' + str(c2[0].isupper()) return feats
def on_call(self, call, process): # Legacy, modern Dyre doesn't have hardcoded hashes in # CryptHashData anymore iocs = [ "J7dnlDvybciDvu8d46D\\x00", "qwererthwebfsdvjaf+\\x00", ] pipe = [ "\\??\\pipe\\3obdw5e5w4", "\\??\\pipe\\g2fabg5713", ] if call["api"] == "CryptHashData": buf = self.get_argument(call, "Buffer") if buf in iocs: self.cryptoapis = True tmp = re.sub(r"\\x[0-9A-Fa-f]{2}", "", buf) if self.compname in tmp: if re.match("^" + self.compname + "[0-9 ]+$", tmp): self.cryptoapis = True elif call["api"] == "HttpOpenRequestA": buf = self.get_argument(call, "Path") if len(buf) > 10: self.networkapis.add(buf) elif call["api"] == "NtCreateNamedPipeFile": buf = self.get_argument(call, "PipeName") for npipe in pipe: if buf == npipe: self.syncapis = True break return None
def rewrite(self, response): """ Rewrites response according to matching rewrite rules. :return: rewritten response """ headers_r = response.headers content_r = response.data for r in self.rules: if r.policy == 'rewrite-headers': # todo: support rewrite-headers, use, e.g., # response.headers['X-Archive-Guessed-Content-Type'] # is this bytes or string? # also, we're rewriting only "rewritable" mimetypes in wsgiapp.py self._log.warn(f'rulesengine policy rewrite-headers to be implemented') continue if r.policy == 'rewrite-all': try: content_r = re.sub(r.rewrite_from, r.rewrite_to, content_r, options=re2_options) self._log.info(f'rewriting response.data from... {r.rewrite_from[:80]}... to ...{r.rewrite_to[:80]}') except Exception as e: self._log.warn(f'exception rewriting response.data from {r.rewrite_from[:80]}: {e}') content_r = response.data elif r.policy == 'rewrite-js': self._log.warn(f'rulesengine policy rewrite-js to be implemented') # support this here? continue else: self._log.warn('unexpected policy; nothing rewritten!') return headers_r, content_r
def _files(files): if not files: files = [] hash = md5(str(datetime.now())).hexdigest() dest = '%s/%s/%s/%s' % (env.user.login[0], env.user.login, hash[:2], hash[2:4]) files_del = env.request.args('del-attach', []) if not isinstance(files_del, (list, tuple)): files_del = [files_del] for f in files_del: if f not in files: continue remove_attach(f) files.remove(f) files_in = env.request.args('attach', []) files_p = env.request.files('attach') if not isinstance(files_in, (list, tuple)): files_in = [files_in] files_p = [files_p] for i, file in enumerate(files_in[:10]): if isinstance(file, str): file = file.decode('utf-8') file = re.sub(r'[^\w\.]+', '-', unidecode(file)) d = "%s/%s/" % (dest, randint(1000, 9999)) make_attach(files_p[i], d, file, remove=True) files.append(os.path.join(d, file)) return files
def run(self) -> List[str]: ret = [] with open(self.filepath, "r") as f: source = f.read() # Get rid of superfluous comments. source = re.sub("/\\*.*?\\*/", "", source, flags=re.S) for script in re.findall(self.script_re, source, re.I | re.S): try: x = bs4.BeautifulSoup(script, "html.parser") language = x.script.attrs.get("language", "").lower() except Exception: language = None # We can't rely on bs4 or any other HTML/XML parser to provide us # with the raw content of the xml tag as they decode html entities # and all that, leaving us with a corrupted string. source = re.match("<.*>(.*)</.*>$", script, re.S).group(0) # Decode JScript.Encode encoding. if language in {"jscript.encode", "vbscript.encode"}: source = EncodedScriptFile(self.filepath).decode(source.encode()) if len(source) > 65536: source = f"{source[:65536]}\r\n<truncated>" ret.append(source) return ret
def spaceReplace(inputString, MODFLAG): # OLD: $var= "EXAMPLE" # NEW: $var= "EXAMPLE" if MODFLAG == 0: MODFLAG = 0 return re.sub(" +", " ", inputString), MODFLAG
def get_db_name_from_url(url): domain = get_domain(url) result = re.sub('\.', '_', domain) # url = url.encode('utf-8', 'replace') # result = hashlib.md5(url).hexdigest() return result + '.db'
def __reWildcard(regexp, string): """Wildcard-based regular expression system""" regexp = re.sub("\*+", "*", regexp) match = True if regexp.count("*") == 0: if regexp == string: return True else: return False blocks = regexp.split("*") start = "" end = "" if not regexp.startswith("*"): start = blocks[0] if not regexp.endswith("*"): end = blocks[-1] if start != "": if string.startswith(start): blocks = blocks[1:] else: return False if end != "": if string.endswith(end): blocks = blocks[:-1] else: return False blocks = [block for block in blocks if block != ""] if not blocks: return match for block in blocks: i = string.find(block) if i == -1: return False string = string[i + len(block):] return match
def processText(text): ''' strips some unwanted characters. Originally stripped the "references" section according to pubGeneric but it wasn't working. Splits full text strings by a simple sentence filter. ''' text = re.sub(r'\x07|\r', '', text) #text = re.sub(r'\x07|\r|[(\s{0,3}\d{1,3}\s{0,3})(,\s{0,3}\d{1,3}\s{0,3}){0,7}\]', '', text) # strip ^G, \r, and inline citations #sections = pubGeneric.sectionRanges(text) #if sections is not None: # try: # dropRange = sections['ack'] # text = text[:dropRange[0]] + text[dropRange[1]:] # except KeyError: # pass # try: # dropRange = sections['refs'] # text = text[:dropRange[0]] + text[dropRange[1]:] # except KeyError: # pass # split by period followed by capital letter within 3 proceeding characters previousThreshold = -2 threshold = 0 for threshold in re.finditer('\..?.?([A-Z])', text): threshold = threshold.start() yield text[previousThreshold + 2:threshold + 1] previousThreshold = threshold yield text[threshold:]
def process_entry(self, entry): out = {'type': 'feed'} # UGnich - mooduck if entry['published'].endswith('UT'): entry['published'] = '%sC' % entry['published'] tz = timezone(settings.timezone) try: out['created'] = \ dateutil.parser.parse(entry['published']).astimezone(tz) except ValueError: entry['created'] = \ dateutil.parser.parse(entry['published']) out['link'] = entry['link'] out['title'] = re.sub(r'&#(?P<c>\d+);', lambda c: unichr(int(c.group('c'))), unescape(entry['title'])) \ if 'title' in entry else '' out['text'] = self.process_text(entry['summary']) out['tags'] = [ t['label'] or t['term'] for t in entry['tags'] ] \ if 'tags' in entry else [] return out
def replace_placeholders(string, item, match): """Replaces placeholders in the string.""" if isinstance(item, praw.objects.Comment): string = string.replace('{{body}}', item.body) else: string = string.replace('{{body}}', item.selftext) string = string.replace('{{domain}}', getattr(item, 'domain', '')) string = string.replace('{{permalink}}', get_permalink(item)) string = string.replace('{{subreddit}}', item.subreddit.display_name) if isinstance(item, praw.objects.Comment): string = string.replace('{{title}}', item.link_title) else: string = string.replace('{{title}}', item.title) string = string.replace('{{url}}', getattr(item, 'url', '')) if item.author: string = string.replace('{{user}}', item.author.name) else: string = string.replace('{{user}}', '[deleted]') # replace any {{match_##}} with the corresponding match groups string = re.sub(r'\{\{match-(\d+)\}\}', r'\\\1', string) if match: string = match.expand(string) return string
def clean_url(url, remove_arguments=True, domain=None, scheme=None): result = urllib.parse.unquote(url) # if '#' in result: # i = result.find('#') # result = result[:i] if domain or remove_arguments: if '?' in result: i = result.find('?') if domain: result1 = result[:i] result2 = result[i + 1:] if domain in result1 and remove_arguments: result = result1 elif domain in result2: res_split = result2.split('=') for r in res_split: if domain in r: result = r if '&' in result: i = result.find('&') result = result[:i] break else: result = result[:i] if scheme: if not re.match('https?://', result): result = scheme + '://' + result result = re.sub(' ', '', result) if result.endswith('/'): result = result[:-1] return result
def add_html_links(mentions, text): linked_text = "" mentions.sort(key=lambda x: x.start) dummy_char = "$" char_list = list(text) rabbi_dict = {} for m in mentions: if m.id_matches is None: continue rabbi_dict[m.start] = (text[m.start:m.end], m.id_matches) char_list[m.start:m.end] = list(dummy_char * (m.end - m.start)) dummy_text = "".join(char_list) # assert len(dummy_text) == len(text), f"DUMMY {dummy_text}\nREAL {text}" def repl(match): try: mention, slugs = rabbi_dict[match.start()] except KeyError: print("KEYERROR", match.group()) return match.group() # TODO find better way to determine if slug is in topics collection slug = slugs[0] other_slugs = slugs[1:] link = f"""<a href="https://www.sefaria.org/topics/{slug}" class="{"missing" if ':' in slug else "found"}">{mention}</a>""" if len(other_slugs) > 0: link += f'''<sup>{", ".join([f"""<a href="https://www.sefaria.org/topics/{temp_slug}" class="{"missing" if ':' in temp_slug else "found"}">[{i+1}]</a>""" for i, temp_slug in enumerate(other_slugs)])}</sup>''' return link linked_text = re.sub(r"\$+", repl, dummy_text) return linked_text
def render_pony(name, text, balloonstyle, width=80, center=False, centertext=False): pony = load_pony(name) balloon = link_l = link_r = '' if text: [link_l, link_r] = balloonstyle[-2:] for i,line in enumerate(pony): match = re.search('\$balloon([0-9]*)\$', line) if match: minwidth = int(match.group(1) or '0') pony[i:i+1] = render_balloon(text, balloonstyle, minwidth=minwidth, maxwidth=int(width/2), pad=str.center if centertext else str.ljust) break try: first = pony.index('$$$') second = pony[first+1:].index('$$$') pony[first:] = pony[first+1+second+1:] except: pass pony = [ line.replace('$\\$', link_l).replace('$/$', link_r) for line in pony ] indent = '' if center: ponywidth = max([ len(re.sub(r'\x1B\[[0-9;]+m|\$.*\$', '', line)) for line in pony ]) indent = ' '*int((width-ponywidth)/2) wre = re.compile('((\x1B\[[0-9;]+m)*.){0,%s}' % width) reset = '[39;49m\n' return indent+(reset+indent).join([ wre.search(line).group() for line in pony ])+reset
def processText(text): ''' strips some unwanted characters. Originally stripped the "references" section according to pubGeneric but it wasn't working. Splits full text strings by a simple sentence filter. ''' text = re.sub(r'\x07|\r', '', text) #text = re.sub(r'\x07|\r|[(\s{0,3}\d{1,3}\s{0,3})(,\s{0,3}\d{1,3}\s{0,3}){0,7}\]', '', text) # strip ^G, \r, and inline citations #sections = pubGeneric.sectionRanges(text) #if sections is not None: # try: # dropRange = sections['ack'] # text = text[:dropRange[0]] + text[dropRange[1]:] # except KeyError: # pass # try: # dropRange = sections['refs'] # text = text[:dropRange[0]] + text[dropRange[1]:] # except KeyError: # pass # split by period followed by capital letter within 3 proceeding characters previousThreshold = -2 threshold = 0 for threshold in re.finditer('\..?.?([A-Z])', text): threshold = threshold.start() yield text[previousThreshold+2:threshold+1] previousThreshold = threshold yield text[threshold:]
def _find_and_replace(self, date_string, captures): """ :warning: when multiple tz matches exist the last sorted capture will trump :param date_string: :return: date_string, tz_string """ # add timezones to replace # import pdb; pdb.set_trace() cloned_replacements = copy.copy(self.REPLACEMENTS) # don't mutate if captures.get('timezones') is not None: for tz_string in captures.get('timezones', []): cloned_replacements.update({tz_string: ' '}) date_string = date_string.lower() for key, replacement in cloned_replacements.items(): # we really want to match all permutations of the key surrounded by whitespace chars except one # for example: consider the key = 'to' # 1. match 'to ' # 2. match ' to' # 3. match ' to ' # but never match r'(\s|)to(\s|)' which would make 'october' > 'ocber' date_string = re.sub(r'(?i)(^|\s)' + key + '(\s|$)', replacement, date_string) poptzstring = '' if captures.get('timezones') is not None: poptzstring = self._pop_tz_string( sorted(captures.get('timezones', []))) return date_string, poptzstring
def html2md(s): h2t = HTML2Text() h2t.body_width = 0 #h2t.ignore_links = True #h2t.ignore_images = True s = h2t.handle(s) s = re.sub(r'\!?\[\]\((?P<url>.+?)\)', lambda m: " %s " % m.group('url'), s) return s
def get_rabbi_regex(cls, rabbi): reg = rabbi.replace( cls.b_token, f"(?:{u'|'.join(re.escape(b) for b in cls.b_replacements)})") for starter in cls.starting_replacements: starter = re.escape(starter) reg = re.sub(f'^{starter}', f"(?:{starter.lower()}|{starter})", reg) return reg
def extract_domain_name_from_db(file_): domain_name = file_[file_.rfind('/') + 1:] domain_name = domain_name.replace('_', '.') domain_name = re.sub('^(www\d?\.)', '', domain_name) domain_name = domain_name.replace('.db', '') # domain_name = domain_name.replace('.it', '') # domain_name = domain_name.replace('.com', '') # domain_name = domain_name.replace('.org', '') return domain_name
def get_principal_domain( url): # estrae da un url il dominio "principale"; es: www.xxxxx.xx.it if not isinstance(url, str): url = str(url) url = urllib.parse.unquote(url) result = urlparse(url).hostname if result: result = re.sub('^(www\d?.)', '', result) return result
def parsesections(pattern, pattern_replace, section): sectionsref = re.search(pattern, section) while sectionsref: i1 = sectionsref.start(1) i2 = sectionsref.end(2) #print "found multiple secs at", i1, "-", i2 section = section[:i1]+re.sub(pattern_replace[0], pattern_replace[1] % sectionsref.group(2), section[i1:i2]) + section[1+i2:] sectionsref = re.search(pattern, section) return section
def replace(self, m): return "%s%s%s%s%s%s" % ( m.group("scheme"), m.group("pass"), m.group("authority"), m.group("undef"), m.group("query"), re.sub(r":", "%3a", m.group("fragment")), )
def ulogin(): if env.user.id: raise AlreadyAuthorized sess = Session() if env.request.method == "POST": url = "http://ulogin.ru/token.php?token=%s&host=%s" % (env.request.args("token"), settings.domain) try: resp = urllib2.urlopen(url) data = dict.fromkeys(ULOGIN_FIELDS) data.update(json.loads(resp.read())) resp.close() except urllib2.URLError: return render("/auth/login.html", fields=ULOGIN_FIELDS, errors=["ulogin-fail"]) try: env.user.authenticate_ulogin(data["network"], data["uid"]) if env.user.id: return Response(redirect=referer()) except NotAuthorized: pass login = data["nickname"].strip(u" -+.") if login: login = re.sub(r"[\._\-\+]+", "-", login) info = { "login": login, "network": data["network"], "uid": data["uid"], "name": ("%s %s" % (data["first_name"], data["last_name"])).strip(), "email": data["email"], "avatar": data["photo_big"], "birthdate": data["bdate"], "gender": True if data["sex"] == "2" else False if data["sex"] == "1" else None, "location": "%s, %s" % (data["city"], data["country"]) if data["city"] and data["country"] else data["city"] or data["country"], "_nickname": data["nickname"], "_name": ("%s %s" % (data["first_name"], data["last_name"])).strip(), "_profile": data["profile"], } sess["reg_info"] = info sess.save() else: info = sess["reg_info"] if not info or not "network" in info or not "uid" in info: return Response(redirect="%s://%s/register" % (env.request.protocol, settings.domain)) info["birthdate"] = parse_date(info["birthdate"]) or datetime.now() - timedelta(days=365 * 16 + 4) return render("/auth/register_ulogin.html", info=info)
async def async_get_oauth2_token(session: aiohttp.ClientSession, username: str, password: str) -> Dict: """Hackily get an oauth2 token until I can be bothered to do this correctly""" params = { 'client_id': OAUTH2_CLIENT_ID, 'response_type': 'code', 'access_type': 'offline', 'redirect_uri': OAUTH2_REDIRECT_URI, } async with session.get(f'{LOGIN_URL}/oauth2/auth', params=params) as resp: if 400 <= resp.status < 500: raise GeAuthError(await resp.text()) if resp.status >= 500: raise GeServerError(await resp.text()) resp_text = await resp.text() email_regex = ( r'^\s*(\w+(?:(?:-\w+)|(?:\.\w+)|(?:\+\w+))*\@' r'[A-Za-z0-9]+(?:(?:\.|-)[A-Za-z0-9]+)*\.[A-Za-z0-9][A-Za-z0-9]+)\s*$' ) clean_username = re.sub(email_regex, r'\1', username) etr = etree.HTML(resp_text) post_data = { i.attrib['name']: i.attrib['value'] for i in etr.xpath("//form[@id = 'frmsignin']//input") if 'value' in i.keys() } post_data['username'] = clean_username post_data['password'] = password async with session.post(f'{LOGIN_URL}/oauth2/g_authenticate', data=post_data, allow_redirects=False) as resp: if 400 <= resp.status < 500: raise GeAuthError(await resp.text()) if resp.status >= 500: raise GeServerError(await resp.text()) code = parse_qs(urlparse(resp.headers['Location']).query)['code'][0] post_data = { 'code': code, 'client_id': OAUTH2_CLIENT_ID, 'client_secret': OAUTH2_CLIENT_SECRET, 'redirect_uri': OAUTH2_REDIRECT_URI, 'grant_type': 'authorization_code', } auth = aiohttp.BasicAuth(OAUTH2_CLIENT_ID, OAUTH2_CLIENT_SECRET) async with session.post(f'{LOGIN_URL}/oauth2/token', data=post_data, auth=auth) as resp: if 400 <= resp.status < 500: raise GeAuthError(await resp.text()) if resp.status >= 500: raise GeServerError(await resp.text()) oauth_token = await resp.json() try: return {'Authorization': 'Bearer ' + oauth_token['access_token']} except KeyError: raise GeAuthError(f'Failed to get a token: {oauth_token}')
def normalize_text(cls, lang, s): # text = re.sub('<[^>]+>', ' ', text) if lang == 'en': s = cls.myunidecode(s) s = re.sub(cls.normalizing_reg, cls.normalizing_rep, s) # text = unidecode(text) # text = re.sub('\([^)]+\)', ' ', text) # text = re.sub('\[[^\]]+\]', ' ', text) # text = ' '.join(text.split()) return s
def run(self,results): """Run Moloch to import pcap @return: nothing """ self.key = "moloch" self.alerthash ={} self.fileshash ={} self.MOLOCH_CAPTURE_BIN = self.options.get("capture", None) self.MOLOCH_CAPTURE_CONF = self.options.get("captureconf",None) self.CUCKOO_INSTANCE_TAG = self.options.get("node",None) self.MOLOCH_USER = self.options.get("user",None) self.MOLOCH_PASSWORD = self.options.get("pass",None) self.MOLOCH_REALM = self.options.get("realm",None) self.MOLOCH_AUTH = self.options.get("auth","digest") self.pcap_path = os.path.join(self.analysis_path, "dump.pcap") self.MOLOCH_URL = self.options.get("base",None) self.task_id = results["info"]["id"] self.custom = None if results["info"].has_key("machine") and results["info"]["machine"].has_key("name"): self.machine_name = re.sub(r"[\W]","_",str(results["info"]["machine"]["name"])) else: self.machine_name = "Unknown" if results["info"].has_key("options") and results["info"]["options"].has_key("setgw"): self.gateway = re.sub(r"[\W]","_",str(results["info"]["options"]["setgw"])) else: self.gateway = "Default" if results["info"].has_key("options") and results["info"].has_key("custom"): self.custom = re.sub(r"[\W]","_",str(results["info"]["custom"])) if not os.path.exists(self.MOLOCH_CAPTURE_BIN): log.warning("Unable to Run moloch-capture: BIN File %s Does Not Exist" % (self.MOLOCH_CAPTURE_BIN)) return if not os.path.exists(self.MOLOCH_CAPTURE_CONF): log.warning("Unable to Run moloch-capture Conf File %s Does Not Exist" % (self.MOLOCH_CAPTURE_CONF)) return try: cmd = "%s -c %s -r %s -n %s -t %s:%s -t cuckoo_jtype:%s -t cuckoo_machine:%s -t cuckoo_gw:%s" % (self.MOLOCH_CAPTURE_BIN,self.MOLOCH_CAPTURE_CONF,self.pcap_path,self.CUCKOO_INSTANCE_TAG,self.CUCKOO_INSTANCE_TAG,self.task_id,self.task["category"],self.machine_name,self.gateway) if self.custom: cmd = cmd + " -t custom:%s" % (self.custom) except Exception,e: log.warning("Unable to Build Basic Moloch CMD: %s" % e)
def preprocess(self, lower=False, remove_hash=False, remove_mentions=False, remove_url=False, remove_newline=False): if lower: self.text_pp = self.text_pp.lower() if remove_hash: self.text_pp = self.text_pp.replace('#', '') if remove_mentions: self.text_pp = re.sub('@[a-z0-9_]+', '', self.text_pp) if remove_url: self.text_pp = re.sub( r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', self.text_pp) if remove_newline: self.text_pp = self.text_pp.replace('\r\n', ' ').replace( '\n', ' ').replace('\r', ' ') return self
def __init__(self, word_processor_type='default'): self.__allowed_types = ['none', 'default', 'open_text'] # sanity checks for input assert word_processor_type in self.__allowed_types # assigning processing function if word_processor_type == 'none': self.__call__ = lambda x: x if word_processor_type == 'default': self.__call__ = lambda word: re.sub(r'[^\w_,.?@!$#\':\/\-()]|[,\'?@$#]{2,}', "", word) if word_processor_type == "open_text": self.__call__ = self.__open_text_cleaner
def remove_evil_links(pdf_data): """ Removes all it-ebook's links and metadata from the passed PDF data. """ pdf_data = pdf_data.encode("hex") # Remove each annotation element inside the PDF file (This removes the # "clickable" it-ebooks.info links) print 'Removing evil links' new_data = re2.sub(pattern, "", pdf_data) # Remove the actual links (link elements which are assigned to the annotations) new_data = new_data.replace("www.it-ebooks.info".encode("hex"), "") print 'Done' return new_data.decode("hex")
def build_options(self): """Generate analysis options. @return: options dict. """ options = {} options["id"] = self.task.id options["ip"] = self.machine.resultserver_ip options["port"] = self.machine.resultserver_port options["category"] = self.task.category options["target"] = self.task.target options["package"] = self.task.package if self.task.package == "service": if "service-dll-of-interest" not in self.task.options: if self.task.options == "": self.task.options = "service-dll-of-interest=c:\\windows\\system32\\nwsapagent.dll" else: self.task.options += ",service-dll-of-interest=c:\\windows\\system32\\nwsapagent.dll" options["options"] = self.task.options options["enforce_timeout"] = self.task.enforce_timeout options["clock"] = self.task.clock options["terminate_processes"] = self.cfg.cuckoo.terminate_processes if not self.task.timeout or self.task.timeout == 0: options["timeout"] = self.cfg.timeouts.default else: options["timeout"] = self.task.timeout if self.task.category == "file": options["file_name"] = File(self.task.target).get_name() options["file_type"] = File(self.task.target).get_type() # if it's a PE file, collect export information to use in more smartly determining the right # package to use options["exports"] = "" if HAVE_PEFILE and ("PE32" in options["file_type"] or "MS-DOS executable" in options["file_type"]): try: pe = pefile.PE(self.task.target) if hasattr(pe, "DIRECTORY_ENTRY_EXPORT"): exports = [] for exported_symbol in pe.DIRECTORY_ENTRY_EXPORT.symbols: exports.append(re.sub(r'[^A-Za-z0-9_?@-]', '', exported_symbol.name)) options["exports"] = ",".join(exports) except: pass return options
def _generate_login(self): name = unidecode(self.get_info('name')).lower() if not name: name = re.sub(r'^\w+:/+', '', self._url.lower()) name = re.sub('^\W+|\W+$', '', name) words = re.split(r'\W+', name) name = '' br = False for w in words[:]: if not name: _name = w else: _name = "%s-%s" % (name, w) if len(_name) <= 16: name = _name else: name = _name[:16] br = True break if br: try: ri = name.rindex('-') except ValueError: ri = 16 if ri > 6: name = name[:ri] i = 0 while True: login = '******' % (name, i or '') try: User('login', login) except UserNotFound: return login i += 1
def on_call(self, call, process): if not self.check: return None if call["api"].startswith("RegQueryValueEx"): keyname = self.get_argument(call, "FullName") uninstall = "\\microsoft\\windows\\currentversion\\uninstall" if (keyname and uninstall in keyname.lower() and keyname.lower().endswith("displayname")): app = self.get_argument(call, "Data") if app: # Ignore language/architecture name segments buf = re.sub(r"\([^\)]+\)", "", app).strip() self.programs.add(buf)
def has_standalone_keywords(classified_event): solo_lines_regex = rules.GOOD_SOLO_LINE.hack_double_regex()[classified_event.boundaries] text = classified_event.search_text good_matches = set() for line in text.split('\n'): alpha_line = re.sub(r'\W+', '', line) if not alpha_line: continue remaining_line = solo_lines_regex.sub('', line) deleted_length = len(line) - len(remaining_line) if 0.5 < 1.0 * deleted_length / len(alpha_line): good_matches.add(solo_lines_regex.findall(line)[0]) # at most one keyword per line if len(good_matches) >= 2: return True, 'found good keywords on lines by themselves: %s' % set(good_matches) return False, 'no good keywords on lines by themselves'
def parse_tags(tags): if tags: tags = tags.strip(" \r\n\t*") if isinstance(tags, str): tags = tags.decode("utf-8") # tags = re.findall(r'[^\s*]+', tags) tags = filter( None, [t.replace(u"\xa0", " ").strip()[:64] for t in uniqify(re.split(r"(?<!\\)[\*,]", tags)[:10])] ) if not tags: tags = None else: tags = [] return map(lambda t: re.sub(r"\\,", ",", t), tags)
def test_bug_1140(self): # re.sub(x, y, u'') should return u'', not '', and # re.sub(x, y, '') should return '', not u''. # Also: # re.sub(x, y, unicode(x)) should return unicode(y), and # re.sub(x, y, str(x)) should return # str(y) if isinstance(y, str) else unicode(y). for x in 'x', u'x': for y in 'y', u'y': z = re.sub(x, y, u'') self.assertEqual(z, u'') self.assertEqual(type(z), unicode) # z = re.sub(x, y, '') self.assertEqual(z, '') self.assertEqual(type(z), str) # z = re.sub(x, y, unicode(x)) self.assertEqual(z, y) self.assertEqual(type(z), unicode) # z = re.sub(x, y, str(x)) self.assertEqual(z, y) self.assertEqual(type(z), type(y))
def beautify(self, s, opts = None ): if opts != None: self.opts = opts if self.opts.brace_style not in ['expand', 'collapse', 'end-expand']: raise(Exception('opts.brace_style must be "expand", "collapse" or "end-expand".')) self.blank_state() while s and s[0] in [' ', '\t']: self.preindent_string += s[0] s = s[1:] self.input = self.unpack(s, opts.eval_code) parser_pos = 0 while True: token_text, token_type = self.get_next_token() #print (token_text, token_type, self.flags.mode) if token_type == 'TK_EOF': break handlers = { 'TK_START_EXPR': self.handle_start_expr, 'TK_END_EXPR': self.handle_end_expr, 'TK_START_BLOCK': self.handle_start_block, 'TK_END_BLOCK': self.handle_end_block, 'TK_WORD': self.handle_word, 'TK_SEMICOLON': self.handle_semicolon, 'TK_STRING': self.handle_string, 'TK_EQUALS': self.handle_equals, 'TK_OPERATOR': self.handle_operator, 'TK_BLOCK_COMMENT': self.handle_block_comment, 'TK_INLINE_COMMENT': self.handle_inline_comment, 'TK_COMMENT': self.handle_comment, 'TK_UNKNOWN': self.handle_unknown, } handlers[token_type](token_text) self.last_last_text = self.last_text self.last_type = token_type self.last_text = token_text sweet_code = self.preindent_string + re.sub('[\n ]+$', '', ''.join(self.output)) return sweet_code
def extract_iocs(s): for desc, pattern in PATTERNS: m = pattern.findall(s) if m: # Hacked-up buxfix for multilayer Chr(Asc(Chr(Asc( which can # sometimes mess up our quoted string extraction / parsing. while "Chr(Asc(" in s: lastline = s s = re.sub(r'(?i)Chr\(Asc\((.+?)\)\)', r"\1", s) if s == lastline: break # Return the line matched and not m because I prefer to have # context and not simply the IOC. This helps with the executable # file IOC, sometimes it's a save location! return desc, convert_to_printable(s) return None
def search_posts(text, user=None, private=None, bookmarks=False, offset=0, limit=20): text = re.sub(r'[\(\)\[\]\{\}!?\\/]+', ' ', text).strip() es = Elasticsearch() body = { 'query': { 'filtered': { 'filter': { 'term': { 'private': False } }, 'query': { 'query_string': { 'fields': ['text', 'tags'], 'query': text, #'analyze_wildcard': True } } } }, #'sort': [{'created': {'order': 'desc'}}], 'highlight': { 'fields': { 'text': { 'pre_tags': ['**'], 'post_tags': ['**'], 'number_of_fragments': 2, 'fragment_size': 200, } } } } res = es.search(index='point-posts,point-comments', from_=offset, size=limit+1, body=body) results = _plist(res) #results = res['hits']['hits'] has_next = len(results) > limit total = res['hits']['total'] from pprint import pprint pprint(res) return results[:limit], has_next, total
def handle_string(self, token_text): if self.last_type in ['TK_START_BLOCK', 'TK_END_BLOCK', 'TK_SEMICOLON']: self.append_newline() elif self.last_type == 'TK_WORD': self.append(' ') # Try to replace readable \x-encoded characters with their equivalent, # if it is possible (e.g. '\x41\x42\x43\x01' becomes 'ABC\x01'). def unescape(match): block, code = match.group(0, 1) char = chr(int(code, 16)) if block.count('\\') == 1 and char in string.printable: return char return block token_text = re.sub(r'\\{1,2}x([a-fA-F0-9]{2})', unescape, token_text) self.append(token_text)
def filtercomments(source): """NOT USED: strips trailing comments and put them at the top.""" trailing_comments = [] comment = True while comment: if re.search(r'^\s*\/\*', source): comment = source[0, source.index('*/') + 2] elif re.search(r'^\s*\/\/', source): comment = re.search(r'^\s*\/\/', source).group(0) else: comment = None if comment: source = re.sub(r'^\s+', '', source[len(comment):]) trailing_comments.append(comment) return '\n'.join(trailing_comments) + source