def __init__(self): ''' Initialize QBCredentials, this has to pass ''' self.datastruct = { "SNNs": [], "SPs": [], "Users": [], "Logins": [], "_SNNs": ["Count", "SSN"], "_SPs": ["Count", "PASS"], "_Users": ["Count", "USER"], "_Logins": ["Count", "UserPass"] } self.ssn = rcompile(r"(\d{3}-\d{2}-\d{4})", I) self.strongpasswords = rcompile( r"((?=.*[A-Z])(?=.*[a-z])(?=.*\d)(?=.*[ \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~])[A-Za-z\d \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~]{10,24})" ) self.username = rcompile(r"(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]{6,24}") self.logins = rcompile( r"((user|pass|login|sign)(.*)[^A-Za-z\d \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~])" ) self.words = [] self.wordsstripped = ""
def __init__(self, new_packets, **kwargs): super().__init__() self.new_packets = new_packets self.essid_filters = kwargs.get("essid_filters", None) self.essid_regex = kwargs.get("essid_regex", None) self.ignore_case = kwargs.get("ignore_case", False) self.display_func = kwargs.get("display_func", lambda p: None) self.storage_func = kwargs.get("storage_func", lambda p: None) self.debug = kwargs.get("debug", False) self.stop_parser = Event() if self.debug: print("[!] ESSID filters: " + str(self.essid_filters)) print("[!] ESSID regex: " + str(self.essid_regex)) print("[!] Ignore case: " + str(self.ignore_case)) if self.essid_regex is not None: if self.ignore_case: self.essid_regex = rcompile(self.essid_regex, IGNORECASE) else: self.essid_regex = rcompile(self.essid_regex) else: self.essid_regex = None
def rcompile_and_find(self, data, filename): ''' parse the detections and check them against wordsstripped ''' with copen(filename, "r", encoding='utf8') as file: for _ in loads(file.read()): with ignore_excpetion(Exception): if "Type" in _ and "QREGEX" in _["Type"]: _list = [] tempmatches = 0 for item in _["Detection"]: if _["Options"]["Word"] == "Normal": temp_value = rsearch( rcompile(r"{}".format(item), _["Options"]["Flag"]), self.wordsstripped) elif _["Options"]["Word"] != "Normal": temp_value = rsearch( rcompile(r"\b{}\b".format(item), _["Options"]["Flag"]), self.wordsstripped) if temp_value is not None: _list.append(temp_value.group()) tempmatches += 1 if _list and tempmatches >= _["Options"]["Required"]: data.append({ "Matched": tempmatches, "Required": _["Options"]["Required"], "Behavior": _["Name"], "Detected": ', '.join(_list) })
def complile_essid_regex(self): """ Returns the compiled version of the ESSID regex. """ if self.essid_regex is not None: if self.ignore_case: return rcompile(self.essid_regex, IGNORECASE) return rcompile(self.essid_regex) return None
def startanalyzing(data): ''' start extracting ransom patterns ''' for detectonroot in DETECTIONS: detect = 0 temp_list = [] for check in range(0, 15): randompick = choice(DETECTIONS[detectonroot]) nextpick = DETECTIONS[detectonroot][ (DETECTIONS[detectonroot].index(randompick) + 1) % len(DETECTIONS[detectonroot])] if search( rcompile(r"{}[ \x00\|]{}".format(randompick, nextpick), I), data["StringsRAW"]["wordsstripped"]): temp_list.append("({} {})".format(randompick, nextpick)) detect += 1 if detect >= 5: data["QBDETECT"]["Detection"].append({ "Count": detect, "Offset": "Unavailable", "Rule": "Ransom", "Match": ", ".join(temp_list), "Parsed": None }) else: detect = 0 temp_list = [] for check in range(0, 15): randompick1 = choice(DETECTIONS[detectonroot]) randompick2 = choice(DETECTIONS[detectonroot]) if search( rcompile( r"{}[ \x00\|]{}".format(randompick1, randompick2), I), data["StringsRAW"]["wordsstripped"]): temp_list.append("({} {})".format(randompick1, randompick2)) detect += 1 if detect >= 5: data["QBDETECT"]["Detection"].append({ "Count": detect, "Offset": "Unavailable", "Rule": "Ransom", "Match": ", ".join(temp_list), "Parsed": None })
def get_objects(self, data, buffer) -> (list, list): ''' get objects from rtf by regex ''' temp_x = rcompile(rb'\\objdata\b', DOTALL | MULTILINE) temp_list = [] temp_list_objects = [] for _ in finditer(temp_x, buffer): start, position = _.span() position += 1 startcurlybracket = 0 endcurlybracket = 0 for item in range(position, position + len(buffer[position:])): if chr(buffer[item]) == "{": startcurlybracket += 1 if chr(buffer[item]) == "}": endcurlybracket += 1 if startcurlybracket == 0 and endcurlybracket == 1 or \ endcurlybracket > startcurlybracket: whitespaces = sub(rb'\s+', b'', buffer[position:item]) temp = unhexlify(whitespaces) tempdecoded = sub(br'[^\x20-\x7F]+', b'', temp) temp_list_objects.append(tempdecoded) temp_list.append({ "Len": len(buffer[position:item]), "Parsed": tempdecoded.decode("utf-8", errors="ignore") }) break return temp_list, temp_list_objects
def preInterpretFeedbackSyntax(text): result = '' for line in text.split('\n'): if line.startswith('#Aufgabe'): result += '<br><u><b>Aufgabe %s</b></u><br><br>' % line.split( ' ')[1] + '\n' elif rcompile('#\[(\+|-)?\d+(\.\d+)?\]$').match(line): num = float(line[2:-1]) numseq = '%s pkt' % str( num) if num == -1 or num == 1 else '%s pkte' % str(num) result += '<br>[%s]<br>' % numseq + '\n' elif line.startswith('#SUM['): num = float(line[5:-1]) numseq = '%s pkt' % str( num) if num == -1 or num == 1 else '%s pkte' % str(num) result += '<br><b>SUM</b>:[%s]' % numseq + '\n' elif line.startswith('#>Code'): result += '<code>\n' elif line.startswith('#<'): result += '</code>\n' else: result += line + '\n' return result
def __init__(self): ''' initialize class and datastruct, this has to pass ''' self.datastruct = {"Snort":[], "_Snort":["time", "sid", "revision", "class", "priority", "protocol", "src", "dest", "msg"]} self.snortpattern = rcompile(r'(\d{2}\/\d{2}\/\d{2}\-\d{2}\:\d{2}\:\d{2}\.\d{6})\s+\[\*\*\]\s+\[(\d+)\:([\d]+)\:(\d+)\]\s+(.+)\s+\[\*\*\]\s+\[(.+)\]\s+\[(.+)\]\s+\{(.+)\}\s+([\d.:]+)\s+\-\>\s+([\d.:]+)')
def intro(filename, link): ''' this function is needed for the home page intro ''' intromarkdown = "" with ignore_excpetion(Exception): ret_request = get(link, verify=False, timeout=2) if ret_request.ok is True: intromarkdown = search(rcompile(r"\#\# Features.*", DOTALL), ret_request.text).group(0) if intromarkdown == "": with ignore_excpetion(Exception): readmefolder = path.abspath( path.join(path.dirname(__file__), filename)) with open(readmefolder, "rU", encoding="utf-8") as file: intromarkdown = search(rcompile(r"\#\# Features.*", DOTALL), file.read()).group(0) return intromarkdown
def readAttribute(gMDataPath, key): with open(getMetaDataPath(gMDataPath), 'r') as fd: for line in fd: if line.startswith(key): val = line.split('=')[1].strip() if rcompile('>(\d+|)*').match(val): return modify(val, reverse = True) else: return val
def findword(self, word, _print=False): ''' search for specific word in the files (case insensitive) ''' temp_x = {} pattern = rcompile(r'(^.*%s.*$)' % word, 8|2) temp_x['enterpriseattack'] = list(set(findall(pattern, self.enterprise))) temp_x['preattack'] = list(set(findall(pattern, self.preattack))) if _print: print(dumps(temp_x, indent=4, sort_keys=True)) return temp_x
def __init__(self, tutLastname, sheetNr): self.__tutLastname = tutLastname self.__sheetNr = sheetNr self.__exts = ['zip', 'tar', 'tar.gz', 'rar'] pat = self.__tutLastname.upper()\ + '_Blatt' + str(sheetNr).zfill(2)\ + '_(.+-.+(\(.+-.+\))*)+'\ + '\.(zip|tar\.gz|tar|rar)' self.__pattern = rcompile(pat)
def __init__(self,filter=None,interface=None,logs=None): self.current_ip = ifaddresses(interface)[AF_INET][0]['addr'].encode('utf-8') self.current_mac = ifaddresses(interface)[AF_LINK][0]['addr'].encode('utf-8') self.filter = filter self.interface = interface self.method = "TCPUDP" self.ICMP_codes = [(0, 0, 'Echo/Ping reply'),(3, 0, 'Destination network unreachable'),(3, 1, 'Destination host unreachable'),(3, 2, 'Desination protocol unreachable'),(3, 3, 'Destination port unreachable'),(3, 4, 'Fragmentation required'),(3, 5, 'Source route failed'),(3, 6, 'Destination network unknown'),(3, 7, 'Destination host unknown'),(3, 8, 'Source host isolated'),(3, 9, 'Network administratively prohibited'),(3, 10, 'Host administratively prohibited'),(3, 11, 'Network unreachable for TOS'),(3, 12, 'Host unreachable for TOS'),(3, 13, 'Communication administratively prohibited'),(3, 14, 'Host Precedence Violation'),(3, 15, 'Precendence cutoff in effect'),(4, 0, 'Source quench'),(5, 0, 'Redirect Datagram for the Network'),(5, 1, 'Redirect Datagram for the Host'),(5, 2, 'Redirect Datagram for the TOS & network'),(5, 3, 'Redirect Datagram for the TOS & host'),(8, 0, 'Echo/Ping Request'),(9, 0, 'Router advertisement'),(10, 0, 'Router discovery/selection/solicitation'),(11, 0, 'TTL expired in transit'),(11, 1, 'Fragment reassembly time exceeded'),(12, 0, 'Pointer indicates the error'),(12, 1, 'Missing a required option'),(12, 2, 'Bad length'),(13, 0, 'Timestamp'),(14, 0, 'Timestamp Reply'),(15, 0, 'Information Request'),(16, 0, 'Information Reply'),(17, 0, 'Address Mask Request'),(18, 0, 'Address Mask Reply'),(30, 0, 'Information Request')] self.allowed_ports = [] self.allowed_ips = [] self.common = rcompile('pass|user|login') self.setup_logger(logs)
def stripansi(stripstr: Union[str, list]) -> Union[str, list]: ansi_escape = rcompile(r"\x1B\[[0-?]*[ -/]*[@-~]") if isinstance(stripstr, list): newlist = [] for line in stripstr: if isinstance(line, bytes): line = line.decode() newlist.append(ansi_escape.sub("", line).strip()) return newlist else: return ansi_escape.sub("", stripstr).strip()
def compiled_essid_regex(self): """ Returns the compiled version of the ESSID regex. The value is cached once computed. """ # If there is a regex in the configuration and it hasn't been compiled # yet. if self._compiled_essid_regex is None and self.essid_regex is not None: self.logger.debug("Compiling ESSID regex") if self.ignore_case: self.logger.debug("Ignoring case in ESSID regex") self._compiled_essid_regex = rcompile(self.essid_regex, IGNORECASE) else: self._compiled_essid_regex = rcompile(self.essid_regex) return self._compiled_essid_regex
def find_it_by_hash(self, md5, data): ''' look in the databases by hash ''' items = [] items = find_items("QBWindows", {"md5": rcompile(md5, I)}) keys = [ "Collection", "CompanyName", "FileDescription", "FileVersion", "InternalName", "LegalCopyright", "OriginalFilename", "ProductName", "ProductVersion", "md5", "entropy", "path" ] self.loop_wrapper(items, keys, data)
def test_compiled_essid_regex_with_a_case_sensitive_regex(self): """ Tests 'compiled_essid_regex' with a case-sensitive regex. """ config = Config() config.essid_regex = "Free Wi-Fi" with self.assertLogs(self.logger, level=logging.DEBUG): compiled_regex = config.compiled_essid_regex self.assertEqual(compiled_regex, rcompile(config.essid_regex))
def test_compile_essid_regex_with_a_case_sensitive_regex(self): """ Tests 'complile_essid_regex' with a case-sensitive regex. """ from re import compile as rcompile config = Config() config.essid_regex = "Free Wi-Fi" compiled_regex = config.complile_essid_regex() self.assertEqual(compiled_regex, rcompile(config.essid_regex))
def __init__(self): ''' Initialize QBCreditcards, this has to pass ''' self.datastruct = { "AMERICANEXPRESS": [], "VISA": [], "MASTERCARD": [], "DISCOVER": [], "JCB": [], "DINERSCLUB": [], "_AMERICANEXPRESS": ["Count", "AmericanExpress"], "_VISA": ["Count", "Visa"], "_MASTERCARD": ["Count", "MasterCard"], "_DISCOVER": ["Count", "Discover"], "_JCB": ["Count", "JCB"], "_DINERSCLUB": ["Count", "DinersClub"] } self.detectionamericanexpress = rcompile(r'\b(?:3[47][0-9]{13})\b', I) self.detectionvisa = rcompile(r'\b(?:4[0-9]{12})(?:[0-9]{3})?\b', I) self.detectionmastercard = rcompile( r'\b(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}\b', I) self.detectiondiscover = rcompile(r'\b(?:6011\d{12})|(?:65\d{14})\b', I) self.detectionjcb = rcompile(r'\b(?:2131|1800|35[0-9]{3})[0-9]{11}?\b', I) self.detectiondinersclub = rcompile( r'\b3(?:0[0-5]|[68][0-9])[0-9]{11}\b', I) self.words = [] self.wordsstripped = ""
def __init__(self): ''' Initialize QBDGA, this has to pass ''' self.datastruct = { "Repeated": [], "LowFreqLetters": [], "ConsonantsRow": [], "Consonants": [], "Encryption": [], "Symbols": [], "Numbers": [], "Long": [], "Entropy": [], "_Repeated": ["Length", "Repeated"], "_LowFreqLetters": ["Count", "Letters", "URL"], "_ConsonantsRow": ["Groups", "Row", "URL"], "_Consonants": ["Count", "Letters", "URL"], "_Encryption": ["Type", "Detected", "URL"], "_Symbols": ["Count", "Symbols", "URL"], "_Numbers": ["Count", "Numbers", "URL"], "_Long": ["Length", "URL"], "_Entropy": ["Entropy", "URL"] } self.detectionlowfreq = rcompile(r"[vkjxqz]") self.detectionconsonantslettersinrow = rcompile( r"[bcdfghjklmnpqrstvwxyz]{4,}") self.detectionconsonants = rcompile(r"[bcdfghjklmnpqrstvwxyz]") self.detectionhex = rcompile(r'([0-9a-fA-F]{4,})', I) self.detectionsymbols = rcompile(r'[_\-~]', I) self.detectionnumbers = rcompile(r'[\d]', I)
def __init__(self, new_packets, essid_filters=None, essid_regex=None, ignore_case=False, display_func=lambda p: None, storage_func=lambda p: None, debug=False): super().__init__() self.new_packets = new_packets self.essid_filters = essid_filters self.display_func = display_func self.storage_func = storage_func self.stop_parser = Event() if debug: print("[!] ESSID filters: " + str(self.essid_filters)) print("[!] ESSID regex: " + str(essid_regex)) print("[!] Ignore case: " + str(ignore_case)) if essid_regex is not None: if ignore_case: self.essid_regex = rcompile(essid_regex, IGNORECASE) else: self.essid_regex = rcompile(essid_regex) else: self.essid_regex = None
def __init__(self): ''' initialize class and make detections path ''' self.intell = path.abspath( path.join(path.dirname(__file__), 'detections')) if not self.intell.endswith(path.sep): self.intell = self.intell + path.sep if not path.isdir(self.intell): mkdir(self.intell) self.ipv4privateonelinebad = rcompile( r"^(10|127|169\.254|172\.1[6-9]|172\.2[0-9]|172\.3[0-1]|192\.168)\..*", I)
def test_compile_essid_regex_with_a_case_insensitive_regex(self): """ Tests 'complile_essid_regex' with a case-insensitive regex. """ from re import compile as rcompile, IGNORECASE config = Config() config.essid_regex = "Free Wi-Fi" config.ignore_case = True compiled_regex = config.complile_essid_regex() self.assertEqual(compiled_regex, rcompile(config.essid_regex, IGNORECASE))
def find_it_from_words(self, data): ''' look in the databases by words ''' items = [] keys = [ "Collection", "FileDescription", "InternalName", "OriginalFilename", "ProductName", "md5", "entropy", "path" ] for word in self.words: # pass on "unterminated character set at position 1" some words are not escaped with ignore_excpetion(Exception): items = find_items( "QBWindows", { "$or": [{ "InternalName": rcompile(word, I) }, { "OriginalFilename": rcompile(word, I) }, { "md5": rcompile(word, I) }] }) self.loop_wrapper(items, keys, data)
def listCoursesOf(self, pFacName='Informatik', pSFacName='Informatik'): subFacLink = self.getSubFacility(pFacName, pSFacName)['Link'] r = self.__session.get(subFacLink) soup = BeautifulSoup(r.text, 'html.parser') elem = soup.find('div', attrs={'class': 'course_category_tree clearfix '}) rows = elem.findAll( 'a', href=rcompile( 'https://elearning2\.uni-heidelberg\.de/course/view\.php\?id=\d+' )) return [x.text for x in rows]
def createAllCredits(path): fm = FolderManager() subs, imp = findAllSubmissionFolders(path) total = [] extra = [] curParent = path sheetNr = '' while True: tmp = pdirname(curParent) if tmp == curParent: raise ValueError("U can't use this script here!") curParent = tmp if rcompile('Blatt_\d\d').match(pbasename(curParent)): sheetNr = pbasename(curParent).split('_')[1] break for subpath in subs: for tup in createCreditsText(subpath): total.append((tup[0], [('%.2f' % x).zfill(5) for x in tup[1]])) for subpath in imp: for tup in createCreditsText(subpath): searchRes = fm.findStudentByName(tup[0], status='Imported') if len(searchRes) == 1: extra.append( (searchRes[0], [('%.2f' % x).zfill(5) for x in tup[1]])) else: total.append((tup[0], [('%.2f' % x).zfill(5) for x in tup[1]])) print('Create %s ...' % ('AllCredits_%s.txt' % sheetNr), end='', flush=True) writeNameCreditLists(path, 'AllCredits_%s.txt' % sheetNr, total) print('[OK]') extTutDict = {} for elem in extra: eTutor = elem[0]['ExtTut'] if eTutor in extTutDict: extTutDict[eTutor].append((elem[0]['Name'], elem[1])) else: extTutDict[eTutor] = [(elem[0]['Name'], elem[1])] for k, v in extTutDict.items(): fname = 'AllCredits_%s_%s_%s.txt' % (sheetNr, fm.getTFirstName(), k.replace(' ', '-')) print('Create %s ...' % fname, end='', flush=True) writeNameCreditLists(path, fname, v) print('[OK]')
def analyze(self, data, _data, filename): ''' start analyzing logic ''' listheaders = [] listpayloads = [] for _ in data: listheaders.append(str(_["fields"])) listpayloads.append(str(_["payload"])) headers = "".join(listheaders) content = "".join(listpayloads) with copen(self.intell + filename, "r", encoding='utf8') as file: for _ in loads(file.read()): with ignore_excpetion(Exception): if "Type" in _ and "WQREGEX" in _["Type"]: if _["Options"][ "Word"] == "Normal" and "Header_Detection" in _: temp_var = search( rcompile(r"{}".format(_["Header_Detection"]), _["Options"]["Flag"]), headers) elif _["Options"][ "Word"] == "Normal" and "Content_Detection" in _: temp_var = search( rcompile(r"{}".format(_["Content_Detection"]), _["Options"]["Flag"]), content) if temp_var is not None: _data.append({ "Matched": "1", "Required": _["Options"]["Required"], "WAF": _["Name"], "Detected": temp_var.group() }) self.check_proxy_bypass(data, _data)
def __init__(self): ''' Initialize QBPhishing, this has to pass ''' self.datastruct = {"Suspicious":[], "Spelling count":[], "Spelling":[], "Symbols":[], "_Spelling count":["Total", "Misspelled"], "_Spelling":["Count", "Word", "Misspelled"], "_Suspicious":["Count", "Words"], "_Symbols":["Count", "Symbol"]} self.suspiciouswords = rcompile(r"uniq|18\+|action|act|additional income|affordable|amazed|apply|avoid|babe|be amazed|beneficiary|billing|billion|bonus|boss|buy|call|cancel|cash|casino|certified|cheap|claim|clearance|click|collect|compare rates|confirm|congrat|congratulations|credit|cures|customer|deal|dear|debt|direct email|discount|don\'t delete|don\'t hesitate|double your income|earn|experience|expire|extra|fantastic|fgift|free|freedom|friend|get it|great|guarantee|hello|income|increase |instant|investment|iphone|junk|limited|lose|log|lowest price|lucky|luxury|make money|medicine|mobile|money|msg|name|no credit check|now|obligation|offer|only|open|order|password|please|presently|problem|promise|purchase|quote|rates|refinance|refund|remove|reply|request|risk-free|sales|satisfaction|save|score|serious|sex|sexy|sign|sms|spam|special|subscription|success|supplies|take action|terms|text|ticket|traffic|trial|txt|unlimited|update|urgent|weight|win|winner|won") self.wordsstripped = ""
def findCourse(self, pFacName='Informatik', pSFacName='Informatik', course='Datenstrukt'): subFacLink = self.getSubFacility(pFacName, pSFacName)['Link'] r = self.__session.get(subFacLink) soup = BeautifulSoup(r.text, 'html.parser') rows = soup.findAll( 'a', href=rcompile( 'https://elearning2\.uni-heidelberg\.de/course/view\.php\?id=\d+' )) for row in rows: if course in row.text: return row['href']
def markHyperlinks(txt): res = '' pattern = rcompile('.*https?://.*') for line in txt.split('\n'): if res != '': res += '\n' for word in line.split(' '): if pattern.match(word): res += (' ' if res != '' else '') + toHyperLink(word) else: res += (' ' if res != '' else '') + word return res
from dateutil.parser import parse as dateParse import re from time import mktime from CONFIG import * from urllib import quote,unquote import subprocess from os import devnull as dev_NULL from difflib import ndiff dev_NULL=file(dev_NULL) from re import compile as rcompile from re import I as ignorecase rarchiveparse=rcompile(r'http\:\/\/web\.archive\.org\/web\/\d{14}/((http[s]?|ftp):\/)?\/?(?P<domain>[^:\/\s]+)(:([^\/]*))?(?P<path>(\/\w+)*\/)(?P<file>[\w\-\.]+[^#?\s]+)(?P<parameters>\?([^#]*))?(#(.*))?') rurlparse=rcompile(r'((http[s]?|ftp):\/)?\/?(?P<domain>[^:\/\s]+)(:([^\/]*))?(?P<path>(\/\w+)*\/)(?P<file>[\w\-\.]+[^#?\s]+)(?P<parameters>\?([^#]*))?(#(.*))?') rlink=rcompile(r'(http|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?') rdate=rcompile(r'[^\n\r]{0,130}(retrieved|downloaded|accessed)( on)?[ :](?P<date>((Jan(uary)?[ \.\-,\\]+([012]?\d|3[01])|Feb(ruary)?[ \.\-,\\]+([01]?\d|2[89])|Mar(ch)?[ \.\-,\\]+([012]?\d|3[01])|Apr(il)?[ \.\-,\\]+([012]?\d|30)|May[ \.\-,\\]+([012]?\d|3[01])|June?[ \.\-,\\]+([012]?\d|3[0])|July?[ \.\-,\\]+([012]?\d|3[01])|Aug(ust)?[ \.\-,\\]+([012]?\d|3[01])|Sep(tember)?[ \.\-,\\]+([012]?\d|30)|Oct(ober)?[ \.\-,\\]+([012]?\d|3[01])|Nov(ember)?[ \.\-,\\]+([012]?\d|3[0])|Dec(ember)?[ \.\-,\\]+([012]?\d|3[01]))[ \.\-,\\]+(19|20)?\d\d)|((([012]?\d|3[01])[ \.\-,\\]+Jan(uary)?|([01]?\d|2[89])[ \.\-,\\]+Feb(ruary)?|([012]?\d|3[01])[ \.\-,\\]+Mar(ch)?|([012]?\d|30)[ \.\-,\\]+Apr(il)?|([012]?\d|3[01])[ \.\-,\\]+May|([012]?\d|3[0])[ \.\-,\\]+June?|([012]?\d|3[01])[ \.\-,\\]+July?|([012]?\d|3[01])[ \.\-,\\]+Aug(ust)?|([012]?\d|30)[ \.\-,\\]+Sep(tember)?|([012]?\d|3[01])[ \.\-,\\]+Oct(ober)?|([012]?\d|30)[ \.\-,\\]+Nov(ember)?|([012]?\d|3[10])[ \.\-,\\]+Dec(ember)?)[ \.\-,\\]+(19|20)?\d\d)|(((1[3456789]|2\d|3[01])[\.\-,\\](0?\d|1[012]))[ \.\-,\\](19|20)\d\d)|(((0?\d|1[012])[\.\-,\\](1[3456789]|2\d|3[01]))[ \.\-,\\](19|20)\d\d))',ignorecase) rarchivenear=rcompile('(archive|wayback|webcitation.org|webcite)') rarchive=rcompile(r'([^ ]* [^\n\r\]]*\][\s\.\,\w])') rurlstatus=rcompile(r'HTTP/[\d\.]+ (\d+)') rurlnorm=rcompile('(https?://)?(www\.)?(.*)') def isUsedInTextObject(textobjects,start,end): '''A simple function to search an array of textobjects for a string starting at *start* and ending at *end*, returning the first result. @param textobjects: An array of TextObjects that will be searched. @param start: The starting index of the string to be searched for. @param end: The ending index of the string to be searched for.''' for t in textobjects: if t.isContained(start,end): return t return None
#!/usr/bin/env python # encoding: utf-8 from time import time from re import compile as rcompile RAW_URL='http://en.wikipedia.org/w/index.php?title=%s&action=raw' URL_REQUEST_TIMEOUT='10' BOT_NAME='DASHBot' SHUTOFF='User:DASHBot/Dead_Links/Shutoff' PLURAL=lambda x: 's' if x>1 else '' BLACKLIST=rcompile('(https?\://(web\.archive.*|www\.webcitation.org)') SETTINGS_PAGE='User:DASHBot/Dead_Links/Settings.yaml' COMMENT_TEMPLET="Added archives to %d link%s. See [[User:DASHBot/Dead links]] for details, settings, shutoff." CITEWEB_TEMPLATES='([wW]eb reference 4|[cC]itenewsauthor|[cC]ite newspaper|[wW]eb-reference|[wW]eb reference|[wW]eb citation|[cC]ite website|[cC]ite webpage|[cC]ite article|[cC]ite news-q|[cC]ite news2|[Cc]ite study|[cC]ute news|[cC]ite-news|[cC]ite blog|[Cc]ite news|[wW]eb cite|[lL]ien web|[cC]itenews|[cC]ite-web|[cC]ite url|[cC]ite new|[cC]ite Web|[cC]ita web|[cC]it news|[Cc]ite web|[Cc]itation|[cC]iteweb|[cC]it web|[cC] news|[cC] web|[Cc]ite|[wW]eb)' HISTORY_SEARCH_WINDOW=500 ### s_global_editRate= 12 #seconds between edits s_useWaybackMachine= True #Use the internet archive's wayback machine s_nightlyEditLimit= 10 #The bot will limit itself to this many articles per run. s_noHistorySearch= True #Turn off the function that will look through an article's history to find an appropriate access date. s_editBareLinks= False #Allow the bot to add archvies to some links (not used in reference or citeweb) s_editBareReferences=False #allow bot to add archives to (non citeweb) references s_deltaTime=12960000 #The maximum diference between access date and archive date, in seconds s_noCheck_URL_status=True
for data in app(env, start_response): if data: httpfile.sendall(data, 15.+(len(data)>>10)) httpfile.close(15.) return handler def wsgihandler(app, env={}, **environ): handler = httphandler(wsgi(app), env, **environ) return handler RESPONSES = dict((int(i.split(None, 1)[0]), i) for i in ('100 Continue,101 Switching ' 'Protocols,200 OK,201 Created,202 Accepted,203 Non-Authoritative Information,204 No C' 'ontent,205 Reset Content,206 Partial Content,300 Multiple Choices,301 Moved Permanen' 'tly,302 Found,303 See Other,304 Not Modified,305 Use Proxy,307 Temporary Redirect,40' '0 Bad Request,401 Unauthorized,402 Payment Required,403 Forbidden,404 Not Found,405 ' 'Method Not Allowed,406 Not Acceptable,407 Proxy Authentication Required,408 Request ' 'Timeout,409 Conflict,410 Gone,411 Length Required,412 Precondition Failed,413 Reques' 't Entity Too Large,414 Request-URI Too Long,415 Unsupported Media Type,416 Requested' ' Range Not Satisfiable,417 Expectation Failed,500 Internal Server Error,501 Not Impl' 'emented,502 Bad Gateway,503 Service Unavailable,504 Gateway Timeout,505 HTTP Version' ' Not Supported').split(',')) NOCACHEHEADERS = {'Pragma': 'no-cache', 'Cache-Control': 'no-cache, must-revalidate', 'Expires': 'Mon, 26 Jul 1997 05:00:00 GMT'} quoted_slash_split = rcompile("(?i)%2F").split first = rcompile(r'^(\w+)[\s\t]+([^\r\n]+)[\s\t]+(HTTP/[01]\.[0-9])\r?\n$', I).match header = rcompile(r'^[\s\t]*([^\r\n:]+)[\s\t]*:[\s\t]*([^\r\n]+)[\s\t]*\r?\n$').match WSGIServer = WsgiServer = wsgiserver HTTPServer = HttpServer = httpserver __all__ = ['mainloop', 'exit', 'timeout', 'httpserver', 'wsgiserver', 'HttpServer', 'WsgiServer', 'HTTPServer', 'WSGIServer']
from glob import glob from re import compile as rcompile from zipfile import is_zipfile, ZipFile from tarfile import is_tarfile, open as taropen from string import Template from itertools import chain DEVNULL = open(devnull, "w") BASE = expanduser("~/www") USERNAME = getpwuid(getuid())[0] (PORT_MIN, PORT_MAX, PORT_STEP) = (10000, 10100, 3) # 0=HTTP, 1=HTTPS, 2=DB (START, STOP, ISRUNNING) = ('start', 'stop', 'isrunning') (WWW, DB) = ('www', 'db') SITE_FORMAT = rcompile(r'^site-\w+$') VALID_SITE_ID = rcompile(r'^[a-zA-Z]\w{0,23}$') APPLICATION_FORMAT = rcompile(r'^(.*)\.(tar\.gz|tar\.bz2|zip)$') def is_valid_site_id(site_id): """Check if a site identifier is valid.""" return VALID_SITE_ID.match(site_id) def bash_prelude(): """Returns the full path to the Bash prelude script.""" return join(BASE, 'bin', 'bashprelude') def get_bin_directory(executable, hints): """Try to find the directory of an executable event if it's not in PATH.""" bin_paths = environ["PATH"].split(pathsep) full_paths = [glob(join(path, executable)) for path in bin_paths + hints]
#!/usr/bin/env python # encoding: utf-8 """ Parser.py Created by Tim Sears on 2012-02-19. Copyright (c) 2012. All rights reserved. """ from CONFIG import * from Classes import * from re import compile as rcompile from re import I as ignorecase citeweb_templates=CITEWEB_TEMPLATES rtemplatename=rcompile(r'{{[\s\n\r]*%s'%citeweb_templates) rbaretemplatename=rcompile(citeweb_templates) rreference=rcompile(r'\<[\s\n\r]*ref[^\>]*\>.*?\<[\s\n\r]*/[\s\n\r]*ref[\s\n\r]*>',ignorecase) rtname = rcompile(citeweb_templates) rparameternamed = rcompile(r'([A-Za-z0-9]*)[\s\n\r]*=') #if a "accessed on " used on the same line as a url, assume propper access date chars=['a', 'b', 'c', 'd', 'e', 'f', 'g','_','h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] spaces=' \t\n\r' class TextObject(object): """The base class for text objects""" def __init__(self, start,end,text): super(TextObject, self).__init__() self.text=text self.startIndex=start self.endIndex=end self.urlArray=None
def retrieve(): database = connect('database.db') topics,feeds,documents,titles,descriptions = [[],[],[],[],[]] links,datetimes,thumbnails,doc_topics = [[],[],[],[]] ### GET DATABASE DATA for row in database.execute('SELECT * FROM topics;'): topics.append([row[0], str(row[1])]) for row in database.execute('SELECT fds_topic, fds_link FROM feeds;'): feeds.append([row[0],str(row[1])]) for row in database.execute('SELECT doc_id, doc_datetime, doc_link FROM documents'): documents.append([row[0],str(row[1]),str(row[2]),[]]) for row2 in database.execute('SELECT tpd_topic FROM tpc_doc WHERE tpd_document = '+str(row[0])+';'): documents[-1][3].append(row2[0]) ### GET RSS INFO for topic, link in feeds: html = urlopen(link).read() soup = BeautifulSoup(html) items = [item for item in soup.find_all('item')] for item in items: doc_topics.append(topic) if item.title is not None: title = item.title.findAll(text=True) if len(title) == 1: titles.append(title[0].encode('ascii',errors='ignore')) else: titles.append('') if item.description is not None: desc = item.description.findAll(text=True) if len(desc) == 1: descriptions.append(desc[0].encode('ascii',errors='ignore')) else: descriptions.append('') if item.guid is not None: link = item.guid.findAll(text=True) if len(link) == 1: links.append(link[0].encode('ascii',errors='ignore')) else: links.append('') if item.pubdate is not None: date = item.pubdate.findAll(text=True) if len(date) == 1: datetimes.append(date[0].encode('ascii',errors='ignore')) else: datetimes.append('') thumb = item.findChildren('media:thumbnail',{'width':'144'}) if len(thumb) == 1: thumbnails.append(thumb[0]['url'].encode('ascii',errors='ignore')) else: thumbnails.append('') ### GET DOCUMENTS new = 0 updated = 0 for index in range(len(titles)): print('('+str(index+1).ljust(4) + str(doc_topics[index]).ljust(2) + ')'), datetime = parser.parse(datetimes[index]) try: pos = [doc[2] for doc in documents].index(links[index]) except: refresh = 0 else: if doc_topics[index] not in documents[pos][3]: database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\ ' ('+str(doc_topics[index])+', '+str(documents[pos][0])+');') documents[pos][3].append(doc_topics[index]) database.commit() print('*'), if str(datetime) == str(documents[pos][1]): print('Unchanged Article') continue refresh = 1 not_article = ('VIDEO','AUDIO','In pictures','Your pictures') if titles[index].startswith(not_article): print('Not an Article') continue html = urlopen(links[index]).read() soup = BeautifulSoup(html) title = str(soup.title)[7:-8].decode('utf-8').encode('ascii',errors='ignore') temp = ['BBC News','BBC History','BBC Science','BBC Consumer','BBC Arts','BBC Nature'] if any(i in title for i in temp): division = 'story-body' elif 'BBC Sport' in title: division = 'article' elif 'BBC - Capital' in title: division = 'description|story-body' else: print('Website not known'); continue content = [div for div in soup.find_all('div',{'class':rcompile(division)})] soup = BeautifulSoup(' '.join(list(map(str,content)))) paragraphs = [p for p in soup.findAll('p')] soup = BeautifulSoup(' '.join(list(map(str,paragraphs)))) [p.extract() for p in soup.findAll('p') if str(p).startswith('<p><strong>')] [p.extract() for p in soup.findAll('p',{'class':rcompile('disclaimer|terms')})] text = soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ') text = text.encode('ascii', errors='ignore') if text == '': print('Empty Text') continue rsub(' +',' ',text) text = text.strip() text = '\n'.join([sentence for sentence in sent_tokenize(text)]) if refresh == 1: documents[pos][1] = str(datetime) database.execute('DELETE FROM entities WHERE ent_document = '+str(documents[pos][0])+';') database.execute('UPDATE documents SET doc_processed = 0,'+\ ' doc_datetime = \''+str(datetime)+'\','+\ ' doc_thumbnail = \''+thumbnails[index]+'\','+\ ' doc_title = \''+titles[index].replace('\'','\'\'')+'\','+\ ' doc_description = \''+descriptions[index].replace('\'','\'\'')+'\','+\ ' doc_text = \''+text.replace('\'','\'\'')+'\''+\ ' WHERE doc_link = \''+links[index]+'\';') print('Update - '+titles[index]) updated += 1 else: documents.append([len(documents)+1, datetime, links[index],[doc_topics[index]]]) database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\ ' ('+str(doc_topics[index])+', '+str(documents[-1][0])+');') database.execute('INSERT INTO documents (doc_datetime, doc_link, doc_thumbnail,'+\ ' doc_title, doc_description, doc_text) VALUES (\''+\ str(datetime)+'\',\''+links[index]+'\',\''+thumbnails[index]+'\',\''+\ titles[index].replace('\'','\'\'')+'\',\''+\ descriptions[index].replace('\'','\'\'')+'\',\''+\ text.replace('\'','\'\'')+'\');') print('Insert - '+titles[index]) new += 1 database.commit() print new,"new,", updated,"updated."
def __init__(self, regex, callback): self.regex = regex self.compiled = rcompile(self.regex) self.callback = callback