Exemple #1
0
    def __init__(self):
        '''
        Initialize QBCredentials, this has to pass
        '''
        self.datastruct = {
            "SNNs": [],
            "SPs": [],
            "Users": [],
            "Logins": [],
            "_SNNs": ["Count", "SSN"],
            "_SPs": ["Count", "PASS"],
            "_Users": ["Count", "USER"],
            "_Logins": ["Count", "UserPass"]
        }

        self.ssn = rcompile(r"(\d{3}-\d{2}-\d{4})", I)
        self.strongpasswords = rcompile(
            r"((?=.*[A-Z])(?=.*[a-z])(?=.*\d)(?=.*[ \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~])[A-Za-z\d \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~]{10,24})"
        )
        self.username = rcompile(r"(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d]{6,24}")
        self.logins = rcompile(
            r"((user|pass|login|sign)(.*)[^A-Za-z\d \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~])"
        )
        self.words = []
        self.wordsstripped = ""
        def __init__(self, new_packets, **kwargs):
            super().__init__()

            self.new_packets = new_packets
            self.essid_filters = kwargs.get("essid_filters", None)
            self.essid_regex = kwargs.get("essid_regex", None)
            self.ignore_case = kwargs.get("ignore_case", False)
            self.display_func = kwargs.get("display_func", lambda p: None)
            self.storage_func = kwargs.get("storage_func", lambda p: None)
            self.debug = kwargs.get("debug", False)

            self.stop_parser = Event()

            if self.debug:
                print("[!] ESSID filters: " + str(self.essid_filters))
                print("[!] ESSID regex: " + str(self.essid_regex))
                print("[!] Ignore case: " + str(self.ignore_case))

            if self.essid_regex is not None:
                if self.ignore_case:
                    self.essid_regex = rcompile(self.essid_regex, IGNORECASE)
                else:
                    self.essid_regex = rcompile(self.essid_regex)
            else:
                self.essid_regex = None
Exemple #3
0
 def rcompile_and_find(self, data, filename):
     '''
     parse the detections and check them against wordsstripped
     '''
     with copen(filename, "r", encoding='utf8') as file:
         for _ in loads(file.read()):
             with ignore_excpetion(Exception):
                 if "Type" in _ and "QREGEX" in _["Type"]:
                     _list = []
                     tempmatches = 0
                     for item in _["Detection"]:
                         if _["Options"]["Word"] == "Normal":
                             temp_value = rsearch(
                                 rcompile(r"{}".format(item),
                                          _["Options"]["Flag"]),
                                 self.wordsstripped)
                         elif _["Options"]["Word"] != "Normal":
                             temp_value = rsearch(
                                 rcompile(r"\b{}\b".format(item),
                                          _["Options"]["Flag"]),
                                 self.wordsstripped)
                         if temp_value is not None:
                             _list.append(temp_value.group())
                             tempmatches += 1
                     if _list and tempmatches >= _["Options"]["Required"]:
                         data.append({
                             "Matched": tempmatches,
                             "Required": _["Options"]["Required"],
                             "Behavior": _["Name"],
                             "Detected": ', '.join(_list)
                         })
    def complile_essid_regex(self):
        """
        Returns the compiled version of the ESSID regex.
        """

        if self.essid_regex is not None:
            if self.ignore_case:
                return rcompile(self.essid_regex, IGNORECASE)

            return rcompile(self.essid_regex)

        return None
Exemple #5
0
def startanalyzing(data):
    '''
    start extracting ransom patterns
    '''
    for detectonroot in DETECTIONS:
        detect = 0
        temp_list = []
        for check in range(0, 15):
            randompick = choice(DETECTIONS[detectonroot])
            nextpick = DETECTIONS[detectonroot][
                (DETECTIONS[detectonroot].index(randompick) + 1) %
                len(DETECTIONS[detectonroot])]
            if search(
                    rcompile(r"{}[ \x00\|]{}".format(randompick, nextpick), I),
                    data["StringsRAW"]["wordsstripped"]):
                temp_list.append("({} {})".format(randompick, nextpick))
                detect += 1
        if detect >= 5:
            data["QBDETECT"]["Detection"].append({
                "Count": detect,
                "Offset": "Unavailable",
                "Rule": "Ransom",
                "Match": ", ".join(temp_list),
                "Parsed": None
            })
        else:
            detect = 0
            temp_list = []
            for check in range(0, 15):
                randompick1 = choice(DETECTIONS[detectonroot])
                randompick2 = choice(DETECTIONS[detectonroot])
                if search(
                        rcompile(
                            r"{}[ \x00\|]{}".format(randompick1, randompick2),
                            I), data["StringsRAW"]["wordsstripped"]):
                    temp_list.append("({} {})".format(randompick1,
                                                      randompick2))
                    detect += 1
            if detect >= 5:
                data["QBDETECT"]["Detection"].append({
                    "Count":
                    detect,
                    "Offset":
                    "Unavailable",
                    "Rule":
                    "Ransom",
                    "Match":
                    ", ".join(temp_list),
                    "Parsed":
                    None
                })
Exemple #6
0
 def get_objects(self, data, buffer) -> (list, list):
     '''
     get objects from rtf by regex
     '''
     temp_x = rcompile(rb'\\objdata\b', DOTALL | MULTILINE)
     temp_list = []
     temp_list_objects = []
     for _ in finditer(temp_x, buffer):
         start, position = _.span()
         position += 1
         startcurlybracket = 0
         endcurlybracket = 0
         for item in range(position, position + len(buffer[position:])):
             if chr(buffer[item]) == "{":
                 startcurlybracket += 1
             if chr(buffer[item]) == "}":
                 endcurlybracket += 1
             if startcurlybracket == 0 and endcurlybracket == 1 or \
                 endcurlybracket > startcurlybracket:
                 whitespaces = sub(rb'\s+', b'', buffer[position:item])
                 temp = unhexlify(whitespaces)
                 tempdecoded = sub(br'[^\x20-\x7F]+', b'', temp)
                 temp_list_objects.append(tempdecoded)
                 temp_list.append({
                     "Len":
                     len(buffer[position:item]),
                     "Parsed":
                     tempdecoded.decode("utf-8", errors="ignore")
                 })
                 break
     return temp_list, temp_list_objects
def preInterpretFeedbackSyntax(text):
    result = ''
    for line in text.split('\n'):
        if line.startswith('#Aufgabe'):
            result += '<br><u><b>Aufgabe %s</b></u><br><br>' % line.split(
                ' ')[1] + '\n'

        elif rcompile('#\[(\+|-)?\d+(\.\d+)?\]$').match(line):
            num = float(line[2:-1])
            numseq = '%s pkt' % str(
                num) if num == -1 or num == 1 else '%s pkte' % str(num)
            result += '<br>[%s]<br>' % numseq + '\n'

        elif line.startswith('#SUM['):
            num = float(line[5:-1])
            numseq = '%s pkt' % str(
                num) if num == -1 or num == 1 else '%s pkte' % str(num)
            result += '<br><b>SUM</b>:[%s]' % numseq + '\n'

        elif line.startswith('#>Code'):
            result += '<code>\n'

        elif line.startswith('#<'):
            result += '</code>\n'

        else:
            result += line + '\n'

    return result
Exemple #8
0
    def __init__(self):
        '''
        initialize class and datastruct, this has to pass
        '''
        self.datastruct = {"Snort":[],
                           "_Snort":["time", "sid", "revision", "class", "priority", "protocol", "src", "dest", "msg"]}

        self.snortpattern = rcompile(r'(\d{2}\/\d{2}\/\d{2}\-\d{2}\:\d{2}\:\d{2}\.\d{6})\s+\[\*\*\]\s+\[(\d+)\:([\d]+)\:(\d+)\]\s+(.+)\s+\[\*\*\]\s+\[(.+)\]\s+\[(.+)\]\s+\{(.+)\}\s+([\d.:]+)\s+\-\>\s+([\d.:]+)')
Exemple #9
0
def intro(filename, link):
    '''
    this function is needed for the home page intro
    '''
    intromarkdown = ""
    with ignore_excpetion(Exception):
        ret_request = get(link, verify=False, timeout=2)
        if ret_request.ok is True:
            intromarkdown = search(rcompile(r"\#\# Features.*", DOTALL),
                                   ret_request.text).group(0)
    if intromarkdown == "":
        with ignore_excpetion(Exception):
            readmefolder = path.abspath(
                path.join(path.dirname(__file__), filename))
            with open(readmefolder, "rU", encoding="utf-8") as file:
                intromarkdown = search(rcompile(r"\#\# Features.*", DOTALL),
                                       file.read()).group(0)
    return intromarkdown
def readAttribute(gMDataPath, key):
    with open(getMetaDataPath(gMDataPath), 'r') as fd:
        for line in fd:
            if line.startswith(key):
                val = line.split('=')[1].strip()
                if rcompile('>(\d+|)*').match(val):
                    return modify(val, reverse = True)
                else:
                    return val
Exemple #11
0
 def findword(self, word, _print=False):
     '''
     search for specific word in the files (case insensitive)
     '''
     temp_x = {}
     pattern = rcompile(r'(^.*%s.*$)' % word, 8|2)
     temp_x['enterpriseattack'] = list(set(findall(pattern, self.enterprise)))
     temp_x['preattack'] = list(set(findall(pattern, self.preattack)))
     if _print:
         print(dumps(temp_x, indent=4, sort_keys=True))
     return temp_x
    def __init__(self, tutLastname, sheetNr):
        self.__tutLastname = tutLastname
        self.__sheetNr = sheetNr
        self.__exts = ['zip', 'tar', 'tar.gz', 'rar']

        pat = self.__tutLastname.upper()\
                + '_Blatt' + str(sheetNr).zfill(2)\
                + '_(.+-.+(\(.+-.+\))*)+'\
                + '\.(zip|tar\.gz|tar|rar)'

        self.__pattern = rcompile(pat)
Exemple #13
0
	def __init__(self,filter=None,interface=None,logs=None):
		self.current_ip = ifaddresses(interface)[AF_INET][0]['addr'].encode('utf-8')
		self.current_mac = ifaddresses(interface)[AF_LINK][0]['addr'].encode('utf-8')
		self.filter = filter
		self.interface = interface
		self.method = "TCPUDP"
		self.ICMP_codes = [(0, 0, 'Echo/Ping reply'),(3, 0, 'Destination network unreachable'),(3, 1, 'Destination host unreachable'),(3, 2, 'Desination protocol unreachable'),(3, 3, 'Destination port unreachable'),(3, 4, 'Fragmentation required'),(3, 5, 'Source route failed'),(3, 6, 'Destination network unknown'),(3, 7, 'Destination host unknown'),(3, 8, 'Source host isolated'),(3, 9, 'Network administratively prohibited'),(3, 10, 'Host administratively prohibited'),(3, 11, 'Network unreachable for TOS'),(3, 12, 'Host unreachable for TOS'),(3, 13, 'Communication administratively prohibited'),(3, 14, 'Host Precedence Violation'),(3, 15, 'Precendence cutoff in effect'),(4, 0, 'Source quench'),(5, 0, 'Redirect Datagram for the Network'),(5, 1, 'Redirect Datagram for the Host'),(5, 2, 'Redirect Datagram for the TOS & network'),(5, 3, 'Redirect Datagram for the TOS & host'),(8, 0, 'Echo/Ping Request'),(9, 0, 'Router advertisement'),(10, 0, 'Router discovery/selection/solicitation'),(11, 0, 'TTL expired in transit'),(11, 1, 'Fragment reassembly time exceeded'),(12, 0, 'Pointer indicates the error'),(12, 1, 'Missing a required option'),(12, 2, 'Bad length'),(13, 0, 'Timestamp'),(14, 0, 'Timestamp Reply'),(15, 0, 'Information Request'),(16, 0, 'Information Reply'),(17, 0, 'Address Mask Request'),(18, 0, 'Address Mask Reply'),(30, 0, 'Information Request')]
		self.allowed_ports = []
		self.allowed_ips = []
		self.common = rcompile('pass|user|login') 
		self.setup_logger(logs)
Exemple #14
0
def stripansi(stripstr: Union[str, list]) -> Union[str, list]:
    ansi_escape = rcompile(r"\x1B\[[0-?]*[ -/]*[@-~]")
    if isinstance(stripstr, list):
        newlist = []
        for line in stripstr:
            if isinstance(line, bytes):
                line = line.decode()
            newlist.append(ansi_escape.sub("", line).strip())
        return newlist
    else:
        return ansi_escape.sub("", stripstr).strip()
Exemple #15
0
    def compiled_essid_regex(self):
        """
        Returns the compiled version of the ESSID regex.

        The value is cached once computed.
        """

        # If there is a regex in the configuration and it hasn't been compiled
        # yet.
        if self._compiled_essid_regex is None and self.essid_regex is not None:
            self.logger.debug("Compiling ESSID regex")

            if self.ignore_case:
                self.logger.debug("Ignoring case in ESSID regex")

                self._compiled_essid_regex = rcompile(self.essid_regex,
                                                      IGNORECASE)
            else:
                self._compiled_essid_regex = rcompile(self.essid_regex)

        return self._compiled_essid_regex
Exemple #16
0
 def find_it_by_hash(self, md5, data):
     '''
     look in the databases by hash
     '''
     items = []
     items = find_items("QBWindows", {"md5": rcompile(md5, I)})
     keys = [
         "Collection", "CompanyName", "FileDescription", "FileVersion",
         "InternalName", "LegalCopyright", "OriginalFilename",
         "ProductName", "ProductVersion", "md5", "entropy", "path"
     ]
     self.loop_wrapper(items, keys, data)
Exemple #17
0
    def test_compiled_essid_regex_with_a_case_sensitive_regex(self):
        """
        Tests 'compiled_essid_regex' with a case-sensitive regex.
        """

        config = Config()
        config.essid_regex = "Free Wi-Fi"

        with self.assertLogs(self.logger, level=logging.DEBUG):
            compiled_regex = config.compiled_essid_regex

        self.assertEqual(compiled_regex, rcompile(config.essid_regex))
    def test_compile_essid_regex_with_a_case_sensitive_regex(self):
        """
        Tests 'complile_essid_regex' with a case-sensitive regex.
        """

        from re import compile as rcompile

        config = Config()
        config.essid_regex = "Free Wi-Fi"
        compiled_regex = config.complile_essid_regex()

        self.assertEqual(compiled_regex, rcompile(config.essid_regex))
Exemple #19
0
 def __init__(self):
     '''
     Initialize QBCreditcards, this has to pass
     '''
     self.datastruct = {
         "AMERICANEXPRESS": [],
         "VISA": [],
         "MASTERCARD": [],
         "DISCOVER": [],
         "JCB": [],
         "DINERSCLUB": [],
         "_AMERICANEXPRESS": ["Count", "AmericanExpress"],
         "_VISA": ["Count", "Visa"],
         "_MASTERCARD": ["Count", "MasterCard"],
         "_DISCOVER": ["Count", "Discover"],
         "_JCB": ["Count", "JCB"],
         "_DINERSCLUB": ["Count", "DinersClub"]
     }
     self.detectionamericanexpress = rcompile(r'\b(?:3[47][0-9]{13})\b', I)
     self.detectionvisa = rcompile(r'\b(?:4[0-9]{12})(?:[0-9]{3})?\b', I)
     self.detectionmastercard = rcompile(
         r'\b(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}\b',
         I)
     self.detectiondiscover = rcompile(r'\b(?:6011\d{12})|(?:65\d{14})\b',
                                       I)
     self.detectionjcb = rcompile(r'\b(?:2131|1800|35[0-9]{3})[0-9]{11}?\b',
                                  I)
     self.detectiondinersclub = rcompile(
         r'\b3(?:0[0-5]|[68][0-9])[0-9]{11}\b', I)
     self.words = []
     self.wordsstripped = ""
Exemple #20
0
    def __init__(self):
        '''
        Initialize QBDGA, this has to pass
        '''
        self.datastruct = {
            "Repeated": [],
            "LowFreqLetters": [],
            "ConsonantsRow": [],
            "Consonants": [],
            "Encryption": [],
            "Symbols": [],
            "Numbers": [],
            "Long": [],
            "Entropy": [],
            "_Repeated": ["Length", "Repeated"],
            "_LowFreqLetters": ["Count", "Letters", "URL"],
            "_ConsonantsRow": ["Groups", "Row", "URL"],
            "_Consonants": ["Count", "Letters", "URL"],
            "_Encryption": ["Type", "Detected", "URL"],
            "_Symbols": ["Count", "Symbols", "URL"],
            "_Numbers": ["Count", "Numbers", "URL"],
            "_Long": ["Length", "URL"],
            "_Entropy": ["Entropy", "URL"]
        }

        self.detectionlowfreq = rcompile(r"[vkjxqz]")
        self.detectionconsonantslettersinrow = rcompile(
            r"[bcdfghjklmnpqrstvwxyz]{4,}")
        self.detectionconsonants = rcompile(r"[bcdfghjklmnpqrstvwxyz]")
        self.detectionhex = rcompile(r'([0-9a-fA-F]{4,})', I)
        self.detectionsymbols = rcompile(r'[_\-~]', I)
        self.detectionnumbers = rcompile(r'[\d]', I)
        def __init__(self, new_packets, essid_filters=None, essid_regex=None, ignore_case=False, display_func=lambda p: None, storage_func=lambda p: None, debug=False):
            super().__init__()

            self.new_packets = new_packets
            self.essid_filters = essid_filters
            self.display_func = display_func
            self.storage_func = storage_func

            self.stop_parser = Event()

            if debug:
                print("[!] ESSID filters: " + str(self.essid_filters))
                print("[!] ESSID regex: " + str(essid_regex))
                print("[!] Ignore case: " + str(ignore_case))

            if essid_regex is not None:
                if ignore_case:
                    self.essid_regex = rcompile(essid_regex, IGNORECASE)
                else:
                    self.essid_regex = rcompile(essid_regex)
            else:
                self.essid_regex = None
Exemple #22
0
 def __init__(self):
     '''
     initialize class and make detections path
     '''
     self.intell = path.abspath(
         path.join(path.dirname(__file__), 'detections'))
     if not self.intell.endswith(path.sep):
         self.intell = self.intell + path.sep
     if not path.isdir(self.intell):
         mkdir(self.intell)
     self.ipv4privateonelinebad = rcompile(
         r"^(10|127|169\.254|172\.1[6-9]|172\.2[0-9]|172\.3[0-1]|192\.168)\..*",
         I)
        def __init__(self, new_packets, essid_filters=None, essid_regex=None, ignore_case=False, display_func=lambda p: None, storage_func=lambda p: None, debug=False):
            super().__init__()

            self.new_packets = new_packets
            self.essid_filters = essid_filters
            self.display_func = display_func
            self.storage_func = storage_func

            self.stop_parser = Event()

            if debug:
                print("[!] ESSID filters: " + str(self.essid_filters))
                print("[!] ESSID regex: " + str(essid_regex))
                print("[!] Ignore case: " + str(ignore_case))

            if essid_regex is not None:
                if ignore_case:
                    self.essid_regex = rcompile(essid_regex, IGNORECASE)
                else:
                    self.essid_regex = rcompile(essid_regex)
            else:
                self.essid_regex = None
    def test_compile_essid_regex_with_a_case_insensitive_regex(self):
        """
        Tests 'complile_essid_regex' with a case-insensitive regex.
        """

        from re import compile as rcompile, IGNORECASE

        config = Config()
        config.essid_regex = "Free Wi-Fi"
        config.ignore_case = True
        compiled_regex = config.complile_essid_regex()

        self.assertEqual(compiled_regex,
                         rcompile(config.essid_regex, IGNORECASE))
Exemple #25
0
 def find_it_from_words(self, data):
     '''
     look in the databases by words
     '''
     items = []
     keys = [
         "Collection", "FileDescription", "InternalName",
         "OriginalFilename", "ProductName", "md5", "entropy", "path"
     ]
     for word in self.words:
         # pass on "unterminated character set at position 1" some words are not escaped
         with ignore_excpetion(Exception):
             items = find_items(
                 "QBWindows", {
                     "$or": [{
                         "InternalName": rcompile(word, I)
                     }, {
                         "OriginalFilename": rcompile(word, I)
                     }, {
                         "md5": rcompile(word, I)
                     }]
                 })
             self.loop_wrapper(items, keys, data)
    def listCoursesOf(self, pFacName='Informatik', pSFacName='Informatik'):
        subFacLink = self.getSubFacility(pFacName, pSFacName)['Link']
        r = self.__session.get(subFacLink)

        soup = BeautifulSoup(r.text, 'html.parser')
        elem = soup.find('div',
                         attrs={'class': 'course_category_tree clearfix '})
        rows = elem.findAll(
            'a',
            href=rcompile(
                'https://elearning2\.uni-heidelberg\.de/course/view\.php\?id=\d+'
            ))

        return [x.text for x in rows]
def createAllCredits(path):
    fm = FolderManager()
    subs, imp = findAllSubmissionFolders(path)
    total = []
    extra = []

    curParent = path
    sheetNr = ''
    while True:
        tmp = pdirname(curParent)
        if tmp == curParent:
            raise ValueError("U can't use this script here!")
        curParent = tmp
        if rcompile('Blatt_\d\d').match(pbasename(curParent)):
            sheetNr = pbasename(curParent).split('_')[1]
            break

    for subpath in subs:
        for tup in createCreditsText(subpath):
            total.append((tup[0], [('%.2f' % x).zfill(5) for x in tup[1]]))

    for subpath in imp:
        for tup in createCreditsText(subpath):
            searchRes = fm.findStudentByName(tup[0], status='Imported')
            if len(searchRes) == 1:
                extra.append(
                    (searchRes[0], [('%.2f' % x).zfill(5) for x in tup[1]]))
            else:
                total.append((tup[0], [('%.2f' % x).zfill(5) for x in tup[1]]))

    print('Create %s ...' % ('AllCredits_%s.txt' % sheetNr),
          end='',
          flush=True)
    writeNameCreditLists(path, 'AllCredits_%s.txt' % sheetNr, total)
    print('[OK]')

    extTutDict = {}
    for elem in extra:
        eTutor = elem[0]['ExtTut']
        if eTutor in extTutDict:
            extTutDict[eTutor].append((elem[0]['Name'], elem[1]))
        else:
            extTutDict[eTutor] = [(elem[0]['Name'], elem[1])]

    for k, v in extTutDict.items():
        fname = 'AllCredits_%s_%s_%s.txt' % (sheetNr, fm.getTFirstName(),
                                             k.replace(' ', '-'))
        print('Create %s ...' % fname, end='', flush=True)
        writeNameCreditLists(path, fname, v)
        print('[OK]')
Exemple #28
0
    def analyze(self, data, _data, filename):
        '''
        start analyzing logic
        '''
        listheaders = []
        listpayloads = []

        for _ in data:
            listheaders.append(str(_["fields"]))
            listpayloads.append(str(_["payload"]))

        headers = "".join(listheaders)
        content = "".join(listpayloads)

        with copen(self.intell + filename, "r", encoding='utf8') as file:
            for _ in loads(file.read()):
                with ignore_excpetion(Exception):
                    if "Type" in _ and "WQREGEX" in _["Type"]:
                        if _["Options"][
                                "Word"] == "Normal" and "Header_Detection" in _:
                            temp_var = search(
                                rcompile(r"{}".format(_["Header_Detection"]),
                                         _["Options"]["Flag"]), headers)
                        elif _["Options"][
                                "Word"] == "Normal" and "Content_Detection" in _:
                            temp_var = search(
                                rcompile(r"{}".format(_["Content_Detection"]),
                                         _["Options"]["Flag"]), content)
                        if temp_var is not None:
                            _data.append({
                                "Matched": "1",
                                "Required": _["Options"]["Required"],
                                "WAF": _["Name"],
                                "Detected": temp_var.group()
                            })

        self.check_proxy_bypass(data, _data)
Exemple #29
0
    def __init__(self):
        '''
        Initialize QBPhishing, this has to pass
        '''
        self.datastruct = {"Suspicious":[],
                           "Spelling count":[],
                           "Spelling":[],
                           "Symbols":[],
                           "_Spelling count":["Total", "Misspelled"],
                           "_Spelling":["Count", "Word", "Misspelled"],
                           "_Suspicious":["Count", "Words"],
                           "_Symbols":["Count", "Symbol"]}

        self.suspiciouswords = rcompile(r"uniq|18\+|action|act|additional income|affordable|amazed|apply|avoid|babe|be amazed|beneficiary|billing|billion|bonus|boss|buy|call|cancel|cash|casino|certified|cheap|claim|clearance|click|collect|compare rates|confirm|congrat|congratulations|credit|cures|customer|deal|dear|debt|direct email|discount|don\'t delete|don\'t hesitate|double your income|earn|experience|expire|extra|fantastic|fgift|free|freedom|friend|get it|great|guarantee|hello|income|increase |instant|investment|iphone|junk|limited|lose|log|lowest price|lucky|luxury|make money|medicine|mobile|money|msg|name|no credit check|now|obligation|offer|only|open|order|password|please|presently|problem|promise|purchase|quote|rates|refinance|refund|remove|reply|request|risk-free|sales|satisfaction|save|score|serious|sex|sexy|sign|sms|spam|special|subscription|success|supplies|take action|terms|text|ticket|traffic|trial|txt|unlimited|update|urgent|weight|win|winner|won")
        self.wordsstripped = ""
    def findCourse(self,
                   pFacName='Informatik',
                   pSFacName='Informatik',
                   course='Datenstrukt'):
        subFacLink = self.getSubFacility(pFacName, pSFacName)['Link']
        r = self.__session.get(subFacLink)
        soup = BeautifulSoup(r.text, 'html.parser')
        rows = soup.findAll(
            'a',
            href=rcompile(
                'https://elearning2\.uni-heidelberg\.de/course/view\.php\?id=\d+'
            ))

        for row in rows:
            if course in row.text:
                return row['href']
def markHyperlinks(txt):
    res = ''
    pattern = rcompile('.*https?://.*')

    for line in txt.split('\n'):
        if res != '':
            res += '\n'

        for word in line.split(' '):
            if pattern.match(word):
                res += (' ' if res != '' else '') + toHyperLink(word)

            else:
                res += (' ' if res != '' else '') + word

    return res
Exemple #32
0
from dateutil.parser import parse as dateParse
import re
from time import mktime
from CONFIG import *
from urllib import quote,unquote
import subprocess
from os import devnull as dev_NULL
from difflib import ndiff
dev_NULL=file(dev_NULL)
from re import compile as rcompile
from re import I as ignorecase
rarchiveparse=rcompile(r'http\:\/\/web\.archive\.org\/web\/\d{14}/((http[s]?|ftp):\/)?\/?(?P<domain>[^:\/\s]+)(:([^\/]*))?(?P<path>(\/\w+)*\/)(?P<file>[\w\-\.]+[^#?\s]+)(?P<parameters>\?([^#]*))?(#(.*))?')
rurlparse=rcompile(r'((http[s]?|ftp):\/)?\/?(?P<domain>[^:\/\s]+)(:([^\/]*))?(?P<path>(\/\w+)*\/)(?P<file>[\w\-\.]+[^#?\s]+)(?P<parameters>\?([^#]*))?(#(.*))?')
rlink=rcompile(r'(http|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?')
rdate=rcompile(r'[^\n\r]{0,130}(retrieved|downloaded|accessed)( on)?[ :](?P<date>((Jan(uary)?[ \.\-,\\]+([012]?\d|3[01])|Feb(ruary)?[ \.\-,\\]+([01]?\d|2[89])|Mar(ch)?[ \.\-,\\]+([012]?\d|3[01])|Apr(il)?[ \.\-,\\]+([012]?\d|30)|May[ \.\-,\\]+([012]?\d|3[01])|June?[ \.\-,\\]+([012]?\d|3[0])|July?[ \.\-,\\]+([012]?\d|3[01])|Aug(ust)?[ \.\-,\\]+([012]?\d|3[01])|Sep(tember)?[ \.\-,\\]+([012]?\d|30)|Oct(ober)?[ \.\-,\\]+([012]?\d|3[01])|Nov(ember)?[ \.\-,\\]+([012]?\d|3[0])|Dec(ember)?[ \.\-,\\]+([012]?\d|3[01]))[ \.\-,\\]+(19|20)?\d\d)|((([012]?\d|3[01])[ \.\-,\\]+Jan(uary)?|([01]?\d|2[89])[ \.\-,\\]+Feb(ruary)?|([012]?\d|3[01])[ \.\-,\\]+Mar(ch)?|([012]?\d|30)[ \.\-,\\]+Apr(il)?|([012]?\d|3[01])[ \.\-,\\]+May|([012]?\d|3[0])[ \.\-,\\]+June?|([012]?\d|3[01])[ \.\-,\\]+July?|([012]?\d|3[01])[ \.\-,\\]+Aug(ust)?|([012]?\d|30)[ \.\-,\\]+Sep(tember)?|([012]?\d|3[01])[ \.\-,\\]+Oct(ober)?|([012]?\d|30)[ \.\-,\\]+Nov(ember)?|([012]?\d|3[10])[ \.\-,\\]+Dec(ember)?)[ \.\-,\\]+(19|20)?\d\d)|(((1[3456789]|2\d|3[01])[\.\-,\\](0?\d|1[012]))[ \.\-,\\](19|20)\d\d)|(((0?\d|1[012])[\.\-,\\](1[3456789]|2\d|3[01]))[ \.\-,\\](19|20)\d\d))',ignorecase)
rarchivenear=rcompile('(archive|wayback|webcitation.org|webcite)')
rarchive=rcompile(r'([^ ]* [^\n\r\]]*\][\s\.\,\w])')
rurlstatus=rcompile(r'HTTP/[\d\.]+ (\d+)')
rurlnorm=rcompile('(https?://)?(www\.)?(.*)')

def isUsedInTextObject(textobjects,start,end):
	'''A simple function to search an array of textobjects for a string starting at *start* and ending at *end*, returning the first result.
	@param textobjects: An array of TextObjects that will be searched.
	@param start: The starting index of the string to be searched for.
	@param end: The ending index of the string to be searched for.'''
	for t in textobjects:
		if t.isContained(start,end):
			return t
	return None

Exemple #33
0
#!/usr/bin/env python
# encoding: utf-8

from time import time
from re import compile as rcompile
RAW_URL='http://en.wikipedia.org/w/index.php?title=%s&action=raw'
URL_REQUEST_TIMEOUT='10'
BOT_NAME='DASHBot'
SHUTOFF='User:DASHBot/Dead_Links/Shutoff'
PLURAL=lambda x: 's' if x>1 else ''
BLACKLIST=rcompile('(https?\://(web\.archive.*|www\.webcitation.org)')
SETTINGS_PAGE='User:DASHBot/Dead_Links/Settings.yaml'
COMMENT_TEMPLET="Added archives to %d link%s. See [[User:DASHBot/Dead links]] for details, settings, shutoff."
CITEWEB_TEMPLATES='([wW]eb reference 4|[cC]itenewsauthor|[cC]ite newspaper|[wW]eb-reference|[wW]eb reference|[wW]eb citation|[cC]ite website|[cC]ite webpage|[cC]ite article|[cC]ite news-q|[cC]ite news2|[Cc]ite study|[cC]ute news|[cC]ite-news|[cC]ite blog|[Cc]ite news|[wW]eb cite|[lL]ien web|[cC]itenews|[cC]ite-web|[cC]ite url|[cC]ite new|[cC]ite Web|[cC]ita web|[cC]it news|[Cc]ite web|[Cc]itation|[cC]iteweb|[cC]it web|[cC] news|[cC] web|[Cc]ite|[wW]eb)'
HISTORY_SEARCH_WINDOW=500

###

s_global_editRate= 12 #seconds between edits
s_useWaybackMachine= True #Use the internet archive's wayback machine
s_nightlyEditLimit= 10 #The bot will limit itself to this many articles per run.
s_noHistorySearch= True #Turn off the function that will look through an article's history to find an appropriate access date.
s_editBareLinks= False #Allow the bot to add archvies to some links (not used in reference or citeweb)
s_editBareReferences=False #allow bot to add archives to (non citeweb) references
s_deltaTime=12960000 #The maximum diference between access date and archive date, in seconds
s_noCheck_URL_status=True
Exemple #34
0
        for data in app(env, start_response):
            if data:
                httpfile.sendall(data, 15.+(len(data)>>10))
        httpfile.close(15.)
    return handler

def wsgihandler(app, env={}, **environ):
    handler = httphandler(wsgi(app), env, **environ)
    return handler

RESPONSES = dict((int(i.split(None, 1)[0]), i) for i in ('100 Continue,101 Switching '
'Protocols,200 OK,201 Created,202 Accepted,203 Non-Authoritative Information,204 No C'
'ontent,205 Reset Content,206 Partial Content,300 Multiple Choices,301 Moved Permanen'
'tly,302 Found,303 See Other,304 Not Modified,305 Use Proxy,307 Temporary Redirect,40'
'0 Bad Request,401 Unauthorized,402 Payment Required,403 Forbidden,404 Not Found,405 '
'Method Not Allowed,406 Not Acceptable,407 Proxy Authentication Required,408 Request '
'Timeout,409 Conflict,410 Gone,411 Length Required,412 Precondition Failed,413 Reques'
't Entity Too Large,414 Request-URI Too Long,415 Unsupported Media Type,416 Requested'
' Range Not Satisfiable,417 Expectation Failed,500 Internal Server Error,501 Not Impl'
'emented,502 Bad Gateway,503 Service Unavailable,504 Gateway Timeout,505 HTTP Version'
' Not Supported').split(','))
NOCACHEHEADERS = {'Pragma': 'no-cache', 'Cache-Control': 'no-cache, must-revalidate',
    'Expires': 'Mon, 26 Jul 1997 05:00:00 GMT'}
quoted_slash_split = rcompile("(?i)%2F").split
first  = rcompile(r'^(\w+)[\s\t]+([^\r\n]+)[\s\t]+(HTTP/[01]\.[0-9])\r?\n$', I).match
header = rcompile(r'^[\s\t]*([^\r\n:]+)[\s\t]*:[\s\t]*([^\r\n]+)[\s\t]*\r?\n$').match
WSGIServer = WsgiServer = wsgiserver
HTTPServer = HttpServer = httpserver
__all__    = ['mainloop', 'exit', 'timeout', 'httpserver', 'wsgiserver', 'HttpServer',
              'WsgiServer', 'HTTPServer', 'WSGIServer']
Exemple #35
0
from glob import glob
from re import compile as rcompile
from zipfile import is_zipfile, ZipFile
from tarfile import is_tarfile, open as taropen
from string import Template
from itertools import chain

DEVNULL = open(devnull, "w")
BASE = expanduser("~/www")
USERNAME = getpwuid(getuid())[0]
(PORT_MIN, PORT_MAX, PORT_STEP) = (10000, 10100, 3) # 0=HTTP, 1=HTTPS, 2=DB

(START, STOP, ISRUNNING) = ('start', 'stop', 'isrunning')
(WWW, DB) = ('www', 'db')

SITE_FORMAT = rcompile(r'^site-\w+$')
VALID_SITE_ID = rcompile(r'^[a-zA-Z]\w{0,23}$')
APPLICATION_FORMAT = rcompile(r'^(.*)\.(tar\.gz|tar\.bz2|zip)$')

def is_valid_site_id(site_id):
    """Check if a site identifier is valid."""
    return VALID_SITE_ID.match(site_id)

def bash_prelude():
    """Returns the full path to the Bash prelude script."""
    return join(BASE, 'bin', 'bashprelude')

def get_bin_directory(executable, hints):
    """Try to find the directory of an executable event if it's not in PATH."""
    bin_paths = environ["PATH"].split(pathsep)
    full_paths = [glob(join(path, executable)) for path in bin_paths + hints]
Exemple #36
0
#!/usr/bin/env python
# encoding: utf-8
"""
Parser.py

Created by Tim Sears on 2012-02-19.
Copyright (c) 2012. All rights reserved.
"""
from CONFIG import *
from Classes import *
from re import compile as rcompile
from re import I as ignorecase
citeweb_templates=CITEWEB_TEMPLATES
rtemplatename=rcompile(r'{{[\s\n\r]*%s'%citeweb_templates)
rbaretemplatename=rcompile(citeweb_templates)
rreference=rcompile(r'\<[\s\n\r]*ref[^\>]*\>.*?\<[\s\n\r]*/[\s\n\r]*ref[\s\n\r]*>',ignorecase)
rtname = rcompile(citeweb_templates)
rparameternamed = rcompile(r'([A-Za-z0-9]*)[\s\n\r]*=')
#if a "accessed on " used on the same line as a url, assume propper access date
chars=['a', 'b', 'c', 'd', 'e', 'f', 'g','_','h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
spaces=' \t\n\r'


class TextObject(object):
	"""The base class for text objects"""
	def __init__(self, start,end,text):
				super(TextObject, self).__init__()
				self.text=text
				self.startIndex=start
				self.endIndex=end
				self.urlArray=None
def retrieve():
	database = connect('database.db')
	topics,feeds,documents,titles,descriptions = [[],[],[],[],[]]
	links,datetimes,thumbnails,doc_topics = [[],[],[],[]]

	### GET DATABASE DATA
	for row in database.execute('SELECT * FROM topics;'):
		topics.append([row[0], str(row[1])])
	for row in database.execute('SELECT fds_topic, fds_link FROM feeds;'):
		feeds.append([row[0],str(row[1])])		
	for row in database.execute('SELECT doc_id, doc_datetime, doc_link FROM documents'):
		documents.append([row[0],str(row[1]),str(row[2]),[]])
		for row2 in database.execute('SELECT tpd_topic FROM tpc_doc WHERE tpd_document = '+str(row[0])+';'):
			documents[-1][3].append(row2[0])

	### GET RSS INFO
	for topic, link in feeds:
		html = urlopen(link).read()
		soup = BeautifulSoup(html)
		items = [item for item in soup.find_all('item')]
		for item in items:
			doc_topics.append(topic)
			if item.title is not None:
				title = item.title.findAll(text=True)
				if len(title) == 1: titles.append(title[0].encode('ascii',errors='ignore'))
				else:               titles.append('')
			if item.description is not None:
				desc = item.description.findAll(text=True)
				if len(desc) == 1: descriptions.append(desc[0].encode('ascii',errors='ignore'))
				else:              descriptions.append('')
			if item.guid is not None:
				link = item.guid.findAll(text=True)
				if len(link) == 1: links.append(link[0].encode('ascii',errors='ignore'))
				else:              links.append('')
			if item.pubdate is not None:
				date = item.pubdate.findAll(text=True)
				if len(date) == 1: datetimes.append(date[0].encode('ascii',errors='ignore'))
				else:              datetimes.append('')
			thumb = item.findChildren('media:thumbnail',{'width':'144'})
			if len(thumb) == 1: thumbnails.append(thumb[0]['url'].encode('ascii',errors='ignore'))
			else:               thumbnails.append('')

	### GET DOCUMENTS
	new = 0
	updated = 0
	for index in range(len(titles)):
		print('('+str(index+1).ljust(4) + str(doc_topics[index]).ljust(2) + ')'),
		
		datetime = parser.parse(datetimes[index])
		try:
			pos = [doc[2] for doc in documents].index(links[index])
		except:
			refresh = 0
		else:
			if doc_topics[index] not in documents[pos][3]:
				database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\
					' ('+str(doc_topics[index])+', '+str(documents[pos][0])+');')
				documents[pos][3].append(doc_topics[index])
				database.commit()
				print('*'),
			if str(datetime) == str(documents[pos][1]):
				print('Unchanged Article')
				continue
			refresh = 1


		not_article = ('VIDEO','AUDIO','In pictures','Your pictures')
		if titles[index].startswith(not_article):
			print('Not an Article')
			continue

		html = urlopen(links[index]).read()
		soup = BeautifulSoup(html)
		title = str(soup.title)[7:-8].decode('utf-8').encode('ascii',errors='ignore')

		temp = ['BBC News','BBC History','BBC Science','BBC Consumer','BBC Arts','BBC Nature']
		if any(i in title for i in temp): division = 'story-body'
		elif 'BBC Sport' in title:        division = 'article'
		elif 'BBC - Capital' in title:    division = 'description|story-body'
		else:                             print('Website not known'); continue

		content = [div for div in soup.find_all('div',{'class':rcompile(division)})]
		soup = BeautifulSoup(' '.join(list(map(str,content))))
		paragraphs = [p for p in soup.findAll('p')]
		soup = BeautifulSoup(' '.join(list(map(str,paragraphs))))
		[p.extract() for p in soup.findAll('p') if str(p).startswith('<p><strong>')]
		[p.extract() for p in soup.findAll('p',{'class':rcompile('disclaimer|terms')})]

		text = soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ')
		text = text.encode('ascii', errors='ignore')
		if text == '':
			print('Empty Text')
			continue

		rsub(' +',' ',text)
		text = text.strip()
		text = '\n'.join([sentence for sentence in sent_tokenize(text)])

		if refresh == 1:
			documents[pos][1] = str(datetime)
			database.execute('DELETE FROM entities WHERE ent_document = '+str(documents[pos][0])+';')
			database.execute('UPDATE documents SET doc_processed = 0,'+\
				' doc_datetime = \''+str(datetime)+'\','+\
				' doc_thumbnail = \''+thumbnails[index]+'\','+\
				' doc_title = \''+titles[index].replace('\'','\'\'')+'\','+\
				' doc_description = \''+descriptions[index].replace('\'','\'\'')+'\','+\
				' doc_text = \''+text.replace('\'','\'\'')+'\''+\
				' WHERE doc_link = \''+links[index]+'\';')
			print('Update - '+titles[index])
			updated += 1
		else:
			documents.append([len(documents)+1, datetime, links[index],[doc_topics[index]]])
			database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\
				' ('+str(doc_topics[index])+', '+str(documents[-1][0])+');')
			database.execute('INSERT INTO documents (doc_datetime, doc_link, doc_thumbnail,'+\
				' doc_title, doc_description, doc_text) VALUES (\''+\
				str(datetime)+'\',\''+links[index]+'\',\''+thumbnails[index]+'\',\''+\
				titles[index].replace('\'','\'\'')+'\',\''+\
				descriptions[index].replace('\'','\'\'')+'\',\''+\
				text.replace('\'','\'\'')+'\');')
			print('Insert - '+titles[index])
			new += 1

		database.commit()
		
	print new,"new,", updated,"updated."
Exemple #38
0
 def __init__(self, regex, callback):
     self.regex = regex
     self.compiled = rcompile(self.regex)
     self.callback = callback