Ejemplo n.º 1
0
    def __init__(self, categories=None, parser_ids=None):
        PatternMatching.__init__(self)

        # Load parser list
        tags = []
        if categories: tags += [ ("category", cat) for cat in categories ]
        if parser_ids: tags += [ ("id", parser_id) for parser_id in parser_ids ]
        if tags      : tags += [ None ]
        parser_list = QueryParser(tags)

        # Create string patterns
        for parser in parser_list:
            for (magic, offset) in parser.getParserTags().get("magic",()):
                self.addString(magic, (offset, parser))

        # Create regex patterns
        for parser in parser_list:
            for (regex, offset) in parser.getParserTags().get("magic_regex",()):
                self.addRegex(regex, (offset, parser))
        self.commit()
Ejemplo n.º 2
0
 def search(self, data):
     for start, stop, item in PatternMatching.search(self, data):
         yield (item.user[1], start*8 - item.user[0])
Ejemplo n.º 3
0
    def __add_items_2_sqlite(self, fd, category, is_black, itype):
        if itype == "domain" :
            domains = []
            for line in fd.readlines() :
                dg_domain=line.replace("\r","").replace("\n", "").replace(" ","").replace('"','').replace("'",'')
                tmp_domain=''
                tmp_domain_item_list = dg_domain.split(".")
                tmp_domain_item_list.reverse()
                for x in tmp_domain_item_list:
                    tmp_domain = tmp_domain + x + "."
                tmp_domain=tmp_domain[:-1]
                domains.append(tmp_domain)
            
            domains.sort()

            p = PatternMatching()
            i = 0
            step = False
            total = len(domains)
            current = 0
            
            for domain in domains :
                string = try_to_str(domain)
                if string == None :
                    continue
                
                p.addString(string)
                i = i + 1
                current = current + 1 
                if i < 1500 :
                    continue
                
                if step == False and i % 500 == 0 :
                    if len(str(p.regex)) > 20000 :
                        if len(str(p.regex)) > 24000 :
                            self.__insert_domain_into_sqlite(category, str(p.regex), is_black, current, total )
                            p = PatternMatching()
                            step = False
                            i = 0
                            continue
                        
                        step = True
                        continue
                    
                elif step == True and i % 100 == 0 :
                    if len(str(p.regex)) > 25000 :
                        self.__insert_domain_into_sqlite(category, str(p.regex), is_black, current, total)
                        p = PatternMatching()
                        step = False
                        i = 0
            
            if len(str(p.regex)) > 0 :
                self.__insert_domain_into_sqlite(category, str(p.regex), is_black, total, total)
        else:
            domain_set = set()
            
            urls = []
            for line in fd.readlines() :
                dg_url = line.replace("\r","").replace("\n", "").replace(" ","").replace('"','').replace("'",'')
                urls.append(dg_url)

                if is_black == True:
                    tmp_domain=''
                    tmp_domain_item_list = dg_url.split("/")[0].split(".")
                    tmp_domain_item_list.reverse()
                    for x in tmp_domain_item_list:
                        tmp_domain = tmp_domain + x + "."
                    tmp_domain=tmp_domain[:-1]                
                    domain_set.add(tmp_domain)

            urls.sort()
            
            p = PatternMatching()
            i = 0
            current = 0
            if is_black == True :
                total = len(urls) + len(domain_set)
            else:
                total = len(urls)

            for url in urls :
                string = try_to_str(url)
                if string == None :
                    continue
                
                p.addString(string)
                i = i + 1
                current = current + 1
                
                if i % 100 == 0 :
                    if len(str(p.regex)) > 25000 :
                        self.__insert_url_into_sqlite(category, str(p.regex), is_black, current, total)
                        p = PatternMatching()
                        i = 0

            if len(str(p.regex)) > 0 :
                self.__insert_url_into_sqlite(category, str(p.regex), is_black, len(urls) , total)
                
            if is_black == True:
                domains = list(domain_set)
                domains.sort()

                p = PatternMatching()
                i = 0
                step = False

                for domain in domains :
                    string = try_to_str(domain)
                    if string == None :
                        continue
                    
                    p.addString(string)
                    i = i + 1
                    current = current + 1
                    if i < 1500 :
                        continue
                    
                    if step == False and i % 500 == 0 :
                        if len(str(p.regex)) > 20000 :
                            if len(str(p.regex)) > 24000 :
                                self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black, current, total)
                                p = PatternMatching()
                                step = False
                                i = 0
                                continue
                                
                            step = True
                            continue

                    elif step == True and i % 100 == 0 :
                        if len(str(p.regex)) > 25000 :
                            self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black, current, total)
                            p = PatternMatching()
                            step = False
                            i = 0

                if len(str(p.regex)) > 0 :
                    self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black, total, total)