def buildpcfg(passwd_dictionary, start=0, end=-1): #MIN_COUNT=1000 R = RuleSet() # resource track out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2' resource_tracker = 5240 for n, line in enumerate(open_(passwd_dictionary)): if n<start: continue if n>end: break if n>resource_tracker: l = check_resource(n) if not l: break else: resource_tracker += l # if n%1000==0: print n; line = line.strip().split() if len(line) > 1 and line[0].isdigit(): w, c = ' '.join(line[1:]), int(line[0]) else: continue w, c = ' '.join(line), 1 try: w.decode('ascii') except UnicodeDecodeError: continue # not ascii hence return if c < MIN_COUNT : # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0): print "Word frequency dropped to %d for %s" % (c, w), n break # Careful!!! T = parse(w) R.update_set(T.rule_set(), with_freq=True, freq=c) if end>0: return R R.save(bz2.BZ2File(out_grammar_fl, 'wb'))
def rule_set(self): rs = RuleSet() rs.add_rule('L', self.prod) if self.prod is 'l33t': for c,d in zip(self.l, self.r): rs.add_rule('L_%s'%c,d) return rs
def __init__(self, base_pcfg): self.cal_cdf = False R = RuleSet() self.base_pcfg = base_pcfg R.update_set(RuleSet(d={'L': self.base_pcfg['L']})) for c in string.ascii_lowercase: x = 'L_%s' % c R.update_set(RuleSet(d={x: self.base_pcfg[x]})) self.R = R self.G = R.G self.date = Date() self.freeze = False
def parallel_buildpcfg(password_dictionary): from multiprocessing import Pool p = Pool() Complete_grammar = RuleSet() load_each = 10000 a = [(password_dictionary, c, c+load_each) for c in range(0, 10**6, load_each)] R = p.map(wraper_buildpcfg, a) for r in R: Complete_grammar.update_set(r, with_freq=True) out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2' Complete_grammar.save(bz2.BZ2File(out_grammar_fl, 'wb'))
def rule_set(self): # 添加规则 rs = RuleSet() if isinstance(self, NonT): # 基础规则 即没有任何规则 rs.add_rule('G', self.sym) if isinstance(self.prod, str): rs.add_rule(self.sym, self.prod) elif isinstance(self.prod, list): for p in self.prod: rs.update_set(p.rule_set()) else: return self.prod.rule_set() return rs
def rule_set(self): rs = RuleSet() if isinstance(self, NonT): rs.add_rule('G', self.sym) if isinstance(self.prod, basestring): rs.add_rule(self.sym, self.prod) elif isinstance(self.prod, list): for p in self.prod: rs.update_set(p.rule_set()) else: return self.prod.rule_set() return rs
def rule_set(self): rs = RuleSet() rs.add_rule('L', self.prod) if self.prod is 'l33t': for c, d in zip(self.l, self.r): rs.add_rule('L_%s' % c, d) return rs
def buildpcfg(passwd_dictionary, start=0, end=-1): #MIN_COUNT=1000 R = RuleSet() # resource track out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2' resource_tracker = 5240 for n, line in enumerate(open_(passwd_dictionary)): if n < start: continue if n > end: break if n > resource_tracker: l = check_resource(n) if not l: break else: resource_tracker += l # if n%1000==0: print n; line = line.strip().split() if len(line) > 1 and line[0].isdigit(): w, c = ' '.join(line[1:]), int(line[0]) else: continue w, c = ' '.join(line), 1 try: w.decode('ascii') except UnicodeDecodeError: continue # not ascii hence return if c < MIN_COUNT: # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0): print "Word frequency dropped to %d for %s" % (c, w), n break # Careful!!! T = parse(w) R.update_set(T.rule_set(), with_freq=True, freq=c) if end > 0: return R R.save(bz2.BZ2File(out_grammar_fl, 'wb'))
def buildOurpcfg(filename): """ 函数:pcfg训练函数 功能:将 参数定义: filename :用于存放训练集的位置 rule_set:存放规则以及其频率的类 """ # 准备好用于存放规则的类 rule_set = RuleSet() # 然后打开文件 fp = open(filename, 'r') # 设定最大的可读取行数 max_line = 2555000 for i, line in enumerate(fp): if i > max_line: break # 开始进行检测(规定第一个是密码,第二个是出现次数) line, n = line.strip().split(' ') p = parse(line) # 设置规则 rule_set.update_set(p.rule_set(), with_freq=True, freq=int(n)) # 完成训练,进行存档 rule_set.save(bz2.BZ2File("temp1.cfg", "wb"))
def parallel_buildpcfg(password_dictionary): from multiprocessing import Pool p = Pool() Complete_grammar = RuleSet() load_each = 10000 a = [(password_dictionary, c, c + load_each) for c in range(0, 10**6, load_each)] R = p.map(wraper_buildpcfg, a) for r in R: Complete_grammar.update_set(r, with_freq=True) out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2' Complete_grammar.save(bz2.BZ2File(out_grammar_fl, 'wb'))
def __init__(self, base_pcfg): self.cal_cdf = False R = RuleSet() self.base_pcfg = base_pcfg default_keys = [] # default keys R.update_set(RuleSet(d={'L': self.base_pcfg['L']})) # L default_keys.append('L') for c in string.ascii_lowercase: # L_* x = 'L_%s' % c default_keys.append(x) R.update_set(RuleSet(d={x: self.base_pcfg[x]})) for k,v in self.base_pcfg['G'].items(): if k.endswith(',G'): # W1, D1, Y1 R.G['G'][k] = v # W1 <-- W1,G R.G['G'][k[:-2]] = self.base_pcfg['G'][k[:-2]] default_keys.extend([k, k[:-2]]) R.update_set(RuleSet(d={k[:-2]: self.base_pcfg[k[:-2]]})) self._default_keys = set(default_keys) self.R = R self.G = R.G self.date = Date() self.freeze = False
def rule_set(self, word): rs = RuleSet() pt = self.l_parse_tree(word) for p in pt.tree: rs.add_rule(*p) return rs
def rule_set(self): if isinstance(self.prod, basestring): return RuleSet(self.sym, self.prod) else: return self.prod.rule_set()
a = [(password_dictionary, c, c+load_each) for c in range(0, 10**6, load_each)] R = p.map(wraper_buildpcfg, a) for r in R: Complete_grammar.update_set(r, with_freq=True) out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2' Complete_grammar.save(bz2.BZ2File(out_grammar_fl, 'wb')) if __name__ == "__main__": if sys.argv[1] == '-buildG': print buildpcfg(sys.argv[2], 0, 100) elif sys.argv[1] == '-buildparallelG': parallel_buildpcfg(sys.argv[2]) elif sys.argv[1]=='-file': R = RuleSet() with open_(sys.argv[2]) as f: for i, line in enumerate(f): if i<5000: continue l = line.strip().split() w, c = ' '.join(l[1:]), int(l[0]) try: w.decode('ascii') except UnicodeDecodeError: continue # not ascii hence return if not w or len(w.strip())<1: continue T = parse(w) R.update_set(T.rule_set(), with_freq=True, freq=c) if i%100==0: print i if i>5200: break print R
def rule_set(self): rs = RuleSet() rs.add_rule(self.sym, self.prod) rs.update_set(self.L.rule_set()) return rs
a = [(password_dictionary, c, c + load_each) for c in range(0, 10**6, load_each)] R = p.map(wraper_buildpcfg, a) for r in R: Complete_grammar.update_set(r, with_freq=True) out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2' Complete_grammar.save(bz2.BZ2File(out_grammar_fl, 'wb')) if __name__ == "__main__": if sys.argv[1] == '-buildG': print buildpcfg(sys.argv[2], 0, 100) elif sys.argv[1] == '-buildparallelG': parallel_buildpcfg(sys.argv[2]) elif sys.argv[1] == '-file': R = RuleSet() with open_(sys.argv[2]) as f: for i, line in enumerate(f): if i < 5000: continue l = line.strip().split() w, c = ' '.join(l[1:]), int(l[0]) try: w.decode('ascii') except UnicodeDecodeError: continue # not ascii hence return if not w or len(w.strip()) < 1: continue T = parse(w) R.update_set(T.rule_set(), with_freq=True, freq=c) if i % 100 == 0: print i if i > 5200: break