Ejemplo n.º 1
0
    def sample_trim(self):
        """
		Sample one and then trim it.
		return the trim one.
		If empty, raise an exception.
		"""
        my_cfg = self.sample_raw()
        #print([ prod for prod in my_cfg.productions if len(prod) == 2 and prod[0] == 'S'])
        logging.info(
            "CFG nominally has %d nonterminals, %d terminals, %d binary_rules and %d lexical rules",
            self.number_nonterminals, self.number_terminals, self.binary_rules,
            self.lexical_rules)
        ts = my_cfg.compute_trim_set()
        if len(ts) == 0:
            # empty language
            raise ValueError("Empty language")

        prods = my_cfg.compute_usable_productions(ts)
        terminals = set()
        for prod in prods:
            if len(prod) == 2:
                terminals.add(prod[1])
        tcfg = cfg.CFG()
        tcfg.start = my_cfg.start
        tcfg.terminals = terminals
        tcfg.nonterminals = ts
        tcfg.productions = set(prods)
        logging.info(
            "Final CFG has %d nonterminals, %d terminals, %d binary_rules and %d lexical rules",
            len(tcfg.nonterminals), len(tcfg.terminals),
            len([prod for prod in tcfg.productions if len(prod) == 3]),
            len([prod for prod in tcfg.productions if len(prod) == 2]))
        return tcfg
Ejemplo n.º 2
0
    def sample_raw(self):
        """
		return a CFG
		"""
        lexicon = list(utility.generate_lexicon(self.number_terminals))
        #DEBUGGING
        lexicon.sort()
        print(lexicon[0], lexicon[-1])
        nonterminals = self.generate_nonterminals()
        lprods = set()
        bprods = set()
        lexicon_size = len(lexicon)
        while len(lprods) < self.lexical_rules:
            lhs = numpy.random.choice(nonterminals)
            rhs = lexicon[numpy.random.choice(range(lexicon_size))]

            lprods.add((lhs, rhs))
        print(lhs, rhs)
        while len(bprods) < self.binary_rules:
            if self.strict_cnf:
                a = numpy.random.choice(nonterminals)
                b, c = numpy.random.choice(nonterminals[1:], size=2)
            else:
                a, b, c = numpy.random.choice(nonterminals, size=3)

            bprods.add((a, b, c))
        print(a, b, c)
        my_cfg = cfg.CFG()
        my_cfg.start = nonterminals[0]
        my_cfg.nonterminals = set(nonterminals)
        my_cfg.terminals = set(lexicon)
        my_cfg.productions = lprods | bprods
        return my_cfg
Ejemplo n.º 3
0
def constructCFGs(insns, call_sites=[]):
    overall_cfg = cfg.CFG()
    overall_cfg.construct_new(insns, call_sites)

    cfgs = overall_cfg.partition()

    for c in cfgs:
        for bb in c.basicblocks:
            bb.cfg = c
            bb.iterations = 1

    return cfgs
Ejemplo n.º 4
0
 def sample_full(self):
     lexicon = list(utility.generate_lexicon(self.number_terminals))
     #print("Lexicon",lexicon,self.number_terminals)
     nonterminals = self.generate_nonterminals()
     lprods = set()
     bprods = set()
     for a in nonterminals:
         for b in lexicon:
             lprods.add((a, b))
     for a in nonterminals:
         for b in nonterminals[1:]:
             for c in nonterminals[1:]:
                 bprods.add((a, b, c))
     my_cfg = cfg.CFG()
     my_cfg.start = nonterminals[0]
     my_cfg.nonterminals = set(nonterminals)
     my_cfg.terminals = set(lexicon)
     my_cfg.productions = lprods | bprods
     #print(my_cfg.terminals)
     return my_cfg
Ejemplo n.º 5
0
    def sample_uniform(self, lp=0.5, bp=0.5):
        """
		Sample all productions Bernoulli for lexical and binary. Default 0.5.
		"""
        lexicon = list(utility.generate_lexicon(self.number_terminals))
        #print("Lexicon",lexicon,self.number_terminals)
        nonterminals = self.generate_nonterminals()
        productions = []
        for a in nonterminals:
            for b in lexicon:
                if numpy.random.random() < lp:
                    productions.append((a, b))
        for a in nonterminals:
            for b in nonterminals[1:]:
                for c in nonterminals[1:]:
                    if numpy.random.random() < bp:
                        productions.append((a, b, c))
        my_cfg = cfg.CFG()
        my_cfg.start = nonterminals[0]
        my_cfg.nonterminals = set(nonterminals)
        my_cfg.terminals = set(lexicon)
        my_cfg.productions = productions
        return my_cfg
Ejemplo n.º 6
0
##
##g = CFG()
##g.nonterminals = ["S", "A"]
##g.alphabet = {"f":1, "a":0}
##g.productions = [["S", "f", ["A"]], ["A", "a", []]]
##g.costs = {"f":(lambda x: x[0] + 1), "a":(lambda x: 10)}
##(mu, minprods, order) = g.Knijkstra()
##assert mu["S"] == 11
##
##trees = g.getTreesFromProds(minprods, order)
###print trees
###enums = g.EnumerateStrings(3)
##
####
###S should ignore A
g2 = cfg.CFG()
g2.nonterminals = ["S", "A"]
g2.alphabet = {"f": 1, "a": 0}
g2.productions = [["S", "f", ["A"]], ["A", "a", []], ["S", "a", []]]
g2.costs = {"f": (lambda x: x[0] + 1), "a": (lambda x: 10)}
assert g2.Knijkstra()[0]["S"] == 10  #should equal 10
##
##g3 = CFG()
##g3.nonterminals = ["S", "A", "B"]
##g3.alphabet = {"f":1, "a":0, "b":0}
##g3.productions = [["S", "f", ["A"]], ["A", "a", []], ["A", "f", ["B"]], ["B", "b", []]]
##g3.costs = {"f":(lambda x: x[0] + 1), "a":(lambda x: 10), "b":(lambda x: 1)}
##assert g3.Knijkstra()[0]["S"]==3 #should be 3
##

##
Ejemplo n.º 7
0
# Used for draw figures by seaborn
# Ji Hongchen
# 20200911
# ==================================

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import satamethod
import shutil
import cfg
from scipy import stats

CFG = cfg.CFG()
N_CLUSTER = 7
CLASS_LIST = []
for i in range(N_CLUSTER):
    CLASS_LIST.append('Class_' + str(i + 1))
PATH = '/Users/freud/Documents/MANU/lstmsom_data/exp20200617/analysis_patient_20200617/'
CLAC_DICT = {'patient_age': 'avr', 'patient_weight': 'avr', 'patient_gender': 'chi',
             'ajcc_stage': 'chi', 't_stage': 'chi', 'n_stage': 'chi', 'm_stage': 'chi'}
OUTPATH = '/Users/freud/Documents/MANU/lstmsom_data/exp20200617/cluster/'


def data_loader(dataframe, col_item):
    dataframe.drop(dataframe[dataframe[col_item].isin(
        CFG.clicfeat_dict[col_item]['delete'])].index, inplace=True)
    if CLAC_DICT[col_item] == 'chi':
        for i in CFG.clicfeat_dict[col_item]:
Ejemplo n.º 8
0
##                    ["S", "f", ["B", "A"]],
##                    #["A", "f", ["B", "A"]],
##                    #["S", "g", ["S"]],
##                    #["S", "f", ["S", "S"]],
##                    ["A", "g", ["B"]],
##                    ["B", "g", ["A"]]]
##tg5.alphabet = {p[1]:len(p[2]) for p in tg2.productions}
##tg5.costs = {'f':(lambda x: max(x)), 'g':(lambda x: x[0] + 1), 'a':(lambda x:1)}
###test5 = tg5.EnumerateStrings(5)
##tg5.checkCostFrequency(test5['S'], {2:2, 3:2, 4:1})
##

#### Test Grammar 4: Nand ######
##My grammar doesn't allow for productions without functions,
#Replaced vars and constants non-terminals with a production for each var & const
nand = cfg.CFG()
nand.root = "Start"
nand.nonterminals = ["Start", "StartAnd"]
nand.productions = [["Start", "a", []], ["Start", "b", []], ["Start", "c", []],
                    ["Start", "d", []], ["Start", "true", []],
                    ["Start", "false", []], ["Start", "not", ["StartAnd"]],
                    ["StartAnd", "and", ["Start", "Start"]]]
nand.alphabet = {p[1]: len(p[2]) for p in nand.productions}
nand.costs = {a: (lambda x: 1 + sum(x)) for a in nand.alphabet}
#nand.EnumerateStrings(10)

#####Test Grammar 5: ITE ############
iteg = cfg.CFG()
iteg.root = "Start"
iteg.nonterminals = ["Start", "BoolExpr"]
iteg.productions = [["Start", "0", []], ["Start", "1", []], ["Start", "2", []],
Ejemplo n.º 9
0
def parseString(filestring):
    parsed_file = parse_parentheses(filestring)
    parsed_file = removeExcess(parsed_file)

    #print str(parsed_file)
    defined_funcs = []
    synth_pos = -1
    for x in range(0,len(parsed_file)):
        if type(parsed_file[x]) == list:
            if parsed_file[x][0] == "synth-fun":
                synth_pos = x
                break
            if parsed_file[x][0] == "define-fun":
                defined_funcs.append(x)
    assert synth_pos != -1

    #still need to handle defined funcs

    synthG = cfg.CFG()
    synthfunc = parsed_file[synth_pos]
    if len(synthfunc) < 5: #grammar isn't specified
        return None
    synthG.root = synthfunc[4][0][0]
    synthG.nonterminals = []# [synthG.root]
    for nontDef in synthfunc[4]:
        synthG.nonterminals.append(nontDef[0])
    for nontDef in synthfunc[4]:
        #synthG.nonterminals.append(nontDef[0])
        for prod in nontDef[2]:
            #if production is just defining a typed constant / variable
            if type(prod) == list and len(prod) == 2 and prod[1] == nontDef[1]:
                synthG.alphabet[prod[0]] = 0
                synthG.productions.append([nontDef[0], prod[0], []])
            #if production is a const or a single nonterminal
            elif type(prod) == str:
                #assert not prod in synthG.nonterminals, "A -> B production"

                #production of form A -> B
                if prod in synthG.nonterminals: #take RHS of B and append it to end of A
                    B_rhs = -1
                    for x in synthfunc[4]:
                        if x[0] == prod:
                            B_rhs = x[2]
                    assert B_rhs != -1

                    B_rhs = copy.copy(B_rhs)
                    nontDef[2] += B_rhs
                    
                else:
                    synthG.alphabet[prod] = 0
                    synthG.productions.append([nontDef[0], prod, []])
            #if production includes function calls
            elif type(prod) == list:
                assert len(prod) > 1 and type(prod[0]) == str
                if not prod[0] in synthG.alphabet:
                    synthG.alphabet[prod[0]] = len(prod) - 1 #, "Multiple arities found in " + str(prod[0])
                else:
                    #print str(prod)
                    assert synthG.alphabet[prod[0]] == len(prod) - 1, "Multiple arities found in '" + str(prod[0]) + "'"
                for p in prod[1:]:
                    #rhs is nested function. e.g., S -> g(g(A))
                    #Adds new nonterminal B and yield S -> g(B) , B -> g(A)
                    if type(p) == list:
                        #newNT = nontDef[0] + "->" + str(prod)
                        newNT = "[X[X" + nontDef[0] + str(p) + "X]X]"
                        synthG.nonterminals.append(newNT)
                        synthfunc[4].append([newNT, "Unknown", [p]]) #This will allow for multiple nested functions #NEED TO FIND WAY TO FIND TYPE
                        p = newNT #when production is created, newNT will be in production
                        #assert False, "handle nested production " + str(prod)
                    #rhs is nontermina or terminal
                    elif type(p) == str:
                        if p not in synthG.nonterminals:
                            if p in synthG.alphabet:
                                assert synthG.alphabet[p] == 0
                            synthG.alphabet[p] = 0
                            if p[0].isupper():
                                print "uppercase terminal '" + p + "' in " + str(prod)
                synthG.productions.append([nontDef[0], prod[0], [p for p in prod[1:]]])
            else:
                assert False, "no production type matches " + str(prod)
    #synthG.nonterminals = list(set(synthG.nonterminals)) #removes duplicates
    return synthG
Ejemplo n.º 10
0
        else:
            print_help()
    else:
        print_help()
    return path, main


if __name__ == "__main__":

    path, main_func = get_op()
    sym_tab = build_symtab(path)
    with open(path, 'r') as f:
        lines = f.readlines()
    _cfg_ = {}
    for key in sym_tab.keys():
        _cfg_[key] = cfg.CFG(
            lines[sym_tab[key]["lines"][1]:sym_tab[key]["lines"][2]], key)

    assert main_func in _cfg_.keys(), ("Function %s not found!" % main_func)

    argument = []
    for i in sym_tab['foo']['decl'].get_args():
        key = i.get_name()
        tmp_l = float(input("Lower bound of %s << " % key))
        tmp_r = float(input("Upper bound of %s << " % key))
        argument.append([tmp_l, tmp_r, key])
    cg = FCG(_cfg_, main_func, sym_tab)
    cg.set_entry_range(argument)

    flags = [False for i in cg.get_constraint_nodes()]
    updated = widen(cg, cg.get_entry_nodes(), flags, False)
    while updated:
Ejemplo n.º 11
0
def main():
	# 命令行模式将下面注释去掉,并将IDA python相关函数加上注释,即InforExtraction单独为IDA脚本,后面的独立IDA外执行,调试模式时有用
	'''
	try:
		options,args = getopt.getopt(sys.argv[1:],"hf:", ["help","file="])
	except getopt.GetoptError:
		sys.exit()
	binary = None
	for name,value in options:
		if name in ("-h","--help"):
			usage()
			sys.exit()
		if name in ("-f","--file"):
			binary = value
	if binary == None:
		usage()
		sys.exit()
	'''
	print "[+]log: Start analysis"
	binary = idc.GetInputFilePath()

	isPIE = idc.GetDisasm(0)
	# 基址从0开始
	if len(isPIE) == 0:
		proj = angr.Project(binary, load_options={'auto_load_libs': False,'extern_size': 0x800000})
	# 基址从非0开始, ELF文件中有的需要手动设定基址为0,否则IDA分析的地址数据与angr分析的地址数据不一致
	else:
		# 在最新版本中:custom_base_addr -> base_addr
		proj = angr.Project(binary, load_options={'main_opts':{'custom_base_addr':0},'auto_load_libs': False,'extern_size': 0x800000})
	
	isPE = proj.loader.all_pe_objects
	if len(isPE) == 0:
		filetype = "ELF"
	else:
		filetype = "PE"

	InforExtraction.main(filetype)	


	vftable_file = open("vftable","r")
	vftable_jsonstr = vftable_file.read()	
	vftable_list = json.loads(vftable_jsonstr) 
	vftable_file.close()

	if filetype == "PE":
		vbtable_file = open("vbtable","r")
		vbtable_jsonstr = vbtable_file.read()	
		vbtable_list = json.loads(vbtable_jsonstr)
		vbtable_file.close()
		VTT_list = None
	elif filetype == "ELF":
		VTT_file = open("VTT","r")
		VTT_jsonstr = VTT_file.read()	
		VTT_list = json.loads(VTT_jsonstr)
		VTT_file.close()
		vbtable_list = None
	ctor_file = open("ctor","r")
	ctor_jsonstr = ctor_file.read()	
	ctor_list = json.loads(ctor_jsonstr)
	ctor_file.close()

	symbol_file = open("symbol","r")
	symbol_jsonstr = symbol_file.read()	
	symbol_list = json.loads(symbol_jsonstr)
	symbol_file.close()

	#print vftable_list
	#print vbtable_list
	#print ctor_list
	
	

	# 生成ctor CFG
	start = time.time()
	start_points = []
	for ctor_addr in ctor_list:
		start_points.append(int(ctor_addr,16))
	for vftable in vftable_list:
		if vftable_list[vftable]["dtor"] != 0:
			start_points.append(int(vftable_list[vftable]["dtor"],16))

	mycfg = cfg.CFG(proj=proj,start_points=start_points,symbol_list=symbol_list,thread_num=1)
	end = time.time()
	print "[+]log: Build ctor cfg completion. Time:%fs" % (end-start) 
	#print_cfg(mycfg)

	print "[*]log: The number of analysis functions:%d" % len(mycfg.functions)

	# 进行覆写分析
	start = time.time()
	myoverwrite = StaticTaintAnalysis.StaticTaintAnalysis(proj,mycfg,vftable_list,vbtable_list,VTT_list,ctor_list,symbol_list,filetype)
	end = time.time()
	print "[+]log: Overwrite analysis completion. Time:%fs" % (end-start)
	#print_overwrite(myoverwrite)
	#sys.exit()
	# 继承树生成
	start = time.time()
	inheritance_tree = HeuristicReasoning.HeuristicReasoning(proj,mycfg,myoverwrite.ctor_list,vftable_list,symbol_list)
	end = time.time()
	print "[+]log: Build inherTree completion. Time:%fs" % (end-start)

	inheritance_tree.statistics()
	#inheritance_tree.draw_ctor()
	print_CHT(inheritance_tree)
	inheritance_tree.draw()