Example #1
0
class StartTool(CmdLineTool):
    
    def __init__(self, DESCRIPTION):
        CmdLineTool.__init__(self, DESCRIPTION)

    # @Override
    def _constructQuery(self):
        """
        Create a query from arguments that will be passed to the
        database.
        """
        pass

    # @Override
    def _handleResult(self, res):
        """
        Process the result of the query.
        """
        pass

    def _runImpl(self):
        query = self._constructQuery()
        
        self.dbInterface = DBInterface()
        self.dbInterface.connectToDatabase()

        res = self.dbInterface.runGremlinQuery(query)
        self._handleResult(res)
Example #2
0
class StartTool(CmdLineTool):
    def __init__(self, DESCRIPTION):
        CmdLineTool.__init__(self, DESCRIPTION)

    # @Override
    def _constructQuery(self):
        """
        Create a query from arguments that will be passed to the
        database.
        """
        pass

    # @Override
    def _handleResult(self, res):
        """
        Process the result of the query.
        """
        pass

    def _runImpl(self):
        query = self._constructQuery()

        self.dbInterface = DBInterface()
        self.dbInterface.connectToDatabase()

        res = self.dbInterface.runGremlinQuery(query)
        self._handleResult(res)
Example #3
0
    def _runImpl(self):
        query = self._constructQuery()

        self.dbInterface = DBInterface()
        self.dbInterface.connectToDatabase()

        res = self.dbInterface.runGremlinQuery(query)
        self._handleResult(res)
Example #4
0
def initialize():
    # Get the ids from the SemanticUnit (first line is the projectName)
    idList = [line.rstrip('\n') for line in open('result.txt')]

    # Connect to project DB
    projectName = idList.pop(0)
    db = DBInterface()
    db.connectToDatabase(projectName)

    return [db, idList]
Example #5
0
def getVisibleNodes(projectName):
    db = DBInterface()
    db.connectToDatabase(projectName)
    
    visibleStatementTypes = ['CustomNode', 'ClassDef', 'DeclByClass', 'DeclByType', 'FunctionDef', 'CompoundStatement', 'DeclStmt', 'StructUnionEnum', 'TryStatement', 'CatchStatement', 'IfStatement', 'ElseStatement', 'SwitchStatement', 'ForStatement', 'DoStatement', 'WhileStatement', 'BreakStatement', 'ContinueStatement', 'GotoStatement', 'Label', 'ReturnStatement', 'ThrowStatement', 'ExpressionStatement', 'IdentifierDeclStatement', 'PreIfStatement', 'PreElIfStatement', 'PreElseStatement', 'PreEndIfStatement', 'PreDefine', 'PreUndef', 'PreDiagnostic', 'PreOther', 'PreInclude', 'PreIncludeNext', 'PreLine', 'PrePragma', 'UsingDirective', 'BlockCloser', 'Comment']
    
    # Remove unneeded nodes (we need to exclude IdentifierDeclStatement that have a ForInit or StructUnionEnum as parent)
    query = """g.V().has('type', within(%s))
                .not(has('type', 'IdentifierDeclStatement').in(AST_EDGE).has('type', within('ForInit','StructUnionEnum')))
                .id()""" % (visibleStatementTypes) 
    result = db.runGremlinQuery(query)
    return result  
Example #6
0
    def _runImpl(self):
        
        self.dbInterface = DBInterface()
        self.dbInterface.connectToDatabase()

        self._start()

        query = self._constructIdQuery()
        ids = self.dbInterface.runGremlinQuery(query)
        
        for chunk in self.dbInterface.chunks(ids, CHUNK_SIZE):
            query = self._constructQueryForChunk(chunk)
            res = self.dbInterface.runGremlinQuery(query)
            self._handleChunkResult(res, chunk)

        self._stop()
Example #7
0
    def _runImpl(self):
        query = self._constructQuery()
        
        self.dbInterface = DBInterface()
        self.dbInterface.connectToDatabase()

        res = self.dbInterface.runGremlinQuery(query)
        self._handleResult(res)
Example #8
0
class ChunkStartTool(CmdLineTool):

    def __init__(self, DESCRIPTION):
        CmdLineTool.__init__(self, DESCRIPTION)

    # @Override
    def _constructIdQuery(self):
        pass

    # @Override
    def _constructQueryForChunk(self, chunk):
        pass

    # @Override
    def handleChunkResult(self, res, chunk):
        pass

    # @Override
    def _start(self):
        pass

    def _stop(self):
        pass

    def _runImpl(self):
        
        self.dbInterface = DBInterface()
        self.dbInterface.connectToDatabase()

        self._start()

        query = self._constructIdQuery()
        ids = self.dbInterface.runGremlinQuery(query)
        
        for chunk in self.dbInterface.chunks(ids, CHUNK_SIZE):
            query = self._constructQueryForChunk(chunk)
            res = self.dbInterface.runGremlinQuery(query)
            self._handleChunkResult(res, chunk)

        self._stop()
Example #9
0
    def run(self):
        if self.args.file != None:
            f = open(self.args.file, "r")
        else:
            f = sys.stdin
        lines = __class__._parseScript(f)
        query = "\n".join(lines)
        db = DBInterface()
        if self.args.no_json:
            db.disable_json()
        db.connectToDatabase(self.args.project)

        result = db.runGremlinQuery(query)
        pp = pprint.PrettyPrinter(indent=4, compact=True)
        for x in result:
            if self.args.raw:
                print(repr(x))
            elif self.args.pretty:
                pp.pprint(x)
            else:
                print(x)
        db.runGremlinQuery("quit")
class APIEmbedder(object):
    def __init__(self):
        self._initializeDBConnection()

    def _initializeDBConnection(self):
        self.dbInterface = DBInterface()

    def setOutputDirectory(self, directory):
        self.outputDirectory = directory

    def run(self, tfidf=True):
        try:
            # Will throw error if output directory already exists
            self._initializeOutputDirectory()
        except:
            return
        self._connectToDatabase()
        functions = self._getAPISymbolsFromDatabase()
        featureArray = self._createFeatureArray(functions)
        self._finalizeOutputDirectory()
        self.termDocMatrix = self._createTermDocumentMatrix(featureArray)
        if tfidf:
            self.termDocMatrix.tfidf()
        self._outputInLIBSVMFormat(self.outputDirectory)

    def _connectToDatabase(self):
        self.dbInterface.connectToDatabase()

    def _initializeOutputDirectory(self):
        directory = self.outputDirectory
        if os.path.exists(directory):
            raise
        os.makedirs(directory)
        self.tocFilename = os.path.join(directory, 'TOC')
        self.toc = open(self.tocFilename, 'w')

    def _finalizeOutputDirectory(self):
        self.toc.close()

    def _getAPISymbolsFromDatabase(self):

        CHUNK_SIZE = 1024

        query = """queryNodeIndex('type:Function').id"""
        functionIds = self._runGremlinQuery(query)

        result = []

        for chunk in self.chunks(functionIds, CHUNK_SIZE):
            query = """
            _().transform{ %s }.scatter().transform{g.v(it)}
            .sideEffect{funcId = it.id}
            .transform{ [funcId, it.functionToAPISymbolNodes().code.toList()] }
            """ % (str(chunk))

            result.extend(self._runGremlinQuery(query))

        return result

    def chunks(self, l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    def _runGremlinQuery(self, query):
        return self.dbInterface.runGremlinQuery(query)

    def _createFeatureArray(self, functions):

        featureArray = FeatureArray()
        for index, (funcId, symbols) in enumerate(functions):
            for i in range(len(symbols)):
                symbols[i] = symbols[i] + '\n'
            featureArray.add(index, symbols)  #label,items
            self.toc.write("%d\n" % (funcId))
        self.toc.flush()
        return featureArray

    def _createTermDocumentMatrix(self, featureArray):
        converter = FeatureArrayToMatrix()
        return converter.convertFeatureArray(featureArray)

    def _outputInLIBSVMFormat(self, directory):

        from scipy.sparse import csc_matrix

        if self.termDocMatrix.matrix == None: return

        m = csc_matrix(self.termDocMatrix.matrix)
        nCols = m.shape[1]

        outFilename = os.path.join(directory, 'embedding.libsvm')
        outFile = open(outFilename, 'w')

        for i in range(nCols):
            label = self.termDocMatrix.index2Doc[i]

            col = m.getcol(i)
            entries = [(i, col[i, 0]) for i in col.indices]
            entries.sort()
            features = " ".join(['%d:%f' % e for e in entries])
            row = '%s %s #%s\n' % (label, features, label)
            outFile.write(row)

        outFile.close()
Example #11
0
#!/usr/bin/env python3

from octopus.server.DBInterface import DBInterface

projectName = 'android.tar.gz'
query = "queryNodeIndex('type:Function').id"

db = DBInterface()
db.connectToDatabase(projectName)

ids = db.runGremlinQuery(query)

CHUNK_SIZE = 256
LOCATION = '/home/sid/RABBIT_HOLE/CODE_ANALYSIS/joern/projects/octopus/data/projects/'
for chunk in db.chunks(ids, CHUNK_SIZE):

    query = """
        getCallsToRegex(".*read(Int|Uint)(32|64)")
        .statements()
        .out("REACHES")
        .has("code",textRegex(".*(malloc|memcpy).*"))
        .functions()
        .functionToLocationStr()
    """

    query2 = """
       getNodesWithTypeAndName(TYPE_FUNCTION, '*onTransact*')
       .out(FUNCTION_TO_AST_EDGE)
       .getArguments('(memcpy OR malloc)', '2')
       .out(USES_EDGE)
       .filter{
Example #12
0
class APIEmbedder(object):
    def __init__(self):
        self._initializeDBConnection()

    def _initializeDBConnection(self):
        self.dbInterface = DBInterface()

    def setOutputDirectory(self, directory):
        self.outputDirectory = directory

    def run(self):

        try:
            # Will throw error if output directory already exists
            self._initializeOutputDirectory()
        except:
            return

        self._connectToDatabase()

        functions = self._getAPISymbolsFromDatabase()
        self._writeDataPoints(functions)
        self._finalizeOutputDirectory()

        self._embed()

    def _embed(self):
        # self.embedder = SallyBasedEmbedder()
        self.embedder = Embedder()
        self.embedder.embed(self.outputDirectory)

    def _connectToDatabase(self):
        self.dbInterface.connectToDatabase()

    def _writeDataPoints(self, functions):

        for (funcId, symbols) in functions:
            self.toc.write("%d\n" % (funcId))
            self._addDataPoint(symbols)

    def _addDataPoint(self, symbols):
        datapointFilename = os.path.join(self.dataDir, str(self.curDatapoint))
        f = file(datapointFilename, 'w')
        f.writelines([x + "\n" for x in symbols])
        f.close()
        self.curDatapoint += 1

    def _initializeOutputDirectory(self):
        directory = self.outputDirectory

        if os.path.exists(directory):
            raise

        self.dataDir = os.path.join(directory, 'data')
        self.tocFilename = os.path.join(directory, 'TOC')
        os.makedirs(self.dataDir)
        self.toc = file(self.tocFilename, 'w')

        self.curDatapoint = 0

    def _finalizeOutputDirectory(self):
        self.toc.close()

    def _getAPISymbolsFromDatabase(self):

        CHUNK_SIZE = 1024

        query = """queryNodeIndex('type:Function').id"""
        functionIds = self._runGremlinQuery(query)

        result = []

        for chunk in self.chunks(functionIds, CHUNK_SIZE):
            query = """
            _().transform{ %s }.scatter().transform{g.v(it)}
            .sideEffect{funcId = it.id}
            .transform{ [funcId, it.functionToAPISymbolNodes().code.toList()] }
            """ % (str(chunk))

            result.extend(self._runGremlinQuery(query))

        return result

    def chunks(self, l, n):
        for i in xrange(0, len(l), n):
            yield l[i:i + n]

    def _runGremlinQuery(self, query):
        return self.dbInterface.runGremlinQuery(query)
Example #13
0
# This is a collection of various queries

#Define target project
#projectName = 'SPLC'
#projectName = 'Linux3'
#projectName = 'EvoDiss.tar.gz'
#projectName = 'JoernTest.tar.gz'
#projectName = 'Linux.tar.gz'
#projectName = 'Collection'
#projectName = 'expat'
#projectName = 'sample'
#projectName = 'PV_Current.tar.gz'
#projectName = 'DonorProject'
projectName = 'Ag'
#Connect do database of project
db = DBInterface()
db.connectToDatabase(projectName)

##### Normal Gremlin queries #####

# Get vertice with id 147512
query = "g.V(147512)"
#Shows code of vertice 4256
query = "g.V(4256).values('code')"
#Shows code of all nodes of type function
query = "g.V().has('type', 'Function').values('code')"
#Shows code of all file nodes
query = "g.V().has('type', 'File').values('code')"
# Get IDs of all argument verteces
query = "g.V().has('type', 'Argument').id()"
# Get all code vertices of a function
Example #14
0
#!/usr/bin/env python3
from octopus.server.DBInterface import DBInterface

# Connect to project DB
projectName = 'EvoDiss.tar.gz'
db = DBInterface()
db.connectToDatabase(projectName)

query = "g.V().has('type', 'Directory').has('code', '/home/lea/Downloads/Joern_Advanced/projects/octopus/data/projects/EvoDiss.tar.gz/src/home/lea/Downloads/EvoDiss/src').id()"
result = ["# " + str(db.runGremlinQuery(query))  + " Directory src"]

query = "g.V().has('type', 'File').has('code', '/home/lea/Downloads/Joern_Advanced/projects/octopus/data/projects/EvoDiss.tar.gz/src/home/lea/Downloads/EvoDiss/src/C_Test.c').id()"
result.append("# " + str(db.runGremlinQuery(query))  + " File C_Test.c")

query = "g.V().has('type', 'FunctionDef').has('code', textContains('compareResults')).id()"
result.append("# " + str(db.runGremlinQuery(query))  + " FunctionDef compareResults")

query = "g.V().has('type', 'FunctionDef').has('code', textContains('compareResults')).values('functionId')"
id = db.runGremlinQuery(query)[0]

query = "g.V().has('type', 'IfStatement').has('functionId', '%s').id()" % (id)
result.append("# " + str(db.runGremlinQuery(query))  + " IfStatements in compareResults")

query = "g.V().has('type', 'ElseStatement').has('functionId', '%s').id()" % (id)
result.append("# " + str(db.runGremlinQuery(query))  + " ElseStatement in compareResults")

query = "g.V().has('type', 'ForStatement').has('functionId', '%s').id()" % (id)
result.append("# " + str(db.runGremlinQuery(query))  + " ForStatement in compareResults")

query = "g.V().has('type', 'Condition').has('functionId', '%s').id()" % (id)
result.append("# " + str(db.runGremlinQuery(query))  + " Conditions in compareResults")
Example #15
0
class ProgramGraph(JoernTool):
    def __init__(self, DESCRIPTION):
        JoernTool.__init__(self, DESCRIPTION)

    # @Override
    def processLine(self, line):
        self.plot_configuration = PlotConfiguration()
        f = open(self.args.plot_config, "r")
        self.plot_configuration.parse(f)

        labels = self._getLabels()
        nodes = self._getNodes(int(line), labels)
        edges = self._getEdges(int(line), labels)

        G = pgv.AGraph(directed=True, strict=False)
        self._addNodes(G, nodes)
        self._addEdges(G, edges)
        self._outputGraph(G, line)

    def streamStart(self):
        self.dbInterface = DBInterface()
        self.dbInterface.connectToDatabase(self.args.project)

    def _addNodes(self, G, nodes):
        for v in nodes:
            nr = NodeResult(v)

            label = self._createGraphElementLabel(
                self.plot_configuration.getElementDisplayItems(nr))
            plot_properties = self.plot_configuration.getElementLayout(nr)
            if label:
                plot_properties['label'] = label
            G.add_node(nr.getId(), **plot_properties)

    def _addEdges(self, G, edges):
        for e in edges:
            er = EdgeResult(e)
            label = self._createGraphElementLabel(
                self.plot_configuration.getElementDisplayItems(er))
            plot_properties = self.plot_configuration.getElementLayout(er)
            plot_properties['label'] = label
            G.add_edge(er.getSrc(), er.getDest(), er.getId(),
                       **plot_properties)

    def _createGraphElementLabel(self, labeldata):
        return "\n".join(
            [":".join([str(self._escape(e)) for e in d]) for d in labeldata])

    def _escape(self, label):
        return str(label).replace("\\", "\\\\")

    def _outputGraph(self, G, identifier):
        outputString = '//' + identifier + '\n'
        outputString += str(G) + '\n'
        outputString += '//###' + '\n'
        self.output(outputString)

    def _getLabels(self):
        labels = ["FLOWS_TO", "USE", "DEF", "IS_AST_PARENT"]
        if self.args.show_all:
            return labels

        if not self.args.show_control_flow:
            labels.remove("FLOWS_TO")
        if not self.args.show_data_flow:
            labels.remove("USE")
            labels.remove("DEF")
        if not self.args.show_ast:
            labels.remove("IS_AST_PARENT")

        return labels

    def _getStartNode(self, functionId):
        if self.args.id_property:
            startnode = """g.V().has('type', 'Function').has('_key', {})""".format(
                functionId)
        else:
            startnode = """g.V({})""".format(functionId)
        startnode += """.union( out('IS_FUNCTION_OF_CFG'), out('IS_FUNCTION_OF_AST') )"""
        return startnode

    def _getNodes(self, functionId, labels):
        query = """
            {}.repeat(outE({}).subgraph('sg').inV().dedup().simplePath()).cap('sg').next().traversal().V()
            """.format(self._getStartNode(functionId),
                       ','.join(map(lambda x: "'{}'".format(x), labels)))
        return self._runGremlinQuery(query)

    def _getEdges(self, functionId, labels):
        query = """
            {}.repeat(outE({}).subgraph('sg').inV().simplePath()).cap('sg').next().traversal().E()
            """.format(self._getStartNode(functionId),
                       ','.join(map(lambda x: "'{}'".format(x), labels)))
        return self._runGremlinQuery(query)
Example #16
0
 def streamStart(self):
     self.dbInterface = DBInterface()
     self.dbInterface.connectToDatabase(self.args.project)
Example #17
0
custom = False
evaluation = False
###############################################

# Connect to project DB
#projectName = 'EvoDiss.tar.gz'
#projectName = 'Revamp'
#projectName = 'JoernTest.tar.gz'
#projectName = 'SPLC'
#projectName = 'ICSE'
#projectName = 'expat'
#projectName = 'sample'
#projectName = 'Collection'
projectName = 'DonorProject'
#projectName = 'PV_Current.tar.gz'
db = DBInterface()

####################################### Plotting ###############################################
result = set()
resultIDs = set()

customStatementTypes = [
    'CustomNode', 'ClassDef', 'FunctionDef', 'CompoundStatement', 'DeclStmt',
    'TryStatement', 'CatchStatement', 'IfStatement', 'ElseStatement',
    'SwitchStatement', 'ForStatement', 'DoStatement', 'WhileStatement',
    'BreakStatement', 'ContinueStatement', 'GotoStatement', 'Label',
    'ReturnStatement', 'ThrowStatement', 'ExpressionStatement',
    'IdentifierDeclStatement', 'PreIfStatement', 'PreElIfStatement',
    'PreElseStatement', 'PreEndIfStatement', 'PreDefine', 'PreUndef',
    'MacroCall', 'PreDiagnostic', 'PreOther', 'PreInclude', 'PreIncludeNext',
    'PreLine', 'PrePragma', 'UsingDirective', 'BlockCloser', 'Symbol',
 def _initializeDBConnection(self):
     self.dbInterface = DBInterface()
Example #19
0
    filename = 'SemanticUnit.dot'
    
    #Write to file
    print("Making of graph finished, creating "+filename+" ...")
    print("--------------------------------------------------------------------------------- \n")

    file = open("SemanticUnit/SemanticUnit.dot", 'w')
    file.write(outputString)
    file.close()
    
    # Use terminal output to convert .dot to .png
    os.system("dot -Tpng 'SemanticUnit/SemanticUnit.dot' -o 'SemanticUnit/SemanticUnit.png'")
    #Print status update
    print("Creation of plot was successfull!")    
    
    
################################################### Start of program #################################################################
#Initialize DB interface
db = DBInterface()

# Input of entry points
if (console):
    consoleInput()
else: 
    # projectName must be set manually
    db.connectToDatabase(projectName)
    print("Project is set to: "+projectName)

# Start identification process    
identifySemanticUnits() 
    
Example #20
0
#File -> #PreDefine -> PreMacroIdentifier -> Identifier
#File -> Function -IS_FUNCTION_OF_AST-> #FunctionDef -> Identifier
#File -> #StructUnionEnum -> Identifier
#File -> #DeclStatement -Declares-> Decl (first word is the identifier?)

# List of all types that can use identifiers to do something (sorted by declarations)
#Function: FunctionDef and CallExpression (need parent ExpressionStatement). Declares.
#Macro: MacroDef and Callee or enywhere where we can identify a preMacroIdentifer?
#Declares: ?
#Enum:?

prefix = "semanticUnit__"

print("Adding prefixes...")
# Connect to SU projectfile:///C:/Users/Lea/git/Joern_Advanced/testProjects/Collection/Plot.png
db = DBInterface()
db.connectToDatabase("EvoDiss.tar.gz")

# Get the names of all functions
query = """g.V().has('type', 'FunctionDef').out('IS_AST_PARENT').has('type', 'Identifier').values('code').as('function')"""
functions = db.runGremlinQuery(query)

# Get the names of all macros
query = """g.V().has('type', 'PreDefine').out('IS_AST_PARENT').has('type','PreMacroIdentifier').values('code').as('macro')"""
macros = db.runGremlinQuery(query)

# Get the names of all declarations that can be declared on file scope
query = """g.V().has('type', 'DeclStmt').out('DECLARES').has('type', 'Decl').values('identifier').as('declaration')"""
declarations = db.runGremlinQuery(query)

# Get the names of all StructUnionEnums
Example #21
0
File: plotDB.py Project: LPhD/Jess
# Connect to project DB
#projectName = 'EvoDiss.tar.gz'
#projectName = 'Revamp'
#projectName = 'JoernTest.tar.gz'
#projectName = 'SPLC'
#projectName = 'ICSE'
#projectName = 'expat'
#projectName = 'sample'
#projectName = 'Collection'
projectName = 'DonorProject'
projectName = 'grep'
projectName = 'Test'
projectName = 'Example'
projectName = 'Test3'
#projectName = 'PV_Current.tar.gz'
db = DBInterface()

####################################### Plotting ###############################################   
result = set()
resultIDs = set()
 
customStatementTypes = ['CustomNode', 'ClassDef', 'FunctionDef', 'CompoundStatement', 'DeclStmt', 'TryStatement', 'CatchStatement', 'IfStatement', 'ElseStatement', 'SwitchStatement', 'ForStatement', 'DoStatement', 'WhileStatement', 'BreakStatement', 'ContinueStatement', 'GotoStatement', 'Label', 'ReturnStatement', 'ThrowStatement', 'ExpressionStatement', 'IdentifierDeclStatement', 'PreIfStatement', 'PreElIfStatement', 'PreElseStatement', 'PreEndIfStatement', 'PreDefine', 'PreUndef', 'MacroCall', 'PreDiagnostic', 'PreOther', 'PreInclude', 'PreIncludeNext', 'PreLine', 'PrePragma', 'UsingDirective', 'BlockCloser', 'Symbol', 'CFGEntryNode', 'CFGExitNode', 'Comment']
cNodeIDs = set()

visibleStatementTypes = ['CustomNode', 'ClassDef', 'DeclByClass', 'DeclByType', 'FunctionDef', 'CompoundStatement', 'DeclStmt', 'StructUnionEnum', 'FunctionPointerDeclare', 'TryStatement', 'CatchStatement', 'IfStatement', 'ElseStatement', 'SwitchStatement', 'ForStatement', 'DoStatement', 'WhileStatement', 'BreakStatement', 'ContinueStatement', 'GotoStatement', 'Label', 'ReturnStatement', 'ThrowStatement', 'ExpressionStatement', 'IdentifierDeclStatement', 'PreIfStatement', 'PreElIfStatement', 'PreElseStatement', 'PreEndIfStatement', 'PreDefine', 'PreUndef', 'MacroCall', 'PreDiagnostic', 'PreOther', 'PreInclude', 'PreIncludeNext', 'PreLine', 'PrePragma', 'UsingDirective', 'BlockCloser', 'Comment', 'File', 'Directory']


 
# Plots the results    
def plotResults ():
    db.connectToDatabase(projectName)