コード例 #1
0
ファイル: data.py プロジェクト: tum-i22/Oedipus
def loadLabelFromFile(fileName):
    """ Loads clusters from file into a string """
    cluster, params = "", {}
    if not os.path.exists(fileName):
        prettyPrint("Could not find \"%s\". Skipping" % fileName, "warning")
        return cluster, params

    rawData = open(fileName).read()
    if fileName.find(".label") != -1:
        # It's a label file
        cluster, params = rawData.split('\n')[0], {}
    else:
        # It's a metadata file
        if rawData.find("Ident") != -1:
            cluster, params = "Ident", {}
        else:
            for token in rawData[1:-1].split(","):
                if token.find("Transform") != -1:
                    cluster = cluster + token.split("=")[1].replace("'","") + "_"
                else:
                    if token.find("=") != -1 and token.find("Functions") == -1 and token.find("out") == -1:
                        key = token.split('=')[0].replace('-','').replace("'","")
                        value = token.split('=')[1].replace("'","")
                        params[key] = value
                
        cluster = cluster[:-1] if cluster[-1] == "_" else cluster # Clip any trailing underscores

    return cluster, params
コード例 #2
0
def loadArgumentsFromKLEE(fileName):
    """ Parses KLEE testcase, saves arguments to file, and returns a list of retrieved arguments """
    fileContent = open(fileName, "rb").read().split("\n")
    if len(fileContent) < 1:
        prettyPrint("KLEE testcase file is empty", "warning")
        return [], ""
    args, numArgs = [], 0
    argFlag = False
    # Retrieve the number of arguments
    for row in fileContent:
        if row.find("arg") != -1 and row.find("n_args") == -1 and row.find(
                "--sym-args") == -1:
            argFlag = True
        elif row.find("data: ") != -1 and argFlag:
            argIndex = row.find("data: ") + len("data: ")
            args.append(row[argIndex:].replace("'",
                                               "").decode("string_escape"))
            argFlag = False
    # Now write arguments to file
    inputFile = open(fileName.replace(".txt", ".input"), "wb")
    for arg in args:
        inputFile.write(arg)
        inputFile.write(" ")
    inputFile.close()

    return args, fileName.replace(".txt", ".input")
コード例 #3
0
def loadLabelFromFile(fileName):
    """ Loads clusters from file into a string """
    cluster, params = "", {}
    if not os.path.exists(fileName):
        prettyPrint("Could not find \"%s\". Skipping" % fileName, "warning")
        return cluster, params

    rawData = open(fileName).read()
    if fileName.find(".label") != -1:
        # It's a label file
        cluster, params = rawData.split('\n')[0], {}
    else:
        # It's a metadata file
        if rawData.find("Ident") != -1:
            cluster, params = "Ident", {}
        else:
            for token in rawData[1:-1].split(","):
                if token.find("Transform") != -1:
                    cluster = cluster + token.split("=")[1].replace("'",
                                                                    "") + "_"
                else:
                    if token.find("=") != -1 and token.find(
                            "Functions") == -1 and token.find("out") == -1:
                        key = token.split('=')[0].replace('-',
                                                          '').replace("'", "")
                        value = token.split('=')[1].replace("'", "")
                        params[key] = value

        cluster = cluster[:-1] if cluster[
            -1] == "_" else cluster  # Clip any trailing underscores

    return cluster, params
コード例 #4
0
def loadAlphaSequences(fileName, sequenceSize=0):
    """ Loads alpha sequences from a file into a list of characters """
    alphaSequence = []
    if not os.path.exists(fileName):
        prettyPrint("File \"%s\" was not found" % fileName, "warning")
    rawSequence = open(fileName).read()
    for alpha in rawSequence:
        if alpha != '' and alpha != '\n':
            alphaSequence.append(alpha)
    if sequenceSize == 0 or sequenceSize > len(alphaSequence):
        return alphaSequence
    else:
        return alphaSequence[:sequenceSize]
コード例 #5
0
ファイル: data.py プロジェクト: tum-i22/Oedipus
def loadAlphaSequences(fileName, sequenceSize=0):
    """ Loads alpha sequences from a file into a list of characters """
    alphaSequence = []
    if not os.path.exists(fileName):
        prettyPrint("File \"%s\" was not found" % fileName, "warning")
    rawSequence = open(fileName).read()
    for alpha in rawSequence:
        if alpha != '' and alpha != '\n':
            alphaSequence.append(alpha)
    if sequenceSize == 0 or sequenceSize > len(alphaSequence):
        return alphaSequence
    else:
        return alphaSequence[:sequenceSize]
コード例 #6
0
ファイル: data.py プロジェクト: tum-i22/Oedipus
def sequenceToAlpha( behavior ):
    """ Converts an instruction trace into an alphabet sequence """
    alphaSequence = ""
    global availableLetters
    global sequenceAlphaMap

    try:
        if type(behavior) == str:
            behavior = behavior.split(',')

        for action in behavior:
            if not action in sequenceAlphaMap.keys():
                sequenceAlphaMap[ action ] = availableLetters.pop(0)
            alphaSequence += sequenceAlphaMap[ action ]

    except Exception as e:
        prettyPrint("Error encountered while converting trace into alpha sequence: %s" % e, "error")
        prettyPrint("Length of current sequence is \"%s\"" % len(alphaSequence))

    return alphaSequence
コード例 #7
0
def sequenceToAlpha(behavior):
    """ Converts an instruction trace into an alphabet sequence """
    alphaSequence = ""
    global availableLetters
    global sequenceAlphaMap

    try:
        if type(behavior) == str:
            behavior = behavior.split(',')

        for action in behavior:
            if not action in sequenceAlphaMap.keys():
                sequenceAlphaMap[action] = availableLetters.pop(0)
            alphaSequence += sequenceAlphaMap[action]

    except Exception as e:
        prettyPrint(
            "Error encountered while converting trace into alpha sequence: %s"
            % e, "error")
        prettyPrint("Length of current sequence is \"%s\"" %
                    len(alphaSequence))

    return alphaSequence
コード例 #8
0
ファイル: data.py プロジェクト: tum-i22/Oedipus
def loadArgumentsFromKLEE(fileName):
    """ Parses KLEE testcase, saves arguments to file, and returns a list of retrieved arguments """
    fileContent = open(fileName, "rb").read().split("\n")
    if len(fileContent) < 1:
        prettyPrint("KLEE testcase file is empty", "warning")
        return [], ""
    args, numArgs = [], 0
    argFlag = False
    # Retrieve the number of arguments
    for row in fileContent:
        if row.find("arg") != -1 and row.find("n_args") == -1 and row.find("--sym-args") == -1:
            argFlag = True
        elif row.find("data: ") != -1 and argFlag:
            argIndex = row.find("data: ") + len("data: ")
            args.append(row[argIndex:].replace("'","").decode("string_escape"))
            argFlag = False
    # Now write arguments to file
    inputFile = open(fileName.replace(".txt",".input"), "wb")
    for arg in args:
        inputFile.write(arg)
        inputFile.write(" ")
    inputFile.close()

    return args, fileName.replace(".txt", ".input")
コード例 #9
0
def loadFeaturesFromList(dataFiles,
                         dataType,
                         labelExtension="metadata",
                         classReference=[]):
    """ Loads features from a list of files """
    features = []
    # Retrieve all files
    if len(dataFiles) < 1:
        prettyPrint("No data files of type \"%s\" were found." % dataType,
                    "warning")
        return numpy.array([]), numpy.array([])
    # Iterate over files adding their values to an array
    dataPoints, dataLabels, allClasses = [], [], []

    labelFile = ""  # TODO: Again for KLEE test files
    for dataFile in dataFiles:
        currentExtension = dataFile[dataFile.rfind("."):]

        #if未完成 不考虑
        if labelExtension == "label":
            # TODO: Accomodate for the KLEE files
            if dataFile.find("test") != -1:
                labelFile = dataFile[:dataFile.rfind("_test")] + ".label"
                if not os.path.exists(labelFile):
                    prettyPrint(
                        "Could not find a label file for \"%s\". Skipping" %
                        dataFile, "warning")
                    continue
            else:
                if not os.path.exists(dataFile.replace(dataType, "label")):
                    prettyPrint(
                        "Could not find a label file for \"%s\". Skipping" %
                        dataFile, "warning")
                    continue

        dataFile = dataFile.replace(
            currentExtension, ".%s" %
            dataType)  # Make sure we're loading from the right extension

        #将所有文件的特征值存入dataPoints
        if dataType.find(
                "tfidf"
        ) != -1 or dataType == "freq" or dataType == "util" or dataType == "hmm":
            # Load features as numerical
            dataPoints.append(
                [float(x) for x in open(dataFile).read()[1:-1].split(',')])
            #print dataPoints
        elif dataType == "triton":
            # Load features as numerical/nominal
            content = open(dataFile).read().replace("\n", "").replace(" ",
                                                                      "")[1:-1]
            features = content.split(",")
            for index in range(len(features)):
                features[index] = features[index].replace("'", "")
                if features[index].isdigit():
                    features[index] = int(features[index])
                elif features[index].find(".") != -1:
                    features[index] = float(features[index])
                else:
                    # Numerizing "Yes" and "No"
                    if features[index].lower() == "yes":
                        features[index] = 1.0
                    else:
                        features[index] = 0.0
            # Append to dataPoints
            dataPoints.append(features)
        elif dataType == "seq" or dataType == "parseq":
            # Load features as sequence of strings
            dataPoints.append(open(dataFile).read())

        # Also add the class label
        if labelExtension == "label":
            if labelFile != "":
                currentClass, currentParams = loadLabelFromFile(labelFile)
            else:
                currentClass, currentParams = loadLabelFromFile(
                    dataFile.replace(".%s" % dataType, ".label"))
        elif labelExtension == "metadata":
            currentClass, currentParams = loadLabelFromFile(
                dataFile[:dataFile.rfind("_test")] + ".metadata")
            for attribute in currentParams:
                currentClass += "_%s_%s" % (attribute,
                                            currentParams[attribute])
        currentClass = currentClass.replace(" ", "")  # Get rid of any spaces
        # Translate that to integers
        if currentClass in classReference:
            dataLabels.append(classReference.index(currentClass))
        else:
            classReference.append(currentClass)
            dataLabels.append(classReference.index(
                currentClass))  # Add an index as the class label
    # Now return the data points and labels as lists
    return dataPoints, dataLabels, classReference
コード例 #10
0
def filterTraces(sourceDir,
                 inExtension,
                 filterMode,
                 outExtension,
                 targetFunction="main"):
    """ Filters the GDB generated traces according to the supplied [filterMode] """
    immReg = r'\$0x\w+'
    memReg = r'0x\w+'

    # Retrieve list of files from input dir
    allfiles = glob.glob("%s/*.%s" % (sourceDir, inExtension))
    if len(allfiles) < 1:
        prettyPrint(
            "Unable to retrieve \"*.%s\" from \"%s\"" %
            (inExtension, sourceDir), "warning")
        return False

    prettyPrint(
        "Successfully retrieved %s \"*.%s\" from \"%s\"" %
        (len(allfiles), inExtension, sourceDir), "debug")
    filecounter = 0
    previousline = ""
    # Loop on retrieved file and filter their content
    for inputfile in allfiles:
        prettyPrint(
            "Processing file: %s, #%s out of %s" %
            (inputfile, filecounter + 1, len(allfiles)), "debug")
        content = open(inputfile).read()
        outputfile = open(inputfile.replace(inExtension, outExtension), "w")
        alllines = content.split('\n')
        inMain = False
        '''
        if else 中处理方法根据具体文件内容而写
        '''
        #静态,保存只和targetFunction有关的指令部分
        if inExtension.find("objdump") != -1 or inExtension.find(
                "objdumps") != -1:
            rawlines = []
            for line in alllines:
                if line.find("<%s>" % targetFunction) != -1:
                    inMain = True
                elif line.find(">:") != -1:
                    inMain = False
                if inMain and len(line.split('\t')) > 2:
                    if line.find("call") != -1 or line.find("callq") != -1:
                        functionName = line[line.rfind('<') +
                                            1:line.rfind('>')]
                        rawlines.append("%s()" % functionName)
                    else:
                        rawlines.append(line.split('\t')[-1])
        else:  #动态,保存只和targetFunction有关的指令部分
            rawlines = []
            for line in alllines:
                if line.find("=>") != -1 and line.find(targetFunction) != -1:
                    rawlines.append(line[line.find(':') + 1:])
                else:
                    # Not a target function
                    # Check whether it is a "call" instruction
                    if line.find("call") != -1 or line.find("callq") != -1:
                        if line.find("%") == -1:
                            functionName = line[line.rfind("<") +
                                                1:line.rfind("+")]
                            line = "%s()" % functionName

        # Now filter them
        #对过滤以后的数据 再进行处理后 写入目标文件
        for templine in rawlines:
            # Match and replace immediate and memory values
            # Are we allowed to filter immediate values as well?
            if filterMode.lower() == "both":
                # Yes, then get rid of the immediate first (the more specific)
                templine = re.sub(immReg, "imm", templine)  #替换templine中的匹配项
                templine = re.sub(memReg, "mem", templine)  #
            elif filterMode.lower() == "mem":
                # No, then check whether this is an immediate match
                if re.search(immReg, templine):  #扫描整个字符串并返回第一个成功的匹配,否则返回None
                    # ... and skip
                    pass
                else:
                    # Otherwise, just filter the memory location
                    templine = re.sub(memReg, "mem", templine)
            elif filterMode.lower() == "raw":
                # Leave both the memory and immediate values alone
                templine = templine
            else:
                prettyPrint(
                    "Unknown filter mode \"%s\". Exiting." % filterMode,
                    "warning")

            # Remove commas
            templine = templine.replace(',', ' ')
            # Write the instruction to file
            #instruction = templine.split()
            #finalline = ""
            #for i in instruction:
            #    finalline += " %s" % i
            outputfile.write("%s\n" % templine)

        filecounter += 1

    prettyPrint(
        "Successfully processed %s \"*.%s\"." % (filecounter, inExtension),
        "debug")
    '''
    应该在for循环中
    '''
    outputfile.close()
    return True
コード例 #11
0
ファイル: data.py プロジェクト: tum-i22/Oedipus
def loadFeaturesFromList(dataFiles, dataType, labelExtension="metadata", classReference=[]):
    """ Loads features from a list of files """
    features = []
    # Retrieve all files
    if len(dataFiles) < 1:
        prettyPrint("No data files of type \"%s\" were found." % dataType, "warning")
        return numpy.array([]), numpy.array([])
    # Iterate over files adding their values to an array
    dataPoints, dataLabels, allClasses = [], [], []
    labelFile = "" # TODO: Again for KLEE test files
    for dataFile in dataFiles:
        currentExtension = dataFile[dataFile.rfind("."):]
        if labelExtension == "label":
            # TODO: Accomodate for the KLEE files
            if dataFile.find("test") != -1:
                labelFile = dataFile[:dataFile.rfind("_test")] + ".label"
                if not os.path.exists(labelFile):
                    prettyPrint("Could not find a label file for \"%s\". Skipping" % dataFile, "warning")
                    continue
            else:
                if not os.path.exists(dataFile.replace(dataType, "label")):
                    prettyPrint("Could not find a label file for \"%s\". Skipping" % dataFile, "warning")
                    continue
 
        dataFile = dataFile.replace(currentExtension,".%s" % dataType) # Make sure we're loading from the right extension

        if dataType.find("tfidf") != -1 or dataType == "freq" or dataType == "util" or dataType == "hmm":
            # Load features as numerical
            dataPoints.append([float(x) for x in open(dataFile).read()[1:-1].split(',')])
            #print dataPoints
        elif dataType == "triton":
           # Load features as numerical/nominal
           content = open(dataFile).read().replace("\n", "").replace(" ", "")[1:-1]
           features = content.split(",")
           for index in range(len(features)):
               features[index] = features[index].replace("'","")
               if features[index].isdigit():
                   features[index] = int(features[index])
               elif features[index].find(".") != -1:
                   features[index] = float(features[index])
               else:
                   # Numerizing "Yes" and "No"
                   if features[index].lower() == "yes":
                       features[index] = 1.0
                   else:
                       features[index] = 0.0
           # Append to dataPoints
           dataPoints.append(features)
        elif dataType == "seq" or dataType == "parseq":
            # Load features as sequence of strings
            dataPoints.append(open(dataFile).read())
        # Also add the class label
        if labelExtension == "label":
            if labelFile != "":
                currentClass, currentParams = loadLabelFromFile(labelFile)
            else:
                currentClass, currentParams = loadLabelFromFile(dataFile.replace(".%s" % dataType, ".label"))
        elif labelExtension == "metadata":
            currentClass, currentParams = loadLabelFromFile(dataFile[:dataFile.rfind("_test")] + ".metadata")
            for attribute in currentParams:
                currentClass += "_%s_%s" % (attribute, currentParams[attribute])
        currentClass = currentClass.replace(" ","") # Get rid of any spaces
        # Translate that to integers
        if currentClass in classReference:
            dataLabels.append(classReference.index(currentClass))
        else:
            classReference.append(currentClass)
            dataLabels.append(classReference.index(currentClass)) # Add an index as the class label
    # Now return the data points and labels as lists
    return dataPoints, dataLabels, classReference
コード例 #12
0
ファイル: data.py プロジェクト: tum-i22/Oedipus
def filterTraces(sourceDir, inExtension, filterMode, outExtension, targetFunction="main"):
    """ Filters the GDB generated traces according to the supplied [filterMode] """
    immReg = r'\$0x\w+'
    memReg = r'0x\w+'

    # Retrieve list of files from input dir
    allfiles = glob.glob("%s/*.%s" % (sourceDir, inExtension))
    if len(allfiles) < 1:
        prettyPrint("Unable to retrieve \"*.%s\" from \"%s\"" % (inExtension, sourceDir), "warning")
        return False

    prettyPrint("Successfully retrieved %s \"*.%s\" from \"%s\"" % (len(allfiles), inExtension, sourceDir), "debug")
    filecounter = 0
    previousline = ""
    # Loop on retrieved file and filter their content
    for inputfile in allfiles:
        prettyPrint("Processing file: %s, #%s out of %s" % (inputfile, filecounter+1, len(allfiles)), "debug")
        content = open(inputfile).read()
        outputfile = open(inputfile.replace(inExtension, outExtension), "w")
        alllines = content.split('\n')
        inMain = False
        if inExtension.find("objdump") != -1 or inExtension.find("objdumps") != -1:
            rawlines = []
            for line in alllines:
                if line.find("<%s>" % targetFunction) != -1:
                    inMain = True
                elif line.find(">:") != -1:
                    inMain = False
                if inMain and len(line.split('\t')) > 2:
                    if line.find("call") != -1 or line.find("callq") != -1:
                        functionName = line[line.rfind('<')+1:line.rfind('>')]
                        rawlines.append("%s()" % functionName)
                    else:
                        rawlines.append(line.split('\t')[-1])
        else:
            rawlines = []
            for line in alllines:
                if line.find("=>") != -1 and line.find(targetFunction) != -1:
                    rawlines.append(line[line.find(':')+1:])
                else:
                    # Not a target function
                    # Check whether it is a "call" instruction
                    if line.find("call") != -1 or line.find("callq") != -1:
                        if line.find("%") == -1:
                            functionName = line[line.rfind("<")+1:line.rfind("+")]
                            line = "%s()" % functionName

        # Now filter them
        for templine in rawlines:
            # Match and replace immediate and memory values
            # Are we allowed to filter immediate values as well?
            if filterMode.lower() == "both":
                # Yes, then get rid of the immediate first (the more specific)
                templine = re.sub(immReg, "imm", templine)
                templine = re.sub(memReg, "mem", templine)
            elif filterMode.lower() == "mem":
                # No, then check whether this is an immediate match
                if re.search(immReg, templine):
                    # ... and skip
                    pass
                else:
                    # Otherwise, just filter the memory location
                    templine = re.sub(memReg, "mem", templine)
            elif filterMode.lower() == "raw":
                # Leave both the memory and immediate values alone
                templine = templine
            else:
                prettyPrint("Unknown filter mode \"%s\". Exiting." % filterMode, "warning")

            # Remove commas
            templine = templine.replace(',', ' ')
            # Write the instruction to file
            #instruction = templine.split()
            #finalline = ""
            #for i in instruction:
            #    finalline += " %s" % i
            outputfile.write("%s\n" % templine)
    
        filecounter += 1

    prettyPrint("Successfully processed %s \"*.%s\"." % (filecounter, inExtension), "debug")
    outputfile.close()
    return True