예제 #1
0
def bracket_strings(start, end, b_brack, e_brack):
    sep = "tzvlw"
    t = basicutils.CompileTextFromRange(start, end, sep)
    tokens = [tk.lower() for tk in t.split(sep)]

    b = []
    for tk in tokens:
        tk = tk.strip()

        if tk.startswith(b_brack):
            b_contents = tk[1:tk.find(e_brack)]
            #Hack to get rid of [-],[+],[*] - could also try to remove non alpha
            if (len(b_contents) > 3):
                #Hack for debug prints that started with [0x%x]
                if (b_contents != "0x%x"):
                    b.append(tk[1:tk.find(e_brack)])

    print "bracket_strings tokens:"
    print tokens
    print b

    u_gram = ""
    u_gram_score = 0
    if (len(b) > 0):
        f = nltk.FreqDist(b)
        u_gram = f.most_common(1)[0][0]
        u_gram_score = f.most_common(1)[0][1]

    return (u_gram, u_gram_score)
예제 #2
0
def string_range_tokenize(start, end, sep):
    # get all string references in this range concatenated into a single string
    t = basicutils.CompileTextFromRange(start, end, sep)

    #Enable this if you already have a bunch of function names and want to include that in the mix
    #t+= basicutils.CompileFuncNamesFromRangeAsText(start,end,sep)

    #print "string_range_tokenize: raw text:"
    #print t
    #remove printf/sprintf format strings
    tc = re.sub("%[0-9A-Za-z]+", " ", t)
    #convert dash to underscore
    tc = re.sub("-", "_", tc)
    #replace _ and / with space - may want to turn this off sometimes
    #this will break up snake case and paths
    #problem is that if you have a path that is used throughout the binary it will probably dominate results
    tc = re.sub("_", " ", tc)
    #replace / and \\ with a space
    tc = re.sub("[/\\\\]", " ", tc)
    #remove anything except alphanumeric, spaces, . (for .c, .cpp, etc) and _
    tc = re.sub("[^A-Za-z0-9_\.\s]", " ", tc)

    #lowercase it - and store this as the original set of tokens to work with
    tokens = [tk.lower() for tk in tc.split()]

    #remove English stop words
    #this is the list from the MIT *bow project
    eng_stopw = {
        "about", "all", "am", "an", "and", "are", "as", "at", "be", "been",
        "but", "by", "can", "cannot", "did", "do", "does", "doing", "done",
        "for", "from", "had", "has", "have", "having", "if", "in", "is", "it",
        "its", "of", "on", "that", "the", "these", "they", "this", "those",
        "to", "too", "want", "wants", "was", "what", "which", "will", "with",
        "would"
    }
    #remove "code" stop words
    #e.g. common words in debugging strings
    code_sw = {
        "error", "err", "errlog", "log", "return", "returned", "byte", "bytes",
        "status", "len", "length", "size", "ok", "0x", "warning", "fail",
        "failed", "failure", "invalid", "illegal", "param", "parameter",
        "done", "complete", "assert", "assertion", "cant", "didnt", "class",
        "foundation", "cdecl", "stdcall", "thiscall"
    }
    stopw = eng_stopw.union(code_sw)
    c = 0

    tokens_f = []

    for t in tokens:
        if t not in stopw:
            tokens_f.append(t)

    return tokens_f
예제 #3
0
def source_file_strings(start, end):
    sep = "tzvlw"
    t = basicutils.CompileTextFromRange(start, end, sep)
    #normally would do lower here to normalize but we lose camel case that way
    tokens = [tk for tk in t.split(sep)]

    #for each string, remove quotes and commas, then tokenize based on spaces to generate the final list
    tokens2 = []
    for tk in tokens:
        tk = tk.strip()
        #strip punctuation, need to leave in _ for filenames and / and \ for paths
        tk = re.sub("[\"\'\,]", " ", tk)
        for tk2 in tk.split(" "):
            tokens2.append(tk2)

    b = []
    for tk in tokens2:
        tk = tk.strip()
        if tk.endswith(".c") or tk.endswith(".cpp") or tk.endswith(".cc"):
            #If there's a dir path, only use the end filename
            #This could be tweaked if the directory structure is part of the software architecture
            #e.g. if there are multiple source directories with meaningful names
            if tk.rfind("/") != -1:
                ntk = tk[tk.rfind("/") + 1:]
            elif tk.rfind("\\") != -1:
                ntk = tk[tk.rfind("\\") + 1:]
            else:
                ntk = tk
            b.append(ntk)

    print "source_file_strings tokens:"
    #print tokens
    print b

    #a better way to do this (if there are multiple)
    #would be to sort, uniquify, and then make the name foo.c_and_bar.c
    u_gram = ""
    u_gram_score = 0
    if (len(b) > 0):
        f = nltk.FreqDist(b)
        u_gram = f.most_common(1)[0][0]
        u_gram_score = f.most_common(1)[0][1]

    return (u_gram, u_gram_score)