Esempio n. 1
0
def labeler(vtreport):
    # Select hash used to identify sample, by default MD5
    hash_type = 'sha256'
    alias = pkg_resources.resource_filename('avclass', 'default.aliases')
    gen = pkg_resources.resource_filename('avclass', 'default.generics')
    av = None

    # Create AvLabels object
    av_labels = AvLabels(gen, alias, av)

    # Process each JSON
    with open(vtreport, 'r') as fd:
        #first_token_dict = {}
        #token_count_map = {}
        #pair_count_map = {}
        #token_family_map = {}

        for line in fd:
            # If blank line, skip
            if line == '\n':
                continue

            # Read JSON line and extract sample info (i.e., hashes and labels)
            vt_rep = json.loads(line)
            sample_info = av_labels.get_sample_info(vt_rep, vtreport)
            name = getattr(sample_info, hash_type)

            # Get the distinct tokens from all the av labels in the report
            try:
                # Get distinct tokens from AV labels
                tokens = av_labels.get_family_ranking(sample_info).items()

                # Top candidate is most likely family name
                if tokens:
                    family = tokens[0][0]
                else:
                    family = "UNKNOWN"

                # Return family
                return family.upper()

            except:
                family = "ERROR"
                return family.upper()
Esempio n. 2
0
    def get_family(self, param1):
        """
        get_family will try to compute family name based on data given in param1

        :param param1: lb format {md5,sha1,sha256,scan_date,av_labels}
        :type param1: dict
        :return: string containing family name or SINGLETON
        """

        data = {'unique_name': '', 'pup': '', 'category': ''}

        try:
            av_labels = AvLabels(gen_file=self.__default_gen_file,
                                 alias_file=self.__default_alias_file,
                                 av_file=None,
                                 cat_file=self.__default_category_file)
            self.sample_info = av_labels.get_sample_info(param1, "")
        except Exception, exception:
            log.critical(
                'Error in json loadstrings call: {0}'.format(exception))
            return data
Esempio n. 3
0
def extract_avclass_labels(vt_rep, vtapi, av_file=None, hash_type='sha256'):
    '''
    Given a VT report it returns the AVClass labels
    '''
    # Create AvLabels object
    av_labels = AvLabels(default_gen_file, default_alias_file, av_file)

    if vtapi == 'v3':
        get_sample_info = av_labels.get_sample_info_vt_v3
    elif vtapi == 'v2':
        get_sample_info = av_labels.get_sample_info_vt_v2
    sample_info = get_sample_info(vt_rep)
    if sample_info is None:
        return AVCInfo(None, None, None, None, None, None, FAILURE,
                       PARSER_ERROR_MSG)

    name = getattr(sample_info, hash_type)

    # If the VT report has no AV labels, continue
    if not sample_info.labels:
        return AVCInfo(sample_info.md5, sample_info.sha1, sample_info.sha256,
                       None, None, None, FAILURE, NO_LABELS_MSG)

    # Get distinct tokens from AV labels
    tokens = list(av_labels.get_family_ranking(sample_info).items())

    # Top candidate is most likely family name
    if tokens:
        family = tokens[0][0]
    else:
        family = "SINGLETON:" + name

    # Check if sample is PUP, if requested
    is_pup = av_labels.is_pup(sample_info[3])
    return AVCInfo(sample_info.md5, sample_info.sha1, sample_info.sha256,
                   family, is_pup, tokens, SUCCESS, SUCCESS_MSG)
Esempio n. 4
0
def main(args):
    # Select hash used to identify sample, by default MD5
    hash_type = args.hash if args.hash else 'md5'

    # If ground truth provided, read it from file
    gt_dict = {}
    if args.gt:
        with open(args.gt, 'r') as gt_fd:
            for line in gt_fd:
                gt_hash, family = map(str.lower, line.strip().split('\t', 1))
                gt_dict[gt_hash] = family

        # Guess type of hash in ground truth file
        hash_type = guess_hash(gt_dict.keys()[0])

    # Create AvLabels object
    av_labels = AvLabels(args.gen, args.alias, args.av)

    # Build list of input files
    # NOTE: duplicate input files are not removed
    ifile_l = []
    if (args.vt):
        ifile_l += args.vt
        ifile_are_vt = True
    if (args.lb):
        ifile_l += args.lb
        ifile_are_vt = False
    if (args.vtdir): 
        ifile_l += [os.path.join(args.vtdir, f) for f in os.listdir(args.vtdir)]
        ifile_are_vt = True
    if (args.lbdir):
        ifile_l += [os.path.join(args.lbdir, f) for f in os.listdir(args.lbdir)]
        ifile_are_vt = False

    # Select output prefix
    out_prefix = os.path.basename(os.path.splitext(ifile_l[0])[0])

    # If verbose, open log file
    if args.verbose:
        log_filename = out_prefix + '.verbose'
        verb_fd = open(log_filename, 'w+')

    # Initialize state
    first_token_dict = {}
    token_count_map = {}
    pair_count_map = {}
    token_family_map = {}
    fam_stats = {}
    vt_all = 0
    vt_empty = 0
    singletons = 0

    # Process each input file
    for ifile in ifile_l:
        # Open file
        fd = open(ifile, 'r')

        # Debug info, file processed
        sys.stderr.write('[-] Processing input file %s\n' % ifile)

        # Process all lines in file
        for line in fd:

            # If blank line, skip
            if line == '\n':
                continue

            # Debug info
            if vt_all % 100 == 0:
                sys.stderr.write('\r[-] %d JSON read' % vt_all)
                sys.stderr.flush()
            vt_all += 1

            # Read JSON line and extract sample info (i.e., hashes and labels)
            vt_rep = json.loads(line)
            sample_info = av_labels.get_sample_info(vt_rep, ifile_are_vt)
            if sample_info is None:
                try:
                    name = vt_rep['md5']
                    sys.stderr.write('\nNo AV labels for %s\n' % name)
                except KeyError:
                    sys.stderr.write('\nCould not process: %s\n' % line)
                sys.stderr.flush()
                vt_empty += 1
                continue

            # Sample's name is selected hash type (md5 by default)
            name = getattr(sample_info, hash_type)

            # If the VT report has no AV labels, continue
            if not sample_info[3]:
                vt_empty += 1
                sys.stderr.write('\nNo AV labels for %s\n' % name)
                sys.stderr.flush()
                continue
            
            # Get the distinct tokens from all the av labels in the report
            # And print them. If not verbose, print the first token.
            # If verbose, print the whole list
            try:
                # Get distinct tokens from AV labels
                tokens = av_labels.get_family_ranking(sample_info).items()

                # If alias detection, populate maps
                if args.aliasdetect:
                    prev_tokens = set([])
                    for entry in tokens:
                        curr_tok = entry[0]
                        curr_count = token_count_map.get(curr_tok)
                        if curr_count:
                            token_count_map[curr_tok] = curr_count + 1
                        else:
                            token_count_map[curr_tok] = 1
                        for prev_tok in prev_tokens:
                            if prev_tok < curr_tok:
                                pair = (prev_tok,curr_tok) 
                            else: 
                                pair = (curr_tok,prev_tok)
                            pair_count = pair_count_map.get(pair)
                            if pair_count:
                                pair_count_map[pair] = pair_count + 1
                            else:
                                pair_count_map[pair] = 1
                        prev_tokens.add(curr_tok)

                # If generic token detection, populate map
                if args.gendetect and args.gt:
                    for entry in tokens:
                        curr_tok = entry[0]
                        curr_fam_set = token_family_map.get(curr_tok)
                        family = gt_dict[name] if name in gt_dict else None
                        if curr_fam_set and family:
                            curr_fam_set.add(family)
                        elif family:
                            token_family_map[curr_tok] = set(family)

                # Top candidate is most likely family name
                if tokens:
                    family = tokens[0][0]
                    is_singleton = False
                else:
                    family = "SINGLETON:" + name
                    is_singleton = True
                    singletons += 1

                # Check if sample is PUP, if requested
                if args.pup:
                    is_pup = av_labels.is_pup(sample_info[3])
                    if is_pup:
                        is_pup_str = "\t1"
                    else:
                        is_pup_str = "\t0"
                else:
                    is_pup = None
                    is_pup_str =  ""

                # Build family map for precision, recall, computation
                first_token_dict[name] = family

                # Get ground truth family, if available
                if args.gt:
                    gt_family = '\t' + gt_dict[name] if name in gt_dict else ""
                else:
                    gt_family = ""

                # Print family (and ground truth if available) to stdout
                print '%s\t%s%s%s' % (name, family, gt_family, is_pup_str)

                # If verbose, print tokens (and ground truth if available) 
                # to log file
                if args.verbose:
                    verb_fd.write('%s\t%s%s%s\n' % (
                        name, tokens, gt_family, is_pup_str))

                # Store family stats (if required)
                if args.fam:
                    if is_singleton:
                        ff = 'SINGLETONS'
                    else:
                        ff = family
                    try:
                        numAll, numMal, numPup = fam_stats[ff]
                    except KeyError:
                        numAll = 0
                        numMal = 0
                        numPup = 0

                    numAll += 1
                    if args.pup:
                        if is_pup:
                            numPup += 1
                        else:
                            numMal += 1
                    fam_stats[ff] = (numAll, numMal, numPup)

            except:
                traceback.print_exc(file=sys.stderr)
                continue

        # Debug info
        sys.stderr.write('\r[-] %d JSON read' % vt_all)
        sys.stderr.flush()
        sys.stderr.write('\n')

        # Close file
        fd.close()

    # Print statistics
    sys.stderr.write(
            "[-] Samples: %d NoLabels: %d Singletons: %d "
            "GroundTruth: %d\n" % (
                vt_all, vt_empty, singletons, len(gt_dict)))

    # If ground truth, print precision, recall, and F1-measure
    if args.gt and args.eval:
        precision, recall, fmeasure = \
                    ec.eval_precision_recall_fmeasure(gt_dict,
                                                      first_token_dict)
        sys.stderr.write( \
            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \
                          (precision, recall, fmeasure))

    # If generic token detection, print map
    if args.gendetect:
        # Open generic tokens file
        gen_filename = out_prefix + '.gen'
        gen_fd = open(gen_filename, 'w+')
        # Output header line
        gen_fd.write("Token\t#Families\n")
        sorted_pairs = sorted(token_family_map.iteritems(), 
                              key=lambda x: len(x[1]) if x[1] else 0, 
                              reverse=True)
        for (t,fset) in sorted_pairs:
            gen_fd.write("%s\t%d\n" % (t, len(fset)))

        # Close generic tokens file
        gen_fd.close()
        sys.stderr.write('[-] Generic token data in %s\n' % (gen_filename))

    # If alias detection, print map
    if args.aliasdetect:
        # Open alias file
        alias_filename = out_prefix + '.alias'
        alias_fd = open(alias_filename, 'w+')
        # Sort token pairs by number of times they appear together
        sorted_pairs = sorted(
                pair_count_map.items(), key=itemgetter(1))
        # Output header line
        alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\n")
        # Compute token pair statistic and output to alias file
        for (t1,t2),c in sorted_pairs:
            n1 = token_count_map[t1]
            n2 = token_count_map[t2]
            if (n1 < n2):
                x = t1
                y = t2
                xn = n1
                yn = n2
            else:
                x = t2
                y = t1
                xn = n2
                yn = n1
            f = float(c) / float(xn)
            alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\n" % (
                x,y,xn,yn,c,f))
        # Close alias file
        alias_fd.close()
        sys.stderr.write('[-] Alias data in %s\n' % (alias_filename))

    # If family statistics, output to file
    if args.fam:
        # Open family file
        fam_filename = out_prefix + '.families'
        fam_fd = open(fam_filename, 'w+')
        # Output header line
        if args.pup:
            fam_fd.write("# Family\tTotal\tMalware\tPUP\tFamType\n")
        else:
            fam_fd.write("# Family\tTotal\n")
        # Sort map
        sorted_pairs = sorted(fam_stats.items(), key=itemgetter(1),
                              reverse=True)
        # Print map contents
        for (f,fstat) in sorted_pairs:
            if args.pup:
                if fstat[1] > fstat[2]:
                    famType = "malware"
                else:
                    famType = "pup"
                fam_fd.write("%s\t%d\t%d\t%d\t%s\n" % (f, fstat[0], fstat[1],
                                                fstat[2], famType))
            else:
                fam_fd.write("%s\t%d\n" % (f, fstat[0]))
        # Close file
        fam_fd.close()
        sys.stderr.write('[-] Family data in %s\n' % (fam_filename))

    # Close log file
    if args.verbose:
        sys.stderr.write('[-] Verbose output in %s\n' % (log_filename))
        verb_fd.close()
Esempio n. 5
0
def main(args):
    # Select hash used to identify sample, by default MD5
    hash_type = args.hash if args.hash else 'md5'

    # If ground truth provided, read it from file
    gt_dict = {}
    if args.gt:
        with open(args.gt, 'r') as gt_fd:
            for line in gt_fd:
                gt_hash, family = map(str.lower, line.strip().split('\t', 1))
                gt_dict[gt_hash] = family

        # Guess type of hash in ground truth file
        hash_type = guess_hash(gt_dict.keys()[0])

    # Create AvLabels object
    av_labels = AvLabels(args.gen, args.alias, args.av)

    # Select input file with AV labels
    ifile = args.vt if args.vt else args.lb

    # If verbose, open log file
    if args.verbose:
        log_filename = os.path.basename(os.path.splitext(ifile)[0]) + \
                            '.verbose'
        verb_fd = open(log_filename, 'w+')

    # Process each JSON
    vt_all = 0
    vt_empty = 0
    singletons = 0
    with open(ifile, 'r') as fd:
        first_token_dict = {}
        token_count_map = {}
        pair_count_map = {}
        token_family_map = {}

        for line in fd:

            # If blank line, skip
            if line == '\n':
                continue

            # Debug info
            if vt_all % 100 == 0:
                sys.stderr.write('\r[-] %d JSON read' % vt_all)
                sys.stderr.flush()
            vt_all += 1

            # Read JSON line and extract sample info (i.e., hashes and labels)
            vt_rep = json.loads(line)
            sample_info = av_labels.get_sample_info(vt_rep, args.vt)
            name = getattr(sample_info, hash_type)

            # If the VT report has no AV labels, continue
            if not sample_info[3]:
                vt_empty += 1
                sys.stderr.write('\nNo AV labels for %s\n' % name)
                sys.stderr.flush()
                continue

            # Get the distinct tokens from all the av labels in the report
            # And print them. If not verbose, print the first token.
            # If verbose, print the whole list
            try:
                # Get distinct tokens from AV labels
                tokens = av_labels.get_family_ranking(sample_info).items()

                # If alias detection, populate maps
                if args.aliasdetect:
                    prev_tok = ""
                    for entry in tokens:
                        curr_tok = entry[0]
                        curr_count = token_count_map.get(curr_tok)
                        if curr_count:
                            token_count_map[curr_tok] = curr_count + 1
                        else:
                            token_count_map[curr_tok] = 1
                        if prev_tok != "":
                            if prev_tok < curr_tok:
                                pair = (prev_tok, curr_tok)
                            else:
                                pair = (curr_tok, prev_tok)
                            pair_count = pair_count_map.get(pair)
                            if pair_count:
                                pair_count_map[pair] = pair_count + 1
                            else:
                                pair_count_map[pair] = 1
                        prev_tok = curr_tok

                # If generic token detection, populate map
                if args.gendetect and args.gt:
                    for entry in tokens:
                        curr_tok = entry[0]
                        curr_fam_set = token_family_map.get(curr_tok)
                        family = gt_dict[name] if name in gt_dict else None
                        if curr_fam_set and family:
                            curr_fam_set.add(family)
                        elif family:
                            token_family_map[curr_tok] = set(family)

                # Top candidate is most likely family name
                if tokens:
                    family = tokens[0][0]
                else:
                    family = "SINGLETON:" + name
                    singletons += 1

                # Check if sample is PUP, if requested
                if args.pup:
                    if av_labels.is_pup(sample_info[3]):
                        is_pup_str = "\t1"
                    else:
                        is_pup_str = "\t0"
                else:
                    is_pup_str = ""

                # Build family map for precision, recall, computation
                first_token_dict[name] = family

                # Get ground truth family, if available
                if args.gt:
                    gt_family = '\t' + gt_dict[name] if name in gt_dict else ""
                else:
                    gt_family = ""

                # Print family (and ground truth if available) to stdout
                print '%s\t%s%s%s' % (name, family, gt_family, is_pup_str)

                # If verbose, print tokens (and ground truth if available)
                # to log file
                if args.verbose:
                    verb_fd.write('%s\t%s%s%s\n' %
                                  (name, tokens, gt_family, is_pup_str))

            except:
                traceback.print_exc(file=sys.stderr)
                continue

        # Debug info
        sys.stderr.write('\r[-] %d JSON read' % vt_all)
        sys.stderr.flush()
        sys.stderr.write('\n')

    # Print statistics
    sys.stderr.write("[-] Samples: %d NoLabels: %d Singletons: %d "
                     "GroundTruth: %d\n" %
                     (vt_all, vt_empty, singletons, len(gt_dict)))

    # If ground truth, print precision, recall, and F1-measure
    if args.gt and args.eval:
        precision, recall, fmeasure = \
                    ec.eval_precision_recall_fmeasure(gt_dict,
                                                      first_token_dict)
        sys.stderr.write( \
            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \
                          (precision, recall, fmeasure))

    # If generic token detection, print map
    if args.gendetect:
        # Open generic tokens file
        gen_filename = os.path.basename(os.path.splitext(ifile)[0]) + \
                            '.gen'
        gen_fd = open(gen_filename, 'w+')
        # Output header line
        gen_fd.write("Token\t#Families\n")
        sorted_pairs = sorted(token_family_map.iteritems(),
                              key=lambda x: len(x[1]) if x[1] else 0,
                              reverse=True)
        for (t, fset) in sorted_pairs:
            gen_fd.write("%s\t%d\n" % (t, len(fset)))

        # Close generic tokens file
        gen_fd.close()

    # If alias detection, print map
    if args.aliasdetect:
        # Open alias file
        alias_filename = os.path.basename(os.path.splitext(ifile)[0]) + \
                            '.alias'
        alias_fd = open(alias_filename, 'w+')
        # Sort token pairs by number of times they appear together
        sorted_pairs = sorted(pair_count_map.items(), key=itemgetter(1))
        # Output header line
        alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\n")
        # Compute token pair statistic and output to alias file
        for (t1, t2), c in sorted_pairs:
            n1 = token_count_map[t1]
            n2 = token_count_map[t2]
            if (n1 < n2):
                x = t1
                y = t2
                xn = n1
                yn = n2
            else:
                x = t2
                y = t1
                xn = n2
                yn = n1
            f = float(c) / float(xn)
            alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\n" %
                           (x, y, xn, yn, c, f))
        # Close alias file
        alias_fd.close()

    # Close log file
    if args.verbose:
        sys.stderr.write('[-] Verbose output in %s\n' % (log_filename))
        verb_fd.close()