def send_debug_printer(pid, syscall_object):
    p = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, p, 4)
    addr = params[1]
    size = params[2]
    data = cint.copy_address_range(pid, addr, addr + size)
    logging.debug('This call tried to send: %s', data.encode('string-escape'))
Beispiel #2
0
 def generateDict(self) :
     self.dictionary = corpora.Dictionary(line.lower().split('|') for line in open(self.corpusFile))
     rare_tokens = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq < 5]
     logging.debug('=====The number of tokens to be removed is %d =====' % len(rare_tokens))
     self.dictionary.filter_tokens(rare_tokens)
     logging.debug('=====Total %d tokens=====' % len(self.dictionary.dfs) )
     self.dictionary.compactify()
def shutdown_debug_printer(pid, syscall_object):
    p = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, p, 2)
    fd = params[0]
    cmd = params[1]
    logging.debug('This call tried to shutdown: %d', fd)
    logging.debug('Command: %d: %s', cmd, SHUTDOWN_INT_TO_CMD[params[1]])
Beispiel #4
0
def dataCleaning():
    logging.info('===Data Cleaning Processing===')
    input_file = DATA_TRAINING_SAMPLE
    adClickCntList = generateTopAdsUsersByClick(input_file)
    dumpList2File(adClickCntList, TMP_DATA_DIR_PATH + 'topAdClickCnt.dict')

    adSet = set()
    for line in file(TMP_DATA_DIR_PATH + 'topAdClickCnt.dict'):
        cnt, adid = line.strip().split()
        adSet.add(adid)
    logging.debug(len(adSet))
    ad2Users = generateAd2UsersGivenAdSet(input_file, adSet)
    dumpDict2File(ad2Users, TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict')
    userDict = generateUser2AdGivenAd2User(TMP_DATA_DIR_PATH +
                                           'ad2UsersGivenAdSet.dict',
                                           adViewThreshold=10)
    dumpDict2File(userDict, TMP_DATA_DIR_PATH + 'user2AdGivenAd2User.dict')
    userSet = set()
    logging.debug(len(userSet))
    for line in file(TMP_DATA_DIR_PATH + 'user2AdGivenAd2User.dict'):
        user, ads = line.strip().split('\x01')
        userSet.add(user)

    dumpUserRawFeatureGivenUserSet(input_file, userSet,
                                   TMP_DATA_DIR_PATH + 'userRawFeature.dict')
def listen_entry_handler(syscall_id, syscall_object, pid):
    logging.debug('Entering listen entry handler')
    p = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, p, 1)
    fd_from_trace = int(syscall_object.args[0].value)
    validate_integer_argument(pid, syscall_object, 0, 0, params=params)
    logging.debug('Replaying this system call')
    subcall_return_success_handler(syscall_id, syscall_object, pid)
Beispiel #6
0
def query_Filter(topAdSet, fn_rawData):
    logging.debug('generate Query Filter')
    queryIdSet = set()
    for line in file(fn_rawData):
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = dataParser.parseTrainData(line)
        if AdID not in topAdSet: continue
        queryIdSet.add(QueryID)
    return queryIdSet
Beispiel #7
0
def dumpCTRDistributionPlot(fn_ad2userCTR, output_dir = '/Users/zhanglixin/research/kdd_cup/advertisingLab/data/tmp_data/plot_out/') :
    for line in file(fn_ad2userCTR) :
        plotTool = MiniPlotTool(baseConfig)
        adid, ctrs = line.strip().split('\01')
        logging.debug(adid)
        ctrs = [float(ctr) for ctr in ctrs.split('\t')]
        plotTool.addline({'X':range(len(ctrs)), 'Y':ctrs})
        plotTool.plot()
        plotTool.save(output_dir + adid + '.png')
def query_Filter(topAdSet, fn_rawData) :
    logging.debug('generate Query Filter')
    queryIdSet = set()
    for line in file(fn_rawData) :
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = dataParser.parseTrainData(line)
        if AdID not in topAdSet : continue
        queryIdSet.add(QueryID)
    return queryIdSet
Beispiel #9
0
 def run(lda, num_topics=200, raw_corpus='corpus', fn_bow='corpus.svmlight', fn_out_topic='LDA_corpus.svmlight') :
     lda.generateDict()
     logging.debug('=====start generateDict=====')
     corpora.SvmLightCorpus.serialize(fn_bow, lda.__iter__(raw_corpus))
     return 
     bow_corpus = corpora.SvmLightCorpus(fn_bow) 
     logging.debug('=====Topic Processing=====')
     lda_model = models.ldamodel.LdaModel(bow_corpus, id2word=lda.dictionary, num_topics=num_topics)
     corpus_lda = lda_model[bow_corpus]
     corpora.SvmLightCorpus.serialize(fn_out_topic, corpus_lda) 
Beispiel #10
0
 def generateDict(self):
     self.dictionary = corpora.Dictionary(line.lower().split('|')
                                          for line in open(self.corpusFile))
     rare_tokens = [
         tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems()
         if docfreq < 5
     ]
     logging.debug('=====The number of tokens to be removed is %d =====' %
                   len(rare_tokens))
     self.dictionary.filter_tokens(rare_tokens)
     logging.debug('=====Total %d tokens=====' % len(self.dictionary.dfs))
     self.dictionary.compactify()
def accept_exit_handler(syscall_id, syscall_object, pid):
    logging.debug('Entering accept exit handler')
    fd_from_execution = cint.peek_register(pid, cint.EAX)
    fd_from_trace = int(syscall_object.ret[0])
    if offset_file_descriptor(fd_from_trace) != fd_from_execution:
        raise ReplayDeltaError('File descriptor from execution ({}) '
                               'differs from file descriptor from '
                               'trace ({})'.format(fd_from_execution,
                                                   fd_from_trace))
    if fd_from_execution >= 0:
        add_os_fd_mapping(fd_from_execution, fd_from_trace)
    cint.poke_register(pid, cint.EAX, fd_from_trace)
Beispiel #12
0
def dumpCTRDistributionPlot(
    fn_ad2userCTR,
    output_dir='/Users/zhanglixin/research/kdd_cup/advertisingLab/data/tmp_data/plot_out/'
):
    for line in file(fn_ad2userCTR):
        plotTool = MiniPlotTool(baseConfig)
        adid, ctrs = line.strip().split('\01')
        logging.debug(adid)
        ctrs = [float(ctr) for ctr in ctrs.split('\t')]
        plotTool.addline({'X': range(len(ctrs)), 'Y': ctrs})
        plotTool.plot()
        plotTool.save(output_dir + adid + '.png')
def socketcall_debug_printer(pid, orig_eax, syscall_object):
    subcall_debug_printers = {
        1: socket_debug_printer,
        9: send_debug_printer,
        13: shutdown_debug_printer
    }
    subcall_id = cint.peek_register(pid, cint.EBX)
    logging.debug('Got subcall {} {}'.format(subcall_id,
                                             SOCKET_SUBCALLS[subcall_id]))
    try:
        subcall_debug_printers[subcall_id](pid, syscall_object)
    except KeyError as e:
        logging.warning(
            'This subcall ({}) has no debug printer'.format(subcall_id))
        raise e
Beispiel #14
0
 def run(lda,
         num_topics=200,
         raw_corpus='corpus',
         fn_bow='corpus.svmlight',
         fn_out_topic='LDA_corpus.svmlight'):
     lda.generateDict()
     logging.debug('=====start generateDict=====')
     corpora.SvmLightCorpus.serialize(fn_bow, lda.__iter__(raw_corpus))
     return
     bow_corpus = corpora.SvmLightCorpus(fn_bow)
     logging.debug('=====Topic Processing=====')
     lda_model = models.ldamodel.LdaModel(bow_corpus,
                                          id2word=lda.dictionary,
                                          num_topics=num_topics)
     corpus_lda = lda_model[bow_corpus]
     corpora.SvmLightCorpus.serialize(fn_out_topic, corpus_lda)
def shutdown_subcall_entry_handler(syscall_id, syscall_object, pid):
    """Replay Always
    Checks:
    0: sockfd: the socket file descriptor
    Sets:
    return value: 0 (success) or -1 (error)
    errno

    """
    logging.debug('Entering shutdown entry handler')
    # Pull out the info we can check
    ecx = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, ecx, 2)
    fd_from_trace = syscall_object.args[0].value
    validate_integer_argument(pid, syscall_object, 0, 0, params=params)
    # TODO: We need to check the 'how' parameter here
    # Check to make sure everything is the same
    # Decide if we want to replay this system call
    noop_current_syscall(pid)
    apply_return_conditions(pid, syscall_object)
def connect_entry_handler(syscall_id, syscall_object, pid):
    """Replay Always
    Checks:
    0: The socket file descriptor
    2: The length of the sockaddr structure pointed to by 1
    Sets:
    return value: file descriptor of the new socket -1 (error)
    errno

    Not Implemented:
    * Determine what is not implemented
    """

    logging.debug('Entering connect entry handler')
    ecx = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, ecx, 3)
    validate_integer_argument(pid, syscall_object, 0, 0, params=params)
    validate_integer_argument(pid, syscall_object, 2, 2, params=params)
    trace_fd = int(syscall_object.args[0].value)
    noop_current_syscall(pid)
    apply_return_conditions(pid, syscall_object)
Beispiel #17
0
def generateTransferAdQueryTokenPair(topAdSet, fn_rawData, fn_rawQuery, fn_out,
                                     query_filter):
    logging.debug('Loading Query Map')
    #query_filter = set(line.strip() for line in file(TMP_DATA_DIR_PATH+'queryID.set'))
    query_map = dict([(line.strip().split('\t')) for line in file(fn_rawQuery)
                      if line.strip().split('\t')[0] in query_filter])
    Ad_QueryToken_map = {}
    token_map = {}
    logging.debug('Generating Ad_QueryToken_map')
    for line in file(fn_rawData):
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = dataParser.parseTrainData(line)
        if AdID not in topAdSet: continue
        if AdID not in Ad_QueryToken_map:
            Ad_QueryToken_map[AdID] = {}
        tokens = query_map[QueryID].split('|')
        for token in tokens:
            if token not in token_map:
                token_map[token] = 0
            token_map[token] += 1
            if token not in Ad_QueryToken_map[AdID]:
                Ad_QueryToken_map[AdID][token] = 0
            Ad_QueryToken_map[AdID][token] += 1
    logging.debug('Dumping Transfer info to file')
    writer = file(fn_out, 'w')
    for Ad in Ad_QueryToken_map:
        for token in Ad_QueryToken_map[Ad]:
            #print token, Ad_QueryToken_map[Ad][token],token_map[token]
            writer.write('%s\t%s\t%f\n' %
                         (Ad, token, Ad_QueryToken_map[Ad][token] * 1.0 /
                          token_map[token]))
    writer.close()
def generateTransferAdQueryTokenPair(topAdSet, fn_rawData, fn_rawQuery, fn_out, query_filter) :
    logging.debug('Loading Query Map')
    #query_filter = set(line.strip() for line in file(TMP_DATA_DIR_PATH+'queryID.set'))
    query_map = dict([(line.strip().split('\t')) for line in file(fn_rawQuery) if line.strip().split('\t')[0] in query_filter])
    Ad_QueryToken_map = {}
    token_map = {}
    logging.debug('Generating Ad_QueryToken_map')
    for line in file(fn_rawData) :
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
        Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = dataParser.parseTrainData(line)
        if AdID not in topAdSet : continue
        if AdID not in Ad_QueryToken_map :
            Ad_QueryToken_map[AdID] = {}
        tokens = query_map[QueryID].split('|')
        for token in tokens :
            if token not in token_map :
                token_map[token] = 0
            token_map[token] += 1
            if token not in Ad_QueryToken_map[AdID] :
                Ad_QueryToken_map[AdID][token] = 0
            Ad_QueryToken_map[AdID][token] += 1
    logging.debug('Dumping Transfer info to file')
    writer = file(fn_out, 'w')
    for Ad in Ad_QueryToken_map :
        for token in Ad_QueryToken_map[Ad]:
            #print token, Ad_QueryToken_map[Ad][token],token_map[token]
            writer.write('%s\t%s\t%f\n' % (Ad, token, Ad_QueryToken_map[Ad][token] * 1.0 / token_map[token]))
    writer.close()
Beispiel #19
0
def dataCleaning() :
    logging.info('===Data Cleaning Processing===')
    input_file = DATA_TRAINING_SAMPLE
    adClickCntList = generateTopAdsUsersByClick(input_file)
    dumpList2File(adClickCntList, TMP_DATA_DIR_PATH + 'topAdClickCnt.dict')

    adSet = set()
    for line in file(TMP_DATA_DIR_PATH + 'topAdClickCnt.dict') :
        cnt, adid = line.strip().split()
        adSet.add(adid)
    logging.debug(len(adSet))
    ad2Users = generateAd2UsersGivenAdSet(input_file, adSet)
    dumpDict2File(ad2Users, TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict')
    userDict = generateUser2AdGivenAd2User(TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict', adViewThreshold = 10)
    dumpDict2File(userDict, TMP_DATA_DIR_PATH + 'user2AdGivenAd2User.dict')
    userSet = set()
    logging.debug(len(userSet))
    for line in file(TMP_DATA_DIR_PATH + 'user2AdGivenAd2User.dict') :
        user, ads = line.strip().split('\x01')
        userSet.add(user)
    
    dumpUserRawFeatureGivenUserSet(input_file, userSet, TMP_DATA_DIR_PATH + 'userRawFeature.dict')
def setsockopt_entry_handler(syscall_id, syscall_object, pid):
    """Replay Always
    Checks:
    0: sockfd: the socket file descriptor
    Sets:
    optval: out parameter
    return value: 0 (success) or -1 (error)
    errno

    Not Implemented: More checking

    """
    logging.debug('Entering setsockopt handler')
    ecx = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, ecx, 5)
    fd_from_trace = int(syscall_object.args[0].value)
    optval_addr = params[3]
    # We don't check param[3] because it is an address of an empty buffer
    # We don't check param[4] because it is an address of an empty length
    validate_integer_argument(pid, syscall_object, 0, 0, params=params)
    noop_current_syscall(pid)
    apply_return_conditions(pid, syscall_object)
def getsockopt_entry_handler(syscall_id, syscall_object, pid):
    """Replay Always
    Checks:
    0: The socket file descriptor
    Sets:
    optval: The value being retrieved
    optval_len: The length of the value being retrieved
    return value: 0 (success) or 1 (failure)
    errno

    Not Implemented:
    * Use the address validator to check addresses
    """
    logging.debug('Entering getsockopt handler')
    # Pull out what we can compare
    ecx = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, ecx, 5)
    fd_from_trace = int(syscall_object.args[0].value)
    optval_addr = params[3]
    optval_len_addr = params[4]
    validate_integer_argument(pid, syscall_object, 0, 0, params=params)
    # This if is sufficient for now for the implemented options
    if params[1] != 1 or params[2] != 4:
        raise NotImplementedError('Unimplemented getsockopt level or optname')
    optval_len = int(syscall_object.args[4].value.strip('[]'))
    if optval_len != 4:
        raise NotImplementedError('getsockopt() not implemented for '
                                  'optval sizes other than 4')
    optval = int(syscall_object.args[3].value.strip('[]'))
    logging.debug('Optval: %s', optval)
    logging.debug('Optval Length: %s', optval_len)
    logging.debug('Optval addr: %x', optval_addr & 0xffffffff)
    logging.debug('Optval Lenght addr: %d', optval_len_addr & 0xffffffff)
    noop_current_syscall(pid)
    cint.populate_int(pid, optval_addr, optval)
    cint.populate_int(pid, optval_len_addr, 4)
    apply_return_conditions(pid, syscall_object)
def expandFeatureId2Tokens(aggregateUserfile, expandId2TokensResultFile, query_set, desc_set, title_set) :
    logging.info('=========start expandFeatureId2Tokens processing=========')
    description_map = dict([(line.strip().split('\t')) for line in file(DATA_DESCRIPTION) if line.split('\t',1)[0] in desc_set])
    logging.debug('Read %s Done.' % DATA_DESCRIPTION)
    query_map = dict([(line.strip().split('\t')) for line in file(DATA_QUERY) if line.split('\t',1)[0] in query_set])
    logging.debug('Read %s Done.' % DATA_QUERY)
    title_map = dict([(line.strip().split('\t')) for line in file(DATA_TITLE) if line.split('\t',1)[0] in title_set])
    logging.debug('Read %s Done.' % DATA_TITLE)

    #profile_map = dict([(line.strip().split('\t', 1)) for line in file(DATA_PROFILE) if line.split('\t')])
    dump_format = '%s\x01%s\x01%s\x01%s\n'
    expandId2TokensResult = file(expandId2TokensResultFile, 'w') 
    logging.debug('start joining tokens')
    for line in file(aggregateUserfile) :
        userID, tmp_str = line.strip().split('\x01')
        queryIDlist, titleIDlist, descIDList = tmp_str.split('\x02')
        queryExpandTokensStr = '|'.join([query_map[queryId] for queryId in queryIDlist.split('\t') if queryId != ''])
        titleExpandTokensStr = '|'.join([title_map[titleId] for titleId in titleIDlist.split('\t') if titleId != ''])
        descExpandTokensStr = '|'.join([description_map[descId] for descId in descIDList.split('\t') if descId != ''])
        expandId2TokensResult.write( dump_format % \
               (userID, queryExpandTokensStr, titleExpandTokensStr, descExpandTokensStr))
    expandId2TokensResult.close()
Beispiel #23
0
def expandFeatureId2Tokens(aggregateUserfile, expandId2TokensResultFile,
                           query_set, desc_set, title_set):
    logging.info('=========start expandFeatureId2Tokens processing=========')
    description_map = dict([(line.strip().split('\t'))
                            for line in file(DATA_DESCRIPTION)
                            if line.split('\t', 1)[0] in desc_set])
    logging.debug('Read %s Done.' % DATA_DESCRIPTION)
    query_map = dict([(line.strip().split('\t')) for line in file(DATA_QUERY)
                      if line.split('\t', 1)[0] in query_set])
    logging.debug('Read %s Done.' % DATA_QUERY)
    title_map = dict([(line.strip().split('\t')) for line in file(DATA_TITLE)
                      if line.split('\t', 1)[0] in title_set])
    logging.debug('Read %s Done.' % DATA_TITLE)

    #profile_map = dict([(line.strip().split('\t', 1)) for line in file(DATA_PROFILE) if line.split('\t')])
    dump_format = '%s\x01%s\x01%s\x01%s\n'
    expandId2TokensResult = file(expandId2TokensResultFile, 'w')
    logging.debug('start joining tokens')
    for line in file(aggregateUserfile):
        userID, tmp_str = line.strip().split('\x01')
        queryIDlist, titleIDlist, descIDList = tmp_str.split('\x02')
        queryExpandTokensStr = '|'.join([
            query_map[queryId] for queryId in queryIDlist.split('\t')
            if queryId != ''
        ])
        titleExpandTokensStr = '|'.join([
            title_map[titleId] for titleId in titleIDlist.split('\t')
            if titleId != ''
        ])
        descExpandTokensStr = '|'.join([
            description_map[descId] for descId in descIDList.split('\t')
            if descId != ''
        ])
        expandId2TokensResult.write( dump_format % \
               (userID, queryExpandTokensStr, titleExpandTokensStr, descExpandTokensStr))
    expandId2TokensResult.close()
def socket_debug_printer(pid, syscall_object):
    p = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, p, 3)
    logging.debug('Domain: %s', ADDRFAM_INT_TO_FAM[params[0]])
    logging.debug('Type: %s', SOCKTYPE_INT_TO_TYPE[params[1]])
    logging.debug('Protocol: %s', PROTOFAM_INT_TO_FAM[params[2]])
Beispiel #25
0
import __init__
import sys
sys.path.append('../')
from util import logging


class dataParser:
    @staticmethod
    def parseTrainData(line):
        fields = line.strip().split('\t')
        if len(fields) != 12: return None
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
                Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields
        return  int(Click), int(Impression), Display_url,\
                AdID, AdvertiserID, int(Depth), \
                int(Position), QueryID, KeywordID,\
                TitleID, DescriptionID, UserID


if __name__ == '__main__':
    example_row = '0\t1\t4298118681424644510\t7686695\t385\t3\t3\t1601\t5521\t7709\t576\t490234'
    logging.debug(dataParser.parseTrainData(example_row))
def socket_entry_handler(syscall_id, syscall_object, pid):
    """Replay Always
    Checks:
    0: The domain of the socket
    Sets:
    return value: file descriptor of the new socket -1 (error)
        (added as replay file descriptor)
    errno

    Not Implemented:
    * Determine what is not implemented
    """
    logging.debug('Entering socket subcall entry handler')

    ecx = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, ecx, 3)
    # Only PF_INET and PF_LOCAL socket calls are handled
    execution_is_PF_INET = (params[0] == cint.PF_INET)
    trace_is_PF_INET = (str(syscall_object.args[0]) == '[\'PF_INET\']')
    execution_is_PF_LOCAL = (params[0] == 1)  # define PF_LOCAL 1
    trace_is_PF_LOCAL = (str(syscall_object.args[0]) == '[\'PF_LOCAL\']')
    logging.debug('Execution is PF_INET: %s', execution_is_PF_INET)
    logging.debug('Trace is PF_INET: %s', trace_is_PF_INET)
    logging.debug('Exeuction is PF_LOCAL: %s', execution_is_PF_LOCAL)
    logging.debug('Trace is PF_LOCAL: %s', trace_is_PF_LOCAL)
    if execution_is_PF_INET != trace_is_PF_INET:
        raise ReplayDeltaError(
            'Encountered socket subcall with mismatch between '
            'execution protocol family and trace protocol family')
    if execution_is_PF_LOCAL != trace_is_PF_LOCAL:
        raise ReplayDeltaError(
            'Encountered socket subcall with mismatch between '
            'execution protocol family and trace protocol family')
    # Decide if we want to deal with this socket call or not
    if trace_is_PF_INET or \
       execution_is_PF_INET or \
       trace_is_PF_LOCAL or \
       execution_is_PF_LOCAL:
        noop_current_syscall(pid)
        fd = int(syscall_object.ret[0])
        logging.debug('File Descriptor from trace: %s', fd)
        apply_return_conditions(pid, syscall_object)
    else:
        logging.info('Ignoring non-PF_INET call to socket')
def accept_subcall_entry_handler(syscall_id, syscall_object, pid):
    """Replay Always
    Checks:
    0: sockfd: the socket file descriptor
    Sets:
    return value: The file descriptor -1 (error)
    errno

    Not Implemented:
    * Implement a function to check null terminated strings to clean up this
      mess of checking
    """
    logging.debug('Checking if line from trace is interrupted accept')
    if syscall_object.ret[0] == '?':
        raise NotImplementedError('Interrupted accept()s not implemented')
    ecx = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, ecx, 3)
    sockaddr_addr = params[1]
    sockaddr_len_addr = params[2]
    fd_from_trace = syscall_object.args[0].value
    validate_integer_argument(pid, syscall_object, 0, 0, params=params)
    # Decide if this is a system call we want to replay
    noop_current_syscall(pid)
    if syscall_object.ret[0] != -1 and syscall_object.args[1].value != 'NULL':
        sockfields = syscall_object.args[1].value
        family = sockfields[0].value
        port = int(sockfields[1].value)
        ip = sockfields[2].value
        sockaddr_length = int(syscall_object.args[2].value.strip('[]'))
        logging.debug('Family: %s', family)
        logging.debug('Port: %s', port)
        logging.debug('IP: %s', ip)
        logging.debug('sockaddr Length: %s', sockaddr_length)
        logging.debug('sockaddr addr: %x', sockaddr_addr & 0xffffffff)
        logging.debug('sockaddr length addr: %x',
                      sockaddr_len_addr & 0xffffffff)
        logging.debug('pid: %s', pid)
        cint.populate_af_inet_sockaddr(pid, sockaddr_addr, port, ip,
                                       sockaddr_len_addr, sockaddr_length)
    if syscall_object.ret[0] != -1:
        ret = syscall_object.ret[0]
    apply_return_conditions(pid, syscall_object)
Beispiel #28
0
 def svm_rank_learn(features, output_model, args = '') :
     logging.info(('=='*10 + '%s' + '=='*10) % ( 'START SVM LEARNING'))
     svm_rank_learn_format = '%s %s %s %s'
     cmd_text = svm_rank_learn_format % (SVM_RANK.svm_rank_learn_command, args, features, output_model)
     logging.debug(cmd_text)
     os.system(cmd_text)
Beispiel #29
0
def joinResult4SVMRanking(fn_trainFeature, fn_ad2userStatus, fn_out_SVMRanking,
                          fn_userRawExpandTokens, fn_userid4SVMRanking,
                          fn_ad2UsersGivenAdSet):
    '''
    fn_trainFeature=TMP_DATA_DIR_PATH+'LDA_corpus.svmlight'
    fn_ad2userStatus=TMP_DATA_DIR_PATH+'ad2userStatus.dict'
    fn_userRawExpandTokens = TMP_DATA_DIR_PATH + 'userRawExpandTokens.dict'

    '''

    logging.info('=====joinResult4SVMRanking Start=====')
    userFeature = {}
    userlist = []
    #fn_userRawExpandTokens = TMP_DATA_DIR_PATH + 'userRawExpandTokens.dict'
    for line in file(fn_userRawExpandTokens):
        userid, query, title, desc = line.strip().split('\x01')
        userlist.append(userid)

    trainFeature = file(fn_trainFeature)
    for userid in userlist:
        fields = trainFeature.readline().strip().split(' ', 1)
        if len(fields) != 2: continue
        tmp, feature_str = fields
        if len(feature_str.split()) <= 5: continue
        userFeature[userid] = feature_str

    logging.debug('=====load raw training Feature Done.=====')
    logging.debug('=====loading status map.=====')

    statusMap = {}
    for line in file(fn_ad2userStatus):
        adid, userid, click, impression = line.strip().split('\t')
        click = int(click)
        impression = int(impression)
        status = genStatus(click, impression)
        statusMap[(adid, userid)] = status

    logging.debug('=====join final data start=====')
    output = file(fn_out_SVMRanking, 'w')
    format = '%d qid:%d %s\n'
    adid2Idx = {}

    #line number of userid4SVMRanking equals to output4SVMRanking's
    #fn_userid4SVMRanking = TMP_DATA_DIR_PATH+'userid4SVMRanking.dat'
    userid_output = file(fn_userid4SVMRanking, 'w')

    #fn_ad2UsersGivenAdSet = TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict'
    idx = 1
    for line in file(fn_ad2UsersGivenAdSet):
        adid, user_str = line.strip().split('\x01')
        if adid not in adid2Idx:
            adid2Idx[adid] = idx
            idx += 1
        userids = user_str.split('\t')
        for userid in userids:
            if userid not in userFeature or (adid, userid) not in statusMap:
                continue
            userid_output.write('%s\n' % userid)
            output.write(format % (statusMap[(adid, userid)], adid2Idx[adid],
                                   userFeature[userid]))

    output.close()
    userid_output.close()
    dumpDict2File(adid2Idx, TMP_DATA_DIR_PATH + 'adid2Idx.dict')
Beispiel #30
0
import __init__
import sys
sys.path.append('../')
from util import logging

class dataParser :

    @staticmethod
    def parseTrainData(line) :
        fields = line.strip().split('\t')
        if len(fields) != 12 : return None
        Click, Impression, Display_url, AdID, AdvertiserID, Depth, \
                Position, QueryID, KeywordID, TitleID, DescriptionID, UserID = fields
        return  int(Click), int(Impression), Display_url,\
                AdID, AdvertiserID, int(Depth), \
                int(Position), QueryID, KeywordID,\
                TitleID, DescriptionID, UserID

if __name__ == '__main__' :
    example_row = '0\t1\t4298118681424644510\t7686695\t385\t3\t3\t1601\t5521\t7709\t576\t490234'
    logging.debug(dataParser.parseTrainData(example_row))
def getsockname_entry_handler(syscall_id, syscall_object, pid):
    """Replay Always
    Checks:
    0: The socket file descriptor
    Sets:
    addr: a struct sockaddr populated with the requested information
    addrlen: length of the sockaddr struct being populated
    return value: 0 (success) or -1 (failure)
    errno

    Not Implemented:
    * Use address validator to check the addresses
    """
    logging.debug('Entering getsockname handler')
    # Pull out the info that we can check
    ecx = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, ecx, 3)
    # We don't compare params[1] because it is the address of an empty buffer
    # We don't compare params[2] because it is the address of an out parameter
    # Get values from trace for comparison
    fd_from_trace = syscall_object.args[0].value
    validate_integer_argument(pid, syscall_object, 0, 0, params=params)
    # Decide if this is a file descriptor we want to deal with
    noop_current_syscall(pid)
    if syscall_object.ret[0] != -1:
        logging.debug('Got successful getsockname call')
        addr = params[1]
        length_addr = params[2]
        length = int(syscall_object.args[2].value.strip('[]'))
        logging.debug('Addr: %d', addr & 0xffffffff)
        logging.debug('Length addr: %d', length_addr & 0xffffffff)
        logging.debug('Length: %d', length)
        sockfields = syscall_object.args[1].value
        family = sockfields[0].value
        port = int(sockfields[1].value)
        ip = sockfields[2].value
        logging.debug('Family: %s', family)
        logging.debug('Port: %d', port)
        logging.debug('Ip: %s', ip)
        if family != 'AF_INET':
            raise NotImplementedError('getsockname only supports ' 'AF_INET')
        cint.populate_af_inet_sockaddr(pid, addr, port, ip, length_addr,
                                       length)
    else:
        logging.debug('Got unsuccessful getsockname call')
    apply_return_conditions(pid, syscall_object)
def joinResult4SVMRanking(fn_trainFeature, fn_ad2userStatus, fn_out_SVMRanking, fn_userRawExpandTokens, fn_userid4SVMRanking, fn_ad2UsersGivenAdSet) :
    '''
    fn_trainFeature=TMP_DATA_DIR_PATH+'LDA_corpus.svmlight'
    fn_ad2userStatus=TMP_DATA_DIR_PATH+'ad2userStatus.dict'
    fn_userRawExpandTokens = TMP_DATA_DIR_PATH + 'userRawExpandTokens.dict'

    '''

    logging.info('=====joinResult4SVMRanking Start=====')
    userFeature = {}
    userlist = []
    #fn_userRawExpandTokens = TMP_DATA_DIR_PATH + 'userRawExpandTokens.dict'
    for line in file(fn_userRawExpandTokens) :
        userid, query, title, desc = line.strip().split('\x01')
        userlist.append(userid)

    trainFeature = file(fn_trainFeature)
    for userid in userlist :
        fields = trainFeature.readline().strip().split(' ',1)
        if len(fields) != 2 : continue
        tmp, feature_str = fields
        if len(feature_str.split()) <= 5 : continue
        userFeature[userid] = feature_str

    logging.debug('=====load raw training Feature Done.=====')
    logging.debug('=====loading status map.=====')
    
    statusMap = {}
    for line in file(fn_ad2userStatus) :
        adid, userid, click, impression = line.strip().split('\t')
        click = int(click)
        impression = int(impression)
        status = genStatus(click, impression)
        statusMap[(adid, userid)] = status

    logging.debug('=====join final data start=====')
    output = file(fn_out_SVMRanking, 'w')
    format = '%d qid:%d %s\n'
    adid2Idx = {}
    
    #line number of userid4SVMRanking equals to output4SVMRanking's
    #fn_userid4SVMRanking = TMP_DATA_DIR_PATH+'userid4SVMRanking.dat'
    userid_output = file(fn_userid4SVMRanking, 'w')

    #fn_ad2UsersGivenAdSet = TMP_DATA_DIR_PATH + 'ad2UsersGivenAdSet.dict'
    idx = 1
    for line in file(fn_ad2UsersGivenAdSet) :
        adid, user_str = line.strip().split('\x01')
        if adid not in adid2Idx :
            adid2Idx[adid] = idx
            idx += 1
        userids = user_str.split('\t')
        for userid in userids :
            if userid not in userFeature or (adid, userid) not in statusMap:
                continue
            userid_output.write('%s\n' % userid)
            output.write(format % (statusMap[(adid, userid)], adid2Idx[adid], userFeature[userid]))

    output.close()
    userid_output.close()
    dumpDict2File(adid2Idx, TMP_DATA_DIR_PATH+'adid2Idx.dict')
Beispiel #33
0
 def svm_rank_classify(features, model, predictions):
     logging.info(('=='*10 + '%s' + '=='*10) % ( 'START SVM CLASSIFING'))
     svm_rank_classify_format = '%s %s %s %s'
     cmd_text = svm_rank_classify_format % (SVM_RANK.svm_rank_classify_command, features, model, predictions)
     logging.debug(cmd_text)
     os.system(cmd_text)
def getpeername_entry_handler(syscall_id, syscall_object, pid):
    logging.debug('Entering getpeername handler')
    # Pull out the info that we can check
    ecx = cint.peek_register(pid, cint.ECX)
    params = extract_socketcall_parameters(pid, ecx, 3)
    fd = params[0]
    # We don't compare params[1] because it is the address of an empty buffer
    # We don't compare params[2] because it is the address of an out parameter
    # Get values from trace for comparison
    fd_from_trace = syscall_object.args[0].value
    # Check to make sure everything is the same
    if fd != int(fd_from_trace):
        raise ReplayDeltaError(
            'File descriptor from execution ({}) '
            'does not match file descriptor from trace ({})'.format(
                fd, fd_from_trace))
    # Decide if this is a file descriptor we want to deal with
    noop_current_syscall(pid)
    if syscall_object.ret[0] != -1:
        logging.debug('Got successful getpeername call')
        addr = params[1]
        length_addr = params[2]
        length = int(syscall_object.args[2].value.strip('[]'))
        logging.debug('Addr: %d', addr)
        logging.debug('Length addr: %d', length_addr)
        logging.debug('Length: %d', length)
        sockfields = syscall_object.args[1].value
        family = sockfields[0].value
        port = int(sockfields[1].value)
        ip = sockfields[2].value
        logging.debug('Family: %s', family)
        logging.debug('Port: %d', port)
        logging.debug('Ip: %s', ip)
        if family != 'AF_INET':
            raise NotImplementedError('getpeername only ' 'supports AF_INET')
        cint.populate_af_inet_sockaddr(pid, addr, port, ip, length_addr,
                                       length)
    else:
        logging.debug('Got unsuccessful getpeername call')
    apply_return_conditions(pid, syscall_object)