Exemple #1
0
 def is_stopURL(self, url):
     '''judge if this url is a stopURL'''
     tmp = up.drop_variation(url)
     #        tmp = urlparse.urlparse(ori_requestURL)
     if (tmp in self.stopURLset) or (url in self.stopURLset):
         return True
     else:
         return False
Exemple #2
0
    def is_stopURL(self, url):
        '''judge if this url is a stopURL'''
        tmp = up.drop_variation(url)
#        tmp = urlparse.urlparse(ori_requestURL)
        if (tmp in self.stopURLset) or (url in self.stopURLset):
            return True
        else:
            return False
Exemple #3
0
 def search_url_index(self, url):
     '''
     output: node's index in the "treeRelation" list
     '''
     url_tmp = up.drop_variation(url)
     ind_arr = []
     ind = -1
     while ind <= len(self.treeContent):
         if url_tmp in self.treeContent[ind + 1:]:
             ind = self.treeContent.index(url_tmp, ind + 1)
             ind_arr.append(ind)
         else:
             break
     return ind_arr
Exemple #4
0
 def search_url_index(self, url):
     '''
     output: node's index in the "treeRelation" list
     '''
     url_tmp = up.drop_variation( url)
     ind_arr = []
     ind = -1
     while ind <= len(self.treeContent):
         if url_tmp in self.treeContent[ind+1:]:
             ind = self.treeContent.index(url_tmp,ind+1)
             ind_arr.append( ind)
         else:
             break
     return ind_arr
Exemple #5
0
    def get_stop_urls(self):
        for f in self.file_list:
            data = self.read_json(self.path + '/' + f)
            for i in range(0, len(data['log']['entries'])):
                currentItem = data['log']['entries'][i]
                ori_requestURL = currentItem['request']['url']
                tmp = urlparse.urlparse(ori_requestURL)
                requestURL = up.drop_variation(ori_requestURL)

                self.stopURLset.add(requestURL)
                self.stopURLset.add(ori_requestURL)
#            if tmp.hostname:
#                self.stopURLset.add(tmp.hostname)
        return
Exemple #6
0
    def get_stop_urls(self):
        for f in self.file_list:
            data = self.read_json(self.path +'/'+ f)
            for i in range(0,len( data['log']['entries'])):
                currentItem = data['log']['entries'][i]
                ori_requestURL = currentItem['request']['url']
                tmp = urlparse.urlparse(ori_requestURL)
                requestURL = up.drop_variation( ori_requestURL );
                
                self.stopURLset.add(requestURL)
                self.stopURLset.add(ori_requestURL)
#            if tmp.hostname:
#                self.stopURLset.add(tmp.hostname)
        return
Exemple #7
0
def get_Tree(PATH, dumpPATH, stop):
    '''
    input:
        PATH: .har file path. The .har file record the traffic
        dumpPATH: A .txt file. This file record a matrix whose fomat is 
            "treeplotVec; url; timestamp"
        stop: A Stop_url(stopURL.py) object.
        
    output:
        a Tree object
    '''
    #    onLine_re = r'((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?'

    data = up.readJason(PATH)
    '''
        currentItem = data['log']['entries'][i]
        
        treeContent -> up.drop_variation( currentItem['request']['url'] )
        original_treeContent -> currentItem['request']['url']
        indexList -> i(or -i if the node is root of a tree)
        wait_interval -> currentItem['timings']['wait'], (ms毫秒)
        mimeType -> currentItem['response']['content']['mimeType']
    '''
    treeRelation = []  #generate a matlab-treeplot()-like vector
    treeContent = [
    ]  #record the simplified URL corresponding to "treeRelation"
    original_treeContent = [
    ]  #record the original URL corresponding to "treeRelation"
    indexList = [
    ]  #record the index of 'entries' from whose content can find the url, negative element for root node
    wait_interval = [
    ]  #record the request-response interval of corresponding page
    mimeType = []  #record the mimeType of corresponding page
    treeTimestamp = []  #tuple elements here, (date, )
    positionInText = [
    ]  #tuple elements here represent the url ( begin, end) position
    #in text. ( -2, -2) for root node
    size = []

    for i in range(0, len(data['log']['entries'])):
        currentItem = data['log']['entries'][i]
        #        if currentItem['response']['content']['size'] > 102400:
        #            print i, 'size:',currentItem['response']['content']['size']
        #            input('big size')
        ori_requestURL = currentItem['request']['url']
        requestURL = up.drop_variation(ori_requestURL)
        if stop.is_stopURL(ori_requestURL):
            print "StopURL:", ori_requestURL
            continue

        #process this request-response pair
        #process request part
        ifInTree, location = judge_if_existing(treeContent, wait_interval,
                                               requestURL)
        if ifInTree:  #if the requested content has pushed in the tree
            root = location + 1  # get the root index for urls in response text
            # section below is used to debug
            if wait_interval[location] >= 0:
                print 'PATH 0f file:\t', PATH
                print 'entity index:\t', i
                print 'requested URL:\t', ori_requestURL
                print 'URL of the existing node:\t', treeContent[location]
                print 'node location in array:\t', location
                print 'root of this node:\t', treeRelation[location]
                print 'value of the existing node:\t', wait_interval[location]
                input("EXCEPTION:wait_interval[location] >= 0!!!\n")
            wait_interval[location] = currentItem['timings']['wait']
            mimeType[location] = currentItem['response']['content']['mimeType']
            treeTimestamp[location] = up.get_fiddle_timestamp(
                currentItem['startedDateTime'])
            size[location] = currentItem['response']['content']['size']
            treeContent[location] = requestURL
            original_treeContent[location] = ori_requestURL
        else:
            treeRelation.append(0)
            treeContent.append(requestURL)
            original_treeContent.append(ori_requestURL)
            wait_interval.append(currentItem['timings']['wait'])
            mimeType.append(currentItem['response']['content']['mimeType'])
            treeTimestamp.append(
                up.get_fiddle_timestamp(currentItem['startedDateTime']))
            indexList.append(-i)
            positionInText.append((-2, -2))
            size.append(currentItem['response']['content']['size'])
            root = len(treeContent)

        #process response part
        if data['log']['entries'][i]['response']['content'].has_key('text'):
            string = data['log']['entries'][i]['response']['content']['text']
            for url, start_pos, end_pos, count in up.get_urlSet_from_text(
                    string):
                treeRelation.append(root)
                treeContent.append(up.drop_variation(url))  #[Q2]
                original_treeContent.append(url)
                wait_interval.append(-1)
                indexList.append(i)
                mimeType.append(u'')
                treeTimestamp.append((u'', -1, u''))
                positionInText.append((start_pos, end_pos))
                size.append(-2)


#            for item in subPatt:
#                treeRelation.append( root )
#                url = item[0] + item[2] + item[6] # don't aky why, I'll tell you "because of love  ╮( ̄▽ ̄)╭"
#                url = url.rstrip('\\')
#                treeContent.append( up.drop_variation(url) ) #[Q2]
#                original_treeContent.append( url )
#                wait_interval.append(-1)
#                indexList.append(i)
#                mimeType.append(u'')
#                treeTimestamp.append((u'',-1,u''))
        else:
            print i
            print currentItem['response']['content']['mimeType']

    tree_info_mat = {}
    tree_info_mat['treeRelation'] = copy.deepcopy(treeRelation)
    tree_info_mat['treeContent'] = copy.deepcopy(treeContent)
    tree_info_mat['indexList'] = copy.deepcopy(indexList)
    tree_info_mat['original_treeContent'] = copy.deepcopy(original_treeContent)
    tree_info_mat['wait_interval'] = copy.deepcopy(wait_interval)
    tree_info_mat['mimeType'] = copy.deepcopy(mimeType)
    tree_info_mat['filename'] = PATH
    tree_info_mat['treeTimestamp'] = copy.deepcopy(treeTimestamp)
    tree_info_mat['dumpPath'] = dumpPATH
    tree_info_mat['positionInText'] = copy.deepcopy(positionInText)
    tree_info_mat['size'] = copy.deepcopy(size)

    return Tree(tree_info_mat)
Exemple #8
0
def get_Tree(PATH, dumpPATH, stop):
    '''
    input:
        PATH: .har file path. The .har file record the traffic
        dumpPATH: A .txt file. This file record a matrix whose fomat is 
            "treeplotVec; url; timestamp"
        stop: A Stop_url(stopURL.py) object.
        
    output:
        a Tree object
    '''
#    onLine_re = r'((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?'
    
    data = up.readJason(PATH)
    
    '''
        currentItem = data['log']['entries'][i]
        
        treeContent -> up.drop_variation( currentItem['request']['url'] )
        original_treeContent -> currentItem['request']['url']
        indexList -> i(or -i if the node is root of a tree)
        wait_interval -> currentItem['timings']['wait'], (ms毫秒)
        mimeType -> currentItem['response']['content']['mimeType']
    '''
    treeRelation = [] #generate a matlab-treeplot()-like vector
    treeContent = [] #record the simplified URL corresponding to "treeRelation"
    original_treeContent = [] #record the original URL corresponding to "treeRelation"
    indexList = [] #record the index of 'entries' from whose content can find the url, negative element for root node
    wait_interval = [] #record the request-response interval of corresponding page
    mimeType = [] #record the mimeType of corresponding page
    treeTimestamp = [] #tuple elements here, (date, )
    positionInText = [] #tuple elements here represent the url ( begin, end) position
                #in text. ( -2, -2) for root node
    size = []
    
    for i in range(0,len(data['log']['entries'])):
        currentItem = data['log']['entries'][i]
#        if currentItem['response']['content']['size'] > 102400:
#            print i, 'size:',currentItem['response']['content']['size']
#            input('big size')
        ori_requestURL = currentItem['request']['url']
        requestURL = up.drop_variation( ori_requestURL );
        if stop.is_stopURL(ori_requestURL):
            print "StopURL:",ori_requestURL
            continue
        
        #process this request-response pair
        #process request part
        ifInTree, location = judge_if_existing( treeContent, wait_interval, requestURL)
        if ifInTree: #if the requested content has pushed in the tree
            root = location + 1 # get the root index for urls in response text
            # section below is used to debug
            if wait_interval[location] >= 0:
                print 'PATH 0f file:\t',PATH
                print 'entity index:\t',i
                print 'requested URL:\t',ori_requestURL
                print 'URL of the existing node:\t',treeContent[location]
                print 'node location in array:\t',location
                print 'root of this node:\t',treeRelation[location]
                print 'value of the existing node:\t',wait_interval[location]
                input("EXCEPTION:wait_interval[location] >= 0!!!\n")
            wait_interval[location] = currentItem['timings']['wait']
            mimeType[location] = currentItem['response']['content']['mimeType']
            treeTimestamp[location] = up.get_fiddle_timestamp(currentItem['startedDateTime'])
            size[location] = currentItem['response']['content']['size']
            treeContent[ location] = requestURL
            original_treeContent[ location] = ori_requestURL
        else:
            treeRelation.append(0)
            treeContent.append(requestURL)
            original_treeContent.append(ori_requestURL)
            wait_interval.append( currentItem['timings']['wait'] )
            mimeType.append(currentItem['response']['content']['mimeType'])
            treeTimestamp.append(up.get_fiddle_timestamp(currentItem['startedDateTime']))
            indexList.append(-i)
            positionInText.append((-2,-2))
            size.append(currentItem['response']['content']['size'])
            root = len(treeContent)
            
        #process response part
        if data['log']['entries'][i]['response']['content'].has_key('text'):
            string = data['log']['entries'][i]['response']['content']['text']
            for url, start_pos, end_pos, count in up.get_urlSet_from_text( string):
                treeRelation.append( root )
                treeContent.append( up.drop_variation(url)) #[Q2]
                original_treeContent.append( url)
                wait_interval.append(-1)
                indexList.append(i)
                mimeType.append(u'')
                treeTimestamp.append((u'',-1,u''))
                positionInText.append(( start_pos, end_pos))
                size.append(-2)
#            for item in subPatt:
#                treeRelation.append( root )
#                url = item[0] + item[2] + item[6] # don't aky why, I'll tell you "because of love  ╮( ̄▽ ̄)╭"
#                url = url.rstrip('\\')
#                treeContent.append( up.drop_variation(url) ) #[Q2]
#                original_treeContent.append( url )
#                wait_interval.append(-1)
#                indexList.append(i)
#                mimeType.append(u'')
#                treeTimestamp.append((u'',-1,u''))
        else:
            print i
            print currentItem['response']['content']['mimeType']
    
    tree_info_mat = {}
    tree_info_mat['treeRelation'] = copy.deepcopy( treeRelation)
    tree_info_mat['treeContent'] = copy.deepcopy( treeContent)
    tree_info_mat['indexList'] = copy.deepcopy( indexList)
    tree_info_mat['original_treeContent'] = copy.deepcopy( original_treeContent)
    tree_info_mat['wait_interval'] = copy.deepcopy( wait_interval)
    tree_info_mat['mimeType'] = copy.deepcopy( mimeType)
    tree_info_mat['filename'] = PATH
    tree_info_mat['treeTimestamp'] = copy.deepcopy( treeTimestamp )
    tree_info_mat['dumpPath'] = dumpPATH
    tree_info_mat['positionInText'] = copy.deepcopy( positionInText)
    tree_info_mat['size'] = copy.deepcopy( size)

    return Tree(tree_info_mat)