def is_stopURL(self, url): '''judge if this url is a stopURL''' tmp = up.drop_variation(url) # tmp = urlparse.urlparse(ori_requestURL) if (tmp in self.stopURLset) or (url in self.stopURLset): return True else: return False
def search_url_index(self, url): ''' output: node's index in the "treeRelation" list ''' url_tmp = up.drop_variation(url) ind_arr = [] ind = -1 while ind <= len(self.treeContent): if url_tmp in self.treeContent[ind + 1:]: ind = self.treeContent.index(url_tmp, ind + 1) ind_arr.append(ind) else: break return ind_arr
def search_url_index(self, url): ''' output: node's index in the "treeRelation" list ''' url_tmp = up.drop_variation( url) ind_arr = [] ind = -1 while ind <= len(self.treeContent): if url_tmp in self.treeContent[ind+1:]: ind = self.treeContent.index(url_tmp,ind+1) ind_arr.append( ind) else: break return ind_arr
def get_stop_urls(self): for f in self.file_list: data = self.read_json(self.path + '/' + f) for i in range(0, len(data['log']['entries'])): currentItem = data['log']['entries'][i] ori_requestURL = currentItem['request']['url'] tmp = urlparse.urlparse(ori_requestURL) requestURL = up.drop_variation(ori_requestURL) self.stopURLset.add(requestURL) self.stopURLset.add(ori_requestURL) # if tmp.hostname: # self.stopURLset.add(tmp.hostname) return
def get_stop_urls(self): for f in self.file_list: data = self.read_json(self.path +'/'+ f) for i in range(0,len( data['log']['entries'])): currentItem = data['log']['entries'][i] ori_requestURL = currentItem['request']['url'] tmp = urlparse.urlparse(ori_requestURL) requestURL = up.drop_variation( ori_requestURL ); self.stopURLset.add(requestURL) self.stopURLset.add(ori_requestURL) # if tmp.hostname: # self.stopURLset.add(tmp.hostname) return
def get_Tree(PATH, dumpPATH, stop): ''' input: PATH: .har file path. The .har file record the traffic dumpPATH: A .txt file. This file record a matrix whose fomat is "treeplotVec; url; timestamp" stop: A Stop_url(stopURL.py) object. output: a Tree object ''' # onLine_re = r'((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?' data = up.readJason(PATH) ''' currentItem = data['log']['entries'][i] treeContent -> up.drop_variation( currentItem['request']['url'] ) original_treeContent -> currentItem['request']['url'] indexList -> i(or -i if the node is root of a tree) wait_interval -> currentItem['timings']['wait'], (ms毫秒) mimeType -> currentItem['response']['content']['mimeType'] ''' treeRelation = [] #generate a matlab-treeplot()-like vector treeContent = [ ] #record the simplified URL corresponding to "treeRelation" original_treeContent = [ ] #record the original URL corresponding to "treeRelation" indexList = [ ] #record the index of 'entries' from whose content can find the url, negative element for root node wait_interval = [ ] #record the request-response interval of corresponding page mimeType = [] #record the mimeType of corresponding page treeTimestamp = [] #tuple elements here, (date, ) positionInText = [ ] #tuple elements here represent the url ( begin, end) position #in text. ( -2, -2) for root node size = [] for i in range(0, len(data['log']['entries'])): currentItem = data['log']['entries'][i] # if currentItem['response']['content']['size'] > 102400: # print i, 'size:',currentItem['response']['content']['size'] # input('big size') ori_requestURL = currentItem['request']['url'] requestURL = up.drop_variation(ori_requestURL) if stop.is_stopURL(ori_requestURL): print "StopURL:", ori_requestURL continue #process this request-response pair #process request part ifInTree, location = judge_if_existing(treeContent, wait_interval, requestURL) if ifInTree: #if the requested content has pushed in the tree root = location + 1 # get the root index for urls in response text # section below is used to debug if wait_interval[location] >= 0: print 'PATH 0f file:\t', PATH print 'entity index:\t', i print 'requested URL:\t', ori_requestURL print 'URL of the existing node:\t', treeContent[location] print 'node location in array:\t', location print 'root of this node:\t', treeRelation[location] print 'value of the existing node:\t', wait_interval[location] input("EXCEPTION:wait_interval[location] >= 0!!!\n") wait_interval[location] = currentItem['timings']['wait'] mimeType[location] = currentItem['response']['content']['mimeType'] treeTimestamp[location] = up.get_fiddle_timestamp( currentItem['startedDateTime']) size[location] = currentItem['response']['content']['size'] treeContent[location] = requestURL original_treeContent[location] = ori_requestURL else: treeRelation.append(0) treeContent.append(requestURL) original_treeContent.append(ori_requestURL) wait_interval.append(currentItem['timings']['wait']) mimeType.append(currentItem['response']['content']['mimeType']) treeTimestamp.append( up.get_fiddle_timestamp(currentItem['startedDateTime'])) indexList.append(-i) positionInText.append((-2, -2)) size.append(currentItem['response']['content']['size']) root = len(treeContent) #process response part if data['log']['entries'][i]['response']['content'].has_key('text'): string = data['log']['entries'][i]['response']['content']['text'] for url, start_pos, end_pos, count in up.get_urlSet_from_text( string): treeRelation.append(root) treeContent.append(up.drop_variation(url)) #[Q2] original_treeContent.append(url) wait_interval.append(-1) indexList.append(i) mimeType.append(u'') treeTimestamp.append((u'', -1, u'')) positionInText.append((start_pos, end_pos)) size.append(-2) # for item in subPatt: # treeRelation.append( root ) # url = item[0] + item[2] + item[6] # don't aky why, I'll tell you "because of love ╮( ̄▽ ̄)╭" # url = url.rstrip('\\') # treeContent.append( up.drop_variation(url) ) #[Q2] # original_treeContent.append( url ) # wait_interval.append(-1) # indexList.append(i) # mimeType.append(u'') # treeTimestamp.append((u'',-1,u'')) else: print i print currentItem['response']['content']['mimeType'] tree_info_mat = {} tree_info_mat['treeRelation'] = copy.deepcopy(treeRelation) tree_info_mat['treeContent'] = copy.deepcopy(treeContent) tree_info_mat['indexList'] = copy.deepcopy(indexList) tree_info_mat['original_treeContent'] = copy.deepcopy(original_treeContent) tree_info_mat['wait_interval'] = copy.deepcopy(wait_interval) tree_info_mat['mimeType'] = copy.deepcopy(mimeType) tree_info_mat['filename'] = PATH tree_info_mat['treeTimestamp'] = copy.deepcopy(treeTimestamp) tree_info_mat['dumpPath'] = dumpPATH tree_info_mat['positionInText'] = copy.deepcopy(positionInText) tree_info_mat['size'] = copy.deepcopy(size) return Tree(tree_info_mat)
def get_Tree(PATH, dumpPATH, stop): ''' input: PATH: .har file path. The .har file record the traffic dumpPATH: A .txt file. This file record a matrix whose fomat is "treeplotVec; url; timestamp" stop: A Stop_url(stopURL.py) object. output: a Tree object ''' # onLine_re = r'((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?' data = up.readJason(PATH) ''' currentItem = data['log']['entries'][i] treeContent -> up.drop_variation( currentItem['request']['url'] ) original_treeContent -> currentItem['request']['url'] indexList -> i(or -i if the node is root of a tree) wait_interval -> currentItem['timings']['wait'], (ms毫秒) mimeType -> currentItem['response']['content']['mimeType'] ''' treeRelation = [] #generate a matlab-treeplot()-like vector treeContent = [] #record the simplified URL corresponding to "treeRelation" original_treeContent = [] #record the original URL corresponding to "treeRelation" indexList = [] #record the index of 'entries' from whose content can find the url, negative element for root node wait_interval = [] #record the request-response interval of corresponding page mimeType = [] #record the mimeType of corresponding page treeTimestamp = [] #tuple elements here, (date, ) positionInText = [] #tuple elements here represent the url ( begin, end) position #in text. ( -2, -2) for root node size = [] for i in range(0,len(data['log']['entries'])): currentItem = data['log']['entries'][i] # if currentItem['response']['content']['size'] > 102400: # print i, 'size:',currentItem['response']['content']['size'] # input('big size') ori_requestURL = currentItem['request']['url'] requestURL = up.drop_variation( ori_requestURL ); if stop.is_stopURL(ori_requestURL): print "StopURL:",ori_requestURL continue #process this request-response pair #process request part ifInTree, location = judge_if_existing( treeContent, wait_interval, requestURL) if ifInTree: #if the requested content has pushed in the tree root = location + 1 # get the root index for urls in response text # section below is used to debug if wait_interval[location] >= 0: print 'PATH 0f file:\t',PATH print 'entity index:\t',i print 'requested URL:\t',ori_requestURL print 'URL of the existing node:\t',treeContent[location] print 'node location in array:\t',location print 'root of this node:\t',treeRelation[location] print 'value of the existing node:\t',wait_interval[location] input("EXCEPTION:wait_interval[location] >= 0!!!\n") wait_interval[location] = currentItem['timings']['wait'] mimeType[location] = currentItem['response']['content']['mimeType'] treeTimestamp[location] = up.get_fiddle_timestamp(currentItem['startedDateTime']) size[location] = currentItem['response']['content']['size'] treeContent[ location] = requestURL original_treeContent[ location] = ori_requestURL else: treeRelation.append(0) treeContent.append(requestURL) original_treeContent.append(ori_requestURL) wait_interval.append( currentItem['timings']['wait'] ) mimeType.append(currentItem['response']['content']['mimeType']) treeTimestamp.append(up.get_fiddle_timestamp(currentItem['startedDateTime'])) indexList.append(-i) positionInText.append((-2,-2)) size.append(currentItem['response']['content']['size']) root = len(treeContent) #process response part if data['log']['entries'][i]['response']['content'].has_key('text'): string = data['log']['entries'][i]['response']['content']['text'] for url, start_pos, end_pos, count in up.get_urlSet_from_text( string): treeRelation.append( root ) treeContent.append( up.drop_variation(url)) #[Q2] original_treeContent.append( url) wait_interval.append(-1) indexList.append(i) mimeType.append(u'') treeTimestamp.append((u'',-1,u'')) positionInText.append(( start_pos, end_pos)) size.append(-2) # for item in subPatt: # treeRelation.append( root ) # url = item[0] + item[2] + item[6] # don't aky why, I'll tell you "because of love ╮( ̄▽ ̄)╭" # url = url.rstrip('\\') # treeContent.append( up.drop_variation(url) ) #[Q2] # original_treeContent.append( url ) # wait_interval.append(-1) # indexList.append(i) # mimeType.append(u'') # treeTimestamp.append((u'',-1,u'')) else: print i print currentItem['response']['content']['mimeType'] tree_info_mat = {} tree_info_mat['treeRelation'] = copy.deepcopy( treeRelation) tree_info_mat['treeContent'] = copy.deepcopy( treeContent) tree_info_mat['indexList'] = copy.deepcopy( indexList) tree_info_mat['original_treeContent'] = copy.deepcopy( original_treeContent) tree_info_mat['wait_interval'] = copy.deepcopy( wait_interval) tree_info_mat['mimeType'] = copy.deepcopy( mimeType) tree_info_mat['filename'] = PATH tree_info_mat['treeTimestamp'] = copy.deepcopy( treeTimestamp ) tree_info_mat['dumpPath'] = dumpPATH tree_info_mat['positionInText'] = copy.deepcopy( positionInText) tree_info_mat['size'] = copy.deepcopy( size) return Tree(tree_info_mat)