def get_surround_text(self, index): ''' input: node's index in the "treeRelation" list get surround text.Return self.SURROUND chars for front and back respectively. And return "start" in addition. "start" == 0 means the url is a root. "start" == -1 means the url is not found. Otherwise, "start" is the url start position in response body text ''' data = up.readJason(self.filename) item_position = self.indexList[index] if item_position < 0: # the url is a root return (" ", " ", 0) currentItem = data['log']['entries'][item_position] if currentItem['response']['content'].has_key('text'): text = currentItem['response']['content']['text'] start = self.positionInText[index][0] end = self.positionInText[index][1] if start + 1 >= self.SURROUND and (len(text) - end - 1) >= self.SURROUND: front = text[start - self.SURROUND:start] back = text[end:end + self.SURROUND] else: min_num = min([start + 1, len(text) - end - 1]) front = text[start - min_num:start] back = text[end:end + min_num] else: front = "" back = "" start = -1 return (front, back, start)
def get_surround_text(self, index): ''' input: node's index in the "treeRelation" list get surround text.Return self.SURROUND chars for front and back respectively. And return "start" in addition. "start" == 0 means the url is a root. "start" == -1 means the url is not found. Otherwise, "start" is the url start position in response body text ''' data = up.readJason(self.filename) item_position = self.indexList[index] if item_position< 0: # the url is a root return (" ", " ", 0) currentItem = data['log']['entries'][item_position] if currentItem['response']['content'].has_key('text'): text = currentItem['response']['content']['text'] start = self.positionInText[ index][0] end = self.positionInText[ index][1] if start+1>= self.SURROUND and (len(text)-end-1)>= self.SURROUND: front = text[start-self.SURROUND: start] back = text[end: end+ self.SURROUND] else: min_num = min([start+1, len(text)-end-1]) front = text[start- min_num: start] back = text[end: end+ min_num] else: front = "" back = "" start = -1 return (front, back, start)
def get_Tree(PATH, dumpPATH, stop): ''' input: PATH: .har file path. The .har file record the traffic dumpPATH: A .txt file. This file record a matrix whose fomat is "treeplotVec; url; timestamp" stop: A Stop_url(stopURL.py) object. output: a Tree object ''' # onLine_re = r'((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?' data = up.readJason(PATH) ''' currentItem = data['log']['entries'][i] treeContent -> up.drop_variation( currentItem['request']['url'] ) original_treeContent -> currentItem['request']['url'] indexList -> i(or -i if the node is root of a tree) wait_interval -> currentItem['timings']['wait'], (ms毫秒) mimeType -> currentItem['response']['content']['mimeType'] ''' treeRelation = [] #generate a matlab-treeplot()-like vector treeContent = [ ] #record the simplified URL corresponding to "treeRelation" original_treeContent = [ ] #record the original URL corresponding to "treeRelation" indexList = [ ] #record the index of 'entries' from whose content can find the url, negative element for root node wait_interval = [ ] #record the request-response interval of corresponding page mimeType = [] #record the mimeType of corresponding page treeTimestamp = [] #tuple elements here, (date, ) positionInText = [ ] #tuple elements here represent the url ( begin, end) position #in text. ( -2, -2) for root node size = [] for i in range(0, len(data['log']['entries'])): currentItem = data['log']['entries'][i] # if currentItem['response']['content']['size'] > 102400: # print i, 'size:',currentItem['response']['content']['size'] # input('big size') ori_requestURL = currentItem['request']['url'] requestURL = up.drop_variation(ori_requestURL) if stop.is_stopURL(ori_requestURL): print "StopURL:", ori_requestURL continue #process this request-response pair #process request part ifInTree, location = judge_if_existing(treeContent, wait_interval, requestURL) if ifInTree: #if the requested content has pushed in the tree root = location + 1 # get the root index for urls in response text # section below is used to debug if wait_interval[location] >= 0: print 'PATH 0f file:\t', PATH print 'entity index:\t', i print 'requested URL:\t', ori_requestURL print 'URL of the existing node:\t', treeContent[location] print 'node location in array:\t', location print 'root of this node:\t', treeRelation[location] print 'value of the existing node:\t', wait_interval[location] input("EXCEPTION:wait_interval[location] >= 0!!!\n") wait_interval[location] = currentItem['timings']['wait'] mimeType[location] = currentItem['response']['content']['mimeType'] treeTimestamp[location] = up.get_fiddle_timestamp( currentItem['startedDateTime']) size[location] = currentItem['response']['content']['size'] treeContent[location] = requestURL original_treeContent[location] = ori_requestURL else: treeRelation.append(0) treeContent.append(requestURL) original_treeContent.append(ori_requestURL) wait_interval.append(currentItem['timings']['wait']) mimeType.append(currentItem['response']['content']['mimeType']) treeTimestamp.append( up.get_fiddle_timestamp(currentItem['startedDateTime'])) indexList.append(-i) positionInText.append((-2, -2)) size.append(currentItem['response']['content']['size']) root = len(treeContent) #process response part if data['log']['entries'][i]['response']['content'].has_key('text'): string = data['log']['entries'][i]['response']['content']['text'] for url, start_pos, end_pos, count in up.get_urlSet_from_text( string): treeRelation.append(root) treeContent.append(up.drop_variation(url)) #[Q2] original_treeContent.append(url) wait_interval.append(-1) indexList.append(i) mimeType.append(u'') treeTimestamp.append((u'', -1, u'')) positionInText.append((start_pos, end_pos)) size.append(-2) # for item in subPatt: # treeRelation.append( root ) # url = item[0] + item[2] + item[6] # don't aky why, I'll tell you "because of love ╮( ̄▽ ̄)╭" # url = url.rstrip('\\') # treeContent.append( up.drop_variation(url) ) #[Q2] # original_treeContent.append( url ) # wait_interval.append(-1) # indexList.append(i) # mimeType.append(u'') # treeTimestamp.append((u'',-1,u'')) else: print i print currentItem['response']['content']['mimeType'] tree_info_mat = {} tree_info_mat['treeRelation'] = copy.deepcopy(treeRelation) tree_info_mat['treeContent'] = copy.deepcopy(treeContent) tree_info_mat['indexList'] = copy.deepcopy(indexList) tree_info_mat['original_treeContent'] = copy.deepcopy(original_treeContent) tree_info_mat['wait_interval'] = copy.deepcopy(wait_interval) tree_info_mat['mimeType'] = copy.deepcopy(mimeType) tree_info_mat['filename'] = PATH tree_info_mat['treeTimestamp'] = copy.deepcopy(treeTimestamp) tree_info_mat['dumpPath'] = dumpPATH tree_info_mat['positionInText'] = copy.deepcopy(positionInText) tree_info_mat['size'] = copy.deepcopy(size) return Tree(tree_info_mat)
def get_Tree(PATH, dumpPATH, stop): ''' input: PATH: .har file path. The .har file record the traffic dumpPATH: A .txt file. This file record a matrix whose fomat is "treeplotVec; url; timestamp" stop: A Stop_url(stopURL.py) object. output: a Tree object ''' # onLine_re = r'((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?' data = up.readJason(PATH) ''' currentItem = data['log']['entries'][i] treeContent -> up.drop_variation( currentItem['request']['url'] ) original_treeContent -> currentItem['request']['url'] indexList -> i(or -i if the node is root of a tree) wait_interval -> currentItem['timings']['wait'], (ms毫秒) mimeType -> currentItem['response']['content']['mimeType'] ''' treeRelation = [] #generate a matlab-treeplot()-like vector treeContent = [] #record the simplified URL corresponding to "treeRelation" original_treeContent = [] #record the original URL corresponding to "treeRelation" indexList = [] #record the index of 'entries' from whose content can find the url, negative element for root node wait_interval = [] #record the request-response interval of corresponding page mimeType = [] #record the mimeType of corresponding page treeTimestamp = [] #tuple elements here, (date, ) positionInText = [] #tuple elements here represent the url ( begin, end) position #in text. ( -2, -2) for root node size = [] for i in range(0,len(data['log']['entries'])): currentItem = data['log']['entries'][i] # if currentItem['response']['content']['size'] > 102400: # print i, 'size:',currentItem['response']['content']['size'] # input('big size') ori_requestURL = currentItem['request']['url'] requestURL = up.drop_variation( ori_requestURL ); if stop.is_stopURL(ori_requestURL): print "StopURL:",ori_requestURL continue #process this request-response pair #process request part ifInTree, location = judge_if_existing( treeContent, wait_interval, requestURL) if ifInTree: #if the requested content has pushed in the tree root = location + 1 # get the root index for urls in response text # section below is used to debug if wait_interval[location] >= 0: print 'PATH 0f file:\t',PATH print 'entity index:\t',i print 'requested URL:\t',ori_requestURL print 'URL of the existing node:\t',treeContent[location] print 'node location in array:\t',location print 'root of this node:\t',treeRelation[location] print 'value of the existing node:\t',wait_interval[location] input("EXCEPTION:wait_interval[location] >= 0!!!\n") wait_interval[location] = currentItem['timings']['wait'] mimeType[location] = currentItem['response']['content']['mimeType'] treeTimestamp[location] = up.get_fiddle_timestamp(currentItem['startedDateTime']) size[location] = currentItem['response']['content']['size'] treeContent[ location] = requestURL original_treeContent[ location] = ori_requestURL else: treeRelation.append(0) treeContent.append(requestURL) original_treeContent.append(ori_requestURL) wait_interval.append( currentItem['timings']['wait'] ) mimeType.append(currentItem['response']['content']['mimeType']) treeTimestamp.append(up.get_fiddle_timestamp(currentItem['startedDateTime'])) indexList.append(-i) positionInText.append((-2,-2)) size.append(currentItem['response']['content']['size']) root = len(treeContent) #process response part if data['log']['entries'][i]['response']['content'].has_key('text'): string = data['log']['entries'][i]['response']['content']['text'] for url, start_pos, end_pos, count in up.get_urlSet_from_text( string): treeRelation.append( root ) treeContent.append( up.drop_variation(url)) #[Q2] original_treeContent.append( url) wait_interval.append(-1) indexList.append(i) mimeType.append(u'') treeTimestamp.append((u'',-1,u'')) positionInText.append(( start_pos, end_pos)) size.append(-2) # for item in subPatt: # treeRelation.append( root ) # url = item[0] + item[2] + item[6] # don't aky why, I'll tell you "because of love ╮( ̄▽ ̄)╭" # url = url.rstrip('\\') # treeContent.append( up.drop_variation(url) ) #[Q2] # original_treeContent.append( url ) # wait_interval.append(-1) # indexList.append(i) # mimeType.append(u'') # treeTimestamp.append((u'',-1,u'')) else: print i print currentItem['response']['content']['mimeType'] tree_info_mat = {} tree_info_mat['treeRelation'] = copy.deepcopy( treeRelation) tree_info_mat['treeContent'] = copy.deepcopy( treeContent) tree_info_mat['indexList'] = copy.deepcopy( indexList) tree_info_mat['original_treeContent'] = copy.deepcopy( original_treeContent) tree_info_mat['wait_interval'] = copy.deepcopy( wait_interval) tree_info_mat['mimeType'] = copy.deepcopy( mimeType) tree_info_mat['filename'] = PATH tree_info_mat['treeTimestamp'] = copy.deepcopy( treeTimestamp ) tree_info_mat['dumpPath'] = dumpPATH tree_info_mat['positionInText'] = copy.deepcopy( positionInText) tree_info_mat['size'] = copy.deepcopy( size) return Tree(tree_info_mat)
def read_json(self, path): return up.readJason(path)