def rank(self): dictionary = rw.readFile(self.dict_dir).split("\n")[0:-1] dic = set() sample = rw.readFile(self.res_dir).split("\n")[0:-1] words = {} high_freq = [] content = "" for line in dictionary: temp = line.split("\t") dic.add(temp[0]) for line in sample: temp = line.split(":") if (temp[0] in dic): high_freq.append(line) limit = int(round(len(high_freq) * self.ratio)) for line in high_freq[:limit]: #content += (line + "\n") temp = line.split(':') content += (temp[0] + '\n') self.dic_word[temp[0]] = int(temp[1]) rw.writeFile(self.rank_dir,content)
def readDictionary(self): files = os.listdir(self.dic_dir) for filename in files: if filename.endswith('.txt'): key = filename[0:-4] if (key not in self.dic): path = self.dic_dir + '/' + filename self.dic[key] = rw.readFile(path).split('\r\n')[0:-1]
def get_text(self): files = os.listdir(self.src_dir) content = "" for filename in files: if filename == 'total.txt': path = self.src_dir + '/' + filename txt = rw.readFile(path) content += txt return content
def addToDic(self, filename): src_path = self.src_dir + '/' + filename lines = rw.readFile(src_path).split('\n')[0:-1] for line in lines: temp = line.split('\t') if temp[0] in self.query_dic: self.query_dic[temp[0]] += int(temp[1]) else: self.query_dic[temp[0]] = int(temp[1])
def find_NE(self, filename): src_path = self.src_dir + '/' + filename res_path = self.res_dir + '/' + filename txt = rw.readFile(src_path) content = "" for key in self.dic: txt = self.find_key(key,txt) p = re.compile('.*\[.+\].*') lines = p.findall(txt) for line in lines: content += (line + '\n') rw.writeFile(res_path,content) return content
def analyze(self): files = os.listdir(self.src_dir) for filename in files: path = self.src_dir + '/' + filename if os.path.isdir(path) == False: txt = rw.readFile(path) segs = self.ch.findall(txt) for seg in segs: words = self.get_words(seg) for word in words: if (word in self.dictionary): self.dictionary[word] += 1 else: self.dictionary[word] = 1 return self.dictionary
def sub(self, filename): print "********************" print "Substituting File: %s" % filename src_path = self.src_dir + '/' + filename res_path = self.res_dir + '/' + filename txt = pre.sort_txt(rw.readFile(src_path), [0, 1], 20) query_log = txt.split('\n')[0:self.num] content = "" for line in query_log: temp = line.split('\t') substituted = self.run(temp[0]) if substituted != "": print "Substituting: %s" % temp[0] query = temp[0] + '\t' + substituted + '\t' + temp[1] + '\n' content += query rw.writeFile(res_path, content)
def sub(self,filename): print "********************" print "Substituting File: %s" % filename src_path = self.src_dir + '/' + filename res_path = self.res_dir + '/' + filename txt = pre.sort_txt(rw.readFile(src_path),[0,1],20) query_log = txt.split('\n')[0:self.num] content = "" for line in query_log: temp = line.split('\t') substituted = self.run(temp[0]) if substituted != "": print "Substituting: %s" % temp[0] query = temp[0] + '\t' + substituted + '\t' + temp[1] + '\n' content += query rw.writeFile(res_path,content)
def segment(self,filename): print "********************" print "Segmenting File: %s" % filename src_path = self.src_dir + '/' + filename res_path = self.res_dir + '/' + filename query_log = rw.readFile(src_path).split('\n')[0:self.num] content = "" for line in query_log: temp = line.split('\t') #print "Segmenting: %s" % temp[0] segmented = self.run(temp[0]) if segmented != "": print "Segmenting: %s" % temp[0] query = temp[0] + '\t' + segmented + '\t' + temp[1] + '\n' content += query rw.writeFile(res_path,content)
def sort_file(self,filename): src_path = self.src_dir + "/" + filename tar_path = self.tar_dir + "/" + filename content = rw.readFile(src_path) query_list = content.split("\n") #constants num = len(query_list) num_sorted = 0 freq = 0 freq_sorted = 0 query_list_sorted = [] content_sorted = "" # sort the queries over min_char for query in query_list: temp = query.split("\t") if len(temp) > 1: freq += int(temp[-1]) if len(temp[0]) >= self.min_char: query_list_sorted.append(temp[0] + "\t" + temp[-1]) freq_sorted += int(temp[-1]) for target in self.targets: index = self.targets.index(target) count = (temp[0].count(target) > 0) * 1 self.target_num_l[index] += count#temp[0].count(target) self.target_freq_l[index] += count*int(temp[-1]) num_sorted = len(query_list_sorted) self.total_num += num self.total_num_sorted += num_sorted self.total_freq += freq self.total_freq_sorted += freq_sorted result = ("Query Log: %s\n" % filename) \ + ("Number of queries: %d\n" % num) \ + ("Queries over %d bytes: %d\n" % (self.min_char,num_sorted)) \ + ("Queries frequency: %d\n" % freq) \ + ("Long queries frequency: %d\n" % freq_sorted) \ + (("Long query ratio: %0.3f\n") % (float(num_sorted)/num)) \ + (("Long query frequency ratio: %0.3f\n") % (float(freq_sorted)/freq)) # recombine the sorted queries for query in query_list_sorted: content_sorted += (query + "\n") rw.writeFile(tar_path,content_sorted) return result
def sort_file(src_dir, res_dir, filename, cols, min_f): print "Pre-processing file: %s" % filename src_path = src_dir + '/' + filename res_path = res_dir + '/' + filename content = rw.readFile(src_path) query_list = content.split('\n')[0:-1] #constants query_list_sorted = [] content_sorted = "" for query in query_list: temp = query.split("\t") freq = int(temp[cols[-1]]) if freq >= min_f: line = "" for i in xrange(len(cols)-1): line += (temp[cols[i]] + '\t') line += temp[cols[-1]] content_sorted += (line + '\n') rw.writeFile(res_path,content_sorted) return content_sorted
def run(self): #define the server thread self.s = socket.socket() self.host = socket.gethostname() self.port = 2222 + self.pid * 10 self.s.bind((self.host, self.port)) self.s.listen(5) while True: #listen for connection c, addr = self.s.accept() print '\nGot connection from ', addr #initialize file index original_index = self.fileindex('original') download_index = self.fileindex('download') index = original_index + download_index #receive message msg = c.recv(1024) #parse & analyze message mid = msg.split(',')[0].split(':')[1] action = msg.split(',')[1].split(':')[0] name = msg.split(',')[1].split(':')[1] if action == 'search': print 'search ' + name ttl = msg.split(',')[2].split('=')[1] #read mid_list from file 'msg_list' mid_list = rw.readList('msg_list.txt') #decide broadcast or not if int(ttl) == 1 or mid in mid_list: print 'file dont need to pass' else: #update mid_list in file 'msg_list' mid_list.append(mid) rw.write('msg_list.txt', mid_list) #decrease the ttl value by 1 after broadcast for once rmsg = msg.rsplit(ttl, 1) msg = str(int(ttl)-1).join(rmsg) #start autobroadcast thread (details in autobroadcast.py module) broadcast = autobroadcast.Auto(msg, self.neighbor) broadcast.start() broadcast.join() if name in index: #check file's state meta_dict = rw.readDict('metadata.txt') if (name in original_index and (meta_dict['original'][name]['state'] == 'valid' or meta_dict['download'][name]['state'] == 'valid')): #start hitresponse thread (details in hitresponse.py module) msg = 'mid:' + str(self.pid) + '|' + str(self.port) + ',response:' + name hit = hitresponse.Hit(msg, mid) hit.start() hit.join() else: 'file state is not qualified' else: print 'no file match, pass to neighbors' elif action == 'update': print 'update ' + name #read mid_list from file 'msg_list' mid_list = rw.readList('msg_list.txt') #decide broadcast or not if mid in mid_list: print 'update dont need to pass' else: #update mid_list in file 'msg_list' mid_list.append(mid) rw.write('msg_list.txt', mid_list) #start autobroadcast thread (details in autobroadcast.py module) broadcast = autobroadcast.Auto(msg, self.neighbor) broadcast.start() broadcast.join() if name in download_index: #set file state to invalid meta_dict = rw.readDict('metadata.txt') meta_dict['download'][name]['state'] = 'invalid' rw.write('metadata.txt', meta_dict) else: print 'no file need to update, pass to neighbors' elif action == 'check': print 'check ' + name #check the original file's version version = int(msg.split(',')[2].split(':')[1]) TTR = int(msg.split(',')[3].split(':')[1]) meta_dict = rw.readDict('metadata.txt') if meta_dict['original'][name]['version'] == version: #if same version, send a new TTR. newTTR = 2 * TTR msg = 'mid:' + str(self.pid) + '|' + str(self.port) + ',checkresponse:' + name + ',state:valid,TTR:' + str(newTTR) else: #send Invalid & new version exist. msg = 'mid:' + str(self.pid) + '|' + str(self.port) + ',checkresponse:' + name + ',state:invalid' hit = hitresponse.Hit(msg, mid) hit.start() hit.join() elif action == 'checkresponse': print 'checkresponse ' + name #update the metadata state = str(msg.split(',')[2].split(':')[1]) if state == 'valid': TTR = int(msg.split(',')[3].split(':')[1]) #change TTR to 2TTR. (details in pullrenew.py module) pullrenew.renew('TTR', name, TTR) #change state from 'TTR Expired' back to 'valid' (details in pullrenew.py module) pullrenew.renew('state', name, state) elif state == 'invalid': #change state from 'TTR Expired' back to 'invalid' (details in pullrenew.py module) pullrenew.renew('state', name, state) elif action == 'response': #start reconnect thread (details in reconnect.py module) print 'response ' + name + ' from ' + mid #read name_list from file 'req_list' name_list = rw.readList('req_list.txt') #decide reconnect or not if name in name_list: msg = 'mid:' + str(self.pid) + '|' + str(self.port) + ',obtain:' + name connect = reconnect.Connect(msg, mid, name) connect.start() connect.join() else: print 'file has been obtained' elif action == 'obtain': print 'obtain ' + name #start transfer directly to original peer if name in original_index: path = os.path.join(os.getcwd(), 'files', 'original', name) metadata = rw.readDict('metadata.txt')['original'][name] elif name in download_index: path = os.path.join(os.getcwd(), 'files', 'download', name) metadata = rw.readDict('metadata.txt')['download'][name] #send metadata & file c.sendall(json.dumps(metadata)) #slice file into chunks by buffer content = rw.readFile(path) i = 0 while i <= len(content): chunk = buffer(content, i, 1024) c.sendall(chunk) i += 1024 print 'send file: ' + name c.close()
def init_dict(self): words = rw.readFile(self.dict_dir).split('\n')[0:-1] for word in words: self.dictionary.add(word)
def init_dic(self): lines = rw.readFile(self.dic_dir).split('\n')[0:-1] for line in lines: temp = line.split('\t') self.dictionary[temp[0]] = temp[1] return self.dictionary
def init_model(self): lines = rw.readFile(self.src_dir).split('\n')[0:-1] for line in lines: temp = line.split('\t') self.models.append(temp[0])