def crawl(self): self.get_pattern(self.dataset, self.cluster_rank) self.a = annotator(self.dataset) write_file = open( "./results/vidal_{0}_{1}_{2}_size{3}.txt".format( self.dataset, self.date, self.cluster_rank, self.crawl_size), "w") num_web_crawl = 0 entry, prefix = self.entry, self.prefix self.url_stack = [(entry, "", 0)] self.final_list = [] size, num = self.crawl_size, 0 # the number of crawling s = sampler(self.dataset, self.entry, self.prefix, 0) while (num < size and len(self.url_stack) > 0): first_url = self.url_stack[0][0] parent_url = self.url_stack[0][1] rule_id = self.url_stack[0][2] try: print "first url is ", first_url except: traceback.print_exc() if first_url not in self.history_set: num += 1 try: url_list, new_rule_id = self.crawl_link( first_url, rule_id, self.history_set, s) self.final_list.append((first_url, parent_url, rule_id)) except: print "might miss somthing here" traceback.print_exc() flag = s.crawlUrl(first_url, self.dataset, self.url_stack, self.history_set) if flag == 1: url_list, new_rule_id = self.crawl_link( first_url, rule_id, self.history_set, s) self.final_list.append( (first_url, parent_url, rule_id)) random_time_s = random.randint(5, 10) time.sleep(random_time_s) num_web_crawl += 1 if num_web_crawl % 10 == 9: random_time_s = random.randint(60, 90) time.sleep(random_time_s) else: num -= 1 print "crawl failure" if self.url_stack[0][0] == first_url: self.url_stack.pop(0) print " num is {}".format(num) sys.stdout.flush() self.history_set.add(first_url) print len(self.final_list), "length of final list" for pair in self.final_list: url, parent_url, cluster_id = pair[0], pair[1], pair[2] write_file.write(url + "\t" + str(parent_url) + "\t" + str(cluster_id) + '\n')
async def set_rule(sid, data): name, method, direction, criteria, exclude = data["name"], data[ "method"], data["direction"], data["criteria"], data["exclude"] rules[name] = sampler(name, method, direction, criteria, exclude) await sio.emit('rules', json.dumps(rules, cls=AdvancedJSONEncoder), room=sid)
def crawl(self): self.get_pattern(self.dataset, self.cluster_rank) self.a = annotator(self.dataset) write_file = open( "./results/vidal_{0}_{1}_{2}_size{3}.txt".format( self.dataset, self.date, self.cluster_rank, self.crawl_size ), "w", ) num_web_crawl = 0 entry, prefix = self.entry, self.prefix self.url_stack = [(entry, "", 0)] self.final_list = [] size, num = self.crawl_size, 0 # the number of crawling s = sampler(self.dataset, self.entry, self.prefix, 0) while num < size and len(self.url_stack) > 0: first_url = self.url_stack[0][0] parent_url = self.url_stack[0][1] rule_id = self.url_stack[0][2] try: print "first url is ", first_url except: traceback.print_exc() if first_url not in self.history_set: num += 1 try: url_list, new_rule_id = self.crawl_link(first_url, rule_id, self.history_set, s) self.final_list.append((first_url, parent_url, rule_id)) except: print "might miss somthing here" traceback.print_exc() flag = s.crawlUrl(first_url, self.dataset, self.url_stack, self.history_set) if flag == 1: url_list, new_rule_id = self.crawl_link(first_url, rule_id, self.history_set, s) self.final_list.append((first_url, parent_url, rule_id)) random_time_s = random.randint(5, 10) time.sleep(random_time_s) num_web_crawl += 1 if num_web_crawl % 10 == 9: random_time_s = random.randint(60, 90) time.sleep(random_time_s) else: num -= 1 print "crawl failure" if self.url_stack[0][0] == first_url: self.url_stack.pop(0) print " num is {}".format(num) sys.stdout.flush() self.history_set.add(first_url) print len(self.final_list), "length of final list" for pair in self.final_list: url, parent_url, cluster_id = pair[0], pair[1], pair[2] write_file.write(url + "\t" + str(parent_url) + "\t" + str(cluster_id) + "\n")
async def read_rule(sid): global rules try: #讀取規則集 with open("ruleset.json", "r") as f: rules_in_json = json.load(f) #建構Sampler rules = { key: sampler(data["name"], data["method"], data["direction"], data["criteria"], []) for (key, data) in rules_in_json.items() } await sio.emit('rules', json.dumps(rules, cls=AdvancedJSONEncoder), room=sid) except Exception as e: await sio.emit('error', str(e), room=sid)
def train_and_sample(gen, dis, dataset, sample_dataset, config, stage=1, pre_gen=None, pri_img_every=100, save_path='', sample_every=10, save_every=100): """ This function is used to train the generator and discriminator of both stage 1 and 2. gen: Generator object. dis: Discriminator object. dataset: The dataloader which is used to obtain images for training. sample_dataset: The dataset from which pre trained sentence embeddings for sampling is obtained. config: Dictionary containing the hyperparameters. stage: Indicates which stage so that stage specific operations can be done. pre_gen: If stage 2 this gives the path to the trained stage 1 generator. pri_img_every: Tells the iteration frequency for which loss and fake images generated has to be displayed. save_path: Containes the path to which the models and the images has to be stored. sample_every: Indicates the epoch frequency for sampling images from the trained generator. save_every: Indicates the epoch frequency for which the model has to be saved. """ if (stage == 2 and pre_gen == None): return 'Give the path to the trained stage 1 generator' elif (stage == 2 and pre_gen != None): gen.gen_1.load_state_dict(torch.load(pre_gen)['generator']) if (stage == 1): save_path = os.path.join(save_path, '1') else: save_path = os.path.join(save_path, '2') params = config noise_dim = params['noise_dim'] batch_size = params['batch_size'] gen_lr = params['gen_lr'] dis_lr = params['dis_lr'] noise = torch.FloatTensor(batch_size, noise_dim) noise = noise.to(device) imgen_noise = torch.FloatTensor(batch_size, noise_dim).normal_(0, 1) imgen_noise = imgen_noise.to(device) real_labels = torch.FloatTensor(batch_size).fill_(1) fake_labels = torch.FloatTensor(batch_size).fill_(0) real_labels = real_labels.to(device) fake_labels = fake_labels.to(device) optimizer_dis = optim.Adam(dis.parameters(), lr=dis_lr, betas=(0.5, 0.999)) gen_layers = [] for layer in gen.parameters( ): #ommiting the stage 1 generator layers in stage 2 for optimizing. if layer.requires_grad: gen_layers.append(layer) optimizer_gen = optim.Adam(gen_layers, lr=gen_lr, betas=(0.5, 0.999)) for epoch in range(params['epoch']): er_d = [] er_g = [] kl = [] start = time.time() print('Epoch {}'.format(epoch + 1)) if (epoch > 0 and ((epoch + 1) % params['lr_decay_epoch'] == 0) ): #decaying the learning rate after every specified interval gen_lr *= 0.5 for par in optimizer_gen.param_groups: par['lr'] = gen_lr dis_lr *= 0.5 for par in optimizer_dis.param_groups: par['lr'] = dis_lr for i, data in enumerate(dataset, 0): real_image, embedding = data real_image = real_image.to(device) embedding = embedding.to(device) noise.data.normal_(0, 1) gen.train() _, fake_image, mean, variance = gen(embedding, noise) #genrate fake image dis.zero_grad() #updating discriminator error_d, real_error, wrong_error, fake_error = discriminator_loss( dis, fake_image, real_image, fake_labels, real_labels, mean, stage) er_d.append(error_d.item()) error_d.backward() optimizer_dis.step() gen.zero_grad() #updating generator error_g = generator_loss(dis, fake_image, real_labels, mean) er_g.append(error_g.item()) kl_los = kl_loss(mean, variance) kl.append(kl_los.item()) total_error = error_g + kl_los * params['kl_coeff'] total_error.backward() optimizer_gen.step() if (((i + 1) % pri_img_every) == 0): print('Discriminator_error: {}'.format(error_d.item())) print('Generator_error:{}'.format(error_g.item())) print('KL loss:{}'.format(kl_los.item())) print('Running discriminator loss: {}'.format( sum(er_d) / len(er_d))) print('Running generator loss: {}'.format( sum(er_g) / len(er_g))) print('Running KL loss: {}'.format(sum(kl) / len(kl))) previous, current, _, _ = gen(embedding, imgen_noise) save_image(real_image, current, epoch + 1, os.path.join(save_path, 'images')) show = utils.make_grid(real_image[0:16]) image_show(show) show = utils.make_grid(current[0:16]) image_show(show) if previous is not None: save_image(None, previous, epoch + 1, os.path.join(save_path, 'images')) elapsed_time = time.time() - start print('Epoch {} completed in {:.0f}minutes {:.0f}seconds'.format( epoch + 1, elapsed_time // 60, elapsed_time % 60)) print('Discriminator loss for this epoch: {}'.format( sum(er_d) / len(er_d))) print('Generator loss for this epoch: {}'.format( sum(er_g) / len(er_g))) print('KL loss for this epoch: {}'.format(sum(kl) / len(kl))) if ((epoch + 1) % save_every == 0): save_model(gen, dis, optimizer_gen, optimizer_dis, epoch + 1, os.path.join(save_path, 'model'), stage=stage) if ((epoch + 1) % sample_every == 0): sampler(gen, sample_dataset, epoch + 1, noise=imgen_noise, save_path=os.path.join(save_path, 'images')) save_model(gen, dis, optimizer_gen, optimizer_dis, params['epoch'], os.path.join(save_path, 'model'), stage=stage)
def crawling(self, crawl_size=1000): if not os.path.exists("./results/irobot/"): os.mkdir("./results/irobot/") write_file = open( "./results/irobot/{0}_irobot_size{1}.txt".format( self.dataset, crawl_size), "w") entry, prefix = self.entry, self.prefix self.url_stack = [(entry, "", 0)] #(entry,parent_url,crawl_level) self.final_list = [] size, num = crawl_size, 0 # the number of crawling crawl_id = 0 s = sampler(self.dataset, self.entry, self.prefix, 0) end = 0 num_web_crawl = 0 while (num < size and len(self.url_stack) > 0): print self.url_stack[-1] print self.url_stack[0] first_url = self.url_stack[end][0] parent_url = self.url_stack[end][1] crawl_level = self.url_stack[end][2] try: print "first url is ", first_url except: traceback.print_exc() if first_url not in self.history_set: num += 1 try: url_list = self.crawl_link(first_url, crawl_level, self.history_set, s) print "url list", len(url_list) self.url_stack.pop(end) self.url_stack += url_list self.final_list.append( (first_url, parent_url, crawl_level)) except: print "might miss somthing here" traceback.print_exc() flag = self.crawlUrl(first_url, self.dataset, self.url_stack, self.history_set) if flag == 1: url_list = self.crawl_link(first_url, crawl_level, self.history_set, s) self.url_stack.pop(end) print "url list", len(url_list) self.url_stack += url_list self.final_list.append( (first_url, parent_url, crawl_level)) random_time_s = random.randint(5, 10) time.sleep(random_time_s) num_web_crawl += 1 if num_web_crawl % 10 == 9: random_time_s = random.randint(60, 90) time.sleep(random_time_s) else: num -= 1 print "crawl failure" else: self.url_stack.pop(end) end = random.choice([0, -1]) print "end is ", end crawl_id += 1 print " num is {}".format(num) sys.stdout.flush() if num >= size: print "crawl_id is {0} for size {1}".format(crawl_id, size) self.history_set.add(first_url) print len(self.final_list), "length of final list" for pair in self.final_list: url, parent_url, crawl_level = pair[0], pair[1], pair[2] write_file.write(url + "\t" + str(parent_url) + "\t" + str(crawl_level) + '\n')
def crawling(self, num_crawl): counter = Counter(sitemap.UP_pages.category) self.c_prob = defaultdict(float) total = sum(counter.values()) for key in counter: self.c_prob[key] = float(counter[key]) / float(total) # self.entry, self.prefix, self.dataset, self.trans_xpath_dict, target_cluste id #self.target_cluster = self.get_sample_cluster() write_file = open( "./results/{0}_{1}_{2}_{3}_size{4}.txt".format( self.dataset, self.date, self.cluster_rank, self.rank_algo, self.crawl_size), "w") num_web_crawl = 0 entry, prefix = self.entry, self.prefix self.url_stack = [(entry, "", "", self.max_score)] self.final_list = [] size, num = num_crawl, 0 # the number of crawling crawl_id = 0 s = sampler(self.dataset, self.entry, self.prefix, 0) while (num < size and len(self.url_stack) > 0): first_url = self.url_stack[0][0] parent_url = self.url_stack[0][1] parent_xpath = self.url_stack[0][2] score = self.url_stack[0][3] print self.url_stack[0] print self.url_stack[-1] #first_url = self.url_stack[0][0] try: print "first url is " + first_url except: traceback.print_exc() if first_url not in self.history_set: num += 1 try: url_list, cluster_id = self.crawl_link( first_url, self.history_set, s) #print "url_list", url_list self.sort_queue(url_list, first_url, self.rank_algo) # sort url_stack self.final_list.append((first_url, parent_url, parent_xpath, score, cluster_id)) except: print "might miss somthing here" traceback.print_exc() flag = s.crawlUrl(first_url, self.dataset, self.url_stack, self.history_set) if flag == 1: url_list, cluster_id = self.crawl_link( first_url, self.history_set, s) self.sort_queue(url_list, first_url, rank_algo=self.rank_algo) self.final_list.append( (first_url, parent_url, parent_xpath, score, cluster_id)) random_time_s = random.randint(5, 10) time.sleep(random_time_s) num_web_crawl += 1 if num_web_crawl % 10 == 9: random_time_s = random.randint(60, 90) time.sleep(random_time_s) else: num -= 1 print "crawl failure" if self.url_stack[0][0] == first_url: self.url_stack.pop(0) crawl_id += 1 print " num is {}".format(num) sys.stdout.flush() if num >= size: print "crawl_id is {0} for size {1}".format(crawl_id, size) #print "first url comes from the {} th crawled page".format(self.page_num[first_url]) self.history_set.add(first_url) print len(self.final_list), "length of final list" for pair in self.final_list: url, parent_url, parent_xpath, score, cluster_id = pair[0], pair[ 1], pair[2], pair[3], pair[4] write_file.write(url + "\t" + str(parent_url) + "\t" + str(parent_xpath) + "\t" + str(score) + "\t" + str(cluster_id) + '\n')
def sampling(self, num_crawl, method="uniform"): # need to read the pagerank dict from file if method == "pagerank": path = "./src/data/{0}/{0}.pr_dict".format(self.dataset) with open(path, "rb") as outfile: pr_dict = pickle.load(outfile) avg_pr = sum(pr_dict.values()) / len(pr_dict) print avg_pr, "average pagerank" elif method == "indegree": path = "./src/data/{0}/{0}.inlink_dict".format(self.dataset) with open(path, "rb") as outfile: inlink = pickle.load(outfile) indegree_dict = defaultdict(int) for key in inlink: indegree_dict[key] = len(inlink[key]) avg_indegree = sum(indegree_dict.values()) / len(indegree_dict) elif method == "est_prob": counter = Counter(sitemap.UP_pages.category) self.c_prob = defaultdict(float) total = sum(counter.values()) for key in counter: self.c_prob[key] = float(counter[key]) / float(total) self.crawl_history = Counter() for i in range(self.cluster_num): self.crawl_history[i] = 1 write_file = open( "./results/sampling/random_{0}_{1}_size{2}.txt".format( method, self.dataset, self.crawl_size), "w") num_web_crawl = 0 entry, prefix = self.entry, self.prefix self.url_stack, self.crawl_length = [entry], 0 self.final_list, url_list, last_list = [], [], [] size, num = num_crawl, 0 # the number of crawling s = sampler(self.dataset, self.entry, self.prefix, 0) while (num < size and len(self.url_stack) > 0): first_url = self.url_stack[0] print "first_url", first_url try: sys.stdout.write("num is {}\n".format(num)) sys.stdout.flush() #print num, "num" url_list, cluster_id = self.sample_link(first_url, s, method) if first_url not in self.history_set: self.final_list.append((first_url, cluster_id)) num += 1 # add url to sample history anyway self.crawl_history[cluster_id] += 1 self.crawl_length += 1 except: print "might miss somthing here" traceback.print_exc() flag = s.crawlUrl(first_url, self.dataset, self.url_stack, self.history_set) if flag == 1: sys.stdout.write("num is {}\n".format(num)) sys.stdout.flush() #print num, "num" url_list, cluster_id = self.sample_link( first_url, s, method) print url_list if first_url not in self.history_set: num += 1 self.final_list.append((first_url, cluster_id)) random_time_s = random.randint(5, 10) time.sleep(random_time_s) #num_web_crawl += 1 if num_web_crawl % 10 == 9: random_time_s = random.randint(60, 90) time.sleep(random_time_s) else: #change the first_url from parent sampling print num, "num" traceback.print_exc() pass if self.url_stack[0] == first_url: self.url_stack.pop(0) self.history_set.add(first_url) probability = 0.15 if method == "uniform": # after processing, 0.15 random and 0.85 uniform sampling # no out-links - random sample if random.random() < probability: self.select_from_history_set() else: try: print url_list id = random.randrange(len(url_list)) self.url_stack.append(url_list[id]) print url_list[id], "select from out-links" except: self.select_from_history_set() elif method == "pagerank": if random.random() < probability: url = random.sample(self.history_set, 1)[0] self.url_stack.append(url) print url, "random sampled from history set" else: try: id = self.sample_from_dist(url_list, pr_dict, avg_pr) self.url_stack.append(url_list[id]) print url_list[id], "select from out-links" except: self.select_from_history_set() elif method == "indegree": print "sample from orcacle indegree" if random.random() < probability: url = random.sample(self.history_set, 1)[0] self.url_stack.append(url) print url, "random sampled from history set" else: try: id = self.sample_from_dist(url_list, indegree_dict, avg_indegree) self.url_stack.append(url_list[id]) print url_list[id], "select from out-links" except: traceback.print_exc() self.select_from_history_set() else: # our method if random.random() < probability: url = random.sample(self.history_set, 1)[0] self.url_stack.append(url) print url, "random sampled from history set" else: try: id = self.sample_from_prob_list(url_list) self.url_stack.append(url_list[id][0]) print url_list[id], "select from out-links" except: traceback.print_exc() self.select_from_history_set() print len(self.final_list), "length of final list" for pair in self.final_list: url, cluster_id = pair[0], pair[1] write_file.write(url + "\t" + str(cluster_id) + '\n')
def crawling(self,num_crawl): counter = Counter(sitemap.UP_pages.category) self.c_prob = defaultdict(float) total = sum(counter.values()) for key in counter: self.c_prob[key] = float(counter[key])/float(total) # self.entry, self.prefix, self.dataset, self.trans_xpath_dict, target_cluste id #self.target_cluster = self.get_sample_cluster() write_file = open("./results/{0}_{1}_{2}_{3}_size{4}.txt".format(self.dataset,self.date, self.cluster_rank, self.rank_algo, self.crawl_size),"w") num_web_crawl = 0 entry,prefix = self.entry, self.prefix self.url_stack = [(entry,"","",self.max_score)] self.final_list = [] size, num = num_crawl, 0 # the number of crawling crawl_id = 0 s = sampler(self.dataset,self.entry,self.prefix,0) while(num<size and len(self.url_stack)>0): first_url = self.url_stack[0][0] parent_url = self.url_stack[0][1] parent_xpath = self.url_stack[0][2] score = self.url_stack[0][3] print self.url_stack[0] print self.url_stack[-1] #first_url = self.url_stack[0][0] try: print "first url is " + first_url except: traceback.print_exc() if first_url not in self.history_set: num += 1 try: url_list,cluster_id = self.crawl_link(first_url, self.history_set, s) #print "url_list", url_list self.sort_queue(url_list,first_url,self.rank_algo) # sort url_stack self.final_list.append((first_url,parent_url,parent_xpath,score,cluster_id)) except: print "might miss somthing here" traceback.print_exc() flag = s.crawlUrl(first_url,self.dataset,self.url_stack,self.history_set) if flag == 1: url_list,cluster_id = self.crawl_link(first_url, self.history_set,s ) self.sort_queue(url_list,first_url,rank_algo=self.rank_algo) self.final_list.append((first_url,parent_url,parent_xpath,score,cluster_id)) random_time_s = random.randint(5, 10) time.sleep(random_time_s) num_web_crawl += 1 if num_web_crawl%10 == 9: random_time_s = random.randint(60, 90) time.sleep(random_time_s) else: num -= 1 print "crawl failure" if self.url_stack[0][0] == first_url: self.url_stack.pop(0) crawl_id += 1 print " num is {}".format(num) sys.stdout.flush() if num >= size: print "crawl_id is {0} for size {1}".format(crawl_id,size) #print "first url comes from the {} th crawled page".format(self.page_num[first_url]) self.history_set.add(first_url) print len(self.final_list), "length of final list" for pair in self.final_list: url, parent_url,parent_xpath,score, cluster_id = pair[0],pair[1],pair[2],pair[3],pair[4] write_file.write(url + "\t" + str(parent_url) +"\t" + str(parent_xpath) + "\t" + str(score) + "\t"+ str(cluster_id) + '\n')
def sampling(self,num_crawl,method="uniform"): # need to read the pagerank dict from file if method == "pagerank": path = "./src/data/{0}/{0}.pr_dict".format(self.dataset) with open(path,"rb") as outfile: pr_dict = pickle.load(outfile) avg_pr = sum(pr_dict.values())/len(pr_dict) print avg_pr, "average pagerank" elif method == "indegree": path = "./src/data/{0}/{0}.inlink_dict".format(self.dataset) with open(path,"rb") as outfile: inlink = pickle.load(outfile) indegree_dict = defaultdict(int) for key in inlink: indegree_dict[key] = len(inlink[key]) avg_indegree = sum(indegree_dict.values())/len(indegree_dict) elif method == "est_prob": counter = Counter(sitemap.UP_pages.category) self.c_prob = defaultdict(float) total = sum(counter.values()) for key in counter: self.c_prob[key] = float(counter[key])/float(total) self.crawl_history = Counter() for i in range(self.cluster_num): self.crawl_history[i] = 1 write_file = open("./results/sampling/random_{0}_{1}_size{2}.txt".format(method,self.dataset,self.crawl_size),"w") num_web_crawl=0 entry, prefix = self.entry, self.prefix self.url_stack,self.crawl_length = [entry],0 self.final_list, url_list, last_list = [], [], [] size, num = num_crawl, 0 # the number of crawling s = sampler(self.dataset,self.entry,self.prefix,0) while(num<size and len(self.url_stack) >0): first_url = self.url_stack[0] print "first_url", first_url try: sys.stdout.write("num is {}\n".format(num)) sys.stdout.flush() #print num, "num" url_list,cluster_id = self.sample_link(first_url,s,method) if first_url not in self.history_set: self.final_list.append((first_url,cluster_id)) num += 1 # add url to sample history anyway self.crawl_history[cluster_id] += 1 self.crawl_length += 1 except: print "might miss somthing here" traceback.print_exc() flag = s.crawlUrl(first_url,self.dataset,self.url_stack,self.history_set) if flag == 1: sys.stdout.write("num is {}\n".format(num)) sys.stdout.flush() #print num, "num" url_list,cluster_id = self.sample_link(first_url,s,method) print url_list if first_url not in self.history_set: num += 1 self.final_list.append((first_url,cluster_id)) random_time_s = random.randint(5, 10) time.sleep(random_time_s) #num_web_crawl += 1 if num_web_crawl%10 == 9: random_time_s = random.randint(60, 90) time.sleep(random_time_s) else: #change the first_url from parent sampling print num, "num" traceback.print_exc() pass if self.url_stack[0] == first_url: self.url_stack.pop(0) self.history_set.add(first_url) probability = 0.15 if method=="uniform": # after processing, 0.15 random and 0.85 uniform sampling # no out-links - random sample if random.random() < probability: self.select_from_history_set() else: try: print url_list id = random.randrange(len(url_list)) self.url_stack.append(url_list[id]) print url_list[id], "select from out-links" except: self.select_from_history_set() elif method == "pagerank": if random.random() < probability: url = random.sample(self.history_set,1)[0] self.url_stack.append(url) print url, "random sampled from history set" else: try: id = self.sample_from_dist(url_list,pr_dict,avg_pr) self.url_stack.append(url_list[id]) print url_list[id], "select from out-links" except: self.select_from_history_set() elif method == "indegree": print "sample from orcacle indegree" if random.random() < probability: url = random.sample(self.history_set,1)[0] self.url_stack.append(url) print url, "random sampled from history set" else: try: id = self.sample_from_dist(url_list,indegree_dict,avg_indegree) self.url_stack.append(url_list[id]) print url_list[id], "select from out-links" except: traceback.print_exc() self.select_from_history_set() else: # our method if random.random() < probability: url = random.sample(self.history_set,1)[0] self.url_stack.append(url) print url, "random sampled from history set" else: try: id = self.sample_from_prob_list(url_list) self.url_stack.append(url_list[id][0]) print url_list[id], "select from out-links" except: traceback.print_exc() self.select_from_history_set() print len(self.final_list), "length of final list" for pair in self.final_list: url, cluster_id = pair[0],pair[1] write_file.write(url + "\t"+ str(cluster_id) + '\n')
def crawling(self,crawl_size=1000): if not os.path.exists("./results/irobot/"): os.mkdir("./results/irobot/") write_file = open("./results/irobot/{0}_irobot_size{1}.txt".format(self.dataset,crawl_size),"w") entry,prefix = self.entry, self.prefix self.url_stack = [(entry,"",0)] #(entry,parent_url,crawl_level) self.final_list = [] size, num = crawl_size, 0 # the number of crawling crawl_id = 0 s = sampler(self.dataset,self.entry,self.prefix,0) end = 0 num_web_crawl = 0 while(num<size and len(self.url_stack)>0): print self.url_stack[-1] print self.url_stack[0] first_url = self.url_stack[end][0] parent_url = self.url_stack[end][1] crawl_level = self.url_stack[end][2] try: print "first url is ",first_url except: traceback.print_exc() if first_url not in self.history_set: num += 1 try: url_list = self.crawl_link(first_url, crawl_level, self.history_set, s) print "url list", len(url_list) self.url_stack.pop(end) self.url_stack += url_list self.final_list.append((first_url,parent_url,crawl_level)) except: print "might miss somthing here" traceback.print_exc() flag = self.crawlUrl(first_url,self.dataset,self.url_stack,self.history_set) if flag == 1: url_list = self.crawl_link(first_url, crawl_level, self.history_set,s ) self.url_stack.pop(end) print "url list", len(url_list) self.url_stack += url_list self.final_list.append((first_url,parent_url,crawl_level)) random_time_s = random.randint(5, 10) time.sleep(random_time_s) num_web_crawl += 1 if num_web_crawl%10 == 9: random_time_s = random.randint(60, 90) time.sleep(random_time_s) else: num -= 1 print "crawl failure" else: self.url_stack.pop(end) end = random.choice([0,-1]) print "end is ", end crawl_id += 1 print " num is {}".format(num) sys.stdout.flush() if num >= size: print "crawl_id is {0} for size {1}".format(crawl_id,size) self.history_set.add(first_url) print len(self.final_list), "length of final list" for pair in self.final_list: url, parent_url, crawl_level = pair[0],pair[1],pair[2] write_file.write(url + "\t" + str(parent_url) +"\t" + str(crawl_level) + '\n')