def seg_train(path=config.train_dev_path): wb = load_workbook(path) ws = wb['sheet1'] max_row = ws.max_row indexs = list(range(2,max_row+1)) shuffle(indexs) wb_train = Workbook() sheet_train = wb_train.create_sheet('sheet1') wb_train.remove(wb_train['Sheet']) wb_dev = Workbook() sheet_dev = wb_dev.create_sheet('sheet1') wb_dev.remove(wb_dev['Sheet']) for i in range(4): sheet_train.cell(1,i+1,ws.cell(1,i+1).value) sheet_dev.cell(1,i+1,ws.cell(1,i+1).value) mid = len(indexs) // 10 * 8 train_line = 2 test_line = 2 for i in range(len(indexs)): if i<mid: sheet_train.cell(train_line,1,ws.cell(indexs[i],1).value) sheet_train.cell(train_line,2,ws.cell(indexs[i],2).value) sheet_train.cell(train_line,3,ws.cell(indexs[i],3).value) sheet_train.cell(train_line,4,ws.cell(indexs[i],4).value) train_line+=1 else: sheet_dev.cell(test_line,1,ws.cell(indexs[i],1).value) sheet_dev.cell(test_line,2,ws.cell(indexs[i],2).value) sheet_dev.cell(test_line,3,ws.cell(indexs[i],3).value) sheet_dev.cell(test_line,4,ws.cell(indexs[i],4).value) test_line+=1 wb_train.save('./sub_train.xlsx') wb_dev.save('./sub_dev.xlsx') logger.info("Finished seg train data")
def get_all_vocab(): vocab_train_path = './task2_vocab.txt' vocab_val_path = './task2_vocab.val.txt' train_data_path = './task2_train_reformat.xlsx' all_cocab_path = './all_vocab.txt' vocab_list = [] for path in [vocab_train_path, vocab_val_path]: with open(path, 'r', encoding='utf-8') as vocab: for x in vocab.readlines(): vocab_list.append(x[:-1].replace('_x0004_', '').replace(' ', '')) wb = load_workbook(train_data_path) ws = wb['sheet1'] max_row = ws.max_row for i in range(max_row-1): line = i+2 if ws.cell(line, 2).value is not None: places = ws.cell(line, 2).value.split(',') for place in places: vocab_list.append(place.replace('_x0004_', '').replace(' ', '')) if ws.cell(line, 4).value is not None: places = ws.cell(line, 4).value.split(',') for place in places: vocab_list.append(place.replace('_x0004_', '').replace(' ', '')) vocab_list = list(set(vocab_list)) with open(all_cocab_path, 'w', encoding='utf-8') as all_cocab_file: for place in vocab_list: all_cocab_file.write(place+'\n') logger.info('all_vocab写入完成')
async def run_api_server(): start_http_server(8001) logger.info("run api_server") runner = web.AppRunner(api_server) await runner.setup() site = web.TCPSite(runner, '0.0.0.0', 8000) await site.start()
def spider_main(search_list, use_cache=False): if use_cache: proxy_df = SpiderProxy.read_csv() else: pxy = SpiderProxy() # pxy.spider_proxy360() pxy.spider_xicidaili() pxy.check_proxy() pxy.save_csv() proxy_df = pxy.proxy_df if (len(proxy_df) <= 0): logger.info('代理失效,结束') return """ 由于代理太慢暂时每次只启动三个进程 """ n_jobs = PROCESS_MAX_COUNT if g_enable_debug: n_jobs = 1 parallel = Parallel(n_jobs=n_jobs, verbose=0, pre_dispatch='2*n_jobs') parallel( delayed(do_spider_parallel)(proxy_df, ind, search_name) for ind, search_name in enumerate(search_list))
def sub_text_more(file='train'): if file == 'train': path = './sub_train.xlsx' save_path = './sub_cut_train.xlsx' else: path = './sub_dev.xlsx' save_path = './sub_cut_dev.xlsx' wb = load_workbook(path) ws = wb['sheet1'] max_row = ws.max_row wb1 = Workbook() ws1 = wb1.create_sheet('sheet1') wb1.remove(wb1['Sheet']) names = ['原文', '原发部位', '病灶大小', '转移部位'] for i in range(len(names)): ws1.cell(1, i + 1, names[i]) all_text = [] all_origin = [] all_size = [] all_trans = [] for i in range(max_row-1): line = i+2 text = ws.cell(line,1).value texts = tool.split_text(text) all_text.extend(texts) for j in range(3): if ws.cell(line,j+2).value is not None: places = ws.cell(line,j+2).value.split(',') for t in texts: place_in_text = [] for place in places: if place in t: place_in_text.append(place) if j==0: all_origin.append(','.join(place_in_text)) elif j==1: all_size.append(','.join(place_in_text)) else: all_trans.append(','.join(place_in_text)) else: for t in texts: if j==0: all_origin.append('') elif j==1: all_size.append('') else: all_trans.append('') assert len(all_trans) == len(all_size) and len(all_trans) == len(all_origin), 'len(all_trans) != len(all_size) or len(all_trans) != len(all_origin)' for i in range(len(all_text)): line = i+2 ws1.cell(line,1,all_text[i]) ws1.cell(line,2,all_origin[i]) ws1.cell(line,3,all_size[i]) ws1.cell(line,4,all_trans[i]) wb1.save(save_path) logger.info('Finished cut {}.xlsx'.format(file))
def _do_collect_work(self): with ThreadPoolExecutor(max_workers=len(self.back_proxys) * 3) as executor: """ 这里使用线程池还是因为代理的质量太差了, 要控制线程数量 """ thread_lock = threading.RLock() all_same_cnt = 0 while True: soup = BeautifulSoup(self.driver.page_source, "lxml") img_objs = soup.select('#imgid > div > ul > li[data-objurl]') sub_same_cnt = 0 for img in img_objs: url = img['data-objurl'] url_thumb = img['data-thumburl'] if self.requested_url.count(url) > 0: sub_same_cnt += 1 continue url_dict = {'url': url, 'url_thumb': url_thumb} if g_enable_debug: self.down_load_img(url_dict, thread_lock) else: executor.submit(self.down_load_img, url_dict, thread_lock) # 就在这里append否则里面还要线程同步 self.requested_url.append(url) js = "window.scrollTo({}, {})".format( self.current_pos, self.current_pos + MOVE_STEPS) self.current_pos += MOVE_STEPS self.driver.execute_script(js) time.sleep(MOVE_SLEEP_TIME) """ 所有都在requested中记录全等一次否则重置 """ if sub_same_cnt == len(img_objs): all_same_cnt += 1 else: all_same_cnt = 0 """ 达到一定次数,认为到底部了 """ if all_same_cnt > 30: print '[Process] 图片流 结束,已下载图片数目=%d' % (self.collect_cnt) break if self.collect_cnt >= IMAGE_MAX_COUNT: logger.info('collect_cnt > %d task end' % (IMAGE_MAX_COUNT)) print '[Process] 下载图片数目 超过 %d' % (IMAGE_MAX_COUNT) break
def sub_text(file='train'): if file == 'train': path = './sub_train.xlsx' save_path = './sub_cut_train.xlsx' else: path = './sub_dev.xlsx' save_path = './sub_cut_dev.xlsx' wb = load_workbook(path) ws = wb['sheet1'] max_row = ws.max_row wb1 = Workbook() wb2 = Workbook() ws1 = wb1.create_sheet('sheet1') wb1.remove(wb1['Sheet']) ws2 = wb2.create_sheet('sheet1') wb2.remove(wb2['Sheet']) names = ['原文', '原发部位', '病灶大小', '转移部位'] for i in range(len(names)): ws1.cell(1, i + 1, names[i]) ws2.cell(1, i + 1, names[i]) line_1_2 = 2 for i in tqdm(range(max_row - 1)): p = [[] for i in range(6)] line = i + 2 text = ws.cell(line,1).value middle = ws.cell(line,1).value.find('。') text1 = text[:middle+1] text2 = text[middle+1:] for i in range(3): if ws.cell(line,i+2).value is not None: places = ws.cell(line,i+2).value.split(',') for place in places: if i==1: p[2*i].append(place) p[2 * i + 1].append(place) else: if place in text1: p[2*i].append(place) if place in text2: p[2*i+1].append(place) else: p[2 * i]='' p[2 * i + 1]='' ws1.cell(line_1_2,1,text1) ws1.cell(line_1_2+1,1,text2) ws1.cell(line_1_2,2,','.join(p[0])) ws1.cell(line_1_2+1,2,','.join(p[1])) ws1.cell(line_1_2,3,','.join(p[2])) ws1.cell(line_1_2+1,3,','.join(p[3])) ws1.cell(line_1_2,4,','.join(p[4])) ws1.cell(line_1_2+1,4,','.join(p[5])) line_1_2+=2 wb1.save(save_path) logger.info('Finished cut {}.xlsx'.format(file))
def do_thread_work(self, proxy, checked_list, thread_lock): if proxy['type'] == 'HTTP': proxy_dict = dict(http='http://{}'.format(proxy['proxy']), https='http://{}'.format(proxy['proxy'])) else: proxy_dict = dict(http='socks5://{}'.format(proxy['proxy']), https='socks5://{}'.format(proxy['proxy'])) try: # r = requests.post("https://www.baidu.com/", headers=self.headers, proxies=proxy_dict, timeout=15, # verify=False) img_url = 'http://picm.bbzhi.com/dongwubizhi/labuladuoxunhuiquanbizhi/animal_' \ 'labrador_retriever_1600x1200_44243_m.jpg' enable_stream = False if enable_stream: response = requests.get(img_url, headers=self.headers, proxies=proxy_dict, timeout=15, stream=True) if response.status_code == 200: test_name = '../gen/check_proxy.jpg' with open(test_name, 'wb') as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.flush() check_img = PIL.Image.open(test_name) check_img.close() else: response = requests.get(img_url, headers=self.headers, proxies=proxy_dict, timeout=(10, 20)) if response.status_code == 200: test_name = '../gen/check_proxy.jpg' with open(test_name, 'wb') as f: f.write(response.content) f.flush() check_img = PIL.Image.open(test_name) check_img.close() except Exception as e: # logger.exception(e) return with thread_lock: logger.info('{} check ok'.format(proxy['proxy'])) checked_list.append(proxy)
async def verify_error_proxy_task(): logger.info("run verify_error_proxy_task") s = sess_maker() c = s.query(Proxy).filter(Proxy.status == STATUS_OK).count() s.close() if c < VERIFY_ERROR_LIMIT: await verify_error_proxy() s = sess_maker() c = s.query(Proxy).filter(Proxy.status == STATUS_ERROR).count() if c > MAX_ERROR_PROXIES: res = s.query(Proxy).filter(Proxy.status == STATUS_ERROR).order_by( asc(Proxy.updated_at)).limit(c - MAX_ERROR_PROXIES).from_self().all() [s.delete(i) for i in res] s.commit()
def check_proxy(self): checked_list = list() thread_lock = threading.RLock() thread_array = [] for proxy in self.proxy_list: # self.do_thread_work(proxy, checked_list, thread_lock) t = threading.Thread(target=self.do_thread_work, args=( proxy, checked_list, thread_lock, )) t.setDaemon(True) t.start() thread_array.append(t) for t in thread_array: t.join() self.proxy_list = checked_list logger.info('proxy_list len={}'.format(len(self.proxy_list)))
def createMenu(self, accessToken): logger.info("createMenu in...........") redirect_uri1 = 'http://www.dabooster.com/sport/reserve_middleware_wx.html?query=yes' redirect_uri2 = 'http://www.dabooster.com/sport/reserve_middleware_wx.html?query=no' redirect_uri3 = 'http://www.dabooster.com/sport/personal_center_wx.html' url1 = "https://open.weixin.qq.com/connect/oauth2/authorize?appid=" + appid + "&redirect_uri=" + redirect_uri1 + "&response_type=code&scope=snsapi_base&state=OK#wechat_redirect" url2 = "https://open.weixin.qq.com/connect/oauth2/authorize?appid=" + appid + "&redirect_uri=" + redirect_uri2 + "&response_type=code&scope=snsapi_base&state=OK#wechat_redirect" url3 = "https://open.weixin.qq.com/connect/oauth2/authorize?appid=" + appid + "&redirect_uri=" + redirect_uri3 + "&response_type=code&scope=snsapi_base&state=OK#wechat_redirect" menu = '''{ "button":[ { "name":"在线预定", "sub_button":[ {"type":"view", "name":"场地预定", "url":"http://www.dabooster.com/sport/reserve_wx.html?appid=wx95198705de430c74"}, {"type":"view", "name":"我的订单", "url":"%s"} ] }, { "type":"view", "name":"活动", "url":"%s" }, { "type":"view", "name":"个人中心", "url":"%s" } } ]}'''%(url1,url2,url3) html = urllib2.urlopen(self.createUrl + accessToken, menu.encode("utf-8")) result = json.loads(html.read().decode("utf-8")) logger.info("html=" + str(result)) return result["errcode"]
def get_data(self): db = Database() db.connect() for tab in self.tab_list: #因为mysql的命名不能用「-」 if '-' in tab: temp_tab = tab.replace('-', '_') else: temp_tab = tab table_name = 'etf_' + temp_tab url = self.root_url first_body = { "tab": tab, "only": ["meta", "data"], } re = requests.post(url, headers=self.header, data=json.dumps(first_body)).json() #meta 表示返回的数据概要 total_pages = re['meta']['total_pages'] #把第一次返回的内容写进数据库 logger.info("{} has total {} pages!!".format(tab, total_pages)) logger.info("getting {} page {} data".format(tab, '1')) self.spilt_data_by_name_and_insert(re['data'], db, table_name) for i in range(2, total_pages + 1): logger.info("getting data of {}--page {} / {}".format( tab, str(i), str(total_pages))) payload = { "page": str(i), "tab": tab, "only": ["meta", "data"], } re = requests.post(url, headers=self.header, data=json.dumps(payload)).json() self.spilt_data_by_name_and_insert(re['data'], db, table_name) db.close()
async def wrapper(*args, **kw): res = await fn(*args, **kw) logger.info(f"run spider {fn.__name__} get {len(res)} result") return res
def train(self): max_f1 = -1 max_dict = {} max_report = {} label_report = {} loss_list = [] f1_list = [] epoch_list = [] if not os.path.exists('./result/classification_report/{}'.format( self.config.experiment_name)): os.mkdir('./result/classification_report/{}'.format( self.config.experiment_name)) os.mkdir('./result/picture/{}'.format(self.config.experiment_name)) os.mkdir('./result/data/{}'.format(self.config.experiment_name)) os.mkdir('./result/data/{}/test_format'.format( self.config.experiment_name)) logger.info('Loading data ...') train_data = self.tool.load_data(self.config.train_path, self.config.is_bioes) dev_data = self.tool.load_data(self.config.dev_path, self.config.is_bioes) logger.info('Finished load data') logger.info('Building vocab ...') if self.config.is_pretrained_model: with open(self.config.pretrained_vocab, 'r', encoding='utf-8') as vocab_file: vocab_list = vocab_file.readlines() if self.config.model_name == 'FLAT': self.bigram_vocab = self.tool.get_bigram_vocab( train_data, dev_data) self.lattice_vocab = self.tool.get_text_vocab( train_data, dev_data) else: self.word_vocab = self.tool.get_text_vocab(vocab_list) else: if self.config.model_name == 'FLAT': self.bigram_vocab = self.tool.get_bigram_vocab( train_data, dev_data) self.lattice_vocab = self.tool.get_text_vocab( train_data, dev_data) else: self.word_vocab = self.tool.get_text_vocab( train_data, dev_data) if self.config.model_name == 'FLAT': vectors = self.lattice_vocab.vectors else: vectors = self.word_vocab.vectors self.tag_vocab = self.tool.get_tag_vocab(train_data, dev_data) logger.info('Finished build vocab') if self.config.is_hidden_tag: self.hidden_tag_vocab = self.tool.get_hidden_tag_vocab( train_data, dev_data) model = self.init_model(self.config, len(self.word_vocab), len(self.tag_vocab), len(self.hidden_tag_vocab), vectors=vectors, n_bigram=None) elif self.config.model_name == 'FLAT': model = self.init_model(self.config, len(self.bigram_vocab), len(self.lattice_vocab), len(self.tag_vocab), vectors=vectors, n_bigram=None) else: model = self.init_model(self.config, len(self.word_vocab), len(self.tag_vocab), None, vectors=vectors, n_bigram=None) # model.load_state_dict(torch.load(self.config.model_path.format(self.config.experiment_name))) self.model = model logger.info('Building iterator ...') train_iter = self.tool.get_iterator(train_data, batch_size=self.config.batch_size) dev_iter = self.tool.get_iterator(dev_data, batch_size=self.config.batch_size) logger.info('Finished build iterator') optimizer = optim.Adam(model.parameters(), lr=self.config.learning_rate, weight_decay=1e-5) logger.info('Begining train ...') for epoch in range(self.config.epoch): model.train() acc_loss = 0 dice_loss = 0 for index, iter in enumerate(tqdm(train_iter)): if iter.tag.shape[1] == self.config.batch_size: optimizer.zero_grad() if self.config.model_name == 'FLAT': bigiam = iter.bigram[0] lattice = iter.lattice[0] lattice_len = iter.lattice[1] tag = iter.tag loss = model.loss(bigiam, lattice, lattice_len, tag) else: text = iter.text[0] tag = iter.tag text_len = iter.text[1] if self.config.is_hidden_tag: hidden_tag = iter.hidden_tag loss = model.loss(text, text_len, tag, hidden_tag) else: loss, dice = model.loss(text, text_len, tag) acc_loss += loss.view(-1).cpu().data.tolist()[0] dice_loss += dice.view(-1).cpu().data.tolist()[0] loss.backward() optimizer.step() f1, report_dict, entity_prf_dict = self.eval(dev_iter) loss_list.append(acc_loss) # f1 = report_dict['weighted avg']['f1-score'] f1_list.append(f1) epoch_list.append(epoch + 1) logger.info('dice_loss:{}'.format(dice_loss)) logger.info('epoch:{} loss:{} weighted avg:{}'.format( epoch, acc_loss, report_dict['weighted avg'])) if f1 > max_f1: max_f1 = f1 label_report = report_dict['weighted avg'] max_dict = entity_prf_dict['average'] max_report = entity_prf_dict torch.save( model.state_dict(), './save_model/{}.pkl'.format(self.config.experiment_name)) logger.info( 'The best model saved has entity-f1:{} label-f1:{}'. format(max_f1, label_report['f1-score'])) logger.info('Finished train') logger.info('Max_f1 avg : {}'.format(max_dict)) # with codecs.open('./result/classification_report/{}/pred_info.txt'.format(config.experiment_name), 'w', # encoding='utf-8') as f: # f.write(max_dict+ '\n' + label_report) self.tool.write_csv(max_report, label_report) self.tool.show_1y(epoch_list, loss_list, 'loss') self.tool.show_1y(epoch_list, f1_list, 'f1')
def write_val_true_pred(self, path=None, model_name=None, save_path=None): if path is None: model_name = self.config.model_path.format( self.config.experiment_name) save_path = self.config.analysis_path.format( self.config.experiment_name) train_data = self.tool.load_data(self.config.train_path, self.config.is_bioes) dev_data = self.tool.load_data(self.config.dev_path, self.config.is_bioes) logger.info('Finished load data') logger.info('Building vocab ...') if self.config.is_pretrained_model: with open(self.config.pretrained_vocab, 'r', encoding='utf-8') as vocab_file: vocab_list = vocab_file.readlines() word_vocab = self.tool.get_text_vocab(vocab_list) else: if self.config.model_name == 'FLAT': bigram_vocab = self.tool.get_bigram_vocab(train_data, dev_data) lattice_vocab = self.tool.get_text_vocab(train_data, dev_data) else: word_vocab = self.tool.get_text_vocab(train_data, dev_data) # vectors = lattice_vocab.vectors vectors = None tag_vocab = self.tool.get_tag_vocab(train_data, dev_data) logger.info('Finished build vocab') if self.config.is_hidden_tag: self.hidden_tag_vocab = self.tool.get_hidden_tag_vocab( train_data, dev_data) model = self.init_model(self.config, len(word_vocab), len(tag_vocab), len(self.hidden_tag_vocab), vectors=vectors) elif self.config.model_name == 'FLAT': model = self.init_model(self.config, len(bigram_vocab), len(lattice_vocab), len(tag_vocab), vectors=vectors, n_bigram=None) else: model = self.init_model(self.config, len(word_vocab), len(tag_vocab), None, vectors=vectors) model.load_state_dict(torch.load(model_name)) # 需要新建xlsx 七列 wb_analysis = Workbook() analysis_sheet = wb_analysis.create_sheet('sheet1') wb_analysis.remove(wb_analysis['Sheet']) names = [ '原文', '原发部位', '病灶大小', '转移部位', 'pred_原发部位', 'pred_病灶大小', 'pred_转移部位' ] for i in range(len(names)): analysis_sheet.cell(1, i + 1, names[i]) wb = load_workbook(filename=config.analysis_dev_path) ws = wb['sheet1'] max_row = ws.max_row false_fill = PatternFill(fill_type='solid', fgColor='FFC125') for line_num in tqdm(range(max_row - 1)): line_num += 2 sentence = ws.cell(line_num, 1).value # index_size = {} # chars = ['.', '*', '×', 'X', 'x', 'c', 'C', 'm', 'M'] # starts = [] # ends = [] # i = 0 # while i < len(sentence): # if sentence[i] in chars or sentence[i].isdigit(): # S_start = i # while i + 1 < len(sentence) and (sentence[i + 1] in chars or sentence[i + 1].isdigit()): # i += 1 # if sentence[S_start:i + 1].__contains__('M') or sentence[S_start:i + 1].__contains__('m'): # starts.append(S_start) # ends.append(i) # i += 1 # else: # i += 1 # sentence.replace('$', '') # new_sentence = [c for c in sentence] # width = 0 # if len(starts) != 0: # for i in range(len(starts)): # start_i = starts[i] - width # index_size[start_i] = sentence[starts[i]:ends[i] + 1] # for j in range(ends[i] - starts[i]): # del new_sentence[start_i] # new_sentence[start_i] = '$' # width += ends[i] - starts[i] # a = 0 # sentence = ''.join(new_sentence) sentence1 = [] tag_pred = [] if self.config.model_name == 'FLAT': texts = self.tool.split_text(sentence) for text in texts: sentence1.extend(text) bigram1 = get_bigram(text) bigram = torch.tensor( numpy.array([bigram_vocab.stoi[bi] for bi in bigram1], dtype='int64')).unsqueeze(1).expand( len(bigram1), self.config.batch_size).to(device) lattice1 = list(text) + w_trie.get_lexicon(text) lattice = torch.tensor( numpy.array( [lattice_vocab.stoi[word] for word in lattice1], dtype='int64')).unsqueeze(1).expand( len(lattice1), self.config.batch_size).to(device) lattice_len = torch.tensor( numpy.array([len(lattice1)], dtype='int64')).expand( self.config.batch_size).to(device) result = model(bigram, lattice, lattice_len)[0] for k in result: tag_pred.append(tag_vocab.itos[k]) else: texts = self.tool.split_text(sentence) for text in texts: sentence1.extend(text) text = torch.tensor( numpy.array([word_vocab.stoi[word] for word in text], dtype='int64')).unsqueeze(1).expand( len(text), self.config.batch_size).to(device) text_len = torch.tensor( numpy.array([len(text)], dtype='int64')).expand( self.config.batch_size).to(device) result = model(text, text_len)[0] for k in result: tag_pred.append(tag_vocab.itos[k]) sentence1 = ''.join(sentence1) i = 0 origin_places = [] sizes = [] transfered_places = [] while i < len(tag_pred): if self.config.is_bioes: start = end = 0 if tag_pred[i][:1] == 'B': kind = tag_pred[i][2:] start = i end = i while end + 1 < len(sentence1) and ( tag_pred[end + 1][0] == 'I' or tag_pred[end + 1][0] == 'E') and tag_pred[end + 1][2:] == kind: end += 1 if kind == 'origin_place': origin_places.append(sentence1[start:end + 1]) elif kind == 'size': sizes.append(sentence1[start:end + 1]) else: transfered_places.append(sentence1[start:end + 1]) i = end + 1 elif tag_pred[i][:1] == 'E': kind = tag_pred[i][2:] start = i end = i if kind == 'origin_place': origin_places.append(sentence1[start:end + 1]) elif kind == 'size': # sizes.append(index_size[start]) sizes.append(sentence1[start:end + 1]) else: transfered_places.append(sentence1[start:end + 1]) i += 1 else: i += 1 else: start = end = 0 if tag_pred[i][:1] == 'B': kind = tag_pred[i][2:] start = end = i while end + 1 < len(sentence1) and tag_pred[ end + 1][0] == 'I' and tag_pred[end + 1][2:] == kind: end += 1 if kind == 'origin_place': origin_places.append(sentence1[start:end + 1]) elif kind == 'size': # sizes.append(index_size[start]) sizes.append(sentence1[start:end + 1]) else: transfered_places.append(sentence1[start:end + 1]) i = end + 1 else: i += 1 # if tag_pred[i]!='O': # start = i # kind = tag_pred[i][2:] # while i+1<len(tag_pred) and tag_pred[i+1][2:]==kind: # i+=1 # end = i + 1 # if kind == 'origin_place': # origin_places.append(sentence1[start:end]) # elif kind == 'size': # sizes.append(index_size[start]) # else: # transfered_places.append(sentence1[start:end]) # i+=1 for places in [origin_places, sizes, transfered_places]: for place in places: if place == []: places.remove(place) analysis_sheet.cell(line_num, 1, ws.cell(line_num, 1).value) analysis_sheet.cell(line_num, 2, ws.cell(line_num, 2).value) analysis_sheet.cell(line_num, 3, ws.cell(line_num, 3).value) analysis_sheet.cell(line_num, 4, ws.cell(line_num, 4).value) analysis_sheet.cell(line_num, 5, ','.join(list(set(origin_places)))) analysis_sheet.cell(line_num, 6, ','.join(list(set(sizes)))) analysis_sheet.cell(line_num, 7, ','.join(list(set(transfered_places)))) for i in range(2, 5): if analysis_sheet.cell(line_num, i).value != None: if analysis_sheet.cell(line_num, i + 3).value == None: analysis_sheet.cell(line_num, i).fill = false_fill analysis_sheet.cell(line_num, i + 3).fill = false_fill else: flag = False s1 = analysis_sheet.cell(line_num, i).value.split(',') s2 = analysis_sheet.cell(line_num, i + 3).value.split(',') for x in s2: if x not in s1: flag = True break if flag: analysis_sheet.cell(line_num, i).fill = false_fill analysis_sheet.cell(line_num, i + 3).fill = false_fill wb_analysis.save(save_path) logger.info('Finished Predicting...')
async def verify_ok_proxy_task(): logger.info("run verify_ok_proxy_task") await verifier.verify_ok_proxy() await verify_error_proxy_task() await update_squid_task()
await verifier.verify_ok_proxy() await verify_error_proxy_task() await update_squid_task() @cron_wait async def fetch_new_proxy_task(): logger.info("run fetch_new_proxy_task") await spider.run_spider() await verifier.verify_new_proxy() # await verify_error_proxy_task() await update_squid_task() if __name__ == '__main__': logger.info("start") loop = asyncio.get_event_loop() loop.run_until_complete(update_squid_task()) msh = Scheduler() msh.add_job(CronJob().every(10).minute.go(verify_ok_proxy_task)) msh.add_job(CronJob().every(30).minute.go(fetch_new_proxy_task)) try: loop.run_until_complete(asyncio.wait([ msh.start(), run_api_server(), ])) loop.run_forever() except KeyboardInterrupt: print('exit')
def data_clean(path='./task2_train_reformat{}.xlsx'): wb = load_workbook(path.format('')) ws = wb['sheet1'] max_row = ws.max_row wb1 = Workbook() ws1 = wb1.create_sheet('sheet1') wb1.remove(wb1['Sheet']) names = ['原文', '原发部位', '病灶大小', '转移部位'] for i in range(len(names)): ws1.cell(1, i + 1, names[i]) place_num = 0 size_num = 0 for i in range(max_row - 1): line = i + 2 new_sentence = '' chars = ['.','*','×','X','x','c','C','m','M',' '] # o_chars = ['_x0004_', '�', ':', ',', ';'] # t_chars = ['', '', ':', ',', ';'] o_chars = ['�'] t_chars = [''] for i in range(4): if i==0: if '检测值' in ws.cell(line,i+1).value: new_sentence = ws.cell(line, i + 1).value for i in range(len(o_chars)): new_sentence = new_sentence.replace(o_chars[i], t_chars[i]) else: new_sentence = ws.cell(line, i+1).value.replace(' ', '') for i in range(len(o_chars)): new_sentence = new_sentence.replace(o_chars[i], t_chars[i]) i = 0 j = 0 while j < len(new_sentence): while j < len(new_sentence) and not new_sentence[j].isdigit(): j+=1 start = j end = start while end+1<len(new_sentence) and (new_sentence[end+1] in chars or new_sentence[end+1].isdigit()): end+=1 if new_sentence[start:end+1].__contains__('m') or new_sentence[start:end+1].__contains__('M'): old_size = new_sentence[start:end+1] new_size = '' nums = '' k=start flag = False while k<=end: while new_sentence[k].isdigit() or new_sentence[k]=='.': nums+=new_sentence[k] k+=1 flag=True if flag: nums+=',' flag=False k+=1 nums = nums[:-1].split(',') if old_size.__contains__('c') or old_size.__contains__('C'): for num in nums: new_size = new_size+num+'CM'+'×' new_size = new_size[:-1] else: for num in nums: new_size = new_size+num+'MM'+'×' new_size = new_size[:-1] j=end new_sentence = new_sentence.replace(old_size,new_size) j+=1 ws1.cell(line,i+1,new_sentence) elif i==1 or i==3: places = ws.cell(line,i+1).value if places is not None: places = places.replace('_x0004_', '').replace(' ', '') for place in places.split(','): if place not in new_sentence: place_num+=1 print('sentence中未找到place,次数{} {} {}'.format(place_num, new_sentence, place)) ws1.cell(line, i + 1, places) else: ws1.cell(line, i + 1, '') else: sizes = ws.cell(line,i+1).value if sizes is not None: sizes = sizes.replace('_x0004_', '').replace(' ', '').replace('�', '') sizes_clean = '' sizes_list = sizes.split(',') for j in range(len(sizes_list)): size = re.findall(r"\d+\.?\d*",sizes_list[j]) if sizes_list[j].__contains__('c') or sizes_list[j].__contains__('C'): for k in range(len(size)): sizes_clean = sizes_clean + size[k] + 'CM' + '×' sizes_clean = sizes_clean[:-1]+',' else: for k in range(len(size)): sizes_clean = sizes_clean + size[k] + 'MM' + '×' sizes_clean = sizes_clean[:-1] + ',' sizes_clean = sizes_clean[:-1] for size in sizes_clean.split(','): if size not in new_sentence: size_num += 1 print('sentence中未找到place,次数{} {} {}'.format(size_num, new_sentence, size)) ws1.cell(line, i + 1, sizes_clean) wb1.save(path.format('_cleaned')) logger.info('Finished cleaned data')
def sub_text_condition(file='train'): if file == 'train': path = './sub_train.xlsx' save_path = './sub_cut_train1.xlsx' else: path = './sub_dev.xlsx' save_path = './sub_cut_dev1.xlsx' wb = load_workbook(path) ws = wb['sheet1'] max_row = ws.max_row wb1 = Workbook() ws1 = wb1.create_sheet('sheet1') wb1.remove(wb1['Sheet']) names = ['原文', '原发部位', '病灶大小', '转移部位'] for i in range(len(names)): ws1.cell(1, i + 1, names[i]) all_text = [] all_origin = [] all_size = [] all_trans = [] for i in range(max_row-1): line = i+2 text = ws.cell(line,1).value texts = tool.split_text(text) all_text.extend(texts) if ws.cell(line,4).value is not None and text.__contains__('转移'): places = ws.cell(line,4).value.split(',') for t in texts: place_in_text = [] for place in places: if place in t and t.__contains__('转移'): place_in_text.append(place) all_trans.append(','.join(place_in_text)) elif ws.cell(line,4).value is not None and not text.__contains__('转移'): places = ws.cell(line, 4).value.split(',') for t in texts: place_in_text = [] for place in places: if place in t : place_in_text.append(place) all_trans.append(','.join(place_in_text)) elif ws.cell(line,4).value is None: for t in texts: all_trans.append('') for j in range(2): if ws.cell(line,j+2).value is not None: places = ws.cell(line,j+2).value.split(',') for t in texts: place_in_text = [] for place in places: if place in t: place_in_text.append(place) if j==0: all_origin.append(','.join(place_in_text)) elif j==1: all_size.append(','.join(place_in_text)) else: all_trans.append(','.join(place_in_text)) else: for t in texts: if j==0: all_origin.append('') elif j==1: all_size.append('') else: all_trans.append('') # line = i+2 # text = ws.cell(line, 1).value # describe, conclusion = tool.split_describe_conclusion(text) # origin = ws.cell(line, 2).value # size = ws.cell(line, 3).value # tran = ws.cell(line, 4).value # if conclusion is not None: # conclusions = tool.split_text(conclusion) # all_text.extend(conclusions) # if origin is not None : # origins = origin.split(',') # for text in conclusions: # place_in_text = [] # for place in origins: # if place in text: # place_in_text.append(place) # all_origin.append(''.join(place_in_text)) # else: # for i in range(len(conclusions)): # all_origin.append('') # if tran is not None: # trans = tran.split(',') # for text in conclusions: # place_in_text = [] # for place in trans: # if place in text and place.__contains__('转移'): # place_in_text.append(place) # all_trans.append(','.join(place_in_text)) # else: # for i in range(len(conclusions)): # all_trans.append('') # for i in range(len(conclusions)): # all_size.append('') # describes = tool.split_text(describe) # all_text.extend(describes) # if origin is not None and size is not None: # origins = origin.split(',') # sizes = size.split(',') # for text in describes: # place_in_text1 = [] # place_in_text2 = [] # for place1 in origins: # for place2 in sizes: # if place1 in text and place2 in text: # place_in_text1.append(place1) # place_in_text2.append(place2) # all_origin.append(','.join(place_in_text1)) # all_size.append(','.join(place_in_text2)) # all_trans.append('') # elif origin is not None and size is None: # origins = origin.split(',') # for text in describes: # place_in_text = [] # for place in origins: # if place in text: # place_in_text.append(place) # all_origin.append(','.join(place_in_text)) # all_size.append('') # all_trans.append('') # else: # for i in range(len(describes)): # all_origin.append('') # all_size.append('') # all_trans.append('') # else: # describes = tool.split_text(describe) # all_text.extend(describes) # if origin is not None and size is not None: # origins = origin.split(',') # sizes = size.split(',') # for text in describes: # place_in_text1 = [] # place_in_text2 = [] # for place1 in origins: # for place2 in sizes: # if place1 in text and place2 in text: # place_in_text1.append(place1) # place_in_text2.append(place2) # all_origin.append(','.join(place_in_text1)) # all_size.append(','.join(place_in_text2)) # elif origin is not None and size is None: # origins = origin.split(',') # for text in describes: # place_in_text = [] # for place in origins: # if place in text: # place_in_text.append(place) # all_origin.append(','.join(place_in_text)) # all_size.append('') # elif origin is None and size is None: # for text in describes: # all_origin.append('') # all_size.append('') # if tran is not None: # trans = tran.split(',') # for text in describes: # place_in_text = [] # for place in trans: # if place in text and text.__contains__('转移'): # place_in_text.append(place) # all_trans.append(','.join(place_in_text)) # else: # for text in describes: # all_trans.append('') assert len(all_text) == len(all_trans) and len(all_trans) == len(all_size) and len(all_trans) == len(all_origin), 'len(all_trans) != len(all_size) or len(all_trans) != len(all_origin)' for i in range(len(all_text)): line = i+2 ws1.cell(line,1,all_text[i]) ws1.cell(line,2,all_origin[i]) ws1.cell(line,3,all_size[i]) ws1.cell(line,4,all_trans[i]) wb1.save(save_path) logger.info('Finished cut {}.xlsx'.format(file))
def predict_test(self, path=None, model_name=None, save_path=None): if path is None: path = self.config.test_path model_name = self.config.model_path.format( self.config.experiment_name) save_path = self.config.unformated_val_path.format( self.config.experiment_name) train_data = self.tool.load_data(self.config.train_path, self.config.is_bioes) dev_data = self.tool.load_data(self.config.dev_path, self.config.is_bioes) logger.info('Finished load data') logger.info('Building vocab ...') model = None if self.config.is_pretrained_model: with open(self.config.pretrained_vocab, 'r', encoding='utf-8') as vocab_file: vocab_list = vocab_file.readlines() word_vocab = self.tool.get_text_vocab(vocab_list) else: if self.config.model_name == 'FLAT': bigram_vocab = self.tool.get_bigram_vocab(train_data, dev_data) lattice_vocab = self.tool.get_text_vocab(train_data, dev_data) else: word_vocab = self.tool.get_text_vocab(train_data, dev_data) # vectors = lattice_vocab.vectors vectors = None tag_vocab = self.tool.get_tag_vocab(train_data, dev_data) logger.info('Finished build vocab') if self.config.is_hidden_tag: self.hidden_tag_vocab = self.tool.get_hidden_tag_vocab( train_data, dev_data) model = self.init_model(self.config, len(word_vocab), len(tag_vocab), len(self.hidden_tag_vocab), vectors=vectors) elif self.config.model_name == 'FLAT': model = self.init_model(self.config, len(bigram_vocab), len(lattice_vocab), len(tag_vocab), vectors=vectors, n_bigram=None) else: model = self.init_model(self.config, len(word_vocab), len(tag_vocab), None, vectors=vectors) model.load_state_dict(torch.load(model_name)) wb = load_workbook(filename=path) ws = wb['sheet1'] max_row = ws.max_row f = open(self.config.vocab_path, 'r') lines = f.readlines() w_list = [] for line in lines: splited = line.strip().split(' ') w = splited[0] w_list.append(w) w_trie = Trie() for w in w_list: w_trie.insert(w) for line_num in tqdm(range(max_row - 1)): line_num += 2 sentence = ws.cell(line_num, 1).value # index_size = {} # chars = ['.', '*', '×', 'X', 'x', 'c', 'C', 'm', 'M'] # starts = [] # ends = [] # i = 0 # while i < len(sentence): # if sentence[i] in chars or sentence[i].isdigit(): # S_start = i # while i + 1 < len(sentence) and (sentence[i + 1] in chars or sentence[i + 1].isdigit()): # i += 1 # if sentence[S_start:i + 1].__contains__('M') or sentence[S_start:i + 1].__contains__('m'): # starts.append(S_start) # ends.append(i) # i += 1 # else: # i += 1 # sentence.replace('$', '') # new_sentence = [c for c in sentence] # width = 0 # if len(starts) != 0: # for i in range(len(starts)): # start_i = starts[i] - width # index_size[start_i] = sentence[starts[i]:ends[i] + 1] # for j in range(ends[i] - starts[i]): # del new_sentence[start_i] # new_sentence[start_i] = '$' # width += ends[i] - starts[i] # a = 0 # sentence = ''.join(new_sentence) sentence1 = [] tag_pred = [] if self.config.model_name == 'FLAT': texts = self.tool.split_text(sentence) for text in texts: sentence1.extend(text) bigram1 = get_bigram(text) bigram = torch.tensor( numpy.array([bigram_vocab.stoi[bi] for bi in bigram1], dtype='int64')).unsqueeze(1).expand( len(bigram1), self.config.batch_size).to(device) lattice1 = list(text) + w_trie.get_lexicon(text) lattice = torch.tensor( numpy.array( [lattice_vocab.stoi[word] for word in lattice1], dtype='int64')).unsqueeze(1).expand( len(lattice1), self.config.batch_size).to(device) lattice_len = torch.tensor( numpy.array([len(lattice1)], dtype='int64')).expand( self.config.batch_size).to(device) result = model(bigram, lattice, lattice_len)[0] for k in result: tag_pred.append(tag_vocab.itos[k]) else: texts = self.tool.split_text(sentence) for text in texts: sentence1.extend(text) text = torch.tensor( numpy.array([word_vocab.stoi[word] for word in text], dtype='int64')).unsqueeze(1).expand( len(text), self.config.batch_size).to(device) text_len = torch.tensor( numpy.array([len(text)], dtype='int64')).expand( self.config.batch_size).to(device) result = model(text, text_len)[0] for k in result: tag_pred.append(tag_vocab.itos[k]) sentence1 = ''.join(sentence1) i = 0 origin_places = [] sizes = [] transfered_places = [] while i < len(tag_pred): if self.config.is_bioes: start = end = 0 if tag_pred[i][:1] == 'B': kind = tag_pred[i][2:] start = i end = i while end + 1 < len(sentence1) and ( tag_pred[end + 1][0] == 'I' or tag_pred[end + 1][0] == 'E') and tag_pred[end + 1][2:] == kind: end += 1 if kind == 'origin_place': origin_places.append(sentence1[start:end + 1]) elif kind == 'size': sizes.append(sentence1[start:end + 1]) else: transfered_places.append(sentence1[start:end + 1]) i = end + 1 elif tag_pred[i][:1] == 'E': kind = tag_pred[i][2:] start = i end = i if kind == 'origin_place': origin_places.append(sentence1[start:end + 1]) elif kind == 'size': # sizes.append(index_size[start]) sizes.append(sentence1[start:end + 1]) else: transfered_places.append(sentence1[start:end + 1]) i += 1 else: i += 1 else: start = end = 0 if tag_pred[i][:1] == 'B': kind = tag_pred[i][2:] start = end = i while end + 1 < len(sentence1) and tag_pred[ end + 1][0] == 'I' and tag_pred[end + 1][2:] == kind: end += 1 if kind == 'origin_place': origin_places.append(sentence1[start:end + 1]) elif kind == 'size': # sizes.append(index_size[start]) sizes.append(sentence1[start:end + 1]) else: transfered_places.append(sentence1[start:end + 1]) i = end + 1 else: i += 1 # if tag_pred[i]!='O': # start = i # kind = tag_pred[i][2:] # while i+1<len(tag_pred) and tag_pred[i+1][2:]==kind: # i+=1 # end = i + 1 # if kind == 'origin_place': # origin_places.append(sentence1[start:end]) # elif kind == 'size': # sizes.append(index_size[start]) # else: # transfered_places.append(sentence1[start:end]) # i+=1 for places in [origin_places, sizes, transfered_places]: for place in places: if place == []: places.remove(place) ws.cell(line_num, 2).value = ','.join(list(set(origin_places))) ws.cell(line_num, 3).value = ','.join(list(set(sizes))) ws.cell(line_num, 4).value = ','.join(list(set(transfered_places))) wb.save(filename=save_path) logger.info('Finished Predicting...')
async def fetch_new_proxy_task(): logger.info("run fetch_new_proxy_task") await spider.run_spider() await verifier.verify_new_proxy() # await verify_error_proxy_task() await update_squid_task()
def test_format_result(self): self.train_data = self.tool.load_data(self.config.train_path) self.dev_data = self.tool.load_data(self.config.dev_path) tag_vocab = self.tool.get_tag_vocab(self.train_data, self.dev_data) self.predict_test( path=self.config.dev_path, save_path=self.config.test_unformated_val_path.format( self.config.experiment_name)) tag_true = [] tag_formated_pred = [] tag_unformated_pred = [] format_result(path=self.config.test_unformated_val_path.format( self.config.experiment_name), save_path=self.config.test_formated_val_path.format( self.config.experiment_name)) dev_data = self.tool.load_data(self.config.dev_path) formated_dev_data = self.tool.load_data( self.config.test_formated_val_path.format( self.config.experiment_name)) unformated_dev_data = self.tool.load_data( self.config.test_unformated_val_path.format( self.config.experiment_name)) assert len(dev_data.examples) == len( unformated_dev_data.examples ), 'train_dev_data:{} != unformated_train_dev_data:{}'.format( len(dev_data.examples), len(unformated_dev_data.examples)) assert len(dev_data.examples) == len( formated_dev_data.examples ), 'train_dev_data:{} != formated_train_dev_data:{}'.format( len(dev_data.examples), len(formated_dev_data.examples)) for example1 in dev_data.examples: tag_true.extend(example1.tag) for example2 in formated_dev_data.examples: tag_formated_pred.extend(example2.tag) for example3 in unformated_dev_data.examples: tag_unformated_pred.extend(example3.tag) # the eval of unformated result for i in range(len(dev_data)): pass assert len(tag_true) == len( tag_unformated_pred), 'tag_true:{} != tag_pred:{}'.format( len(tag_true), len(tag_unformated_pred)) assert len(tag_true) == len( tag_formated_pred), 'tag_true:{} != tag_pred:{}'.format( len(tag_true), len(tag_formated_pred)) labels = [] for index, label in enumerate(tag_vocab.itos): labels.append(label) labels.remove('O') prf_dict_formated = classification_report(tag_true, tag_formated_pred, labels=labels, output_dict=True) prf_dict_unformated = classification_report(tag_true, tag_unformated_pred, labels=labels, output_dict=True) # the eval of formated result logger.info('unformated report{}'.format( prf_dict_formated['weighted avg'])) logger.info('formated report{}'.format( prf_dict_unformated['weighted avg']))
def data_clean_test(path='./task2_no_val{}.xlsx'): wb = load_workbook(path.format('')) ws = wb['sheet1'] max_row = ws.max_row wb1 = Workbook() ws1 = wb1.create_sheet('sheet1') wb1.remove(wb1['Sheet']) names = ['原文', '肿瘤原发部位', '原发病灶大小', '转移部位'] for i in range(len(names)): ws1.cell(1, i + 1, names[i]) for i in range(max_row - 1): line = i + 2 new_sentence = '' chars = ['.','*','×','X','x','c','C','m','M'] # o_chars = ['_x0004_', '�', ':', ',', ';'] # t_chars = ['', '', ':', ',', ';'] o_chars = ['�'] t_chars = [''] for i in range(4): if i==0: if '检测值' in ws.cell(line, i + 1).value: new_sentence = ws.cell(line, i + 1).value for j in range(len(o_chars)): new_sentence = new_sentence.replace(o_chars[j], t_chars[j]) else: new_sentence = ws.cell(line, i+1).value.replace(' ', '') # new_sentence = ws.cell(line, i + 1).value for j in range(len(o_chars)): new_sentence = new_sentence.replace(o_chars[j], t_chars[j]) j = 0 while j < len(new_sentence): while j < len(new_sentence) and not new_sentence[j].isdigit(): j+=1 start = j end = start while end+1<len(new_sentence) and (new_sentence[end+1] in chars or new_sentence[end+1].isdigit()): end+=1 if new_sentence[start:end+1].__contains__('m') or new_sentence[start:end+1].__contains__('M'): old_size = new_sentence[start:end+1] new_size = '' nums = '' k=start flag = False while k<=end: while new_sentence[k].isdigit() or new_sentence[k]=='.': nums+=new_sentence[k] k+=1 flag=True if flag: nums+=',' flag=False k+=1 nums = nums[:-1].split(',') if old_size.__contains__('c') or old_size.__contains__('C'): for num in nums: new_size = new_size+num+'CM'+'×' new_size = new_size[:-1] else: for num in nums: new_size = new_size+num+'MM'+'×' new_size = new_size[:-1] j=end new_sentence = new_sentence.replace(old_size,new_size) j+=1 ws1.cell(line,i+1,new_sentence) wb1.save(path.format('_cleaned')) logger.info('Finished val cleaned data')
def predict_sentence(self, model_name=None): if model_name is None: model_name = self.config.model_path.format( self.config.experiment_name) train_data = self.tool.load_data(self.config.train_path, self.config.is_bioes) dev_data = self.tool.load_data(self.config.dev_path, self.config.is_bioes) logger.info('Finished load data') logger.info('Building vocab ...') model = None # if self.config.is_pretrained_model: # with open(self.config.pretrained_vocab, 'r', encoding='utf-8') as vocab_file: # vocab_list = vocab_file.readlines() # word_vocab = self.tool.get_text_vocab(vocab_list) # else: # word_vocab = self.tool.get_text_vocab(train_data, dev_data) # vectors = word_vocab.vectors # tag_vocab = self.tool.get_tag_vocab(train_data, dev_data) # logger.info('Finished build vocab') # if self.config.is_hidden_tag: # self.hidden_tag_vocab = self.tool.get_hidden_tag_vocab(train_data, dev_data) # model = self.init_model(self.config, len(self.word_vocab), len(self.tag_vocab), len(self.hidden_tag_vocab), # vectors=vectors) # else: # model = self.init_model(self.config, len(word_vocab), len(tag_vocab), None, vectors=vectors) # model.load_state_dict(torch.load(model_name)) if self.config.is_pretrained_model: with open(self.config.pretrained_vocab, 'r', encoding='utf-8') as vocab_file: vocab_list = vocab_file.readlines() word_vocab = self.tool.get_text_vocab(vocab_list) else: if self.config.model_name == 'FLAT': bigram_vocab = self.tool.get_bigram_vocab(train_data, dev_data) lattice_vocab = self.tool.get_text_vocab(train_data, dev_data) else: word_vocab = self.tool.get_text_vocab(train_data, dev_data) vectors = None tag_vocab = self.tool.get_tag_vocab(train_data, dev_data) logger.info('Finished build vocab') if self.config.is_hidden_tag: self.hidden_tag_vocab = self.tool.get_hidden_tag_vocab( train_data, dev_data) model = self.init_model(self.config, len(word_vocab), len(tag_vocab), len(self.hidden_tag_vocab), vectors=vectors) elif self.config.model_name == 'FLAT': model = self.init_model(self.config, len(bigram_vocab), len(lattice_vocab), len(tag_vocab), vectors=vectors, n_bigram=None) else: model = self.init_model(self.config, len(word_vocab), len(tag_vocab), None, vectors=vectors) model.load_state_dict(torch.load(model_name)) f = open(self.config.vocab_path, 'r') lines = f.readlines() w_list = [] for line in lines: splited = line.strip().split(' ') w = splited[0] w_list.append(w) w_trie = Trie() for w in w_list: w_trie.insert(w) while True: print('请输入sentence:') sentence = input() # texts = self.tool.split_text(sentence) # tag_pred = [] # sentence1 = [] # for text in texts: # sentence1.extend(text) # text = torch.tensor(numpy.array([word_vocab.stoi[word] for word in text], dtype='int64')).unsqueeze( # 1).expand(len(text), self.config.batch_size).to(device) # text_len = torch.tensor(numpy.array([len(text)], dtype='int64')).expand(self.config.batch_size).to( # device) # result = model(text, text_len)[0] # for k in result: # tag_pred.append(tag_vocab.itos[k]) # sentence1 = ''.join(sentence1) # i = 0 sentence1 = [] if self.config.model_name == 'FLAT': texts = self.tool.split_text(sentence) tag_pred = [] for text in texts: sentence1.extend(text) bigram1 = get_bigram(text) bigram = torch.tensor( numpy.array([bigram_vocab.stoi[bi] for bi in bigram1], dtype='int64')).unsqueeze(1).expand( len(bigram1), self.config.batch_size).to(device) lattice1 = list(text) + w_trie.get_lexicon(text) lattice = torch.tensor( numpy.array( [lattice_vocab.stoi[word] for word in lattice1], dtype='int64')).unsqueeze(1).expand( len(lattice1), self.config.batch_size).to(device) lattice_len = torch.tensor( numpy.array([len(lattice1)], dtype='int64')).expand( self.config.batch_size).to(device) result = model(bigram, lattice, lattice_len)[0] for k in result: tag_pred.append(tag_vocab.itos[k]) else: texts = self.tool.split_text(sentence) tag_pred = [] for text in texts: sentence1.extend(text) text = torch.tensor( numpy.array([word_vocab.stoi[word] for word in text], dtype='int64')).unsqueeze(1).expand( len(text), self.config.batch_size).to(device) text_len = torch.tensor( numpy.array([len(text)], dtype='int64')).expand( self.config.batch_size).to(device) result = model(text, text_len)[0] for k in result: tag_pred.append(tag_vocab.itos[k]) sentence1 = ''.join(sentence1) i = 0 origin_places = [] sizes = [] transfered_places = [] while i < len(tag_pred): start = 0 end = 0 kind = None if tag_pred[i] != 'O': start = i kind = tag_pred[i][2:] while i + 1 < len(tag_pred) and tag_pred[i + 1][2:] == kind: i += 1 end = i + 1 if kind == 'origin_place': origin_places.append(sentence1[start:end]) elif kind == 'size': sizes.append(sentence1[start:end]) else: transfered_places.append(sentence1[start:end]) i += 1 # print(sentence1) # print(tag_pred) for i in range(len(sentence1)): print(sentence1[i], tag_pred[i]) print(origin_places) print(sizes) print(transfered_places)
async def update_squid_task(): logger.info("run update_squid_task") s = sess_maker() proxies = s.query(Proxy).filter(Proxy.status == STATUS_OK).all() s.close() squid.update_conf(proxies)