def encode_corpus(self, documents: List[str]) -> Union[torch.Tensor, np.array]: length_sorted_idx = np.argsort([len(sen) for sen in documents]) documents = [documents[idx] for idx in length_sorted_idx] encoded_documents = [] for start_index in trange(0, len(documents), self.params.batch_size): sentences_batch = documents[start_index:start_index + self.params.batch_size] encoded_dict = self.params.tokenizer( text=sentences_batch, add_special_tokens=True, padding='longest', truncation=True, max_length=self.params.sequence_max_len, return_attention_mask=True, return_token_type_ids=False, return_tensors='np') inputs = { 'input_ids': encoded_dict["input_ids"].reshape(1, -1), 'attention_mask': encoded_dict["attention_mask"].reshape(1, -1), } output = self.session.run(None, inputs) embeddings = output[0] encoded_documents.extend(embeddings) encoded_documents = [ encoded_documents[idx] for idx in np.argsort(length_sorted_idx) ] return encoded_documents
def get_data(folderpath_origin): """加载所有待处理文件 Args: :folderpath_origin (str): Folder,待加载的文件所在目录 Returns: :new_all_data (list): List of all files,将所有的待处理数据加载到列表中 """ filename = os.listdir(folderpath_origin) all_data = [] for name in filename: path = folderpath_origin + name if path[-5:] == '.xlsx': data = pd.read_excel(path) abstracts = data['摘要'].to_list() for abstract in abstracts: all_data.append(abstract) elif path[-4:] == '.csv': data = pd.read_csv(path) detaileds = data['详细'].to_list() for detailed in detaileds: all_data.append(detailed) new_all_data = [] print('开始去重处理!!!') for data_index in trange(len(all_data)): data = all_data[data_index] if data not in new_all_data: new_all_data.append(data) return new_all_data
def save_mean_data(data, amount, folderpath_dest): """将已识别的数据按照小组人数进行切分,方便后续继续进行核查 Args: :data (list): list of data,所有已识别数据列表 :amount (int): amount to people,均分的次数 :folderpath_dest (str): path of folder,已识别数据待保存文件目录的路径 """ all_data_length = len(data) mean_data_length = all_data_length // amount start = 0 end = mean_data_length print('开始平均所有数据') for index in trange(amount): while len(data[end - 1]) != 0: end += 1 else: sheet = pd.DataFrame(data[start:end - 1]) filename = folderpath_dest + 'all_extract_part_' + str( index) + '.txt' sheet.to_csv(filename, index=None, header=None) start = end end += mean_data_length if end > all_data_length: end = all_data_length
def check_data(folderpath_origin, folderpath_dest): """处理因人工标注造成的字符合并问题(单个文件) Args: : folderpath_origin(str): Folder,待处理文件路径 : folderpath_dest(str): Folder,已处理数据保存路径 """ all_data = pd.read_csv(folderpath_origin, header=None) all_data = all_data.values.tolist() new_all_data_list = [] for data_index in trange(len(all_data)): data = all_data[data_index] if ((str(data[0]) != 'nan') and (not str(data[0]).isdigit()) and (len(data[0]) > 1)): for i in range(len(data[0])): new_data_list = [] new_data_list.append(data[0][i]) new_data_list.append('O') new_all_data_list.append(new_data_list) else: new_all_data_list.append(data) sheet = pd.DataFrame(new_all_data_list) filename = folderpath_dest sheet.to_csv(filename, index=None, header=None)
def check_datas(folderpath_origin, folderpath_dest): """处理因人工标注造成的字符合并问题(多个文件处于同一目录下) Args: : folderpath_origin(str): Folder,待处理文件目录路径 : folderpath_dest(str): Folder,已处理数据保存目录路径 """ filenames = os.listdir(folderpath_origin) for filename in filenames: if len(filename) == 13: filename = folderpath_origin + filename print(filename) all_data = pd.read_csv(filename, header=None) all_data = all_data.values.tolist() new_all_data_list = [] for data_index in trange(len(all_data)): data = all_data[data_index] if ((str(data[0]) != 'nan') and (not str(data[0]).isdigit()) and (len(data[0]) > 1)): for i in range(len(data[0])): new_data_list = [] new_data_list.append(data[0][i]) new_data_list.append('O') new_all_data_list.append(new_data_list) else: new_all_data_list.append(data) sheet = pd.DataFrame(new_all_data_list) filename = folderpath_dest + 'G_' + filename[-13:] sheet.to_csv(filename, index=None, header=None)
def merge_data(all_data): """合并标记数据(B、I) Args: :all_data (list): list of data,加载的所有标记数据列表,包含B、I """ Merged_Data_List = [] # 存放所有实体字典 print('开始合并所有邻近标记的实体:') for data_index in trange(len(all_data['data'])): data = all_data['data'][data_index] item = '' data_length = len(data) Merged_Data_Dict = {} # 存放合并后的所有实体对 merged_data_list = [] # 存放合并后的实体字典 merged_data_dict = {} # 存放合并后的实体对象 for index in range(data_length): if index == data_length - 1: if item == '': merged_data_dict['item'] = data[index]['item'] merged_data_dict['marker'] = data[index]['marker'] merged_data_list.append(merged_data_dict) merged_data_dict = {} else: item += data[index]['item'] merged_data_dict['item'] = item merged_data_dict['marker'] = data[index]['marker'] merged_data_list.append(merged_data_dict) merged_data_dict = {} item = '' elif data[index]['marker'] == 'O': merged_data_dict['item'] = data[index]['item'] merged_data_dict['marker'] = data[index]['marker'] merged_data_list.append(merged_data_dict) merged_data_dict = {} elif data[index]['marker'] != data[index + 1]['marker'] and item == '': merged_data_dict['item'] = data[index]['item'] merged_data_dict['marker'] = data[index]['marker'] merged_data_list.append(merged_data_dict) merged_data_dict = {} elif data[index]['marker'] == data[index + 1]['marker']: item += data[index]['item'] elif data[index]['marker'] != data[index + 1]['marker'] and item != '': item += data[index]['item'] merged_data_dict['item'] = item merged_data_dict['marker'] = data[index]['marker'] merged_data_list.append(merged_data_dict) merged_data_dict = {} item = '' Merged_Data_Dict['data'] = merged_data_list Merged_Data_List.append(Merged_Data_Dict) return Merged_Data_List
def saveData(self, data): print('开始进行地理位置编码......') pos_dict = {} for n in trange(len(data)): pos_lon_lat = self.location(data[n]) pos_dict.update({data[n]: pos_lon_lat}) json_data = json.dumps(pos_dict, ensure_ascii=False) with open(self.filename, 'w', encoding='utf-8') as f: # 将结果存储为JSON文件 f.write(json_data) print('地理位置编码完成,生成文件:%s' % self.filename) return pos_dict
def marker_data(all_data): """转换标记,并拆分字符串 Args: :all_data (dict): dictionary,加载的所有数据字典 """ extract_data_lists = [] print('转换标记,并拆分字符串:') for item_index in trange(len(all_data['data'])): item = all_data['data'][item_index] for cell in item: if cell['marker'] == 'O': extract_data_list = [] extract_data_list.append(cell['item']) extract_data_list.append(cell['marker']) number_data = '' if len(extract_data_list) == 2: extract_data_lists.append(extract_data_list) else: marker_len = len(extract_data_lists) number_data = '' for index in range(0, len(cell['item'])): # 拆分字符串,并合并字符串中的数字 extract_data_list = [] # 判断是否为字符串的开始 index,并设置长度避免超出字符串 index if (str(cell['item'][index]).isdigit() is False) or (index == len(cell['item']) - 1): marker = 'I-' + cell['marker'] extract_data_list.append(cell['item'][index]) extract_data_list.append(marker) elif (str(cell['item'][index]).isdigit()) and (str( cell['item'][index + 1]).isdigit()): number_data += str(cell['item'][index]) elif str(cell['item'][index]).isdigit() and (str( cell['item'][index + 1]).isdigit() is False): number_data += str(cell['item'][index]) marker = 'I-' + cell['marker'] extract_data_list.append(number_data) extract_data_list.append(marker) number_data = '' if len(extract_data_list) == 2: extract_data_lists.append(extract_data_list) extract_data_lists[marker_len][1] = ( 'B' + extract_data_lists[marker_len][1][1:]) extract_data_lists.append('') return extract_data_lists
def sort_out_news(func, title_ulr_lists): """整理已经获取到的事件详细 list (包含标题、href、发布时间、详细信息) Args: :func (str): 将要调用的方法名称 :urls (list): 待获取网页 url 列表 Returns: :news_data_lists (list): 已整理好的事件详细 list (包含标题、href、发布时间、详细信息) """ print("======>>>开始整理所有新闻的详细信息<<<======") news_data_lists = [] news_lists = process_pools(func, title_ulr_lists) for news_list_index in trange(len(news_lists)): news_data_lists.append(news_lists[news_list_index]) return news_data_lists
def saveToDB(self, flights): airport_dict = {} # 定义机场地理数据的字典 flight_geo = pd.DataFrame(self.flight_geo.find({}, {'_id': 0})) if flight_geo.shape[0] == 0: # 判断数据库是否为空,如果不空则将code转换为列表 airport_code_list = [] else: airport_code_list = flight_geo.airport_code.tolist() code_and_address = flights[['airport_code', 'start_airport' ]] # 在清理后的DataFrame中提取出发,到达两列数据 code_and_address = code_and_address.drop_duplicates( subset='start_airport') # 按出发机场去重 address_list = code_and_address.start_airport.tolist() # 将出发机场转换为列表 print('开始进行地理位置编码......') invalid_address = [] # 定义不能生成地理位置的机场地址列表 address_num = 0 # 初始化有效生成的数量为0 for n in trange(len(address_list)): code = code_and_address[code_and_address['start_airport'].isin( [address_list[n]])].iat[0, 0] # 提取机场代码 if code in airport_code_list: # 判断库中是否含有这个机场的数据 continue else: airport_geo = self.convertGeo(address_list[n], code) # 调用convertGeo生成地理位置 if airport_geo == 0: # 如果返回值为0,添加机场地址到不能生成地理位置的列表中 invalid_address.append(address_list[n]) continue else: self.flight_geo.insert_one( airport_geo) # 返回值如果为0,添加到mongoDB中 airport_dict.update({ address_list[n]: [airport_geo['pos_lon'], airport_geo['pos_lat']] }) # 添加到地理数据字典中 address_num += 1 # 有效生成的数量+1 print('地理位置编码完成,共生成%d条机场的地理位置数据。' % address_num) airport_json = json.dumps(airport_dict, ensure_ascii=False) # 将地理位置字典转换为json with open(self.filename, 'w', encoding='utf-8') as f: # 存储到json文件中 f.write(airport_json) print('地理位置数据已存入json文件中,文件地址:%s' % self.filename) if len(invalid_address) != 0: # 如果无效的机场地址列表不为空,打印输出 print('下面的机场没能生成地理位置坐标,请手动添加:\n{}'.format(invalid_address)) return
def save_merged_data(all_pre_data, folderpath_dest): """保存合并后的数据,并去除','、空格(行) Args: :all_pre_data (list): list of pre data,合并后的数据列表 :folderpath_dest (str): path of folder,处理好的数据保存目录 """ print('开始保存所有数据') filename = folderpath_dest + 'all_marked_data.txt' with open(filename, 'w') as f: for data_index in trange(len(all_pre_data)): data = str(all_pre_data[data_index]).replace(',', ' ').replace('\n', '') if '。' in list(data): f.write(data + '\n') f.write('\n') elif (len(data) > 1) and (str(data)[0] != ' '): f.write(data + '\n')
def encode_text(self, documents: List[str], output_np: bool = False) -> Union[torch.Tensor, np.array]: self.to(self.params.device) length_sorted_idx = np.argsort([len(sen) for sen in documents]) documents = [documents[idx] for idx in length_sorted_idx] encoded_documents = [] self.eval() for start_index in trange(0, len(documents), self.params.batch_size): sentences_batch = documents[start_index:start_index + self.params.batch_size] encoded_dict = self.params.tokenizer( text=sentences_batch, add_special_tokens=True, padding='longest', truncation=True, max_length=self.params.sequence_max_len, return_attention_mask=True, return_token_type_ids=False, return_tensors='pt') input_ids = encoded_dict["input_ids"].to(self.params.device) attention_mask = encoded_dict["attention_mask"].to( self.params.device) features = EmbeddingsFeatures( input_ids=input_ids, attention_mask=attention_mask, ) with torch.no_grad(): embeddings = self.encode(features, parallel_mode=False) embeddings = embeddings.detach() if output_np: embeddings = embeddings.cpu() encoded_documents.extend(embeddings) encoded_documents = [ encoded_documents[idx] for idx in np.argsort(length_sorted_idx) ] if output_np: encoded_documents = np.asarray( [embedding.numpy() for embedding in encoded_documents]) return encoded_documents return torch.stack(encoded_documents)
def sort_out_href(func, urls): """整理已经获取到的标题 list (包含标题、href、发布时间) Args: :func (str): 将要调用的方法名称 :urls (list): 待获取网页 url 列表 Returns: :title_ulr_lists (list): 已整理好的标题 list (包含标题、href、发布时间) """ print("======>>>开始整理所有新闻的超链接地址<<<======") title_ulr_lists = [] href_lists = process_pools(func, urls) for href_list_index in trange(len(href_lists)): href_list = href_lists[href_list_index] for href in href_list: title_ulr_lists.append(href) return title_ulr_lists
def save_data(self): if self.now_date in self.collection_list: # 判断库中是否已存在当天数据 collection = self.db[self.now_date] print('地区库中已有今天的数据{}条,请不要重复收集......'.format( collection.count_documents({}))) return collection else: collection = self.db[self.now_date] data_china = json.loads( self.get_data_all()['data'])['areaTree'][0] # 定位到中国数据 country = data_china['name'] # 国家名称 for prov_n in trange(len(data_china['children'])): # 调用tqdm库显示进度 data_province = data_china['children'][prov_n] # 定位到省级数据 province = data_province['name'] # 省份名称 for city_n in range(len(data_province['children'])): data_city = data_province['children'][city_n] # 定位到城市数据 city = data_city['name'] # 城市名称 isupdated = data_city['today']['isUpdated'] # 是否更新 today_confirm = data_city['today']['confirm'] # 当天确诊数 total_confirm = data_city['total']['confirm'] # 总确诊数 total_heal = data_city['total']['heal'] # 总恢复数 total_dead = data_city['total']['dead'] # 总死亡数 pos_lon_lat = self.location(province, city) if pos_lon_lat is None: # 如果返回的坐标为空值,则设置这条数据为省会 pos_lon_lat = self.location(province, province) item = { 'country': country, 'province': province, 'city': city, 'isupdated': isupdated, 'today_confirm': today_confirm, 'total_confirm': total_confirm, 'total_heal': total_heal, 'total_dead': total_dead, 'pos_lon': pos_lon_lat[0], 'pos_lat': pos_lon_lat[1] } collection.insert_one(item) print('\n今天的数据已收集完毕,共采集了{}个城市和地区的数据......'.format( collection.count_documents({}))) return
def house_data(self): dict_price, dict_position = {}, {} for i in trange(len(self.address)): url = 'https://restapi.amap.com/v3/geocode/geo?address=' + \ self.address[i] + '&key=' + self.key + '&city=沈阳' req = requests.get(url) data = json.loads(req.text) if data['count'] == '0': # 去除不能生成地理位置坐标的数据 continue pos = data['geocodes'][0]['location'].split(',') if float(pos[0]) == 0 or float(pos[1]) == 0: # 去除坐标值为0的数据 continue pos_lon_lat = [float(pos[0]), float(pos[1])] dict_position.update({self.name[i]: pos_lon_lat}) dict_price.update( {self.name[i]: {'pos_lon_lat': pos_lon_lat, 'price': self.price[i]}}) with open('position.json', 'w', encoding='utf-8') as f: # 将坐标值写入json文件 f.write(json.dumps(dict_position, ensure_ascii=False)) with open('price.json', 'w', encoding='utf-8') as f: # 将价格写入json文件 f.write(json.dumps(dict_price, ensure_ascii=False))
def saveData(self): """ 根据车站名称调用高德地图API获得省、市、经纬度信息,保存到数据库中 """ stations_df = pd.DataFrame(self.station.find({}, {'_id': 0})) string = ['站', '火车站'] for n in trange(stations_df.shape[0]): name = stations_df.iat[n, 0] telecode = stations_df.iat[n, 3] pinyin = stations_df.iat[n, 4] bureau = stations_df.iat[n, 1] location = self.amapLocation(name, string[0]) if location == 'Error': location_1 = self.amapLocation(name, string[1]) if location_1 == 'Error': self.writeLog(name) else: item = { 'name': name, 'telecode': telecode, 'pinyin': pinyin, 'province': location_1[0], 'city': location_1[1], 'bureau': bureau, 'lon': location_1[2], 'lat': location_1[3] } self.geo.insert_one(item) else: item = { 'name': name, 'telecode': telecode, 'pinyin': pinyin, 'province': location[0], 'city': location[1], 'bureau': bureau, 'lon': location[2], 'lat': location[3] } self.geo.insert_one(item) return
def update(self): with trange(max_gen) as t: for i in t: for j in range(pop_size): # 速度更新 self.update_vel(self.Part.pop_v[j], self.Part.pop_x[j], self.Part.p_best[j]) # 位置更新 self.update_pos(self.Part.pop_x[j], self.Part.pop_v[j]) # 粒子のベスト位置更新 if fitness(self.Part.pop_x[j], self.mode) > fitness( self.Part.p_best[j]): self.Part.p_best[j] = self.Part.pop_x[j] # 粒子群のベスト位置更新 if fitness(self.Part.pop_x[j], self.mode) > fitness( self.g_best, self.mode): self.g_best = self.Part.pop_x[j] self.result.append(fitness(self.g_best)) t.set_description("Generation: %i" % i) t.set_postfix(fitness=fitness(self.g_best, self.mode))
def get_data(folderpath_origin): """加载数据 Args: :folderpath_origin (str): Folder,待加载数据文件目录 """ filename = os.listdir(folderpath_origin) merged_data_list = [] print('开始合并所有文件中的数据') for name_index in trange(len(filename)): name = filename[name_index] if str(name)[-5:] == 'w.txt': path = folderpath_origin + name with open(path, 'r') as f: for chunk in f: if str(chunk)[0] != ' ': merged_data_list.append(chunk) else: print('\n不是目标文件,跳过当前文件。') return merged_data_list
def eval_fn(trainer, env_config, hands) -> dict: """Evaluates current policy under `evaluation_config` settings. Note that this default implementation does not do anything beyond merging evaluation_config with the normal trainer config. """ # Call the `_before_evaluate` hook. trainer._before_evaluate() # Sync weights to the evaluation WorkerSet. trainer._sync_weights_to_workers(worker_set=trainer.evaluation_workers) trainer._sync_filters_if_needed(trainer.evaluation_workers) if trainer.config["evaluation_num_workers"] == 0: for _ in range(trainer.config["evaluation_num_episodes"]): trainer.evaluation_workers.local_worker().sample() else: num_rounds = int( math.ceil(trainer.config["evaluation_num_episodes"] / trainer.config["evaluation_num_workers"])) num_workers = len(trainer.evaluation_workers.remote_workers()) num_episodes = num_rounds * num_workers for i in trange(0, num_episodes, num_workers, unit_scale=num_workers, leave=False): update_config() logger.info("Running round {} of parallel evaluation " "({}/{} episodes)".format(i, (i + 1) * num_workers, num_episodes)) ray.get([ w.sample.remote() for w in trainer.evaluation_workers.remote_workers() ]) metrics = collect_metrics(trainer.evaluation_workers.local_worker(), trainer.evaluation_workers.remote_workers()) return {"evaluation": metrics}
def eval(model, data_loader, device): batch_num = int(data_loader.dev_num/args.dev_batch_size) batch_num = batch_num if data_loader.dev_num% args.dev_batch_size == 0 else batch_num + 1 predict_all = np.array([], dtype=int) label_all = np.array([], dtype=int) loss_mean = 0 with torch.no_grad(): for step in trange(batch_num, desc=f'valid {step}/{batch_num}'): batch_data = data_loader.get_next_batch(args.dev_batch_size, 'dev') batch_text_list, batch_label_list, batch_seg_list, batch_type_list, batch_category_list, \ batch_a_seg_list, batch_a_tree_list, batch_b_seg_list, batch_b_tree_list = batch_data batch_label_ids = torch.tensor(batch_label_list, dtype=torch.long).to(device) pred_output = model(batch_text_list, batch_seg_list, batch_type_list, batch_a_seg_list, batch_a_tree_list, batch_b_seg_list, batch_b_tree_list, fine_tune=True) logits = pred_output[0] loss = criterion(logits.view(-1, label_nums), batch_label_ids.view(-1)) loss_mean += torch.sum(loss) predict = torch.max(logits.data, 1)[1].cpu().numpy() label_all = np.append(label_all, batch_label_ids.data.cpu().numpy()) predict_all = np.append(predict_all, predict) acc = metrics.accuracy_score(label_all, predict_all) loss_mean /= data_loader.dev_num return acc, loss_mean
def tq_data(url, all_data): """封装自动处理数据API Args: :url (str): URL,数据处理的URL :all_data (list): List,所有已加载的数据 Returns: :all_pre_data (list): Dictionary,所有已处理好的数据list """ all_data_length = len(all_data) all_pre_data = [] for index in trange(all_data_length): now_data = all_data[index] now_data_list_temp = list(filter(None, now_data.split('。'))) now_data_list = [] for temp_data in now_data_list_temp: if len(temp_data) > 10: now_data_list.append(temp_data) pre_data = make_request(url, now_data_list) all_pre_data.append(pre_data) return all_pre_data
def merge_all_data(folderpath_origin, folderpath_dest): """ 合并所有已经标注好的数据 Args: :folderpath_origin (str): Folder,已经标注好的数据文件的路径 :folderpath_dest (str): Destination,合并后的数据将要保存的路径 """ print('开始合并所有标注数据') all_data = [] filename_origin = folderpath_origin + 'all_marked_data.txt' with open(filename_origin, 'r') as f: for data in f: if len(str(data)) > 1: all_data.append(data.replace('\n', '').split(' ')) filename_dest = folderpath_dest + 'all_marked_data.txt' with open(filename_dest, 'w') as f: data_str = '' for data_index in trange(len(all_data)): data = all_data[data_index] if data[0] == '。' or data_index == len(all_data) - 1: data_str = data[0] + '/O' + '\n' f.write(data_str) elif data[1] == 'O': data_str = data[0] + '/O ' f.write(data_str) elif (all_data[data_index + 1][1][0] == 'B') and (data[1][2:] == all_data[data_index + 1][1][2:]): data_str = data[0] + '/' + data[1][2:] + ' ' f.write(data_str) elif (all_data[data_index + 1][1][0] != 'B') and (data[1][2:] == all_data[data_index + 1][1][2:]): data_str = data[0] f.write(data_str) elif data[1][2:] != all_data[data_index + 1][1][2:]: data_str = data[0] + '/' + data[1][2:] + ' ' f.write(data_str)
def save_data_dict(folderpath_dest): """ 保存数据字典 Args: :folderpath_dest (str): folder,文件的保存目录 """ print('开始分割数据字典') filename_origin = folderpath_dest + 'all_marked_data.txt' data = pd.read_csv(filename_origin, header=None) data = data.values.tolist() marked_data_dict = [] # 存放标记数据的检索词 data_dict = [] # 存放标记数据的完整数据 for data_index in trange(len(data)): now_data = data[data_index][0] data_list = now_data.split(' ') for cell in data_list: cell_marked = str(cell).split('/')[0] if cell_marked not in marked_data_dict: marked_data_dict.append(cell_marked) data_dict.append(cell) sheet = pd.DataFrame(data_dict) filename_dest = folderpath_dest + 'dict.txt' sheet.to_csv(filename_dest, index=None, header=None)
def calculate_coverage_analytically(self, sample_size1: int, sample_size2: int, proportions: proportions_type, confidence: float, z_precision: Union[float, Literal['auto']] = 'auto' ): """ Calculates true coverage of confidence interval for the difference between two proportions produced by the `method` for the given desired `confidence` using an indistinguishably precise approximation for the analytical solution. Optimal approximation precision is auto-picked for the specific case, but can be set manually in `z_precision`. This is a z-value for precision instead of p. Meaning, `z_precision` of 1.96 is 95% precision (which is a terrible precision). Number of trials for both samples are `sample_size1` and `sample_size2`. Two proportions for samples 1 and 2 are taken from the list `proportions`, each against each, producing 2d square matrix of results, a value for each pair of proportions. This 2d square matrix is `coverage`, and is saved to `self.coverage`. """ self.confidence = confidence self.proportions = self.form_proportions_list(proportions) self.sample_size1 = sample_size1 self.sample_size2 = sample_size2 if z_precision == 'auto': z_precision = get_binomial_z_precision(confidence) if __debug__ is True: print( self.f.calculation_inputs() + ",\n" f"calculation_method = analytical approximation, " + f"z_precision = {z_precision:5.2f}" ) # `n` by `n` 0-matrix, where `n` - the number of probabilities (population proportions) coverage = np_zeros((len(self.proportions),len(self.proportions)), dtype=longdouble) progress_bar_str = "p1={}; p2={} => cov={}%" """ Here we loop through the cartesian square of the list `self.proportions`, (cartesian product of the list `self.proportions` with itself) But there's no need to loop through the entire "matrix": for each pair `(xi, xj)` the same result can be used for `(xj, xi)`. Therefore, only "left diagonal matrix" elements of this "matrix" have to be included """ t = trange(len(self.proportions), desc=progress_bar_str.format("***","***","***")) for i in t: for j in range(i, len(self.proportions)): (prob_x1, prob_x2) = self.proportions[i], self.proportions[j] delta = abs(prob_x2 - prob_x1) """The entire range of the binomial distribution could be used""" #x1_from, x1_to = (0, sample_size) #x2_from, x2_to = (0, sample_size) """ This is too computationally expensive to calculate CI for `y` value of each `x1` and `x2` of a 2-variate binomial distribution. Since most `y` values of the binomial distribution are very close to zero, we can use only a small part of the binomial distribution around the peak. Such part of a binomial distribution can often be efficiently modeled with a normal distribution. Let's say we need to consider the span covering 99.999% percent of the mass of the two-variate binomial distribution. According to the normal distribution, this would be a range that spans 4.42 standard deviations from the mean on both sides for both single-variate binomial distribution from which the two-variate one is constructed. The span of 4.42 sigma would cover around 99.999% of a binomial distribution `Binom(n,p)` for most values of `n` and `p`. This would nail it for 95%CI, but what if a user wants to ask for 99.999%CI, and we are only considering 99.999% of the binomial distribution? We'd need to consider much more expansive range in our calculations. We would need something like this: for 95% confidence => 99.995% of the distribution (4.056 sigma) for 99% confidence => 99.999% of the distribution (4.417 sigma) for 99.9% confidence => 99.9999% of the distribution (4.892 sigma) for 99.99% confidence => 99.999_99% of the distribution (5.327 sigma) for significant range of 5 sigma: for 99.999_943% confidence => 99.999_999_943% of the distribution (6.199 sigma) etc. Thus, precision is to be determined given the `confidence`. A specific formula is used to figure out the optimal `z_precision`. """ x1_from, x1_to = binomial_distribution_two_tailed_range(n=sample_size1, p=prob_x1, sds=z_precision) x2_from, x2_to = binomial_distribution_two_tailed_range(n=sample_size2, p=prob_x2, sds=z_precision) x1s = range(x1_from, x1_to+1) x2s = range(x2_from, x2_to+1) CIs = [[ self.method(x1, self.sample_size1, x2, self.sample_size2, self.confidence) for x2 in x2s] for x1 in x1s] # Array of `1`s and `0`s # int constructor could be used, but longdouble is used to provide better precision covered = [[longdouble(CIs[i1][i2][0] < delta < CIs[i1][i2][1]) for i2 in range(len(CIs[i1]))] for i1 in range(len(CIs))] # multiplied by 100 in-place for better progress bar, and for a better figure later thiscoverage = 100 * (np_sum( [covered[i][j] * binomial_distribution_pmf(x1s[i], sample_size1, prob_x1) * binomial_distribution_pmf(x2s[j], sample_size2, prob_x2) for i in range(len(x1s)) for j in range(len(x2s))] )) coverage[i][j] = coverage[j][i] = thiscoverage t.set_description(progress_bar_str.format( self.f.proportion(prob_x1), self.f.proportion(prob_x2), self.f.coverage(thiscoverage))) self.coverage = coverage t.set_description(progress_bar_str.format( "*", "*", self.f.coverage(self.average_coverage))) print(f"average confidence level {self.f.coverage(self.average_coverage)}") print(f"average deviation from {self.f.confidence_percent} = {self.f.coverage(self.average_deviation)} (coverage %)") print("") return self.coverage
import random import pandas as pd from LAC import LAC from tqdm.std import trange # 使用自己训练好的模型 my_lac = LAC(model_path='Model_Code/model/') my_lac.load_customization('Model_Code/data/dict.txt', sep=None) data = pd.read_csv('Model_Code/data/G_all_marked_data.txt', header=None) data = data.values.tolist() texts = [] for data_index in trange(len(data)): line = data[data_index][0] texts.append(line) start = random.randint(0, len(data) - 10) lac_result = my_lac.run(texts[start : start + 10]) sheet = pd.DataFrame(lac_result) sheet.to_csv('Model_Code/result/result_lac.csv', index=None, header=None) items_list = [] for data_index in trange(len(lac_result)): word_list = lac_result[data_index][0] tag_list = lac_result[data_index][1] for word_index in range(0, len(word_list)): item_list = [] item_list.append(word_list[word_index]) item_list.append(tag_list[word_index]) items_list.append(item_list)
def calculate_coverage_randomly(self, sample_size: int, proportions: proportions_type, confidence: float, n_of_experiments: int = 20000): """ Calculates true coverage of confidence interval for proportion produced by the `method` for the given desired `confidence` using a simulation with a number of random experiments (`n_of_experiments`). Total number of trials in a sample is `sample_size`. Proportion for the sample is taken from the list `proportions`, producing a list of results, a value for each proportion. This list is `coverage`, and is saved to `self.coverage`. """ self.confidence = confidence self.proportions = self.form_proportions_list(proportions) self.sample_size = sample_size if __debug__ is True: print(self.f.calculation_inputs() + ",\n" f"calculation_method = random simulation, " + f"n_of_experiments = {n_of_experiments}") coverage = [] # The return value of this function will be cached (this is not necessary) z = normal_z_score_two_tailed(p=confidence) progress_bar_str = "p={} => cov={}%" t = trange(len(self.proportions), desc=progress_bar_str.format("***", "***")) for i in t: prob = self.proportions[i] x = binomial_experiment(sample_size, prob, n_of_experiments) CIs = [ self.method(x[j], sample_size, confidence) for j in range(0, n_of_experiments) ] covered = [int(CI[0] < prob < CI[1]) for CI in CIs] # multiplied by 100 in-place for better progress bar, and for a better figure later thiscoverage = (sum(covered) / n_of_experiments) * 100 coverage.append(thiscoverage) t.set_description( progress_bar_str.format(self.f.proportion(prob), self.f.coverage(thiscoverage))) self.coverage = coverage t.set_description( progress_bar_str.format("*", "*", self.f.coverage(self.average_coverage))) print( f"average confidence level {self.f.coverage(self.average_coverage)}" ) print( f"average deviation from {self.f.confidence_percent} = {self.f.coverage(self.average_deviation)} (coverage %)" ) print("") return self.coverage
optimizer = optim.AdamW(model.parameters(), lr=3e-5) # param_optimizer = list(model.named_parameters()) # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) global_step = 0 best_dev_acc = 0.0 for epoch in trange(int(args.train_epoch), desc='Epoch'): model.train() batch_num = int(data_loader.train_num/args.train_batch_size) batch_num = batch_num if data_loader.train_num % args.train_batch_size == 0 else batch_num + 1 train_loss = 0 for step in trange(batch_num, desc=f'Training {step}/{batch_num}'): logits, loss = predict(model, data_loader, device, is_train=True) loss.backward() train_loss += loss.item() optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % 100 == 0: dev_acc, loss = eval(model, data_loader, device) if dev_acc > best_dev_acc: best_dev_acc = dev_acc
prj.close() ## Read the data resolution, edgeLength, shore, hydrology, cells, Ts = SaveFile.readDataModel( inputFile ) realShape = shore.realShape with shapefile.Writer(outputFile, shapeType=3) as w: # The only relevant field for rivers w.field('flow', 'F') # This loop adds rivers in the same way that they were created #for node in hydrology.allMouthNodes(): for nidx in trange(len(hydrology)): node = hydrology.node(nidx) leaves = hydrology.allLeaves(node.id) for leafNode in leaves: # Path from the leaf to the sea path = hydrology.pathToNode(node.id, leafNode.id) path.reverse() # The flow of the river is the flow at the base of # its path to the sea, unless this stretch of the # river doesn't flow all the way to the sea riverFlow = path[len(path)-1].flow for ni in range(1,len(path)): upstreamFlow = max([n.flow for n in hydrology.upstream(path[ni].id)]) # If this river is merely a tributary to a larger
def calculate_coverage_analytically( self, sample_size: int, proportions: proportions_type, confidence: float, z_precision: Union[float, Literal['auto']] = 'auto'): """ Calculates true coverage of confidence interval for proportion produced by the `method` for the given desired `confidence` using an indistinguishably precise approximation for the analytical solution. Optimal approximation precision is auto-picked for the specific case, but can be set manually in `z_precision`. This is a z-value for precision instead of p. Meaning, `z_precision` of 1.96 is 95% precision (which is a terrible precision). See more comments below for the actual meaning of z_precision. Total number of trials in a sample is `sample_size`. Proportion for the sample is taken from the list `proportions`, producing a list of results, a value for each proportion. This list is `coverage`, and is saved to `self.coverage`. """ self.confidence = confidence self.proportions = self.form_proportions_list(proportions) self.sample_size = sample_size if z_precision == 'auto': z_precision = get_binomial_z_precision(confidence) if __debug__ is True: print(self.f.calculation_inputs() + ",\n" f"calculation_method = analytical approximation, " + f"z_precision = {z_precision:5.2f}") coverage = [] # The return value of this function will be cached (this is not necessary) z = normal_z_score_two_tailed(p=confidence) progress_bar_str = "p={} => cov={}%" t = trange(len(self.proportions), desc=progress_bar_str.format("***", "***")) for i in t: prob = self.proportions[i] """The entire range of the binomial distribution could be used""" #x_from, x_to = (0, sample_size) """ But this is too computationally expensive to calculate CI for `y` value of each `x` of a binomial distribution. Since most `y` values of the binomial distribution are very close to zero, we can use only a small part of the binomial distribution around the peak. Such part of a binomial distribution can often be efficiently modeled with a normal distribution. Let's say we want to consider the span covering 99.999% of the mass of the binomial distribution. According to the normal distribution, this would be a range that spans 4.42 standard deviations from the mean on both sides. The span of 4.42 sigma would cover around 99.999% of a binomial distribution `Binom(n,p)` for most values of `n` and `p`. This would nail it for 95%CI, but what if a user wants to ask for 99.999%CI, and we are only considering 99.999% of the binomial distribution? We'd need to consider much more expansive range in our calculations. We would need something like this: for 95% confidence => 99.995% of the distribution (4.056 sigma) for 99% confidence => 99.999% of the distribution (4.417 sigma) for 99.9% confidence => 99.9999% of the distribution (4.892 sigma) for 99.99% confidence => 99.999_99% of the distribution (5.327 sigma) for significant range of 5 sigma: for 99.999_943% confidence => 99.999_999_943% of the distribution (6.199 sigma) etc. Thus, precision is to be determined given the `confidence`. A specific formula is used to figure out the optimal `z_precision`. """ x_from, x_to = binomial_distribution_two_tailed_range( n=sample_size, p=prob, sds=z_precision) xs = range(x_from, x_to + 1) CIs = [self.method(x, sample_size, confidence) for x in xs] # Array of `1`s and `0`s # int constructor could be used, but longdouble is used to provide better precision covered = [longdouble(CI[0] < prob < CI[1]) for CI in CIs] # multiplied by 100 in-place for better progress bar, and for a better figure later thiscoverage = sum([ covered[i] * binomial_distribution.pmf(xs[i], sample_size, prob) for i in range(len(xs)) ]) * 100 coverage.append(thiscoverage) t.set_description( progress_bar_str.format(self.f.proportion(prob), self.f.coverage(thiscoverage))) self.coverage = coverage t.set_description( progress_bar_str.format("*", "*", self.f.coverage(self.average_coverage))) print( f"average confidence level {self.f.coverage(self.average_coverage)}" ) print( f"average deviation from {self.f.confidence_percent} = {self.f.coverage(self.average_deviation)} (coverage %)" ) print("") return self.coverage
def calculate_coverage_randomly(self, sample_size1: int, sample_size2: int, proportions: proportions_type, confidence: float, n_of_experiments: int = 10000 ): """ Calculates true coverage of confidence interval for the difference between two proportions produced by the `method` for the given desired `confidence` using a simulation with a number of random experiments (`n_of_experiments`). Number of trials for both samples are `sample_size1` and `sample_size2`. Two proportions for samples 1 and 2 are taken from the list `proportions`, each against each, producing 2d square matrix of results, a value for each pair of proportions. This 2d square matrix is `coverage`, and is saved to `self.coverage`. """ self.confidence = confidence self.proportions = self.form_proportions_list(proportions) self.sample_size1 = sample_size1 self.sample_size2 = sample_size2 if __debug__ is True: print( self.f.calculation_inputs() + ",\n" f"calculation_method = random simulation, " + f"n_of_experiments = {n_of_experiments}" ) # n by n zero matrix, where n is the number of tested probabilities (actual population proportions) coverage = np_zeros((len(self.proportions),len(self.proportions)), dtype=longdouble) # The return value of this function will be cached (this is not necessary) z = normal_z_score_two_tailed(p=confidence) progress_bar_str = "p1={}; p2={} => cov={}%" """ Here we loop through the cartesian square of the list `self.proportions`, (cartesian product of the list `self.proportions` with itself) But there's no need to loop through the entire "matrix": for each pair `(xi, xj)` the same result can be used for `(xj, xi)`. Therefore, only "left diagonal matrix" elements of this "matrix" have to be included """ t = trange(len(self.proportions), desc=progress_bar_str.format("***","***","***")) for i in t: for j in range(i, len(self.proportions)): (prob_x1, prob_x2) = self.proportions[i], self.proportions[j] delta = abs(prob_x2 - prob_x1) x1 = binomial_experiment(sample_size1, prob_x1, n_of_experiments) x2 = binomial_experiment(sample_size2, prob_x2, n_of_experiments) CIs = [self.method(x1[k], sample_size1, x2[k], sample_size2, confidence) for k in range(0, n_of_experiments)] covered = [int(CI[0] < delta < CI[1]) for CI in CIs] # multiplied by 100 in-place for better progress bar, and for a better figure later thiscoverage = (sum(covered)/n_of_experiments) * 100 coverage[i][j] = coverage[j][i] = thiscoverage t.set_description(progress_bar_str.format( self.f.proportion(prob_x1), self.f.proportion(prob_x2), self.f.coverage(thiscoverage))) self.coverage = coverage t.set_description(progress_bar_str.format( "*", "*", self.f.coverage(self.average_coverage))) print(f"average confidence level {self.f.coverage(self.average_coverage)}") print(f"average deviation from {self.f.confidence_percent} = {self.f.coverage(self.average_deviation)} (coverage %)") print("") return self.coverage