def encode_corpus(self,
                   documents: List[str]) -> Union[torch.Tensor, np.array]:
     length_sorted_idx = np.argsort([len(sen) for sen in documents])
     documents = [documents[idx] for idx in length_sorted_idx]
     encoded_documents = []
     for start_index in trange(0, len(documents), self.params.batch_size):
         sentences_batch = documents[start_index:start_index +
                                     self.params.batch_size]
         encoded_dict = self.params.tokenizer(
             text=sentences_batch,
             add_special_tokens=True,
             padding='longest',
             truncation=True,
             max_length=self.params.sequence_max_len,
             return_attention_mask=True,
             return_token_type_ids=False,
             return_tensors='np')
         inputs = {
             'input_ids': encoded_dict["input_ids"].reshape(1, -1),
             'attention_mask':
             encoded_dict["attention_mask"].reshape(1, -1),
         }
         output = self.session.run(None, inputs)
         embeddings = output[0]
         encoded_documents.extend(embeddings)
     encoded_documents = [
         encoded_documents[idx] for idx in np.argsort(length_sorted_idx)
     ]
     return encoded_documents
Example #2
0
def get_data(folderpath_origin):
    """加载所有待处理文件

    Args:
        :folderpath_origin (str): Folder,待加载的文件所在目录

    Returns:
        :new_all_data (list): List of all files,将所有的待处理数据加载到列表中
    """

    filename = os.listdir(folderpath_origin)
    all_data = []
    for name in filename:
        path = folderpath_origin + name
        if path[-5:] == '.xlsx':
            data = pd.read_excel(path)
            abstracts = data['摘要'].to_list()
            for abstract in abstracts:
                all_data.append(abstract)
        elif path[-4:] == '.csv':
            data = pd.read_csv(path)
            detaileds = data['详细'].to_list()
            for detailed in detaileds:
                all_data.append(detailed)
    new_all_data = []
    print('开始去重处理!!!')
    for data_index in trange(len(all_data)):
        data = all_data[data_index]
        if data not in new_all_data:
            new_all_data.append(data)
    return new_all_data
Example #3
0
def save_mean_data(data, amount, folderpath_dest):
    """将已识别的数据按照小组人数进行切分,方便后续继续进行核查

    Args:
        :data (list): list of data,所有已识别数据列表
        :amount (int): amount to people,均分的次数
        :folderpath_dest (str): path of folder,已识别数据待保存文件目录的路径
    """

    all_data_length = len(data)
    mean_data_length = all_data_length // amount
    start = 0
    end = mean_data_length
    print('开始平均所有数据')
    for index in trange(amount):
        while len(data[end - 1]) != 0:
            end += 1
        else:
            sheet = pd.DataFrame(data[start:end - 1])
            filename = folderpath_dest + 'all_extract_part_' + str(
                index) + '.txt'
            sheet.to_csv(filename, index=None, header=None)
            start = end
            end += mean_data_length
            if end > all_data_length:
                end = all_data_length
Example #4
0
def check_data(folderpath_origin, folderpath_dest):
    """处理因人工标注造成的字符合并问题(单个文件)

    Args:
        : folderpath_origin(str): Folder,待处理文件路径
        : folderpath_dest(str): Folder,已处理数据保存路径
    """

    all_data = pd.read_csv(folderpath_origin, header=None)
    all_data = all_data.values.tolist()
    new_all_data_list = []
    for data_index in trange(len(all_data)):
        data = all_data[data_index]
        if ((str(data[0]) != 'nan') and (not str(data[0]).isdigit())
                and (len(data[0]) > 1)):
            for i in range(len(data[0])):
                new_data_list = []
                new_data_list.append(data[0][i])
                new_data_list.append('O')
                new_all_data_list.append(new_data_list)
        else:
            new_all_data_list.append(data)

    sheet = pd.DataFrame(new_all_data_list)
    filename = folderpath_dest
    sheet.to_csv(filename, index=None, header=None)
Example #5
0
def check_datas(folderpath_origin, folderpath_dest):
    """处理因人工标注造成的字符合并问题(多个文件处于同一目录下)

    Args:
        : folderpath_origin(str): Folder,待处理文件目录路径
        : folderpath_dest(str): Folder,已处理数据保存目录路径
    """

    filenames = os.listdir(folderpath_origin)

    for filename in filenames:
        if len(filename) == 13:
            filename = folderpath_origin + filename
            print(filename)
            all_data = pd.read_csv(filename, header=None)
            all_data = all_data.values.tolist()
            new_all_data_list = []
            for data_index in trange(len(all_data)):
                data = all_data[data_index]
                if ((str(data[0]) != 'nan') and (not str(data[0]).isdigit())
                        and (len(data[0]) > 1)):
                    for i in range(len(data[0])):
                        new_data_list = []
                        new_data_list.append(data[0][i])
                        new_data_list.append('O')
                        new_all_data_list.append(new_data_list)
                else:
                    new_all_data_list.append(data)
            sheet = pd.DataFrame(new_all_data_list)
            filename = folderpath_dest + 'G_' + filename[-13:]
            sheet.to_csv(filename, index=None, header=None)
Example #6
0
def merge_data(all_data):
    """合并标记数据(B、I)

    Args:
        :all_data (list): list of data,加载的所有标记数据列表,包含B、I
    """

    Merged_Data_List = []  # 存放所有实体字典
    print('开始合并所有邻近标记的实体:')
    for data_index in trange(len(all_data['data'])):
        data = all_data['data'][data_index]
        item = ''
        data_length = len(data)
        Merged_Data_Dict = {}  # 存放合并后的所有实体对
        merged_data_list = []  # 存放合并后的实体字典
        merged_data_dict = {}  # 存放合并后的实体对象
        for index in range(data_length):
            if index == data_length - 1:
                if item == '':
                    merged_data_dict['item'] = data[index]['item']
                    merged_data_dict['marker'] = data[index]['marker']
                    merged_data_list.append(merged_data_dict)
                    merged_data_dict = {}
                else:
                    item += data[index]['item']
                    merged_data_dict['item'] = item
                    merged_data_dict['marker'] = data[index]['marker']
                    merged_data_list.append(merged_data_dict)
                    merged_data_dict = {}
                    item = ''
            elif data[index]['marker'] == 'O':
                merged_data_dict['item'] = data[index]['item']
                merged_data_dict['marker'] = data[index]['marker']
                merged_data_list.append(merged_data_dict)
                merged_data_dict = {}
            elif data[index]['marker'] != data[index +
                                               1]['marker'] and item == '':
                merged_data_dict['item'] = data[index]['item']
                merged_data_dict['marker'] = data[index]['marker']
                merged_data_list.append(merged_data_dict)
                merged_data_dict = {}
            elif data[index]['marker'] == data[index + 1]['marker']:
                item += data[index]['item']
            elif data[index]['marker'] != data[index +
                                               1]['marker'] and item != '':
                item += data[index]['item']
                merged_data_dict['item'] = item
                merged_data_dict['marker'] = data[index]['marker']
                merged_data_list.append(merged_data_dict)
                merged_data_dict = {}
                item = ''
        Merged_Data_Dict['data'] = merged_data_list
        Merged_Data_List.append(Merged_Data_Dict)
    return Merged_Data_List
Example #7
0
 def saveData(self, data):
     print('开始进行地理位置编码......')
     pos_dict = {}
     for n in trange(len(data)):
         pos_lon_lat = self.location(data[n])
         pos_dict.update({data[n]: pos_lon_lat})
     json_data = json.dumps(pos_dict, ensure_ascii=False)
     with open(self.filename, 'w', encoding='utf-8') as f:  # 将结果存储为JSON文件
         f.write(json_data)
     print('地理位置编码完成,生成文件:%s' % self.filename)
     return pos_dict
Example #8
0
def marker_data(all_data):
    """转换标记,并拆分字符串

    Args:
        :all_data (dict): dictionary,加载的所有数据字典

    """
    extract_data_lists = []
    print('转换标记,并拆分字符串:')
    for item_index in trange(len(all_data['data'])):
        item = all_data['data'][item_index]
        for cell in item:
            if cell['marker'] == 'O':
                extract_data_list = []
                extract_data_list.append(cell['item'])
                extract_data_list.append(cell['marker'])
                number_data = ''
                if len(extract_data_list) == 2:
                    extract_data_lists.append(extract_data_list)
            else:
                marker_len = len(extract_data_lists)
                number_data = ''
                for index in range(0, len(cell['item'])):  # 拆分字符串,并合并字符串中的数字
                    extract_data_list = []
                    # 判断是否为字符串的开始 index,并设置长度避免超出字符串 index
                    if (str(cell['item'][index]).isdigit() is
                            False) or (index == len(cell['item']) - 1):
                        marker = 'I-' + cell['marker']
                        extract_data_list.append(cell['item'][index])
                        extract_data_list.append(marker)
                    elif (str(cell['item'][index]).isdigit()) and (str(
                            cell['item'][index + 1]).isdigit()):
                        number_data += str(cell['item'][index])
                    elif str(cell['item'][index]).isdigit() and (str(
                            cell['item'][index + 1]).isdigit() is False):
                        number_data += str(cell['item'][index])
                        marker = 'I-' + cell['marker']
                        extract_data_list.append(number_data)
                        extract_data_list.append(marker)
                        number_data = ''
                    if len(extract_data_list) == 2:
                        extract_data_lists.append(extract_data_list)
                extract_data_lists[marker_len][1] = (
                    'B' + extract_data_lists[marker_len][1][1:])
        extract_data_lists.append('')
    return extract_data_lists
Example #9
0
def sort_out_news(func, title_ulr_lists):
    """整理已经获取到的事件详细 list (包含标题、href、发布时间、详细信息)

    Args:
        :func (str): 将要调用的方法名称
        :urls (list): 待获取网页 url 列表
        
    Returns:
        :news_data_lists (list): 已整理好的事件详细 list (包含标题、href、发布时间、详细信息)
    """

    print("======>>>开始整理所有新闻的详细信息<<<======")
    news_data_lists = []
    news_lists = process_pools(func, title_ulr_lists)
    for news_list_index in trange(len(news_lists)):
        news_data_lists.append(news_lists[news_list_index])
    return news_data_lists
Example #10
0
    def saveToDB(self, flights):
        airport_dict = {}  # 定义机场地理数据的字典
        flight_geo = pd.DataFrame(self.flight_geo.find({}, {'_id': 0}))
        if flight_geo.shape[0] == 0:  # 判断数据库是否为空,如果不空则将code转换为列表
            airport_code_list = []
        else:
            airport_code_list = flight_geo.airport_code.tolist()

        code_and_address = flights[['airport_code', 'start_airport'
                                    ]]  # 在清理后的DataFrame中提取出发,到达两列数据
        code_and_address = code_and_address.drop_duplicates(
            subset='start_airport')  # 按出发机场去重
        address_list = code_and_address.start_airport.tolist()  # 将出发机场转换为列表
        print('开始进行地理位置编码......')
        invalid_address = []  # 定义不能生成地理位置的机场地址列表
        address_num = 0  # 初始化有效生成的数量为0
        for n in trange(len(address_list)):
            code = code_and_address[code_and_address['start_airport'].isin(
                [address_list[n]])].iat[0, 0]  # 提取机场代码
            if code in airport_code_list:  # 判断库中是否含有这个机场的数据
                continue
            else:
                airport_geo = self.convertGeo(address_list[n],
                                              code)  # 调用convertGeo生成地理位置
                if airport_geo == 0:  # 如果返回值为0,添加机场地址到不能生成地理位置的列表中
                    invalid_address.append(address_list[n])
                    continue
                else:
                    self.flight_geo.insert_one(
                        airport_geo)  # 返回值如果为0,添加到mongoDB中
                    airport_dict.update({
                        address_list[n]:
                        [airport_geo['pos_lon'], airport_geo['pos_lat']]
                    })  # 添加到地理数据字典中
                    address_num += 1  # 有效生成的数量+1
        print('地理位置编码完成,共生成%d条机场的地理位置数据。' % address_num)

        airport_json = json.dumps(airport_dict,
                                  ensure_ascii=False)  # 将地理位置字典转换为json
        with open(self.filename, 'w', encoding='utf-8') as f:  # 存储到json文件中
            f.write(airport_json)
        print('地理位置数据已存入json文件中,文件地址:%s' % self.filename)
        if len(invalid_address) != 0:  # 如果无效的机场地址列表不为空,打印输出
            print('下面的机场没能生成地理位置坐标,请手动添加:\n{}'.format(invalid_address))
        return
Example #11
0
def save_merged_data(all_pre_data, folderpath_dest):
    """保存合并后的数据,并去除','、空格(行)

    Args:
        :all_pre_data (list): list of pre data,合并后的数据列表
        :folderpath_dest (str): path of folder,处理好的数据保存目录
    """

    print('开始保存所有数据')
    filename = folderpath_dest + 'all_marked_data.txt'
    with open(filename, 'w') as f:
        for data_index in trange(len(all_pre_data)):
            data = str(all_pre_data[data_index]).replace(',', ' ').replace('\n', '')
            if '。' in list(data):
                f.write(data + '\n')
                f.write('\n')
            elif (len(data) > 1) and (str(data)[0] != ' '):
                f.write(data + '\n')
    def encode_text(self,
                    documents: List[str],
                    output_np: bool = False) -> Union[torch.Tensor, np.array]:
        self.to(self.params.device)
        length_sorted_idx = np.argsort([len(sen) for sen in documents])
        documents = [documents[idx] for idx in length_sorted_idx]
        encoded_documents = []
        self.eval()
        for start_index in trange(0, len(documents), self.params.batch_size):
            sentences_batch = documents[start_index:start_index +
                                        self.params.batch_size]
            encoded_dict = self.params.tokenizer(
                text=sentences_batch,
                add_special_tokens=True,
                padding='longest',
                truncation=True,
                max_length=self.params.sequence_max_len,
                return_attention_mask=True,
                return_token_type_ids=False,
                return_tensors='pt')
            input_ids = encoded_dict["input_ids"].to(self.params.device)
            attention_mask = encoded_dict["attention_mask"].to(
                self.params.device)
            features = EmbeddingsFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            with torch.no_grad():
                embeddings = self.encode(features, parallel_mode=False)
            embeddings = embeddings.detach()

            if output_np:
                embeddings = embeddings.cpu()

            encoded_documents.extend(embeddings)
        encoded_documents = [
            encoded_documents[idx] for idx in np.argsort(length_sorted_idx)
        ]

        if output_np:
            encoded_documents = np.asarray(
                [embedding.numpy() for embedding in encoded_documents])
            return encoded_documents
        return torch.stack(encoded_documents)
Example #13
0
def sort_out_href(func, urls):
    """整理已经获取到的标题 list (包含标题、href、发布时间)

    Args:
        :func (str): 将要调用的方法名称
        :urls (list): 待获取网页 url 列表
        
    Returns:
        :title_ulr_lists (list): 已整理好的标题 list (包含标题、href、发布时间)
    """

    print("======>>>开始整理所有新闻的超链接地址<<<======")
    title_ulr_lists = []
    href_lists = process_pools(func, urls)
    for href_list_index in trange(len(href_lists)):
        href_list = href_lists[href_list_index]
        for href in href_list:
            title_ulr_lists.append(href)
    return title_ulr_lists
Example #14
0
    def save_data(self):
        if self.now_date in self.collection_list:  # 判断库中是否已存在当天数据
            collection = self.db[self.now_date]
            print('地区库中已有今天的数据{}条,请不要重复收集......'.format(
                collection.count_documents({})))
            return collection
        else:
            collection = self.db[self.now_date]

        data_china = json.loads(
            self.get_data_all()['data'])['areaTree'][0]  # 定位到中国数据
        country = data_china['name']  # 国家名称
        for prov_n in trange(len(data_china['children'])):  # 调用tqdm库显示进度
            data_province = data_china['children'][prov_n]  # 定位到省级数据
            province = data_province['name']  # 省份名称
            for city_n in range(len(data_province['children'])):
                data_city = data_province['children'][city_n]  # 定位到城市数据
                city = data_city['name']  # 城市名称
                isupdated = data_city['today']['isUpdated']  # 是否更新
                today_confirm = data_city['today']['confirm']  # 当天确诊数
                total_confirm = data_city['total']['confirm']  # 总确诊数
                total_heal = data_city['total']['heal']  # 总恢复数
                total_dead = data_city['total']['dead']  # 总死亡数
                pos_lon_lat = self.location(province, city)
                if pos_lon_lat is None:  # 如果返回的坐标为空值,则设置这条数据为省会
                    pos_lon_lat = self.location(province, province)
                item = {
                    'country': country,
                    'province': province,
                    'city': city,
                    'isupdated': isupdated,
                    'today_confirm': today_confirm,
                    'total_confirm': total_confirm,
                    'total_heal': total_heal,
                    'total_dead': total_dead,
                    'pos_lon': pos_lon_lat[0],
                    'pos_lat': pos_lon_lat[1]
                }
                collection.insert_one(item)
        print('\n今天的数据已收集完毕,共采集了{}个城市和地区的数据......'.format(
            collection.count_documents({})))
        return
Example #15
0
 def house_data(self):
     dict_price, dict_position = {}, {}
     for i in trange(len(self.address)):
         url = 'https://restapi.amap.com/v3/geocode/geo?address=' + \
               self.address[i] + '&key=' + self.key + '&city=沈阳'
         req = requests.get(url)
         data = json.loads(req.text)
         if data['count'] == '0':        # 去除不能生成地理位置坐标的数据
             continue
         pos = data['geocodes'][0]['location'].split(',')
         if float(pos[0]) == 0 or float(pos[1]) == 0:        # 去除坐标值为0的数据
             continue
         pos_lon_lat = [float(pos[0]), float(pos[1])]
         dict_position.update({self.name[i]: pos_lon_lat})
         dict_price.update(
             {self.name[i]: {'pos_lon_lat': pos_lon_lat, 'price': self.price[i]}})
     with open('position.json', 'w', encoding='utf-8') as f:     # 将坐标值写入json文件
         f.write(json.dumps(dict_position, ensure_ascii=False))
     with open('price.json', 'w', encoding='utf-8') as f:        # 将价格写入json文件
         f.write(json.dumps(dict_price, ensure_ascii=False))
Example #16
0
 def saveData(self):
     """
     根据车站名称调用高德地图API获得省、市、经纬度信息,保存到数据库中
     """
     stations_df = pd.DataFrame(self.station.find({}, {'_id': 0}))
     string = ['站', '火车站']
     for n in trange(stations_df.shape[0]):
         name = stations_df.iat[n, 0]
         telecode = stations_df.iat[n, 3]
         pinyin = stations_df.iat[n, 4]
         bureau = stations_df.iat[n, 1]
         location = self.amapLocation(name, string[0])
         if location == 'Error':
             location_1 = self.amapLocation(name, string[1])
             if location_1 == 'Error':
                 self.writeLog(name)
             else:
                 item = {
                     'name': name,
                     'telecode': telecode,
                     'pinyin': pinyin,
                     'province': location_1[0],
                     'city': location_1[1],
                     'bureau': bureau,
                     'lon': location_1[2],
                     'lat': location_1[3]
                 }
                 self.geo.insert_one(item)
         else:
             item = {
                 'name': name,
                 'telecode': telecode,
                 'pinyin': pinyin,
                 'province': location[0],
                 'city': location[1],
                 'bureau': bureau,
                 'lon': location[2],
                 'lat': location[3]
             }
             self.geo.insert_one(item)
     return
Example #17
0
File: pso.py Project: noob3004/c
    def update(self):
        with trange(max_gen) as t:
            for i in t:
                for j in range(pop_size):
                    # 速度更新
                    self.update_vel(self.Part.pop_v[j], self.Part.pop_x[j],
                                    self.Part.p_best[j])
                    # 位置更新
                    self.update_pos(self.Part.pop_x[j], self.Part.pop_v[j])

                    # 粒子のベスト位置更新
                    if fitness(self.Part.pop_x[j], self.mode) > fitness(
                            self.Part.p_best[j]):
                        self.Part.p_best[j] = self.Part.pop_x[j]
                    # 粒子群のベスト位置更新
                    if fitness(self.Part.pop_x[j], self.mode) > fitness(
                            self.g_best, self.mode):
                        self.g_best = self.Part.pop_x[j]
                self.result.append(fitness(self.g_best))
                t.set_description("Generation: %i" % i)
                t.set_postfix(fitness=fitness(self.g_best, self.mode))
Example #18
0
def get_data(folderpath_origin):
    """加载数据

    Args:
        :folderpath_origin (str): Folder,待加载数据文件目录
    """

    filename = os.listdir(folderpath_origin)
    merged_data_list = []
    print('开始合并所有文件中的数据')
    for name_index in trange(len(filename)):
        name = filename[name_index]
        if str(name)[-5:] == 'w.txt':
            path = folderpath_origin + name
            with open(path, 'r') as f:
                for chunk in f:
                    if str(chunk)[0] != ' ':
                        merged_data_list.append(chunk)
        else:
            print('\n不是目标文件,跳过当前文件。')
    return merged_data_list
Example #19
0
def eval_fn(trainer, env_config, hands) -> dict:
    """Evaluates current policy under `evaluation_config` settings.

    Note that this default implementation does not do anything beyond
    merging evaluation_config with the normal trainer config.
    """
    # Call the `_before_evaluate` hook.
    trainer._before_evaluate()
    # Sync weights to the evaluation WorkerSet.
    trainer._sync_weights_to_workers(worker_set=trainer.evaluation_workers)
    trainer._sync_filters_if_needed(trainer.evaluation_workers)

    if trainer.config["evaluation_num_workers"] == 0:
        for _ in range(trainer.config["evaluation_num_episodes"]):
            trainer.evaluation_workers.local_worker().sample()
    else:
        num_rounds = int(
            math.ceil(trainer.config["evaluation_num_episodes"] /
                      trainer.config["evaluation_num_workers"]))
        num_workers = len(trainer.evaluation_workers.remote_workers())
        num_episodes = num_rounds * num_workers

        for i in trange(0,
                        num_episodes,
                        num_workers,
                        unit_scale=num_workers,
                        leave=False):
            update_config()
            logger.info("Running round {} of parallel evaluation "
                        "({}/{} episodes)".format(i, (i + 1) * num_workers,
                                                  num_episodes))

            ray.get([
                w.sample.remote()
                for w in trainer.evaluation_workers.remote_workers()
            ])

        metrics = collect_metrics(trainer.evaluation_workers.local_worker(),
                                  trainer.evaluation_workers.remote_workers())
    return {"evaluation": metrics}
Example #20
0
def eval(model, data_loader, device):
    batch_num = int(data_loader.dev_num/args.dev_batch_size)
    batch_num = batch_num if data_loader.dev_num% args.dev_batch_size == 0 else batch_num + 1
    predict_all = np.array([], dtype=int)
    label_all = np.array([], dtype=int)
    loss_mean = 0
    
    with torch.no_grad():
        for step in trange(batch_num, desc=f'valid {step}/{batch_num}'):
            batch_data = data_loader.get_next_batch(args.dev_batch_size, 'dev')
            batch_text_list, batch_label_list, batch_seg_list, batch_type_list, batch_category_list, \
            batch_a_seg_list, batch_a_tree_list, batch_b_seg_list, batch_b_tree_list = batch_data
            batch_label_ids = torch.tensor(batch_label_list, dtype=torch.long).to(device)
            pred_output = model(batch_text_list, batch_seg_list, batch_type_list, batch_a_seg_list, batch_a_tree_list, batch_b_seg_list, batch_b_tree_list, fine_tune=True)
            logits = pred_output[0]
            loss = criterion(logits.view(-1, label_nums), batch_label_ids.view(-1))
            loss_mean += torch.sum(loss)
            predict = torch.max(logits.data, 1)[1].cpu().numpy()
            label_all = np.append(label_all, batch_label_ids.data.cpu().numpy())
            predict_all = np.append(predict_all, predict)
        acc = metrics.accuracy_score(label_all, predict_all)
        loss_mean /= data_loader.dev_num
    return acc, loss_mean
Example #21
0
def tq_data(url, all_data):
    """封装自动处理数据API

    Args:
        :url (str): URL,数据处理的URL
        :all_data (list): List,所有已加载的数据

    Returns:
        :all_pre_data (list): Dictionary,所有已处理好的数据list
    """

    all_data_length = len(all_data)
    all_pre_data = []
    for index in trange(all_data_length):
        now_data = all_data[index]
        now_data_list_temp = list(filter(None, now_data.split('。')))
        now_data_list = []
        for temp_data in now_data_list_temp:
            if len(temp_data) > 10:
                now_data_list.append(temp_data)
        pre_data = make_request(url, now_data_list)
        all_pre_data.append(pre_data)
    return all_pre_data
Example #22
0
def merge_all_data(folderpath_origin, folderpath_dest):
    """ 合并所有已经标注好的数据

    Args:
        :folderpath_origin (str): Folder,已经标注好的数据文件的路径
        :folderpath_dest (str): Destination,合并后的数据将要保存的路径
    """

    print('开始合并所有标注数据')
    all_data = []
    filename_origin = folderpath_origin + 'all_marked_data.txt'
    with open(filename_origin, 'r') as f:
        for data in f:
            if len(str(data)) > 1:
                all_data.append(data.replace('\n', '').split(' '))
    filename_dest = folderpath_dest + 'all_marked_data.txt'
    with open(filename_dest, 'w') as f:
        data_str = ''
        for data_index in trange(len(all_data)):
            data = all_data[data_index]
            if data[0] == '。' or data_index == len(all_data) - 1:
                data_str = data[0] + '/O' + '\n'
                f.write(data_str)
            elif data[1] == 'O':
                data_str = data[0] + '/O '
                f.write(data_str)
            elif (all_data[data_index + 1][1][0]
                  == 'B') and (data[1][2:] == all_data[data_index + 1][1][2:]):
                data_str = data[0] + '/' + data[1][2:] + ' '
                f.write(data_str)
            elif (all_data[data_index + 1][1][0] !=
                  'B') and (data[1][2:] == all_data[data_index + 1][1][2:]):
                data_str = data[0]
                f.write(data_str)
            elif data[1][2:] != all_data[data_index + 1][1][2:]:
                data_str = data[0] + '/' + data[1][2:] + ' '
                f.write(data_str)
Example #23
0
def save_data_dict(folderpath_dest):
    """ 保存数据字典

    Args:
        :folderpath_dest (str): folder,文件的保存目录
    """

    print('开始分割数据字典')
    filename_origin = folderpath_dest + 'all_marked_data.txt'
    data = pd.read_csv(filename_origin, header=None)
    data = data.values.tolist()
    marked_data_dict = []  # 存放标记数据的检索词
    data_dict = []  # 存放标记数据的完整数据
    for data_index in trange(len(data)):
        now_data = data[data_index][0]
        data_list = now_data.split(' ')
        for cell in data_list:
            cell_marked = str(cell).split('/')[0]
            if cell_marked not in marked_data_dict:
                marked_data_dict.append(cell_marked)
                data_dict.append(cell)
    sheet = pd.DataFrame(data_dict)
    filename_dest = folderpath_dest + 'dict.txt'
    sheet.to_csv(filename_dest, index=None, header=None)
Example #24
0
    def calculate_coverage_analytically(self,
            sample_size1: int,
            sample_size2: int,
            proportions: proportions_type,
            confidence: float,
            z_precision: Union[float, Literal['auto']] = 'auto'
            ):
        """
        Calculates true coverage of confidence interval for the difference between two proportions
        produced by the `method` for the given desired `confidence` using
        an indistinguishably precise approximation for the analytical solution.

        Optimal approximation precision is auto-picked for the specific case,
        but can be set manually in `z_precision`. This is a z-value for precision instead of p.
        Meaning, `z_precision` of 1.96 is 95% precision (which is a terrible precision).

        Number of trials for both samples are `sample_size1` and `sample_size2`.

        Two proportions for samples 1 and 2 are taken from the list `proportions`,
        each against each, producing 2d square matrix of results,
        a value for each pair of proportions.

        This 2d square matrix is `coverage`, and is saved to `self.coverage`.
        """
        self.confidence = confidence
        self.proportions = self.form_proportions_list(proportions)
        self.sample_size1 = sample_size1
        self.sample_size2 = sample_size2

        if z_precision == 'auto':
            z_precision = get_binomial_z_precision(confidence)

        if __debug__ is True:
            print(
                self.f.calculation_inputs() + ",\n"
                f"calculation_method = analytical approximation, " +
                f"z_precision = {z_precision:5.2f}"
            )

        # `n` by `n` 0-matrix, where `n` - the number of probabilities (population proportions)
        coverage = np_zeros((len(self.proportions),len(self.proportions)), dtype=longdouble)

        progress_bar_str = "p1={}; p2={} => cov={}%"
        """
        Here we loop through the cartesian square of the list `self.proportions`,
        (cartesian product of the list `self.proportions` with itself)

        But there's no need to loop through the entire "matrix":
        for each pair `(xi, xj)` the same result can be used for `(xj, xi)`.
        Therefore, only "left diagonal matrix" elements of this "matrix" have to be included
        """
        t = trange(len(self.proportions),
                   desc=progress_bar_str.format("***","***","***"))
        for i in t:
            for j in range(i, len(self.proportions)):
                (prob_x1, prob_x2) = self.proportions[i], self.proportions[j]
                delta = abs(prob_x2 - prob_x1)

                """The entire range of the binomial distribution could be used"""
                #x1_from, x1_to = (0, sample_size)
                #x2_from, x2_to = (0, sample_size)
                """
                This is too computationally expensive to calculate CI for `y` value of each `x1`
                and `x2` of a 2-variate binomial distribution.
                Since most `y` values of the binomial distribution are very close to zero,
                we can use only a small part of the binomial distribution around the peak.
                Such part of a binomial distribution can often be efficiently modeled
                with a normal distribution.

                Let's say we need to consider the span covering 99.999% percent of the mass
                of the two-variate binomial distribution. According to the normal distribution,
                this would be a range that spans 4.42 standard deviations from the mean on both
                sides for both single-variate binomial distribution from which the two-variate one
                is constructed. 
                The span of 4.42 sigma would cover around 99.999% of a binomial distribution
                `Binom(n,p)` for most values of `n` and `p`. This would nail it for 95%CI, but
                what if a user wants to ask for 99.999%CI, and we are only considering 99.999%
                of the binomial distribution? We'd need to consider much more expansive range
                in our calculations.

                We would need something like this:
                for 95% confidence         => 99.995%         of the distribution (4.056 sigma)
                for 99% confidence         => 99.999%         of the distribution (4.417 sigma)
                for 99.9% confidence       => 99.9999%        of the distribution (4.892 sigma)
                for 99.99% confidence      => 99.999_99%      of the distribution (5.327 sigma)
                for significant range of 5 sigma:
                for 99.999_943% confidence => 99.999_999_943% of the distribution (6.199 sigma)
                etc.

                Thus, precision is to be determined given the `confidence`. A specific formula
                is used to figure out the optimal `z_precision`.
                """
                x1_from, x1_to = binomial_distribution_two_tailed_range(n=sample_size1, p=prob_x1, sds=z_precision)
                x2_from, x2_to = binomial_distribution_two_tailed_range(n=sample_size2, p=prob_x2, sds=z_precision)
                x1s = range(x1_from, x1_to+1)
                x2s = range(x2_from, x2_to+1)

                CIs = [[
                    self.method(x1, self.sample_size1, x2, self.sample_size2, self.confidence)
                        for x2 in x2s] for x1 in x1s]

                # Array of `1`s and `0`s
                # int constructor could be used, but longdouble is used to provide better precision
                covered = [[longdouble(CIs[i1][i2][0] < delta < CIs[i1][i2][1])
                                                 for i2 in range(len(CIs[i1]))]
                                                         for i1 in range(len(CIs))]

                # multiplied by 100 in-place for better progress bar, and for a better figure later
                thiscoverage = 100 * (np_sum(
                    [covered[i][j] *
                     binomial_distribution_pmf(x1s[i], sample_size1, prob_x1) *
                     binomial_distribution_pmf(x2s[j], sample_size2, prob_x2)
                         for i in range(len(x1s)) for j in range(len(x2s))]
                ))

                coverage[i][j] = coverage[j][i] = thiscoverage
                t.set_description(progress_bar_str.format(
                    self.f.proportion(prob_x1), self.f.proportion(prob_x2),
                    self.f.coverage(thiscoverage)))

        self.coverage = coverage
        t.set_description(progress_bar_str.format(
            "*", "*", self.f.coverage(self.average_coverage)))
        print(f"average confidence level {self.f.coverage(self.average_coverage)}")
        print(f"average deviation from {self.f.confidence_percent} = {self.f.coverage(self.average_deviation)} (coverage %)")
        print("")
        return self.coverage
Example #25
0
import random
import pandas as pd
from LAC import LAC
from tqdm.std import trange


# 使用自己训练好的模型
my_lac = LAC(model_path='Model_Code/model/')
my_lac.load_customization('Model_Code/data/dict.txt', sep=None)

data = pd.read_csv('Model_Code/data/G_all_marked_data.txt', header=None)
data = data.values.tolist()
texts = []
for data_index in trange(len(data)):
    line = data[data_index][0]
    texts.append(line)
start = random.randint(0, len(data) - 10)
lac_result = my_lac.run(texts[start : start + 10])

sheet = pd.DataFrame(lac_result)
sheet.to_csv('Model_Code/result/result_lac.csv', index=None, header=None)

items_list = []
for data_index in trange(len(lac_result)):
    word_list = lac_result[data_index][0]
    tag_list = lac_result[data_index][1]
    for word_index in range(0, len(word_list)):
        item_list = []
        item_list.append(word_list[word_index])
        item_list.append(tag_list[word_index])
        items_list.append(item_list)
Example #26
0
    def calculate_coverage_randomly(self,
                                    sample_size: int,
                                    proportions: proportions_type,
                                    confidence: float,
                                    n_of_experiments: int = 20000):
        """
        Calculates true coverage of confidence interval for proportion
        produced by the `method` for the given desired `confidence` using a simulation
        with a number of random experiments (`n_of_experiments`).

        Total number of trials in a sample is `sample_size`.

        Proportion for the sample is taken from the list `proportions`,
        producing a list of results, a value for each proportion.

        This list is `coverage`, and is saved to `self.coverage`.
        """
        self.confidence = confidence
        self.proportions = self.form_proportions_list(proportions)
        self.sample_size = sample_size

        if __debug__ is True:
            print(self.f.calculation_inputs() + ",\n"
                  f"calculation_method = random simulation, " +
                  f"n_of_experiments = {n_of_experiments}")

        coverage = []

        # The return value of this function will be cached (this is not necessary)
        z = normal_z_score_two_tailed(p=confidence)

        progress_bar_str = "p={} => cov={}%"
        t = trange(len(self.proportions),
                   desc=progress_bar_str.format("***", "***"))
        for i in t:
            prob = self.proportions[i]
            x = binomial_experiment(sample_size, prob, n_of_experiments)

            CIs = [
                self.method(x[j], sample_size, confidence)
                for j in range(0, n_of_experiments)
            ]
            covered = [int(CI[0] < prob < CI[1]) for CI in CIs]

            # multiplied by 100 in-place for better progress bar, and for a better figure later
            thiscoverage = (sum(covered) / n_of_experiments) * 100

            coverage.append(thiscoverage)

            t.set_description(
                progress_bar_str.format(self.f.proportion(prob),
                                        self.f.coverage(thiscoverage)))

        self.coverage = coverage
        t.set_description(
            progress_bar_str.format("*", "*",
                                    self.f.coverage(self.average_coverage)))
        print(
            f"average confidence level {self.f.coverage(self.average_coverage)}"
        )
        print(
            f"average deviation from {self.f.confidence_percent} = {self.f.coverage(self.average_deviation)} (coverage %)"
        )
        print("")
        return self.coverage
Example #27
0
        optimizer = optim.AdamW(model.parameters(), lr=3e-5)

        # param_optimizer = list(model.named_parameters())
        # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        # optimizer_grouped_parameters = [
        #     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        #     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        #     ]
        # optimizer = BertAdam(optimizer_grouped_parameters,
        #                         lr=args.learning_rate,
        #                         warmup=args.warmup_proportion,
        #                         t_total=num_train_optimization_steps)

        global_step = 0
        best_dev_acc = 0.0
        for epoch in trange(int(args.train_epoch), desc='Epoch'):
            model.train()
            batch_num = int(data_loader.train_num/args.train_batch_size)
            batch_num = batch_num if data_loader.train_num % args.train_batch_size == 0 else batch_num + 1
            train_loss = 0
            for step in trange(batch_num, desc=f'Training {step}/{batch_num}'):
                logits, loss = predict(model, data_loader, device, is_train=True)
                loss.backward()
                train_loss += loss.item()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                if global_step % 100 == 0:
                    dev_acc, loss = eval(model, data_loader, device)
                    if dev_acc > best_dev_acc:
                        best_dev_acc = dev_acc
    prj.close()

## Read the data
resolution, edgeLength, shore, hydrology, cells, Ts = SaveFile.readDataModel(
    inputFile
)

realShape = shore.realShape

with shapefile.Writer(outputFile, shapeType=3) as w:
    # The only relevant field for rivers
    w.field('flow', 'F')

    # This loop adds rivers in the same way that they were created
    #for node in hydrology.allMouthNodes():
    for nidx in trange(len(hydrology)):
        node = hydrology.node(nidx)

        leaves = hydrology.allLeaves(node.id)
        for leafNode in leaves:
            # Path from the leaf to the sea
            path = hydrology.pathToNode(node.id, leafNode.id)
            path.reverse()

            # The flow of the river is the flow at the base of
            # its path to the sea, unless this stretch of the
            # river doesn't flow all the way to the sea
            riverFlow = path[len(path)-1].flow
            for ni in range(1,len(path)):
                upstreamFlow = max([n.flow for n in hydrology.upstream(path[ni].id)])
                # If this river is merely a tributary to a larger
Example #29
0
    def calculate_coverage_analytically(
            self,
            sample_size: int,
            proportions: proportions_type,
            confidence: float,
            z_precision: Union[float, Literal['auto']] = 'auto'):
        """
        Calculates true coverage of confidence interval for proportion
        produced by the `method` for the given desired `confidence` using
        an indistinguishably precise approximation for the analytical solution.

        Optimal approximation precision is auto-picked for the specific case,
        but can be set manually in `z_precision`. This is a z-value for precision instead of p.
        Meaning, `z_precision` of 1.96 is 95% precision (which is a terrible precision).
        See more comments below for the actual meaning of z_precision.

        Total number of trials in a sample is `sample_size`.

        Proportion for the sample is taken from the list `proportions`,
        producing a list of results, a value for each proportion.

        This list is `coverage`, and is saved to `self.coverage`.
        """
        self.confidence = confidence
        self.proportions = self.form_proportions_list(proportions)
        self.sample_size = sample_size

        if z_precision == 'auto':
            z_precision = get_binomial_z_precision(confidence)

        if __debug__ is True:
            print(self.f.calculation_inputs() + ",\n"
                  f"calculation_method = analytical approximation, " +
                  f"z_precision = {z_precision:5.2f}")

        coverage = []

        # The return value of this function will be cached (this is not necessary)
        z = normal_z_score_two_tailed(p=confidence)

        progress_bar_str = "p={} => cov={}%"
        t = trange(len(self.proportions),
                   desc=progress_bar_str.format("***", "***"))
        for i in t:
            prob = self.proportions[i]
            """The entire range of the binomial distribution could be used"""
            #x_from, x_to = (0, sample_size)
            """
            But this is too computationally expensive to calculate CI for `y` value of each `x`
            of a binomial distribution.
            Since most `y` values of the binomial distribution are very close to zero,
            we can use only a small part of the binomial distribution around the peak.
            Such part of a binomial distribution can often be efficiently modeled
            with a normal distribution.

            Let's say we want to consider the span covering 99.999% of the mass
            of the binomial distribution. According to the normal distribution, this would be
            a range that spans 4.42 standard deviations from the mean on both sides.
            The span of 4.42 sigma would cover around 99.999% of a binomial distribution
            `Binom(n,p)` for most values of `n` and `p`. This would nail it for 95%CI, but
            what if a user wants to ask for 99.999%CI, and we are only considering 99.999%
            of the binomial distribution? We'd need to consider much more expansive range
            in our calculations.

            We would need something like this:
            for 95% confidence         => 99.995%         of the distribution (4.056 sigma)
            for 99% confidence         => 99.999%         of the distribution (4.417 sigma)
            for 99.9% confidence       => 99.9999%        of the distribution (4.892 sigma)
            for 99.99% confidence      => 99.999_99%      of the distribution (5.327 sigma)
            for significant range of 5 sigma:
            for 99.999_943% confidence => 99.999_999_943% of the distribution (6.199 sigma)
            etc.

            Thus, precision is to be determined given the `confidence`. A specific formula is used
            to figure out the optimal `z_precision`.
            """
            x_from, x_to = binomial_distribution_two_tailed_range(
                n=sample_size, p=prob, sds=z_precision)
            xs = range(x_from, x_to + 1)

            CIs = [self.method(x, sample_size, confidence) for x in xs]

            # Array of `1`s and `0`s
            # int constructor could be used, but longdouble is used to provide better precision
            covered = [longdouble(CI[0] < prob < CI[1]) for CI in CIs]

            # multiplied by 100 in-place for better progress bar, and for a better figure later
            thiscoverage = sum([
                covered[i] *
                binomial_distribution.pmf(xs[i], sample_size, prob)
                for i in range(len(xs))
            ]) * 100

            coverage.append(thiscoverage)

            t.set_description(
                progress_bar_str.format(self.f.proportion(prob),
                                        self.f.coverage(thiscoverage)))

        self.coverage = coverage
        t.set_description(
            progress_bar_str.format("*", "*",
                                    self.f.coverage(self.average_coverage)))
        print(
            f"average confidence level {self.f.coverage(self.average_coverage)}"
        )
        print(
            f"average deviation from {self.f.confidence_percent} = {self.f.coverage(self.average_deviation)} (coverage %)"
        )
        print("")
        return self.coverage
Example #30
0
    def calculate_coverage_randomly(self,
            sample_size1: int,
            sample_size2: int,
            proportions: proportions_type,
            confidence: float,
            n_of_experiments: int = 10000
            ):
        """
        Calculates true coverage of confidence interval for the difference between two proportions
        produced by the `method` for the given desired `confidence` using a simulation
        with a number of random experiments (`n_of_experiments`).

        Number of trials for both samples are `sample_size1` and `sample_size2`.

        Two proportions for samples 1 and 2 are taken from the list `proportions`,
        each against each, producing 2d square matrix of results,
        a value for each pair of proportions.

        This 2d square matrix is `coverage`, and is saved to `self.coverage`.
        """
        self.confidence = confidence
        self.proportions = self.form_proportions_list(proportions)
        self.sample_size1 = sample_size1
        self.sample_size2 = sample_size2

        if __debug__ is True:
            print(
                self.f.calculation_inputs() + ",\n"
                f"calculation_method = random simulation, " +
                f"n_of_experiments = {n_of_experiments}"
            )

        # n by n zero matrix, where n is the number of tested probabilities (actual population proportions)
        coverage = np_zeros((len(self.proportions),len(self.proportions)), dtype=longdouble)

        # The return value of this function will be cached (this is not necessary)
        z = normal_z_score_two_tailed(p=confidence)

        progress_bar_str = "p1={}; p2={} => cov={}%"
        """
        Here we loop through the cartesian square of the list `self.proportions`,
        (cartesian product of the list `self.proportions` with itself)

        But there's no need to loop through the entire "matrix":
        for each pair `(xi, xj)` the same result can be used for `(xj, xi)`.
        Therefore, only "left diagonal matrix" elements of this "matrix" have to be included
        """
        t = trange(len(self.proportions),
                   desc=progress_bar_str.format("***","***","***"))
        for i in t:
            for j in range(i, len(self.proportions)):
                (prob_x1, prob_x2) = self.proportions[i], self.proportions[j]
                delta = abs(prob_x2 - prob_x1)
                x1 = binomial_experiment(sample_size1, prob_x1, n_of_experiments)
                x2 = binomial_experiment(sample_size2, prob_x2, n_of_experiments)

                CIs = [self.method(x1[k], sample_size1, x2[k], sample_size2, confidence)
                                                    for k in range(0, n_of_experiments)]
                covered = [int(CI[0] < delta < CI[1]) for CI in CIs]

                # multiplied by 100 in-place for better progress bar, and for a better figure later
                thiscoverage = (sum(covered)/n_of_experiments) * 100

                coverage[i][j] = coverage[j][i] = thiscoverage

                t.set_description(progress_bar_str.format(
                    self.f.proportion(prob_x1), self.f.proportion(prob_x2),
                    self.f.coverage(thiscoverage)))

        self.coverage = coverage
        t.set_description(progress_bar_str.format(
            "*", "*", self.f.coverage(self.average_coverage)))
        print(f"average confidence level {self.f.coverage(self.average_coverage)}")
        print(f"average deviation from {self.f.confidence_percent} = {self.f.coverage(self.average_deviation)} (coverage %)")
        print("")
        return self.coverage