Example #1
0
def catch_trick88():
    from htmlconvert2text import convert2txt
    def read_train_res():
        with open('/home/mm/Documents/aliyun-FDDC-2018-Financial-Challenge-/chongzu.train') as rf:
            train_res = rf.read()
        return train_res

    train_re = read_train_res()

    for i in os.listdir('/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/')[
             0:2000:20]:
        # sss = convert2txt('/home/html/' + i)
        sss, ent_str = convert2txt("/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/" + i)

        row_train_re = re.search(r'{}[^\n。]+\n'.format(i.split(".")[0]), train_re).group()[:-1]
        print("###########################################################{}".format(i))
        for index, res_enti in enumerate(row_train_re.split('\t')):

            if len(res_enti) > 1:
                print("@@@this is the {}th key_value{}".format(index, res_enti))
                res_find = re.findall(r'{}'.format(res_enti), ent_str)

                if len(res_find) > 0:
                    print(res_find)
                else:
                    print("@@@")
Example #2
0
def findall_reg():
    for i in os.listdir('/home/html/')[100:2770:50]:
        for l in trick_precedences:

            sss = convert2txt('/home/html/'+i)
            reg_out=re.findall(r'{}[^。|]*[。;|]'.format(l), sss, flags=re.X)
            reg_out_final=[]
            list_false_true = []
            for j in reg_out:
                list_false_true = [True if k in j else False for  k in list_keywords]
                if True not in list_false_true:
                    reg_out_final.append(j)
            print(i)
            for i in reg_out_final:
                print(i)

            print('\n\n')
Example #3
0
def catch_trick888():
    from htmlconvert2text import convert2txt
    # def read_train_res():
    #     with open('/home/mm/Documents/aliyun-FDDC-2018-Financial-Challenge-/chongzu.train') as rf:
    #         train_res = rf.read()
    #     return train_res
    #
    # train_re = read_train_res()

    for i in os.listdir('/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/')[
             0:2688:18]:
        # sss = convert2txt('/home/html/' + i)
        sss, ent_str = convert2txt("/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/" + i)
        if len(ent_str) > 10:
            with open('checkregexentity.txt' , 'a') as af:
                af.write(ent_str + "\n")
            print("OOOOOOOOOOO")
        else:
            print(i)
Example #4
0
    def tokenize_enti(self,path11):
        texx, entity_string = convert2txt(path11)
        sentences = re.split(r'。', texx)
        # sentences.sort(key=len, reverse=True)
        entities = list(set(re.split(r'[\s~、,;/]', entity_string)))
        entities.sort(key=len)
        entities_arrows_list =list(set([ x if '~' in x else '' for x in re.split(r'\s', entity_string)]))
        entities_arrows_list.sort(key=len, reverse=True)
        entities_arrows_list = entities_arrows_list[:-1]
        # 找出结果数据行并且把最后的回车符号去掉
        patt_index = re.findall(r'\d{4,10}', path11)[0]
        res_rows = re.findall(r'(?<=\n){}[^\n]+(?=\n)'.format(patt_index), self.train_res)


         # 以下是整理train——res
         # 遍历结果,发现有简称全称的,把匹配的另一半加进去。
        """主要目的是修正train——res文件,里面有简称或者全称,并不统一,为了让简称全称都出现,
            使用正则提取对应的简称或全称,如果有顿号,把那些字串也分开提取,作为标注的标的,当然是先
            把字符长度小的匹配出来,分词之后也是先把长度长的连起来。没问题的"""
        res_paired = {}  # 临时定义一个res的列表,存储修改后的train res
        for x in range(len(res_rows)):
            res_row = res_rows[x]
            for  y in  range(6):
                res_paired[str(x)+str(y)]= [re.split(r'\t', res_row)[y]]

        for arrow_str in entities_arrows_list:

            for index, result_row in enumerate(res_rows):

                for indi, res_value in enumerate(re.split(r'\t', result_row)):
                    if indi in [0, 1, 4, 5]:
                        continue
                    res_value_list = res_value.split('、')

                    for res_value_split in res_value_list:
                        if res_value_split in entities and res_value_split in arrow_str:
                        # 找出配对的简称或者全称,添加,如果是股权/估值法/金额直接添加并且continue
                            niki, fullna = re.split(r'~', arrow_str)
                            fullna_first = fullna.split(',')[0]
                            niki_split_list = re.split(r'[/、]', niki)
                    # 对应的全称满足三个条件,长度/逗号  以及含有简称的几个字
                            if res_value_split in niki_split_list \
                                    and len(fullna_first) < 18 \
                                    and re.search(re.sub(r'(?<=[^屄\s])', '\s?', res_value_split), fullna_first):
                                res_paired[str(index)+str(indi)].append(fullna_first)
                            """ 由全称查简称时候要避免 公司/本公司/上市公司/发起人/申请人/,
                                含有这几个字的要剔除  """
                            if res_value_split == fullna_first:
                                # 对应的简称满足几个条件: 包含在全程里面,不长于4个字,不等于
                                for niki_split in niki_split_list:
                                    if re.search(re.sub(r'(?<=[^屄\s])', '\s?', fullna_first), niki_split)\
                                            and not re.search(r'(^公司$|^本公司$|环境$|^上市公司$|人$|资产|标的|交易|对方|发行|对象|股东|对手|单位)',re.sub(r'\s', '', niki_split)):
                                        res_paired[str(index)+str(indi)].append(niki_split)


        # 遍历公告的每一句,把每一句送进模型。
        words_n_words = ''
        for i in sentences:
            words = self.segmentor.segment(i)
            words = ' '.join(words)
            words = words+' '+'。'+' '  # 加上句号以及句号后面的空格
            # 分词要使用更好的策略,更长一些,避免太短的句子,重复循环浪费流程
            # # 下面是把所有目标主体合并在一起, 把55%股权这样的先分出来,
            # for ent in entities:
            #     # 把words中所有是实体的中间去掉空格。使用双层sub
            #     # 正则还是要多注释啊
            #     """ re.sub(r'(?<=\w)(?=\w)'','\s?',ent) 是把实体里面的每个字符中间插入“\s?”
            #     表示匹配任何以此序列出现但中间可能有空格的情况,分词之后join成空格分割的。然后找出words
            #     中出现这个序列的地方,将其换成没空格的"""
            #     if len(ent) > 1:
            #         if not re.search(r'([\d.]+%的?(?:股权|股份|权益))', ent):  # 如果没有股权关键字,直接加上空格匹配pattern
            #             patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent)
            #         elif len(ent) > 7: # 如果有股权关键字,且长度比较长,就把前面主体提出来,单独分词
            #             patt_ent = re.sub(r'(?<=\w)(?=\w)',r'\s?', re.sub(r'的?[\d.]+%的?(股权|股份|权益)','', ent))
            #         else:
            #             patt_ent = re.sub(r'(?<=\w)(?=\w)', r'\s?', ent)
            #         # 下面一句把words中所有符合主体列表的项目,可能被分词分开的,重新合并起来,单独成行,在test时使用
            #         words = re.sub(r'{}'.format(patt_ent), '\s' + ent + '\s', words)

            # 然后把空格都换成回车,words竖起来了。
            # words = re.sub(r'\s', '\n', words)
            # words = re.sub(r'\n+', '\n', words)
            """把words中所有是结果键值的,后缀上tab键和结果索引号。否则后缀tab键和字母o
                目的是好的,就是让模型更容易找到目标,模型不需要判断开始和结束,
                但是这样的正则太难了, 我无法将所有合适的实体
                全部抽出来,而导致标注的缺失,那么还是把任务给模型了"""
            # for x in range(len(res_rows)):
            #     for y in range(6):
            #         index = str(x)+str(y)
            #         tags_list = res_paired[index]
            for index, tags_list in res_paired.items():
                # 表中的小表,可能有一个或多个成员,遍历一下,包括顿号分割的那些都可以标出来了,不影响合并好的实体字符串。
                for sub_res in sorted(tags_list, key=len, reverse=True):
                    if not index.endswith('0') and len(sub_res) > 1:
                        patt_sub_res = re.sub(r'(?<=[^屄\s])', '\s?', sub_res)
                        if re.search(r'{}'.format(patt_sub_res), words):
                            spliter = re.findall(patt_sub_res, words)[0]
                            words_split_list = re.split(spliter, words)
                            spliter_tagged = re.sub(r'\s', '屄{}'.format(index[1]), spliter)
                            words = spliter_tagged.join(words_split_list)
                            # print(words)


                        # words=re.sub(patt_sub_res, sub_res)
                        # words= re.sub(r'{}(?=\n)'.format(sub_res), '\n{}\t{}\n'.format(sub_res, index), words)
            # train——result标注完了,现在标注o,就是把非数字结尾的行加上tab和o
            words = re.sub(r'\s', '\to\n', words)
            words = re.sub(r'(?<=屄\d)', '\n', words)
            words = re.sub(r'屄', '\t' , words)
            words_n_words += words

            # print(words)
        with open('/home/mm/FDDC_datasets_dir/tokenized_datasets_for_anago/chongzu/'+res_paired['00'][0]+'.txt', 'w') as af:
            af.write(words_n_words)
            print(path11.split("/")[-1])
Example #5
0
recognizer.load(ner_model_path)
pos_model_path = os.path.join(LTP_DATA_DIR,
                              'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')

source_path = "/home/mm/Downloads/round1_train_20180518/dingzeng/html/"
out_path = "/home/mm/aliyunChallenge/"
listdir = os.listdir(source_path)
postagger = Postagger()  # 初始化实例
postagger.load(pos_model_path)  # 加载模型

segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  # 加载模型
for i in listdir[0:1]:
    html_text = convert2txt(source_path + i)
    words = segmentor.segment(html_text)  # 分词
    postags = postagger.postag(words)  # 词性标注
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    indices = [i for i, x in enumerate(list(netags)) if x.endswith("Ni")]

    temp_entity = ""
    new_list = []
    for i, x in enumerate(words):
        if (i in indices) and ((i + 1) in indices) and (i - 1 not in indices):
            temp_entity = x
        elif (i - 1 in indices) and (i + 1 in indices) and (i in indices):
            temp_entity += x
        elif (i - 1 in indices) and (i in indices) and (i + 1 not in indices):
            temp_entity += x
            new_list.append(temp_entity)
Example #6
0
def fill_table(path):

    # if index not in random_index:
    #     continue
    # list_true_res = re.findall(r'{}[^\n]+(?=\n)'.format(path.split(".")[0]), true_res_str)

    # text, entity_string = convert2txt('/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/762567.html')
    text, entity_string = convert2txt(
        '/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/' +
        path)
    official_res_row = re.findall(
        r'{path}[^\n]+\n'.format(path=path.split(".")[0]), true_res_str)
    answer_dic = {"公告ID": path.split(".")[0]}
    _, asset_string, eval_string, money_string = entity_string.split("|||")

    entities_arrows_list = list(
        set([x if 'H&#~' in x else ''
             for x in re.split(r'\s', entity_string)]))
    short_name_list = [re.split(r"H&#~", x)[0] for x in entities_arrows_list]
    reg_short_listr = ""
    for tiy in short_name_list:
        for tiny in re.split(r'[,、/]', tiy):
            if not re.search(
                    r'(^公司$|^本公司$|环境$|^上市公司$|人$|资产|标的|交易|审计|对方|发行|对象|股东|对手|单位|事务所|计划|分公司|日$|董事|独立|书$|承诺|机构|评估|交所|股|认购|局$|律|本次|国家|中央|中国|重组|重大|期$|^元$|^万元$|^亿元$|《|》|股份|股分|利润|报告)',
                    tiny):
                reg_short_listr += tiny
                reg_short_listr += "|"
    reg_short_listr = re.sub(r'^\||\|$', "", reg_short_listr)
    reg_short_listr = re.sub(r'\|\|', "|", reg_short_listr)

    answer_dic["估值方法"] = collections.Counter(
        eval_string.split(" ")).most_common(1)[0][0]
    answer_dic["交易金额"] = collections.Counter(
        money_string.split(" ")).most_common(1)[0][0]
    answer_dic["交易标的"] = collections.Counter(
        asset_string.split(" ")).most_common(1)[0][0]
    answer_dic["标的公司"] = ''
    answer_dic["交易对方"] = ''

    for row in entities_arrows_list:
        if len(row) < 2:
            continue
        short, long = row.split('H&#~')
        list_splits_short = re.split(r'/|、', short)
        for short_split in list_splits_short:
            if re.match(r'交易对[手方]|发行对象|认购人', short_split):
                answer_dic["交易对方"] = long
            if re.match(r'标的公司|目标公司', short_split):
                answer_dic["标的公司"] = long
                if "、" in long:
                    """ltp 识别实体, 针对顿号分开的pos,分别确认不含有动词副词,
                        然后在entity——string里面找相应的各自的股权信息/资产信息"""
                    asset_list = []
                    for long_split in long.split("、"):
                        # print("{} long_split is {}".format(path, long_split))
                        # asset_related_target = re.findall(r'{}[\d.%]+的?股[权分]|全部[股债分权]'.format(long_split), entity_string)
                        # if len(asset_related_target) > 0:
                        #     asset_list.append(re.findall(r'[\d.%]+的?股[权分]|全部[股债分权]', asset_related_target[0])[0])
                        # else:
                        if re.findall(r'{ls}[\d.%]+的?[股债分权份]{{2}}|{ls}全部的?[股债分权份资产负利和与]{{2,6}}'.format(ls=long_split), text) \
                                and re.findall(r'[\d.%]+的?[股债分权份资产负利和与]{2, 5}|全部的?[股债分权份资产负利和与]{2,6}', \
                                               re.findall(r'{ls}[\d.%]+的?股股债分权份资产负利和与]{{2,6}}|全部[股债分权份资产负利和与]{{2,6}}'.format(ls = long_split), text)[0]):
                            # asset_related_target = re.findall(r'{}[\d.%]+的?股[权分]|全部[股债分权]'.format(long_split), text)
                            asset_list.append(re.findall(r'[\d.%]+的?[股债分权份]{2}|全部的?[股债分权份资产负利和与]{2,6}',\
                                                         re.findall(r'{ls}[\d.%]+的?[股债分权份]{{2,4}}|全部的?[股债分权份资产负利和与]{{2,5}}'.format(ls =long_split), text)[0])[0])

                    answer_dic["交易标的"] = '|'.join(asset_list)
                    answer_dic["标的公司"] = "|".join(long.split("、"))

        if re.match(r'本次交易|交易标的|标的资产|交易资产|目标资产|标的股权', short_split):
            if re.findall(r'[\d.%]+的?[股债分权份]{2}|全部的?[股债分权份]{2}', long):
                list_ass = re.findall(r'[\d.%]+的?[股债分权份]{2}|全部的?[股债分权份]{2}',
                                      long)
                # if len(list_ass) > 1:
                answer_dic["交易标的"] = '|'.join(list_ass)
                list_tar = re.findall(
                    r'({ls})(?=的?[\d.%]+的?[股债分权份]{{2,3}}|全部的?[股债分权份资产负利和与]{{2,6}})'
                    .format(ls=reg_short_listr), long)
                answer_dic["标的公司"] = '|'.join(list_tar)

    if answer_dic["标的公司"] == "":
        guess_target = collections.Counter(
            re.findall(r'{ls}'.format(ls=reg_short_listr),
                       entity_string)).most_common(8)
        for tar in guess_target:
            if len(tar[0]) > 2:
                answer_dic["标的公司"] = guess_target
                pass

    # print(re.findall(r'(?<=[和及、,~的])[^\d和及、,~的股份分权]+(?=[\d.%的]+股[权分份])', entity_string))

    if answer_dic["标的公司"] == "":
        for post_fix in re.findall(
                r'(?<=[和及、,~的])[^\d和及、,~的股份分权资产负利与]+(?=的?[\d.%]+的?[股债分权份]{{2,3}}|全部的?[股债分权份资产负利和与]{{2,6}})',
                entity_string):
            if len(post_fix) in [3, 4, 5, 6]:
                answer_dic["标的公司"] = post_fix

    if answer_dic["标的公司"] != "" and answer_dic["交易标的"] != "":
        print("answer dict is ok ")
    else:
        print("f**k it {}".format(path))
    #                 # 说明交易
    #
    # # if "|" in answer_dic['标的公司'] and "|" not in answer_dic['交易标的']:
    # #     answer_dic['交易标的'] == ""
    # #     for target_split in re.split(r'|', answer_dic['标的公司']):
    # #         answer_dic['交易标的'] += re.findall()
    #
    #         if re.match(r'交易标的|标的资产|标的股权|目标资产', short_split):
    #             # if len(answer_dic['交易标的'])>1 and answer_dic['交易标的'] in long:
    #             #     print("股权最频的信息就在交易标的名词解释中{}".format(path))
    #             if re.search(r'的?[\d.%全部]+股[权分份]]', long):
    #                 answer_dic["标的公司"] ="|".join(re.split(r'的?[\d.%全部的]+股[权分份]、?', long))
    #                 answer_dic["交易标的"] ="|".join(re.findall(r'[\d.%全部的]+股[权分份]]', long))
    #             # elif len(answer_dic['交易标的'])>1:
    #             #     answer_dic['交易标的'] = re.findall(r'([\d.]+%的?(?:股权|股份|权益))', long)[0]
    submit_string = ''
    if '|' in str(answer_dic):
        target_list = answer_dic["标的公司"].split("|")
        asset_list = answer_dic["交易标的"].split("|")
        rows_to_gen = max(len(target_list), len(asset_list))
        for row in range(rows_to_gen):
            index_target = row if len(
                target_list) < row + 1 else len(target_list) - 1
            index_asset = row if len(
                asset_list) < row + 1 else len(asset_list) - 1
            submit_string += answer_dic["公告ID"]+ "\t" + asset_list[index_asset] \
                        + "\t" + target_list[index_target] + "\t" + answer_dic["交易对方"] \
                        + "\t" + answer_dic["交易金额"] + "\t" + answer_dic["估值方法"] + "\n"
    else:
        submit_string =  answer_dic["公告ID"]+ "\t" + answer_dic["交易标的"] \
                        + "\t" + answer_dic["标的公司"] + "\t" + answer_dic["交易对方"] \
                        + "\t" + answer_dic["交易金额"] + "\t" + answer_dic["估值方法"] + "\n"

    return submit_string
Example #7
0
recognizer.load(ner_model_path)
pos_model_path = os.path.join(LTP_DATA_DIR,
                              'pos.model')  # 词性标注模型路径,模型名称为`pos.model`

cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')

source_path = "/home/mm/FDDC_datasets_dir/FDDC_announcements_round2_train_html/"
out_path = "/home/mm/FDDC_datasets_text_dir/chongzu/"
listdir = os.listdir(source_path)
postagger = Postagger()  # 初始化实例
postagger.load(pos_model_path)  # 加载模型

segmentor = Segmentor()  # 初始化实例
segmentor.load(cws_model_path)  # 加载模型
for i in listdir[0:1]:
    html_text, entity_string = convert2txt(source_path + i)
    words = segmentor.segment(html_text)  # 分词
    postags = postagger.postag(words)  # 词性标注
    netags = recognizer.recognize(words, postags)  # 命名实体识别
    indices = [i for i, x in enumerate(list(netags)) if x.endswith("Ni")]

    temp_entity = ""
    new_list = []
    """以下是对字符串序列中含有实体名称的部分,重新结合在一起,去掉分词造成的间隔,然后在实体前后加缀一个特殊符号{NER#}"""
    for i, x in enumerate(words):
        if (i in indices) and ((i + 1) in indices) and (i - 1 not in indices):
            temp_entity = x
        elif (i - 1 in indices) and (i + 1 in indices) and (i in indices):
            temp_entity += x
        elif (i - 1 in indices) and (i in indices) and (i + 1 not in indices):
            temp_entity += x