Beispiel #1
0
def begin_stang_bid_new_job(cnn):
    # 加载本次任务的开始id(上次任务的结束id)
    start_id = load_bid_new_start_id()
    # 获取数据库最新id,作为本次任务的结束id
    last_id = get_last_id('stang_bid_new', cnn)
    # 当没有新数据的时候,结束本次任务
    if last_id == start_id:
        return None
    # 确保结束id 大于 开始id。 防止自己手动修改开始id的时候出错
    if last_id < start_id:
        raise AssertionError('last_id < start_id stang_bid_new')
    # 对于每一个id,进行字段的提取工作
    for each_id in range(start_id, last_id):
        print(each_id, 'bid_new')
        # 读取数据,并返回一个bid_data对象,该对象包含了所有需要的数据
        bid_data = BidData(cnn, pattern, each_id, 'stang_bid_new')
        # 判断info字段是否为空, 以及cate_id是否为2或者是否含有‘中标’关键词
        if not bid_data.is_valid_bid():
            continue
        # 提取第一中标单位和项目经理
        first_bid_company, manager = ifm.get_information(bid_data)
        # 将结果插入数据库
        cnn.insert_data_with_table_name('stang_bid_extract_zid', bidid=each_id, first_bidcompany=str(first_bid_company),
                                                manager=str(manager), tablename='stang_bid_new')
        cnn.db.commit()
    # 储存本次任务的结束id, 作为下次任务的开始id
    write_bid_new_start_id(last_id)
Beispiel #2
0
    def main_with_open(self):
        # 读取批量测试数据的id
        sql = "SELECT id FROM stang_cbid WHERE cate_id = 2"
        # 读取上面sql,并将结果转化为id的列表
        ids = self._get_ids(sql)
        # 数据库
        cnn = DataSQL()
        for i in ids:
            try:
                # BidData数据类,实例化的时候,会从数据库中读取该id的数据,并将id, title, info, cate_id, table_name等信息信息存入
                # 该类的属性,并可以直接通过get_info_text等方法直接获取其去掉标签后的info字段内容
                bid_info = BidData(cnn, pattern, i, 'stang_cbid')
                # Information类为主要的提取程序,通过其方法get_information获取各个字段的内容, 传入的是BidData对象
                res = self.ifm.get_information(bid_info)
                # print(res, i)

                # 将提取后的结果插入数据库
                cnn.insert_data_with_table_name('stang_bid_extract_zid',
                                                bidid=i,
                                                first_bidcompany=str(res[0]),
                                                manager=str(res[1]),
                                                tablename='stang_cbid')

                # 打开html, 将提取后的结果,与原先数据info内容合并为一个html,并使用浏览器打开
                # self.open_html.open_html(i, 'stang_bid_new', extra_text=str(res))

                cnn.db.commit()
                # time.sleep(0.8)
            except Exception as e:
                print(e)
Beispiel #3
0
def open_nijian_data_summary(df, ifm, cnn, open):
    # open.open_html()
    df = df.sample(frac=1)
    for id in df['id'].head(80):
        # open.open_html(id, 'stang_bid_new')
        data = BidData(cnn, pattern, id, 'stang_bid_new')
        res = ifm.get_summary(data)
        open.open_html(id, 'stang_bid_new', extra_text=str(res))
Beispiel #4
0
        if not data.is_valid():
            return None
        html_dfs = data.get_dfs()
        if html_dfs is None:
            return None
        # 筛选出有效的df
        html_dfs_filter_obj = filter(lambda df: self.DATA_FRAME_FILTER_KEY_WORDS.findall(df.to_string()), html_dfs)
        html_dfs = list(html_dfs_filter_obj)
        if not html_dfs:
            return None
        return html_dfs




if __name__ == '__main__':
    from extraction import pattern
    from algorithm.nlp_algorithm.ltp_algorithm import Ner
    from algorithm.feature_compute.compute import Computation
    from extraction.bid_data import BidData
    from algorithm.create_df.read_data_lib.data_base import DataSQL
    import os
    os.chdir(os.pardir)
    cnn = DataSQL()
    df_p = DataFramePre(pattern)
    test_ids = [8897, 23599, 9751767, 8703, 19599]
    for i in test_ids:
        bid_data = BidData(cnn, pattern, i, 'stang_bid_new')
        # print(bid_data.info)
        print(df_p.get_all_valid_pandas_df(bid_data))
Beispiel #5
0
 def test_one(self, id, table_name):
     data = BidData(self.cnn, pattern, id, table_name)  # bid_data_object
     candidates = self.ifm.get_first_bidmoney(data)  # 获取金额,
     return candidates
Beispiel #6
0
    print(
        '这是附近查找单个',
        dst.get_surrounding_cell_text_from_one_df(df,
                                                  pattern.MORE_KEY_PATTERN))
    print(
        '这是附近查找多个',
        dst.get_surrounding_cell_text_from_dfs(list((df, df)),
                                               pattern.FIRST_KEY_PATTERN))
    # 测试位置
    # import doctest
    # print(doctest.testmod())
    from extraction import pattern
    from algorithm.nlp_algorithm.ltp_algorithm import Ner
    from algorithm.feature_compute.compute import Computation
    from extraction.bid_data import BidData
    from algorithm.create_df.read_data_lib.data_base import DataSQL
    import os
    from algorithm.bid_information import Information
    os.chdir(os.pardir)
    os.chdir(os.pardir)
    cnn = DataSQL()
    cpt = Computation(pattern.KEY_PATTERN)
    ifm = Information(pattern)
    ids = [4000115, 125452, 8000303, 8000296, 8000147]
    for id in ids:
        bid_data = BidData(cnn, pattern, id, 'stang_bid_new')
        try:
            print(ifm.cpt.get_df_type_pro(bid_data.get_dfs()[0], True))
        except Exception as e:
            print(e)
Beispiel #7
0
from algorithm.sequence_search import SequenceSearch


class Res:
    def __init__(self, pattern):
        self.sqs = SequenceSearch(pattern)

    def main(self, id, table_name):
        return self.sqs.get_target(id, table_name)


if __name__ == '__main__':
    import os
    from extraction import pattern
    from algorithm.create_df.read_data_lib.data_base import DataSQL
    from extraction.bid_data import BidData
    from algorithm.nlp_algorithm.ltp_algorithm import Ner
    # 无法进行测试,模型不能够成功加载
    ner = Ner()
    os.chdir(os.pardir)
    os.chdir(os.pardir)
    df_pre = Res(pattern)
    cnn = DataSQL()
    bid_data = BidData(cnn, pattern, 23540, 'stang_bid_new')
    ids = [8160283, 10115126, 10080145, 9493171, 10125682, 10125613]
    for i in ids:
        data = BidData(cnn, pattern, i, 'stang_bid_new')
        print(
            df_pre.sqs.get_target(data.get_tag_sequence(),
                                  pattern.FIRST_KEY_PATTERN,
                                  ner.get_institution_from_string))
Beispiel #8
0
def test_summary_main(id, ifm, cnn, open):
    data = BidData(cnn, pattern, id, 'stang_bid_new')
    res = ifm.get_summary(data)
    return res
Beispiel #9
0
            info_sequence, key_locs, get_target_function)
        res_next_tem = self._find_target_in_next_position(
            info_sequence, next_locs, get_target_function)
        if res_key_tem:
            res += res_key_tem
        if res_next_tem:
            res += res_next_tem
        if not res:
            return None
        return res


if __name__ == '__main__':
    from extraction import pattern
    from algorithm.nlp_algorithm.ltp_algorithm import Ner
    from algorithm.feature_compute.compute import Computation
    from extraction.bid_data import BidData
    from algorithm.create_df.read_data_lib.data_base import DataSQL
    import os

    os.chdir(os.pardir)
    cnn = DataSQL()
    bid_data = BidData(cnn, pattern, 23540, 'stang_bid_new')
    cpt = Computation(pattern.KEY_PATTERN)
    ner = Ner()
    sequence_search = SequenceSearch(pattern)
    print(
        sequence_search.get_target(bid_data.get_info_sequence(),
                                   pattern.FIRST_KEY_PATTERN,
                                   ner.get_institution_from_string))
Beispiel #10
0
 def main(self, id, table_name):
     # BidData数据类,实例化的时候,会从数据库中读取该id的数据,并将id, title, info, cate_id, table_name等信息信息存入
     # 该类的属性,并可以直接通过get_info_text等方法直接获取其去掉标签后的info字段内容
     data = BidData(self.cnn, pattern, id, table_name)
     # Information类为主要的提取程序,通过其方法get_information获取各个字段的内容, 传入的是BidData对象
     return self.ifm.get_information(data)
Beispiel #11
0
        text_segs = info_soup.find_all(name=['p', 'div', 'span'])
        res = []
        for each in text_segs:
            each_text = each.text
            # 为了解决id为11269279这样的情况,所有的信息为一个大的txt, 所以必须要限定文本的长度。
            if key_pattern.findall(each_text) and len(str(each_text)) < 50:
                target = get_target_function(each.text)
                if target is not None and target:
                    res += target
        if not res:
            return None
        return res


if __name__ == '__main__':
    from extraction import pattern
    from algorithm.nlp_algorithm.ltp_algorithm import Ner
    from algorithm.feature_compute.compute import Computation
    from extraction.bid_data import BidData
    from algorithm.create_df.read_data_lib.data_base import DataSQL
    import os
    os.chdir(os.pardir)
    cnn = DataSQL()
    ner = Ner()
    bid_data = BidData(cnn, pattern, 23540, 'stang_bid_new')
    tag_search = TagSearch()
    print(
        tag_search.get_target(bid_data.get_info_soup(),
                              pattern.FIRST_KEY_PATTERN,
                              ner.get_institution_from_string))