Beispiel #1
0
    def text2dict(target_number: str, text_file_path: str) -> dict:
        book_mark = re.sub(r'県内', '', target_number)
        data_dict = {}

        with open(text_file_path, 'rt', encoding='utf-8') as input_file:
            read_flg = True
            texts = [s.strip() for s in input_file.readlines()]
            for text in texts:
                # numberで返される'n例目'以降の文字列を読み込む
                if read_flg:
                    check = re.search(book_mark, text)
                    # print(f"book_mark:{book_mark} check:{check} text:{text}")
                    if check is None:
                        continue
                    else:
                        # テキスト情報として'n-1,n例目'と記載されている部分を除外する
                        tmp_check = text[check.start() - 1]
                        # print(tmp_check)
                        if re.match(r',|、|、', tmp_check) is None:
                            read_flg = False
                if text != '':
                    # 年代|性別|発生判明日|居住地の情報をセットする
                    flg, key, value = StringUtil.set_key_value(text)
                    if flg and key not in data_dict:
                        data_dict[key] = value
        return data_dict
Beispiel #2
0
    def create_new_patient_dict(self, patient_soup, scr):

        # 対象の階層のデータだけ抽出する
        data = scr.find_h4(patient_soup)
        patients = []
        patient = {}
        patient["No"] = StringUtil.exclude_info_number(data.text)
        # 県外事例は除外
        inside_checker = StringUtil()
        for index, sibling in enumerate(data.next_siblings):
            # if index % 2 != 0:  # 改行コードはスキップする
            if hasattr(sibling, "text"):
                target = sibling.text
            else:
                target = sibling
            # h4属性の場合、新たなpatient dictを作成する
            if sibling.name == 'h4':
                # 患者情報でない余計な情報の場合、dictへの格納を行わない
                if len(patient.keys()) > 1 and inside_checker.exclude_outside(patient["No"]):

                    patient["退院"] = None

                    if patient["発生判明日"] is not None:
                        patient["発生判明日"] = TimeUtil().convert_wareki_to_ad(
                            patient["発生判明日"])
                    patients.append(self.sort_patients_dict(patient))
                patient = {}
                patient["No"] = StringUtil.exclude_info_number(target)
                continue

            # 年代|性別|発生判明日|居住地の情報をセットする
            flg, key, value = StringUtil.set_key_value(target)
            if flg and key not in patient:
                patient[key] = value

            # h2タグが表示された時点で別と患者情報の表示は終了する
            if sibling.name == 'h2':
                patient["退院"] = None
                if patient["発生判明日"] is not None:
                    patient["発生判明日"] = TimeUtil().convert_wareki_to_ad(
                        patient["発生判明日"])
                patients.append(self.sort_patients_dict(patient))
                break
        print(patients)
        self.patient_list = list(filter(lambda x: re.search(
            r'\d', x['No']) is not None, patients))
Beispiel #3
0
    def get_patient_dict(self, source_url, target_url, scr):
        # 発生状況等の取得
        url = scr.getTargetUrl(source_url, target_url)
        soup = scr.getContent(url)
        # テーブル情報を取得する
        try:
            dataset = scr.parseSingleTable(soup)
        except ValueError as e:
            raise e
        # pdf格納folderの作成
        path_op = PathOperator()
        path_op.create_path('pdf')

        # 県外事例は除外
        inside_checker = StringUtil()
        patient_data_tmp = []

        for data in dataset:
            if inside_checker.exclude_outside(data[0]):

                # 格納先のファイル名を作成
                file_name = path_op.set_downlaod_file_name(
                    'pdf', path_op.get_file_name(data[3]))
                scr.downloadPdf(data[3], file_name)
                output_file = os.path.splitext(os.path.basename(file_name))[0]
                output_path = os.path.join('./text', output_file + '.txt')
                is_duplicated, number_char = StringUtil(
                ).is_duplicate_data(data[0])

                # 重複している場合
                if is_duplicated:
                    for n in number_char:
                        patient_data_tmp.append(
                            {"No": n, "発生判明日": data[2].strip(), "link": output_path})

                # 単一の場合
                else:
                    patient_data_tmp.append(
                        {"No": number_char,  "発生判明日": data[2].strip(), "link": output_path})

        # convertの実行
        path_op.create_path('text')
        PdfParser.execute_pdf2text()

        # patientの作成
        for tmp in patient_data_tmp:
            # テキストからjsonの作成
            result = TextParser.text2dict(tmp['No'], tmp['link'])
            result.update(tmp)
            result.update({'退院': None})
            del result['link']
            self.patient_list.append(result)

        # Noに数字が入らない場合の処理(例:"再陽性")
        nan_patient_list = list(filter(lambda x: re.search(
            r'\d', x['No']) is None, self.patient_list))
        number_patient_list = list(filter(lambda x: re.search(
            r'\d', x['No']) is not None, self.patient_list))
        insert_patient_list = sorted(number_patient_list, key=lambda x: int(
            re.sub(r'県|内|例|目|第\d報', '', x['No'])))
        insert_patient_list.extend(nan_patient_list)
        self.patients['data'] = insert_patient_list

        # patients_summaryの作成
        self.create_patients_summary_dict(insert_patient_list)
        return self.patients, self.patients_summary_data
Beispiel #4
0
def getPatientDict(index_html, scr, update_datetime):
    # 発生状況等の取得
    url = scr.getTargetUrl(index_html, 'info_coronavirus_prevention.html')
    soup = scr.getContent(url)
    # テーブル情報を取得する
    try:
        dataset = scr.parseSingleTable(soup)
    except ValueError as e:
        raise e
    # pdf格納folderの作成
    path_op = PathOperator()
    path_op.create_path('pdf')

    # 県外事例は除外
    inside_checker = StringUtil()
    patient_data_tmp = []
    patients_summary_tmp = []
    for data in dataset:
        if inside_checker.exclude_outside(data[0]):

            # 格納先のファイル名を作成
            file_name = path_op.set_downlaod_file_name(
                'pdf', path_op.get_file_name(data[3]))
            scr.downloadPdf(data[3], file_name)
            output_file = os.path.splitext(os.path.basename(file_name))[0]
            output_path = os.path.join('./text', output_file + '.txt')
            is_duplicated, number_char = StringUtil().is_duplicate_data(
                data[0])

            # 重複している場合
            if is_duplicated:
                for n in number_char:
                    # print('No:{} リリース日:{} 判明日:{} Link:{}'.format(n, data[1].strip(), data[2].strip(), output_path))
                    patient_data_tmp.append({
                        "No": n,
                        "リリース日": data[1].strip(),
                        "link": output_path
                    })
                    patients_summary_tmp.append(data[2].strip())

            # 単一の場合
            else:
                # print('No:{} リリース日:{} 判明日:{} Link:{}'.format(number_char, data[1].strip(), data[2].strip(), output_path))
                patient_data_tmp.append({
                    "No": number_char,
                    "リリース日": data[1].strip(),
                    "link": output_path
                })
                patients_summary_tmp.append(data[2].strip())

    # convertの実行
    path_op.create_path('text')
    convert_txt = PdfParser()
    convert_txt.executeConvert()

    # テキストからjsonの作成
    parser = TextParser()
    patient_list = []

    # patientの作成
    for tmp in patient_data_tmp:
        result = parser.text2dict(tmp['No'], tmp['link'])
        result.update(tmp)
        result.update({'退院': None})
        del result['link']
        patient_list.append(result)
    patients = {}
    patients['__comments'] = "陽性患者の属性"
    patients['date'] = update_datetime
    # Noに数字が入らない場合の処理(例:"再陽性")
    nan_patient_list = list(
        filter(lambda x: re.search(r'\d', x['No']) is None, patient_list))
    number_patient_list = list(
        filter(lambda x: re.search(r'\d', x['No']) is not None, patient_list))
    insert_patient_list = sorted(
        number_patient_list,
        key=lambda x: int(re.sub(r'県|内|例|目', '', x['No'])))
    insert_patient_list.extend(nan_patient_list)
    patients['data'] = insert_patient_list
    # patients_summaryの作成
    patients_summary_data = {}
    patients_summary_data['__comments'] = "陽性患者数"
    patients_summary_data['date'] = update_datetime
    patients_summary = TimeUtil().create_dt_dict(datetime.datetime.now())
    for k, g in groupby(patients_summary_tmp):
        patients_summary = list(
            map(
                lambda x: {
                    "日付": x["日付"],
                    "小計": len(list(g)) if x['日付'] == k else x['小計']
                }, patients_summary))

    # 小計が0とならない最新の日付までのリストにする
    patients_summary = sorted(patients_summary, key=lambda x: x['日付'])
    jc = JsonChecker()
    patients_summary = jc.exclude_zero_max_date(patients_summary)
    patients_summary_data['data'] = patients_summary

    return patients, patients_summary_data
Beispiel #5
0
def stu_object():
    stu = StringUtil()
    yield stu
Beispiel #6
0
 def test_english(self):
     assert StringUtil().is_ascii(ord('a'))
Beispiel #7
0
 def test_chinese(self):
     assert not StringUtil().is_ascii(ord('中'))
Beispiel #8
0
 def test_chinese_puntuation(self):
     assert not StringUtil().is_ascii(ord('。'))
Beispiel #9
0
 def test_english_puntuation(self):
     assert StringUtil().is_ascii(ord(','))
 def test_all_chinese(self):
     str = '中国,怎么办'
     assert StringUtil().get_cut_position(str, 1) == 0
     assert StringUtil().get_cut_position(str, 2) == 0
     assert StringUtil().get_cut_position(str, 3) == 1
     assert StringUtil().get_cut_position(str, 4) == 1
     assert StringUtil().get_cut_position(str, 5) == 1
     assert StringUtil().get_cut_position(str, 6) == 1
     assert StringUtil().get_cut_position(str, 7) == 1
     assert StringUtil().get_cut_position(str, 8) == 1
     assert StringUtil().get_cut_position(str, 9) == 3
     assert StringUtil().get_cut_position(str, 10) == 3
     assert StringUtil().get_cut_position(str, 11) == 3
 def test_all_english(self):
     str = 'hello world'
     for i in range(1, len(str) + 1):
         assert StringUtil().get_cut_position(str, i) == i
 def test_chinese_with_english(self):
     str = 'h我1是, 这,end'
     assert StringUtil().get_cut_position(str, 1) == 1
     assert StringUtil().get_cut_position(str, 2) == 1
     assert StringUtil().get_cut_position(str, 3) == 1
     assert StringUtil().get_cut_position(str, 4) == 2
     assert StringUtil().get_cut_position(str, 5) == 3
     assert StringUtil().get_cut_position(str, 6) == 3
     assert StringUtil().get_cut_position(str, 7) == 3
     assert StringUtil().get_cut_position(str, 8) == 3
     assert StringUtil().get_cut_position(str, 9) == 5
     assert StringUtil().get_cut_position(str, 10) == 6
     assert StringUtil().get_cut_position(str, 11) == 6
     assert StringUtil().get_cut_position(str, 12) == 6
     assert StringUtil().get_cut_position(str, 13) == 6
     assert StringUtil().get_cut_position(str, 14) == 6
     assert StringUtil().get_cut_position(str, 15) == 6
     assert StringUtil().get_cut_position(str, 16) == 8
     assert StringUtil().get_cut_position(str, 17) == 9
     assert StringUtil().get_cut_position(str, 18) == 10
     assert StringUtil().get_cut_position(str, 19) == 11