Beispiel #1
0
def insert_global_covid():

    global_daily_df, global_cumul_df = crawling_global_covid()

    # Insert data - MongoDB, database name: 'GlobalCOVID'
    conn = conn_db()

    global_daily_collection = conn.GlobalCOVID.daily
    global_cumul_collection = conn.GlobalCOVID.cumul

    print(datetime.now())
    print(
        "insert_global_covid daily Updated: ",
        insert_data(global_daily_df,
                    global_daily_collection,
                    'date',
                    'country',
                    check=True))
    print(
        "insert_global_covid cumul Updated: ",
        insert_data(global_cumul_df,
                    global_cumul_collection,
                    'date',
                    'country',
                    check=True))
    conn.close()

    return True
def upsert_inference_data(ADJ_CONST):

    pred_df, prev_num = predict_FBProphet_model(ADJ_CONST)
    conn = conn_db()
    inference_FBProphet_collection = conn.DomesticCOVID.inference_FBProphet

    upsert_data(pred_df, inference_FBProphet_collection, ['date'], reset=True)

    prev_doc = {'date': prev_num[0], 'real': prev_num[1]}
    inference_FBProphet_collection.insert(prev_doc)
    conn.close()

    return True
def predict_FBProphet_model(ADJ_CONST=1.):

    conn = conn_db()
    domestic_cumul_collection = conn.DomesticCOVID.domestic_cumul

    total_data = [[elem['date'], elem['confirmed']]
                  for elem in domestic_cumul_collection.find({})]
    conn.close()

    prev_num = total_data[-1]

    model = train_FBProphet_model(total_data)

    future = model.make_future_dataframe(periods=7)
    forecast = model.predict(future)

    forecast = forecast.iloc[-7:, :]

    forecast['yhat'] = forecast['yhat'].astype(int)

    ret_df = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
    ret_df.rename({'ds': 'date'}, axis=1, inplace=True)
    ret_df['date'] = ret_df['date'].apply(lambda x: str(x).split()[0])

    col_list = ['date', 'yhat', 'yhat_lower', 'yhat_upper']

    for col in col_list[1:]:
        ret_df[col] = round((ret_df[col] - prev_num[-1]) * ADJ_CONST, 0)

    ret_df['lower_ratio'] = ret_df.yhat_lower / ret_df.yhat
    ret_df['upper_ratio'] = ret_df.yhat_upper / ret_df.yhat

    adj_yhat = [ret_df.iloc[0].yhat]
    lower, upper = [ret_df.iloc[0].yhat_lower * 1.1
                    ], [ret_df.iloc[0].yhat_upper * 0.9]
    for i in range(1, len(ret_df)):
        prev, cur = ret_df.iloc[i - 1], ret_df.iloc[i]
        diff = cur.yhat - prev.yhat
        adj_yhat.append(diff)
        lower.append(diff * cur.lower_ratio)
        upper.append(diff * cur.upper_ratio)

    ret_df['yhat'] = adj_yhat
    ret_df['yhat_lower'] = lower
    ret_df['yhat_upper'] = upper
    ret_df.drop(['lower_ratio', 'upper_ratio'], axis=1, inplace=True)

    prev_num[1] = 0
    return ret_df, prev_num
Beispiel #4
0
def insert_domestic_covid_cumul():

    domestic_cumul_df = crawling_domestic_covid_cumul()

    conn = conn_db()
    domestic_cumul_collection = conn.DomesticCOVID.domestic_cumul

    print(
        "insert_domestic_covid_cumul Updated: ",
        insert_data(domestic_cumul_df,
                    domestic_cumul_collection,
                    'date',
                    check=True))
    conn.close()

    return True
def insert_validate_FBProphet_data(ADJ_CONST=1.):
    print("ADJ_CONST(R0): ", ADJ_CONST)

    try:
        val_df, ADJ_CONST = validate_FBProphet_model(ADJ_CONST)

        conn = conn_db()

        validate_FBProphet_collection = conn.DomesticCOVID.validate_FBProphet
        validate_FBProphet_collection.drop()

        col_list = val_df.columns
        doc_list = list()

        insert_data(val_df, validate_FBProphet_collection, check=False)

        conn.close()
        return True, ADJ_CONST

    except:
        print("Error Occured: insert_validate_FBProphet_data Function")
        return False
Beispiel #6
0
def insert_domestic_detailed_covid():

    (area_df, gender_df, age_df), cum_df = crawling_domestic_detailed_covid()

    conn = conn_db()
    domestic_detailed_area_collection = conn.DomesticDetailedCOVID.area
    domestic_detailed_gender_collection = conn.DomesticDetailedCOVID.gender
    domestic_detailed_age_collection = conn.DomesticDetailedCOVID.age

    print(
        "insert_domestic_detailed_covid - area: ",
        upsert_data(area_df,
                    domestic_detailed_area_collection, ['date', 'area'],
                    skip_na=True))

    print(
        "insert_domestic_detailed_covid - gender: ",
        upsert_data(gender_df,
                    domestic_detailed_gender_collection, ['date', 'gender'],
                    skip_na=True))

    print(
        "insert_domestic_detailed_covid - age: ",
        upsert_data(age_df,
                    domestic_detailed_age_collection, ['date', 'age'],
                    skip_na=True))

    domestic_detailed_cumul_collection = conn.DomesticDetailedCOVID.cumul

    print(
        "insert_domestic_detailed_covid - cumul: ",
        upsert_data(cum_df,
                    domestic_detailed_cumul_collection, ['type', 'attr'],
                    skip_na=True))

    conn.close()

    return True
def get_R0():
    conn = conn_db()
    data = conn.DomesticDetailedCOVID.cumul.find({'attr': '소계'})[0]
    conn.close()

    # Confirmed, Death, Released
    tmp = [data['confirmed'], data['death'], data['released']]
    for i in range(3):
        tmp[i] = tmp[i].replace(',', '')
    I, D, R = map(int, tmp)
    R += D
    gamma = 1 / 14
    gamma = round(gamma, 2)
    beta = 0.1389

    TOTAL = 51780579

    S = (TOTAL - I)
    s = (S / (S + I + R))
    Ro = (beta * s) / gamma
    Ro = round(Ro, 3)

    return Ro
def validate_FBProphet_model(ADJ_CONST):

    conn = conn_db()
    domestic_cumul_collection = conn.DomesticCOVID.domestic_cumul

    total_data = [[elem['date'], elem['confirmed']]
                  for elem in domestic_cumul_collection.find({})]
    conn.close()

    train_data = total_data[:-7]
    prev_num = total_data[-8][1]

    val_data = [(elem[1] - prev_num) for elem in total_data[-7:]]

    model = train_FBProphet_model(train_data)

    future = model.make_future_dataframe(periods=7)
    forecast = model.predict(future)

    result = forecast.tail(7)
    col_list = ['ds', 'yhat', 'yhat_lower', 'yhat_upper']
    result = result[col_list]
    result.rename({'ds': 'date'}, axis=1, inplace=True)
    result.date = result.date.apply(lambda x: x.strftime("%Y-%m-%d"))

    result['real'] = val_data

    for col in col_list[1:]:
        result[col] = round(result[col] - prev_num, 0)

    if ADJ_CONST == -1:
        ADJ_CONST = _find_opt_CONST(result)

    for col in col_list[1:]:
        result[col] = round(result[col] * ADJ_CONST, 0)

    tmp = [val_data[0]]
    for i in range(1, len(val_data)):
        tmp.append(val_data[i] - val_data[i - 1])
    val_data = tmp
    result['real'] = val_data

    result['lower_ratio'] = result.yhat_lower / result.yhat
    result['upper_ratio'] = result.yhat_upper / result.yhat

    adj_yhat = [result.iloc[0].yhat]
    lower, upper = [result.iloc[0].yhat_lower * 1.1
                    ], [result.iloc[0].yhat_upper * 0.9]
    for i in range(1, len(result)):
        prev, cur = result.iloc[i - 1], result.iloc[i]
        diff = cur.yhat - prev.yhat
        adj_yhat.append(diff)
        lower.append(diff * cur.lower_ratio)
        upper.append(diff * cur.upper_ratio)

    result['yhat'] = adj_yhat
    result['yhat_lower'] = lower
    result['yhat_upper'] = upper

    result.drop(['lower_ratio', 'upper_ratio'], axis=1, inplace=True)
    return result, ADJ_CONST
def insert_brefing_data(doc, collection, reset=True): 
    
    if reset: collection.drop() 
    collection.insert(doc) 
    return True 

# python news_API.py 코로나 200 100 
# 200개 뉴스 수집 및 수집된 뉴스 100개만 유지 
if __name__ == '__main__': 
    
    # 1. Crawling and insert news data 
    # query = sys.argv[1] if len(sys.argv) > 1 else None 
    # if not query: print("Error occuered! => failed to get query word"); exit() 

    # news_query_num = sys.argv[2] if len(sys.argv) > 2 else 100
    conn = conn_db() 
    raw_news_collection = conn.NewsData.raw_news
    
    query_list = '''코로나19
        바이러스
        감염증
        방역
        확진자
        확산
        접촉
        증상
        마스크
        격리
        거리두기'''

    query_list = [elem.strip() for elem in query_list.split('\n')]
def covid19_kor_insert(file_name):
    #데이터 불러오기
    df = pd.read_csv(file_name, encoding='ANSI')  # 01.31 - 2020.10.31 까지의 데이터

    #인덱스
    df = df.reset_index()

    #불필요 컬럼 제거
    df.drop(['구분별', '상태별', '항목', '단위'], axis=1, inplace=True)

    #행렬 변경
    df = df.transpose()

    df = df.drop(df.index[0])
    df = df.drop(df.index[275])

    #컬럼 rename
    df = df.rename(
        {
            0: 'overseas',
            1: 'domestic',
            2: 'confirmed_n',
            3: 'confirmed_c',
            4: 'iso_n',
            5: 'iso_c',
            6: 'iso_rel_n',
            7: 'iso_rel_c',
            8: 'death_n',
            9: 'death_c',
            10: 'm_confirmed_n',
            11: 'm_confirmed_c',
            12: 'm_death_n',
            13: 'm_death_c',
            14: 'f_confirmed_n',
            15: 'f_confirmed_c',
            16: 'f_death_n',
            17: 'f_death_c',
            18: '00s_confirmed_n',
            19: '00s_confirmed_c',
            20: '00s_death_n',
            21: '00s_death_c',
            22: '10s_confirmed_n',
            23: '10s_confirmed_c',
            24: '10s_death_n',
            25: '10s_death_c',
            26: '20s_confirmed_n',
            27: '20s_confirmed_c',
            28: '20s_death_n',
            29: '20s_death_c',
            30: '30s_confirmed_n',
            31: '30s_confirmed_c',
            32: '30s_death_n',
            33: '30s_death_c',
            34: '40s_confirmed_n',
            35: '40s_confirmed_c',
            36: '40s_death_n',
            37: '40s_death_c',
            38: '50s_confirmed_n',
            39: '50s_confirmed_c',
            40: '50s_death_n',
            41: '50s_death_c',
            42: '60s_confirmed_n',
            43: '60s_confirmed_c',
            44: '60s_death_n',
            45: '60s_death_c',
            46: '70s_confirmed_n',
            47: '70s_confirmed_c',
            48: '70s_death_n',
            49: '70s_death_c',
            50: '80s_confirmed_n',
            51: '80s_confirmed_c',
            52: '80s_death_n',
            53: '80s_death_c',
            54: 'seoul_overseas',
            55: 'seoul_domestic',
            56: 'seoul_confirmed_n',
            57: 'seoul_confirmed_c',
            58: 'seoul_iso_n',
            59: 'seoul_iso_c',
            60: 'seoul_iso_rel_n',
            61: 'seoul_iso_rel_c',
            62: 'seoul_death_n',
            63: 'seoul_death_c',
            64: 'bs_overseas',
            65: 'bs_domestic',
            66: 'bs_confirmed_n',
            67: 'bs_confirmed_c',
            68: 'bs_iso_n',
            69: 'bs_iso_c',
            70: 'bs_iso_rel_n',
            71: 'bs_iso_rel_c',
            72: 'bs_death_n',
            73: 'bs_death_c',
            74: 'tk_overseas',
            75: 'tk_domestic',
            76: 'tk_confirmed_n',
            77: 'tk_confirmed_c',
            78: 'tk_iso_n',
            79: 'tk_iso_c',
            80: 'tk_iso_rel_n',
            81: 'tk_iso_rel_c',
            82: 'tk_death_n',
            83: 'tk_death_c',
            84: 'ic_overseas',
            85: 'ic_domestic',
            86: 'ic_confirmed_n',
            87: 'ic_confirmed_c',
            88: 'ic_iso_n',
            89: 'ic_iso_c',
            90: 'ic_iso_rel_n',
            91: 'ic_iso_rel_c',
            92: 'ic_death_n',
            93: 'ic_death_c',
            94: 'kj_overseas',
            95: 'kj_domestic',
            96: 'kj_confirmed_n',
            97: 'kj_confirmed_c',
            98: 'kj_iso_n',
            99: 'kj_iso_c',
            100: 'kj_iso_rel_n',
            101: 'kj_iso_rel_c',
            102: 'kj_death_n',
            103: 'kj_death_c',
            104: 'dj_overseas',
            105: 'dj_domestic',
            106: 'dj_confirmed_n',
            107: 'dj_confirmed_c',
            108: 'dj_iso_n',
            109: 'dj_iso_c',
            110: 'dj_iso_rel_n',
            111: 'dj_iso_rel_c',
            112: 'dj_death_n',
            113: 'dj_death_c',
            114: 'ulsan_overseas',
            115: 'ulsan_domestic',
            116: 'ulsan_confirmed_n',
            117: 'ulsan_confirmed_c',
            118: 'ulsan_iso_n',
            119: 'ulsan_iso_c',
            120: 'ulsan_iso_rel_n',
            121: 'ulsan_iso_rel_c',
            122: 'ulsan_death_n',
            123: 'ulsan_death_c',
            124: 'sj_overseas',
            125: 'sj_domestic',
            126: 'sj_confirmed_n',
            127: 'sj_confirmed_c',
            128: 'sj_iso_n',
            129: 'sj_iso_c',
            130: 'sj_iso_rel_n',
            131: 'sj_iso_rel_c',
            132: 'sj_death_n',
            133: 'sj_death_c',
            134: 'gg_overseas',
            135: 'gg_domestic',
            136: 'gg_confirmed_n',
            137: 'gg_confirmed_c',
            138: 'gg_iso_n',
            139: 'gg_iso_c',
            140: 'gg_iso_rel_n',
            141: 'gg_iso_rel_c',
            142: 'gg_death_n',
            143: 'gg_death_c',
            144: 'kw_overseas',
            145: 'kw_domestic',
            146: 'kw_confirmed_n',
            147: 'kw_confirmed_c',
            148: 'kw_iso_n',
            149: 'kw_iso_c',
            150: 'kw_iso_rel_n',
            151: 'kw_iso_rel_c',
            152: 'kw_death_n',
            153: 'kw_death_c',
            154: 'cb_overseas',
            155: 'cb_domestic',
            156: 'cb_confirmed_n',
            157: 'cb_confirmed_c',
            158: 'cb_iso_n',
            159: 'cb_iso_c',
            160: 'cb_iso_rel_n',
            161: 'cb_iso_rel_c',
            162: 'cb_death_n',
            163: 'cb_death_c',
            164: 'cn_overseas',
            165: 'cn_domestic',
            166: 'cn_confirmed_n',
            167: 'cn_confirmed_c',
            168: 'cn_iso_n',
            169: 'cn_iso_c',
            170: 'cn_iso_rel_n',
            171: 'cn_iso_rel_c',
            172: 'cn_death_n',
            173: 'cn_death_c',
            174: 'jb_overseas',
            175: 'jb_domestic',
            176: 'jb_confirmed_n',
            177: 'jb_confirmed_c',
            178: 'jb_iso_n',
            179: 'jb_iso_c',
            180: 'jb_iso_rel_n',
            181: 'jb_iso_rel_c',
            182: 'jb_death_n',
            183: 'jb_death_c',
            184: 'jn_overseas',
            185: 'jn_domestic',
            186: 'jn_confirmed_n',
            187: 'jn_confirmed_c',
            188: 'jn_iso_n',
            189: 'jn_iso_c',
            190: 'jn_iso_rel_n',
            191: 'jn_iso_rel_c',
            192: 'jn_death_n',
            193: 'jn_death_c',
            194: 'kb_overseas',
            195: 'kb_domestic',
            196: 'kb_confirmed_n',
            197: 'kb_confirmed_c',
            198: 'kb_iso_n',
            199: 'kb_iso_c',
            200: 'kb_iso_rel_n',
            201: 'kb_iso_rel_c',
            202: 'kb_death_n',
            203: 'kb_death_c',
            204: 'kn_overseas',
            205: 'kn_domestic',
            206: 'kn_confirmed_n',
            207: 'kn_confirmed_c',
            208: 'kn_iso_n',
            209: 'kn_iso_c',
            210: 'kn_iso_rel_n',
            211: 'kn_iso_rel_c',
            212: 'kn_death_n',
            213: 'kn_death_c',
            214: 'jj_overseas',
            215: 'jj_domestic',
            216: 'jj_confirmed_n',
            217: 'jj_confirmed_c',
            218: 'jj_iso_n',
            219: 'jj_iso_c',
            220: 'jj_iso_rel_n',
            221: 'jj_iso_rel_c',
            222: 'jj_death_n',
            223: 'jj_death_c',
            224: 'qs_overseas',
            225: 'qs_domestic',
            226: 'qs_confirmed_n',
            227: 'qs_confirmed_c',
            228: 'qs_iso_n',
            229: 'qs_iso_c',
            230: 'qs_iso_rel_n',
            231: 'qs_iso_rel_c',
            232: 'qs_death_n',
            233: 'qs_death_c',
        },
        axis='columns')

    adict = {
        "seoul": "서울",
        "bs": "부산",
        "tk": "대구",
        "ic": "인천",
        "kj": "광주",
        "dj": "대전",
        "ulsan": "울산",
        "sj": "세종",
        "gg": "경기",
        "kw": "강원",
        "cb": "충북",
        "cn": "충남",
        "jb": "전북",
        "jn": "전남",
        "kb": "경북",
        "kn": "경남",
        "jj": "제주",
        "qs": "검역",
        "00s": "0-9세",
        "10s": "10-19세",
        "20s": "20-29세",
        "30s": "30-39세",
        "40s": "40-49세",
        "50s": "50-59세",
        "60s": "60-69세",
        "70s": "70-79세",
        "80s": "80세 이상",
        "m": "남성",
        "f": "여성"
    }

    #null값 처리
    df = df.fillna(0)

    #index reset
    df = df.reset_index()

    df = df.rename({'index': 'date'}, axis='columns')
    df.set_index('date', inplace=True)

    ##area, age, gender 별로 데이터 concat
    #go: area 데이터 추출함수
    def extract_area(column):
        col_list = [
            elem for elem in df.columns if column in elem and '_n' in elem
        ]

        adj_col_list = [elem.split('_')[1] for elem in col_list]
        adj_col_list[2] += '_rel'

        ret_df = df[col_list]
        ret_df = ret_df.reset_index()
        ret_df['area'] = adict[column]

        ret_df.rename({k: v
                       for k, v in zip(col_list, adj_col_list)},
                      axis=1,
                      inplace=True)
        return ret_df

    covid19_area = pd.concat([
        extract_area('seoul'),
        extract_area('bs'),
        extract_area('tk'),
        extract_area('ic'),
        extract_area('kj'),
        extract_area('dj'),
        extract_area('ulsan'),
        extract_area('sj'),
        extract_area('gg'),
        extract_area('kw'),
        extract_area('cb'),
        extract_area('cn'),
        extract_area('jb'),
        extract_area('jn'),
        extract_area('kb'),
        extract_area('kn'),
        extract_area('jj'),
        extract_area('qs')
    ])

    covid19_area = covid19_area[[
        'date', 'area', 'confirmed', 'iso', 'iso_rel', 'death'
    ]]
    covid19_area['date'] = covid19_area['date'].str.replace('. ', '-')
    covid19_area.rename({
        'iso': 'isolated',
        'iso_rel': 'released'
    },
                        axis=1,
                        inplace=True)
    print(covid19_area)

    #gender 데이터 추출
    df_m = df[['m_confirmed_n', 'm_death_n']]
    df_m = df_m.rename({
        'm_confirmed_n': 'confirmed',
        'm_death_n': 'death'
    },
                       axis='columns')
    df_m['gender'] = adict['m']
    df_f = df[['f_confirmed_n', 'f_death_n']]
    df_f = df_f.rename({
        'f_confirmed_n': 'confirmed',
        'f_death_n': 'death'
    },
                       axis='columns')
    df_f['gender'] = adict['f']

    covid19_gender = pd.concat([df_m, df_f])
    covid19_gender = covid19_gender.reset_index()
    covid19_gender = covid19_gender[['date', 'gender', 'confirmed', 'death']]
    covid19_gender['date'] = covid19_gender['date'].str.replace('. ', '-')
    print(covid19_gender)

    #go: age 데이터 추출함수
    def extract_age(column):
        col_list = [
            elem for elem in df.columns if column in elem and '_n' in elem
        ]

        adj_col_list = [elem.split('_')[1] for elem in col_list]

        ret_df = df[col_list]
        ret_df = ret_df.reset_index()
        ret_df['age'] = adict[column]

        ret_df.rename({k: v
                       for k, v in zip(col_list, adj_col_list)},
                      axis=1,
                      inplace=True)
        return ret_df

    covid19_age = pd.concat([
        extract_age('00s'),
        extract_age('10s'),
        extract_age('20s'),
        extract_age('30s'),
        extract_age('40s'),
        extract_age('50s'),
        extract_age('60s'),
        extract_age('70s'),
        extract_age('80s')
    ])
    covid19_age = covid19_age[['date', 'age', 'confirmed', 'death']]
    covid19_age['date'] = covid19_age['date'].str.replace('. ', '-')
    print(covid19_age)

    # mongodb에 데이터 insert
    conn = conn_db()

    kor_area_collection = conn.DomesticDetailedCOVID.area
    kor_gender_collection = conn.DomesticDetailedCOVID.gender
    kor_age_collection = conn.DomesticDetailedCOVID.age

    print("crawling_covid19_area Updated: ",
          upsert_data(covid19_area, kor_area_collection, ['date', 'area']))
    print(
        "crawling_covid19_gender Updated: ",
        upsert_data(covid19_gender, kor_gender_collection, ['date', 'gender']))
    print("crawling_covid19_age Updated: ",
          upsert_data(covid19_age, kor_age_collection, ['date', 'age']))
    conn.close()