import re from pprint import pprint from datetime import datetime from collections import OrderedDict import pandas as pd from pymongo import UpdateOne, DeleteMany, InsertOne from db.mongo import MyMongo from lib.scrape import get_soup_from_url from lib.util import print_bulk_result # Get Total From Web url_total = 'http://watch.peoplepower21.org/?act=&mid=AssemblyMembers&vid=&mode=search&name=&party=®ion=&sangim=&gender=&age=&elect_num=&singlebutton=' soup = get_soup_from_url(url_total) text = soup.get_text() re_obj = re.search(r'총\s(\d+)명', text) count_total_from_web = int(re_obj.group(1).replace(',', '')) # print(count_total_from_web) if not count_total_from_web: raise ValueError('count_total_from_web = 0') # Get Total from db with MyMongo() as db: print('Get row count from db.') member_table = db.get_df_from_table('assembly', 'watch_member') # count_total_from_db = member_table.count() try:
import re import os import pandas as pd from lib.scrape import get_soup_from_url url = 'http://www.alio.go.kr/managementItem.do' soup = get_soup_from_url(url) all_a = soup.select('.left a') # print(all_a) org_name_and_code = { a.text: re.search(r'C\d\d\d\d', str(a)).group(0) for a in all_a } # print(org_name_and_code) # df = pd.DataFrame(columns=['기관명', 'code', '신분', '1인당 복리후생비']) if not os.path.exists('welfare.tsv'): with open('welfare.tsv', 'w') as f: f.write('기관명\tcode\t신분\t1인당 복리후생비\n') df = pd.read_csv('welfare.tsv', sep='\t') for org_name, code in org_name_and_code.items(): if code in df['code'].tolist(): print(org_name, ': already exists, skipped.') continue url2 = f'http://www.alio.go.kr/popReportTerm.do?apbaId={code}&reportFormRootNo=20801#toc-127'
from datetime import datetime import pandas as pd from db.mongo import MyMongo from lib.scrape import get_soup_from_url url_committee = 'http://watch.peoplepower21.org/Committee' soup = get_soup_from_url(url_committee) select = soup.find('select', id='sangim') options = select.find_all('option') df_body = [o['value'] for o in options if o['value']] new_com = pd.DataFrame(columns=['committee_id'], data=df_body) new_com['committeename'] = new_com['committee_id'].str.replace(' ', '') + '위원회' new_com['update_on'] = datetime.now() # print(new_com) # print(df_body) with MyMongo() as db: prev_com = db.get_df_from_table('assembly', 'watch_committee') db.archive_complement_and_dump_new(prev_com, new_com, 'committee_id', 'assembly', 'watch_committee', 'watch_committee_archive')
# cur_dir = os.path.split(os.getcwd())[0] # if cur_dir not in sys.path: # sys.path.append(cur_dir) from datetime import datetime import re import pandas as pd from db.mongo import MyMongo from lib.scrape import get_soup_from_url from lib.util import print_bulk_result from pymongo import UpdateOne url_total = 'http://watch.peoplepower21.org/RollBook' soup = get_soup_from_url(url_total) text = soup.get_text() re_obj = re.search(r'전체\s(.+)\s건', text) count_total_from_web = int(re_obj.group(1).replace(',', '')) if not count_total_from_web: raise ValueError('count_total_from_web = 0') with MyMongo() as db: print('Get row count from db.') confer = db.get_table_obj('assembly', 'watch_conference') count_total_from_db = confer.count({}) # print(count_total_from_web, count_total_from_db) print(f'Conferences from web: {count_total_from_web}') print(f'Conferences from db: {count_total_from_db}')