Python get_soup_from_url Exemples, lib.scrape.get_soup_from_url Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : watch_c0_member.py Projet : danbi2990/workspace

import re
from pprint import pprint
from datetime import datetime
from collections import OrderedDict

import pandas as pd
from pymongo import UpdateOne, DeleteMany, InsertOne

from db.mongo import MyMongo
from lib.scrape import get_soup_from_url
from lib.util import print_bulk_result

# Get Total From Web
url_total = 'http://watch.peoplepower21.org/?act=&mid=AssemblyMembers&vid=&mode=search&name=&party=&region=&sangim=&gender=&age=&elect_num=&singlebutton='
soup = get_soup_from_url(url_total)
text = soup.get_text()
re_obj = re.search(r'총\s(\d+)명', text)
count_total_from_web = int(re_obj.group(1).replace(',', ''))
# print(count_total_from_web)

if not count_total_from_web:
    raise ValueError('count_total_from_web = 0')

# Get Total from db
with MyMongo() as db:
    print('Get row count from db.')
    member_table = db.get_df_from_table('assembly', 'watch_member')
    # count_total_from_db = member_table.count()

    try:

Exemple #2

0

Afficher le fichier

import re
import os

import pandas as pd

from lib.scrape import get_soup_from_url

url = 'http://www.alio.go.kr/managementItem.do'
soup = get_soup_from_url(url)
all_a = soup.select('.left a')
# print(all_a)
org_name_and_code = {
    a.text: re.search(r'C\d\d\d\d', str(a)).group(0)
    for a in all_a
}
# print(org_name_and_code)

# df = pd.DataFrame(columns=['기관명', 'code', '신분', '1인당 복리후생비'])

if not os.path.exists('welfare.tsv'):
    with open('welfare.tsv', 'w') as f:
        f.write('기관명\tcode\t신분\t1인당 복리후생비\n')

df = pd.read_csv('welfare.tsv', sep='\t')

for org_name, code in org_name_and_code.items():
    if code in df['code'].tolist():
        print(org_name, ': already exists, skipped.')
        continue

    url2 = f'http://www.alio.go.kr/popReportTerm.do?apbaId={code}&reportFormRootNo=20801#toc-127'

Exemple #3

0

Afficher le fichier

from datetime import datetime
import pandas as pd

from db.mongo import MyMongo
from lib.scrape import get_soup_from_url

url_committee = 'http://watch.peoplepower21.org/Committee'
soup = get_soup_from_url(url_committee)
select = soup.find('select', id='sangim')
options = select.find_all('option')
df_body = [o['value'] for o in options if o['value']]
new_com = pd.DataFrame(columns=['committee_id'], data=df_body)
new_com['committeename'] = new_com['committee_id'].str.replace(' ', '') + '위원회'
new_com['update_on'] = datetime.now()
# print(new_com)
# print(df_body)

with MyMongo() as db:
    prev_com = db.get_df_from_table('assembly', 'watch_committee')
    db.archive_complement_and_dump_new(prev_com, new_com, 'committee_id',
                                       'assembly', 'watch_committee',
                                       'watch_committee_archive')

Exemple #4

0

Afficher le fichier

#     cur_dir = os.path.split(os.getcwd())[0]
#     if cur_dir not in sys.path:
#         sys.path.append(cur_dir)

from datetime import datetime
import re
import pandas as pd

from db.mongo import MyMongo
from lib.scrape import get_soup_from_url
from lib.util import print_bulk_result

from pymongo import UpdateOne

url_total = 'http://watch.peoplepower21.org/RollBook'
soup = get_soup_from_url(url_total)
text = soup.get_text()
re_obj = re.search(r'전체\s(.+)\s건', text)
count_total_from_web = int(re_obj.group(1).replace(',', ''))

if not count_total_from_web:
    raise ValueError('count_total_from_web = 0')

with MyMongo() as db:
    print('Get row count from db.')
    confer = db.get_table_obj('assembly', 'watch_conference')
    count_total_from_db = confer.count({})

# print(count_total_from_web, count_total_from_db)
print(f'Conferences from web: {count_total_from_web}')
print(f'Conferences from db: {count_total_from_db}')