Python get_soup_from_url Examples

Programming Language: Python

Namespace/Package Name: lib.scrape

Method/Function: get_soup_from_url

Examples at hotexamples.com: 4

Python get_soup_from_url - 4 examples found. These are the top rated real world Python examples of lib.scrape.get_soup_from_url extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: watch_c0_member.py Project: danbi2990/workspace

import re
from pprint import pprint
from datetime import datetime
from collections import OrderedDict

import pandas as pd
from pymongo import UpdateOne, DeleteMany, InsertOne

from db.mongo import MyMongo
from lib.scrape import get_soup_from_url
from lib.util import print_bulk_result

# Get Total From Web
url_total = 'http://watch.peoplepower21.org/?act=&mid=AssemblyMembers&vid=&mode=search&name=&party=&region=&sangim=&gender=&age=&elect_num=&singlebutton='
soup = get_soup_from_url(url_total)
text = soup.get_text()
re_obj = re.search(r'총\s(\d+)명', text)
count_total_from_web = int(re_obj.group(1).replace(',', ''))
# print(count_total_from_web)

if not count_total_from_web:
    raise ValueError('count_total_from_web = 0')

# Get Total from db
with MyMongo() as db:
    print('Get row count from db.')
    member_table = db.get_df_from_table('assembly', 'watch_member')
    # count_total_from_db = member_table.count()

    try:

Example #2

Show file

import re
import os

import pandas as pd

from lib.scrape import get_soup_from_url

url = 'http://www.alio.go.kr/managementItem.do'
soup = get_soup_from_url(url)
all_a = soup.select('.left a')
# print(all_a)
org_name_and_code = {
    a.text: re.search(r'C\d\d\d\d', str(a)).group(0)
    for a in all_a
}
# print(org_name_and_code)

# df = pd.DataFrame(columns=['기관명', 'code', '신분', '1인당 복리후생비'])

if not os.path.exists('welfare.tsv'):
    with open('welfare.tsv', 'w') as f:
        f.write('기관명\tcode\t신분\t1인당 복리후생비\n')

df = pd.read_csv('welfare.tsv', sep='\t')

for org_name, code in org_name_and_code.items():
    if code in df['code'].tolist():
        print(org_name, ': already exists, skipped.')
        continue

    url2 = f'http://www.alio.go.kr/popReportTerm.do?apbaId={code}&reportFormRootNo=20801#toc-127'

Example #3

Show file

from datetime import datetime
import pandas as pd

from db.mongo import MyMongo
from lib.scrape import get_soup_from_url

url_committee = 'http://watch.peoplepower21.org/Committee'
soup = get_soup_from_url(url_committee)
select = soup.find('select', id='sangim')
options = select.find_all('option')
df_body = [o['value'] for o in options if o['value']]
new_com = pd.DataFrame(columns=['committee_id'], data=df_body)
new_com['committeename'] = new_com['committee_id'].str.replace(' ', '') + '위원회'
new_com['update_on'] = datetime.now()
# print(new_com)
# print(df_body)

with MyMongo() as db:
    prev_com = db.get_df_from_table('assembly', 'watch_committee')
    db.archive_complement_and_dump_new(prev_com, new_com, 'committee_id',
                                       'assembly', 'watch_committee',
                                       'watch_committee_archive')

Example #4

Show file

#     cur_dir = os.path.split(os.getcwd())[0]
#     if cur_dir not in sys.path:
#         sys.path.append(cur_dir)

from datetime import datetime
import re
import pandas as pd

from db.mongo import MyMongo
from lib.scrape import get_soup_from_url
from lib.util import print_bulk_result

from pymongo import UpdateOne

url_total = 'http://watch.peoplepower21.org/RollBook'
soup = get_soup_from_url(url_total)
text = soup.get_text()
re_obj = re.search(r'전체\s(.+)\s건', text)
count_total_from_web = int(re_obj.group(1).replace(',', ''))

if not count_total_from_web:
    raise ValueError('count_total_from_web = 0')

with MyMongo() as db:
    print('Get row count from db.')
    confer = db.get_table_obj('assembly', 'watch_conference')
    count_total_from_db = confer.count({})

# print(count_total_from_web, count_total_from_db)
print(f'Conferences from web: {count_total_from_web}')
print(f'Conferences from db: {count_total_from_db}')