def parse(pageString): result = {} bsObj = BeautifulSoup(pageString, "html.parser") # print(bsObj) qtDayText2 = bsObj.find("div", {"id": "qtDay"}) try: extDat = findMatchedTexts(qtDayText2.text, "201[\s\S]+") ss = extDat[0].split("\r\n ") ss2 = "{} {}".format(ss[0].replace("\n", ". "), ss[1]) result['date'] = ss2 res = findMatchedTexts(qtDayText2.text, "\(.+\)") result['addr'] = getAddr(res[0]) except Exception as e: print(e) box2Content = bsObj.find("div", {"class": "box2Content"}) result['box2Content'] = box2Content.text # result['srcipt'] = script.text content = bsObj.find("div", {"id": "content"}) ps = content.findAll("p") result['content'] = ps[4].text bx2 = bsObj.find("div", {"class": "bx2"}) guideText = bx2.text result['bx2'] = addLine(guideText) return result
def get_row(tr): tds = tr.find_all('td') atag = str(tds[0].find('a')).split('<span class="tit_info">') first = '' try: first = re.compile('\t.*\t').sub('', atag[0]).split('\n')[1] first = first.replace('R&amp;amp;amp;D ', '') except: print('----------') second = '' try: second = atag[1].split('</span>')[0] second = second.replace('R&amp;D ', '') except: print('---------') # print(tds[1], tds[2], tds[3]) api_id = '' try: id_a = tds[0].find('h4').find('a')['href'] api_id = findMatchedTexts(id_a, "javascript:view\('[0-9]+")[0] api_id = api_id.replace("javascript:view('", "") except Exception as e: print('----api id exception -----') service_types = [] try: service_types_spans = tds[5].find('div', { 'class': 'datatype' }).find_all('span') service_types = [span.text for span in service_types_spans] except Exception as e: print('----- serivce types exception -------') return { 'api_id': api_id, 'title': first, 'subtitle': second, 'count': tds[3].text, 'service_types': service_types }
from libs.crawler import crawl from bs4 import BeautifulSoup from libs.patternMatcher import findMatchedTexts url = "http://dart.fss.or.kr/corp/searchAutoComplete.do?textCrpNm=%EC%85%80%ED%8A%B8%EB%A6%AC%EC%98%A8&_=1561171426973" pageString = crawl(url) bsObj = BeautifulSoup(pageString, "html.parser") names = findMatchedTexts(bsObj.text, "셀트리온[가-힣0-9a-zA-z]*") print(names) for name in names: print(name)