def f(): tbl = mongo1.db['뉴스_ETRI언어분석_원본'] etri = models.ETRIAI('WiseNLU', 'srl') docids = etri.distinct(key='docid', filter={ 'colname': 'bodytext', 'results': None }) loop = dbg.Loop('뉴스_ETRI언어분석_원본', len(docids)) for docid in docids: cursor = tbl.find({'뉴스id': docid}, { '_id': 0, '뉴스id': 1, '뉴스본문srl_res': 1 }) docs = list(cursor) if len(docs) is 1: d = docs[0] etri.modelnm = 'Article__네이버_모바일홈' etri.docid = d['뉴스id'] etri.colname = 'bodytext' etri.results = d['뉴스본문srl_res'] etri.update_doc( { 'modelnm': etri.modelnm, 'docid': etri.docid, 'colname': etri.colname }, True) loop.report()
def collect(): self = Assemblymen() party = Party().get() loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}", len(party)) for d in party.to_dict('records'): party.attributize(d) self.collect(party.name, party.code).parse() loop.report(addi_info=f" partyname : {party.name}")
def parse(self): loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}", len(self.docs)) for d in self.docs: self.attributize(d) lay = MobileNewsHomeLayoutParser(self.html, self.pageuri) self.layout = lay.parse().docs self.update_doc({'_id': self._id}, True) loop.report(addi_info=f" snapshot_dt : {self.snapshot_dt}") return self
def collect(): page = models.NewsPage() page.load() loop = dbg.Loop( f"{sys.modules[__name__].__file__} | {inspect.stack()[0][3]}", len(page.docs)) for d in page.docs: page.attributize(d) ss = SnapshotCollector(page.pressname, page.name, page.url) ss.collect() loop.report(addi_info=page.url)
def collect(pressname=None, pagename=None): fr = dbg.Function(inspect.currentframe()).report_init() filter = {} if isinstance(pressname, str) and isinstance(pagename, str): filter.update({'pressname': pressname, 'name': pagename}) page = models.NewsPage().load(filter) loop = dbg.Loop( f"{sys.modules[__name__].__file__} | {inspect.stack()[0][3]}", len(page.docs)) for d in page.docs: page.attributize(d) c = Collector(page.pressname, page.name) c.collect() loop.report( addi_info=f" pressname : {page.pressname}, pagename : {page.name}") fr.report_fin()
def parse(self): loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}", len(self.docs)) for d in self.docs: self.attributize(d) soup = BeautifulSoup(d['html'], 'html.parser') self.detect_right_panel(soup) self.top_card() self.job_summary() self.job_description() self.how_you_match() self.competitive_intelligence_about_applicants() self.insight_look_at_company() self.commute() self.about_us() self.update_doc({'_id': d['_id']}, False) # break loop.report()
def collect(self): self.get_targets() if hasattr(self, 'article'): article = copy.copy(self.article) loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}", len(article.docs)) for d in article.docs: article.attributize(d) self.docid = article._id jsondata = self.etriapi.api( text=getattr(article, self.targetcol)) if isinstance(jsondata, dict): self.results = [jsondata] self.update_doc({'docid': self.docid}, True) loop.report( addi_info= f" 기사 {self.targetcol} 일부 : {getattr(article, self.targetcol)[:30]}" )
def collect(self): target_urls = self.get_targets() if target_urls is not None: loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}", len(target_urls)) for url in target_urls: self.url = url try: r = requests.get(self.url) except Exception as e: print(f"\n Exception :\n{e}\n") else: if (r.status_code is 200) and (len(r.text) is not 0): self.html = r.text self.update_doc({'url': self.url}, True) else: print(f"\n{'#'*60}\n 네이버에서 이상감지를 한 듯.") dbg.obj(r, f"{self.__class__} | {inspect.stack()[0][3]}") loop.report(addi_info=f" url : {self.url}")
def parse(self): self.get_targets() loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}", len(self.docs)) for d in self.docs: self.attributize(d) soup = BeautifulSoup(self.html, 'html.parser') s = soup.find('div', class_='responsive_col1') if s is None: errmsg = f"soup.find('div',class_='responsive_col1') is None.\n 기사페이지의 핵심내용태그(responsive_col1)가 없는 껍데기 페이지." print( f"\n{'#'*60}\n{self.__class__} | {inspect.stack()[0][3]}\n {errmsg}" ) else: self.parse_uppertier(s) self.parse_middletier(s) self.parse_lowertier(s) if len(list(self.schematize().doc)) > 3: self.update_doc({'_id': self._id}) loop.report(addi_info=self.url)
def divide_NewsPageSnapshot(): tbl = mongo.db['NewsPageSnapshot'] """url별로 작업하기 위해 urls 리스트 로딩.""" page = models.NewsPage() page.load() loop = dbg.Loop('divide_NewsPageSnapshot', len(page.docs)) for d in page.docs: """pageid에 해당하는 snapshot 데이터 로딩.""" page.attributize(d) cursor = tbl.find({'pageid': page._id}, { '_id': 0, 'collect_dt': 1, 'html': 1 }) """newspage마다 테이블로 분할된 데이터모델에 로딩한 데이터를 저장.""" ss = models.NewsPageSnapshot(page.pressname, page.name) ss.docs = list(cursor) for d in ss.docs: ss.attributize(d) ss.update_doc({'collect_dt': ss.collect_dt}, True) loop.report(addi_info=page.url)
def migrate_screen_tbl(): """url별로 작업하기 위해 urls 리스트 로딩.""" tbl = mongo1.db['screen'] urls = tbl.distinct(key='url') loop = dbg.Loop('migrate_screen_tbl', len(urls)) for url in urls: """url에 해당하는 pageid를 검색.""" page = models.NewsPage().load({'url': url}) if len(page.docs) is 1: page.attributize(page.docs[0]) """screen-tbl에서 마이그할 대상 문서로딩.""" cursor = tbl.find({'url': url}, {'_id': 0}) scrdf = pd.DataFrame(list(cursor)) cols_map = {'r_txt': 'html', '수집일시': 'collect_dt'} scrdf = scrdf.rename(columns=cols_map).reindex( columns=['html', 'collect_dt']) """Snapshot-tbl에 마이그.""" ss = models.NewsPageSnapshot(page.pressname, page.name) ss.docs = scrdf.to_dict('records') for d in ss.docs: ss.attributize(d) ss.update_doc({'collect_dt': ss.collect_dt}, True) loop.report(addi_info=url)
def collect_1page(sleepsecs=10): """url 하나당 25개의 Job-postings 를 수집/파싱한다. 리스트를 반복하며 Job Posting 상세내용을 수집-분석. """ fr = dbg.Function(inspect.currentframe()).report_init() uo = urlparse(driver.current_url) if ('keywords' in uo.query) and ('location' in uo.query): # jobcards = driver.find_elements_by_class_name('occludable-update') jobcards = driver.find_elements_by_class_name('artdeco-list__item') collect_dt = datetime.today().astimezone() loop = dbg.Loop(f"{inspect.stack()[0][3]} | jobcard-index progress", len(jobcards)) for i, jobcard in enumerate(jobcards): time.sleep(sleepsecs) ############################################################ jobcard_job_details(jobcard) ############################################################ loop.report() fr.report_fin() else: print( f"\n keywords와 locations이 url에 없다.\n driver.current_url : {driver.current_url}" ) time.sleep(sleepsecs)