Exemple #1
0
def f():
    tbl = mongo1.db['뉴스_ETRI언어분석_원본']
    etri = models.ETRIAI('WiseNLU', 'srl')
    docids = etri.distinct(key='docid',
                           filter={
                               'colname': 'bodytext',
                               'results': None
                           })
    loop = dbg.Loop('뉴스_ETRI언어분석_원본', len(docids))
    for docid in docids:
        cursor = tbl.find({'뉴스id': docid}, {
            '_id': 0,
            '뉴스id': 1,
            '뉴스본문srl_res': 1
        })
        docs = list(cursor)
        if len(docs) is 1:
            d = docs[0]
            etri.modelnm = 'Article__네이버_모바일홈'
            etri.docid = d['뉴스id']
            etri.colname = 'bodytext'
            etri.results = d['뉴스본문srl_res']
            etri.update_doc(
                {
                    'modelnm': etri.modelnm,
                    'docid': etri.docid,
                    'colname': etri.colname
                }, True)
        loop.report()
Exemple #2
0
def collect():
    self = Assemblymen()
    party = Party().get()
    loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}", len(party))
    for d in party.to_dict('records'):
        party.attributize(d)
        self.collect(party.name, party.code).parse()
        loop.report(addi_info=f" partyname : {party.name}")
Exemple #3
0
 def parse(self):
     loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}",
                     len(self.docs))
     for d in self.docs:
         self.attributize(d)
         lay = MobileNewsHomeLayoutParser(self.html, self.pageuri)
         self.layout = lay.parse().docs
         self.update_doc({'_id': self._id}, True)
         loop.report(addi_info=f" snapshot_dt : {self.snapshot_dt}")
     return self
Exemple #4
0
def collect():
    page = models.NewsPage()
    page.load()
    loop = dbg.Loop(
        f"{sys.modules[__name__].__file__} | {inspect.stack()[0][3]}",
        len(page.docs))
    for d in page.docs:
        page.attributize(d)
        ss = SnapshotCollector(page.pressname, page.name, page.url)
        ss.collect()
        loop.report(addi_info=page.url)
Exemple #5
0
def collect(pressname=None, pagename=None):
    fr = dbg.Function(inspect.currentframe()).report_init()
    filter = {}
    if isinstance(pressname, str) and isinstance(pagename, str):
        filter.update({'pressname': pressname, 'name': pagename})
    page = models.NewsPage().load(filter)
    loop = dbg.Loop(
        f"{sys.modules[__name__].__file__} | {inspect.stack()[0][3]}",
        len(page.docs))
    for d in page.docs:
        page.attributize(d)
        c = Collector(page.pressname, page.name)
        c.collect()
        loop.report(
            addi_info=f" pressname : {page.pressname}, pagename : {page.name}")
    fr.report_fin()
Exemple #6
0
 def parse(self):
     loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}",
                     len(self.docs))
     for d in self.docs:
         self.attributize(d)
         soup = BeautifulSoup(d['html'], 'html.parser')
         self.detect_right_panel(soup)
         self.top_card()
         self.job_summary()
         self.job_description()
         self.how_you_match()
         self.competitive_intelligence_about_applicants()
         self.insight_look_at_company()
         self.commute()
         self.about_us()
         self.update_doc({'_id': d['_id']}, False)
         # break
         loop.report()
Exemple #7
0
 def collect(self):
     self.get_targets()
     if hasattr(self, 'article'):
         article = copy.copy(self.article)
         loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}",
                         len(article.docs))
         for d in article.docs:
             article.attributize(d)
             self.docid = article._id
             jsondata = self.etriapi.api(
                 text=getattr(article, self.targetcol))
             if isinstance(jsondata, dict):
                 self.results = [jsondata]
                 self.update_doc({'docid': self.docid}, True)
             loop.report(
                 addi_info=
                 f" 기사 {self.targetcol} 일부 : {getattr(article, self.targetcol)[:30]}"
             )
Exemple #8
0
 def collect(self):
     target_urls = self.get_targets()
     if target_urls is not None:
         loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}",
                         len(target_urls))
         for url in target_urls:
             self.url = url
             try:
                 r = requests.get(self.url)
             except Exception as e:
                 print(f"\n Exception :\n{e}\n")
             else:
                 if (r.status_code is 200) and (len(r.text) is not 0):
                     self.html = r.text
                     self.update_doc({'url': self.url}, True)
                 else:
                     print(f"\n{'#'*60}\n 네이버에서 이상감지를 한 듯.")
                     dbg.obj(r,
                             f"{self.__class__} | {inspect.stack()[0][3]}")
             loop.report(addi_info=f" url : {self.url}")
Exemple #9
0
 def parse(self):
     self.get_targets()
     loop = dbg.Loop(f"{self.__class__} | {inspect.stack()[0][3]}",
                     len(self.docs))
     for d in self.docs:
         self.attributize(d)
         soup = BeautifulSoup(self.html, 'html.parser')
         s = soup.find('div', class_='responsive_col1')
         if s is None:
             errmsg = f"soup.find('div',class_='responsive_col1') is None.\n 기사페이지의 핵심내용태그(responsive_col1)가 없는 껍데기 페이지."
             print(
                 f"\n{'#'*60}\n{self.__class__} | {inspect.stack()[0][3]}\n {errmsg}"
             )
         else:
             self.parse_uppertier(s)
             self.parse_middletier(s)
             self.parse_lowertier(s)
             if len(list(self.schematize().doc)) > 3:
                 self.update_doc({'_id': self._id})
         loop.report(addi_info=self.url)
Exemple #10
0
def divide_NewsPageSnapshot():
    tbl = mongo.db['NewsPageSnapshot']
    """url별로 작업하기 위해 urls 리스트 로딩."""
    page = models.NewsPage()
    page.load()
    loop = dbg.Loop('divide_NewsPageSnapshot', len(page.docs))
    for d in page.docs:
        """pageid에 해당하는 snapshot 데이터 로딩."""
        page.attributize(d)
        cursor = tbl.find({'pageid': page._id}, {
            '_id': 0,
            'collect_dt': 1,
            'html': 1
        })
        """newspage마다 테이블로 분할된 데이터모델에 로딩한 데이터를 저장."""
        ss = models.NewsPageSnapshot(page.pressname, page.name)
        ss.docs = list(cursor)
        for d in ss.docs:
            ss.attributize(d)
            ss.update_doc({'collect_dt': ss.collect_dt}, True)
        loop.report(addi_info=page.url)
Exemple #11
0
def migrate_screen_tbl():
    """url별로 작업하기 위해 urls 리스트 로딩."""
    tbl = mongo1.db['screen']
    urls = tbl.distinct(key='url')
    loop = dbg.Loop('migrate_screen_tbl', len(urls))
    for url in urls:
        """url에 해당하는 pageid를 검색."""
        page = models.NewsPage().load({'url': url})
        if len(page.docs) is 1:
            page.attributize(page.docs[0])
            """screen-tbl에서 마이그할 대상 문서로딩."""
            cursor = tbl.find({'url': url}, {'_id': 0})
            scrdf = pd.DataFrame(list(cursor))
            cols_map = {'r_txt': 'html', '수집일시': 'collect_dt'}
            scrdf = scrdf.rename(columns=cols_map).reindex(
                columns=['html', 'collect_dt'])
            """Snapshot-tbl에 마이그."""
            ss = models.NewsPageSnapshot(page.pressname, page.name)
            ss.docs = scrdf.to_dict('records')
            for d in ss.docs:
                ss.attributize(d)
                ss.update_doc({'collect_dt': ss.collect_dt}, True)
        loop.report(addi_info=url)
Exemple #12
0
def collect_1page(sleepsecs=10):
    """url 하나당 25개의 Job-postings 를 수집/파싱한다.
    리스트를 반복하며 Job Posting 상세내용을 수집-분석.
    """
    fr = dbg.Function(inspect.currentframe()).report_init()
    uo = urlparse(driver.current_url)
    if ('keywords' in uo.query) and ('location' in uo.query):
        # jobcards = driver.find_elements_by_class_name('occludable-update')
        jobcards = driver.find_elements_by_class_name('artdeco-list__item')
        collect_dt = datetime.today().astimezone()
        loop = dbg.Loop(f"{inspect.stack()[0][3]} | jobcard-index progress",
                        len(jobcards))
        for i, jobcard in enumerate(jobcards):
            time.sleep(sleepsecs)
            ############################################################
            jobcard_job_details(jobcard)
            ############################################################
            loop.report()
        fr.report_fin()
    else:
        print(
            f"\n keywords와 locations이 url에 없다.\n driver.current_url : {driver.current_url}"
        )
        time.sleep(sleepsecs)