Ejemplos de is_saved en Python, ejemplos de mongo.is_saved en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: sites.py Proyecto: jihoon289/crawing-with-requests-selenium

def kai_scan():
    # 지능정보산업협회
    name = '지능정보산업협회'
    try:
        # 신청가능 사업 공지
        req = requests.get('http://www.k-ai.or.kr/kr/information/notice.php')
        req.encoding = 'utf-8'
        html = req.text
        soup = BeautifulSoup(html, 'html.parser')

        # selector 로 데이터가저오기
        top_count = soup.select('span.notice-icon')
        titles = soup.select('td > a')
        dates = soup.select('tr > td:nth-child(3)')

        # 체크포인트 불러오기, 저장하기
        check_point = mongo.check_point_read(name)['title']
        # top_count 공지갯수를 세서 공지를 제외하고 check_point를 저장
        mongo.check_point_save(name, titles[len(top_count)].text.strip())

        # 데이터 변수로 받아서 txt 저장
        for i in range(len(titles)):
            title = titles[i].text.strip()
            if check_point != title:
                link = 'http://www.k-ai.or.kr' + titles[i].get('href')
                date = dates[i].text
                try:
                    edate = date.split("~").pop(1)
                    sdate = date.split("~").pop(0)
                # 형식이 다른경우
                except Exception:
                    sdate = date
                    edate = ''
                # 상단고정 공지때문에 저장된건 중복 걸러주기
                if mongo.is_saved(title) is None:
                    mongo.post_save(name, title, link, sdate, edate)
                    print('이름: ' + name + '\n제목:' + title + '\n링크: ' + link +
                          '\n날짜: ' + date + '\n')
            else:
                break
    except Exception:
        message.site_error_push(name)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: sites.py Proyecto: jihoon289/crawing-with-requests-selenium

def btp_scan():
    try:
        names = ['부산테크노파크_사업공고', '부산테크노파크_공지사항']
        urls = [
            'http://www.btp.or.kr/index.php?action=BD0000M&pagecode=P000000010&language=KR',
            'http://www.btp.or.kr/index.php?action=BD0000M&pagecode=P000000013&language=KR'
        ]
        for j in range(2):
            for pageindex in range(1, 3):  # 1-2페이지까지 돌림
                # 신청가능 사업 공지
                req = requests.get(urls[j] + "&pageIndex=" + str(pageindex))
                req.encoding = 'utf-8'
                html = req.text
                soup = BeautifulSoup(html, 'html.parser')

                # selector 로 데이터가저오기
                titles = soup.select(' tr > td.ui-pleft20 > a')
                if j == 0:
                    dates = soup.select('tr > td:nth-child(3)')
                else:
                    dates = soup.select('tr > td:nth-child(4)')
                # 데이터 변수로 받아서 txt 저장
                for i in range(len(titles)):
                    title = titles[i].text.strip()
                    if mongo.is_saved(title) is None:
                        param = re.findall("\d+", titles[i].get('href'))
                        link = urls[j] + '&command=View&idx=' + param[0]
                        date = dates[i].text.split("(").pop(0).strip()
                        try:
                            edate = date.split("~").pop(1)
                            sdate = date.split("~").pop(0)
                        # 형식이 다른경우
                        except Exception:
                            sdate = ''
                            edate = date
                        # 상단고정 공지때문에 저장된건 중복 걸러주기
                        mongo.post_save(names[j], title, link, sdate, edate)
                        print('이름: ' + names[j] + '\n제목:' + title + '\n링크: ' +
                              link + '\n날짜: ' + sdate + edate + '\n')
    except Exception:
        message.site_error_push(name="부산테크노파트")

Ejemplo n.º 3

0

Mostrar archivo

Archivo: sites.py Proyecto: jihoon289/crawing-with-requests-selenium

def kotra_scan():
    # 코트라 사이트
    name = 'Kotra'
    try:

        url = 'http://www.kotra.or.kr'
        # 신청가능 사업 공지
        req = requests.get(
            'http://www.kotra.or.kr/kh/business/busiList.do?&MENU_CD=T0503&TOP_MENU_CD=T0500&LEFT_MENU_CD=T0503&PARENT_MENU_CD=&CO_TYPE=undefined&boardType=0'
        )
        html = req.text
        soup = BeautifulSoup(html, 'html.parser')

        # selector 로 데이터가저오기
        titles = soup.select('td > a')
        dates = soup.select('tr > td:nth-child(3)')

        # 체크포인트 불러오기, 저장하기
        mongo.check_point_save(name, titles[0].text.strip())

        # 데이터 변수로 받아서 txt 저장
        for i in range(len(titles)):
            title = titles[i].text
            link = url + titles[i].get('href').split('\'').pop(1)
            date = dates[i].text
            try:
                sdate = date.split(" ~ ").pop(0)
                edate = date.split(" ~ ").pop(1)
            except Exception:  # 상시모집인 경우
                sdate = date
                edate = ''

            if mongo.is_saved(title) is None:
                mongo.post_save(name, title, link, sdate, edate)
                print('이름: ' + name + '\n제목: ' + title + '\n링크: ' + link +
                      '\n신청기간: ' + date + '\n')

    except Exception:
        message.site_error_push(name)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: sites.py Proyecto: jihoon289/crawing-with-requests-selenium

def nipa_scan():
    # 정보통신 산업진흥원 url
    names = ['정보통신산업진흥원1', '정보통신산업진흥원2']
    try:
        url = 'http://www.nipa.kr'
        uris = ['', '/biz/']
        urls = [
            'http://www.nipa.kr/board/boardList.it?boardNo=103&menuNo=32&page=1',
            'http://www.nipa.kr/biz/bizNotice.it?menuNo=18&page=1'
        ]

        # 신청가능 사업 공지
        for j in range(2):
            req = requests.get(urls[j])
            html = req.text
            soup = BeautifulSoup(html, 'html.parser')

            # selector 로 데이터가저오기
            titles = soup.select('td > a')
            dates = soup.select('tr > td.date')

            # 체크포인트 불러오기, 저장하기
            check_point = mongo.check_point_read(names[j])['title']
            mongo.check_point_save(names[j], titles[0].text)

            # 데이터 변수로 받아서 txt 저장
            for i in range(len(titles)):
                title = titles[i].text
                link = url + uris[j] + titles[i].get('onclick').split(
                    '\'').pop(1)
                date = dates[i].text

                if mongo.is_saved(title) is None:
                    mongo.post_save(names[j], title, link, date, '')
                    print('이름: ' + names[j] + '\n제목: ' + title + '\n링크: ' +
                          link + '\n등록일: ' + date + '\n')
    except Exception:
        message.site_error_push(names[0])

Ejemplo n.º 5

0

Mostrar archivo

Archivo: sites.py Proyecto: jihoon289/crawing-with-requests-selenium

def kstartup_scan(driver):
    name = 'kstartup'
    # k- startup 신청게시판 url
    url = 'http://www.k-startup.go.kr/common/announcement/announcementList.do?mid=30004&bid=701&searchAppAt=A'
    board_url = 'http://www.k-startup.go.kr/common/announcement/announcementDetail.do?mid=30004&bid=701&searchPrefixCode=BOARD_701_001&searchPostSn='

    # 드라이버로 웹 열기
    driver.get(url)
    driver.implicitly_wait(3)
    driver.find_element_by_tag_name('body').send_keys(Keys.END)

    # 지난번 저장했던 포스트 가져오기
    check_point = mongo.check_point_read(name)['title']
    point_flag = False

    # 페이지 5회 이동
    pagecount = 0
    while pagecount < 5:
        driver.find_element_by_xpath('//*[@id="listPlusAdd"]/a').click()
        driver.find_element_by_tag_name('body').send_keys(Keys.END)
        time.sleep(1)
        pagecount += 1

    try:
        # 중요 공지 가져오기
        impo_board = driver.find_element_by_class_name('ann_list_impor')
        boards_list = impo_board.find_elements_by_xpath('./li')

        # 중요공지는 10개내외로 순차적이지않으므로, 제목을 비교하여 이전에것들을 제외시킨다.
        for x in boards_list:
            # 제목
            title = x.find_element_by_tag_name('a').text.strip()
            if mongo.is_saved(title) is None:
                # 날짜 -  있는것과 없는것 구분처리
                try:
                    due_date = re.findall(
                        "\d{4}-\d{2}-\d{2}",
                        x.find_element_by_xpath('./ul/li[3]').text.strip())
                    date = due_date[0]
                except Exception:
                    date = "상시모집"

                # link - bi.net_url, kstartup_url 구분처리
                params = re.findall(
                    "\d+",
                    x.find_element_by_tag_name('a').get_attribute('href'))
                if len(params) == 2:  # bi-net 이동하는 함수일때, 파라미터가 2개임
                    link = "http://www.bi.go.kr/board/editView.do?boardVO.viewFlag=view&boardID=NOTICE&postSeq=" + \
                           params[0] + "&registDate=" + params[1]
                elif len(params) > 2:
                    link = board_url + params[2]
                else:
                    link = "링크오류"
                mongo.post_save(name, title, link, '', date)
                print('이름: ' + name + '\n제목:' + title + '\n링크: ' + link +
                      '\n마감일: ' + date + '\n')

    except Exception:
        message.site_error_push(name + " < 중요공지부분 > ")
        pass

    try:
        # 페이지별 공지 가져오기
        ann_board = driver.find_element_by_class_name('ann_list')
        boards_list2 = ann_board.find_elements_by_xpath('./li')

        for x in boards_list2:
            title = x.find_element_by_tag_name('a').text.strip()
            if mongo.is_saved(title) is None and title != "":
                # 날짜 있는것과 없는것 구분처리
                try:
                    due_date = re.findall(
                        "\d{4}-\d{2}-\d{2}",
                        x.find_element_by_xpath('./ul/li[3]').text.strip())
                    date = due_date[0]
                except Exception:
                    date = "상시모집"

                #  bi.net_url, kstartup_url 구분처리
                params = re.findall(
                    "\d+",
                    x.find_element_by_tag_name('a').get_attribute('href'))
                if len(params) == 2:  # bi-net 이동하는 함수일때, 파라미터가 2개임
                    link = "http://www.bi.go.kr/board/editView.do?boardVO.viewFlag=view&boardID=NOTICE&postSeq=" + \
                           params[0] + "&registDate=" + params[1]
                elif len(params) > 2:
                    link = board_url + params[2]
                else:
                    link = "링크오류"
                mongo.post_save(name, title, link, '', date)
                print('이름: ' + name + '\n제목:' + title + '\n링크: ' + link +
                      '\n마감일: ' + date + '\n')
            else:
                break
    except Exception:
        message.site_error_push(name + " < 일반 공지 부분 > ")
        pass