Example #1
0
 def onegoogolePR(self, url):
     '''返回单个PR'''
     prUrl = 'http://pr.chinaz.com'  # 谷歌PR查询地址
     driver = PhantomJS()
     driver.get(prUrl)
     driver.find_element_by_id('PRAddress').send_keys(url)
     driver.find_element_by_class_name('search-write-btn').click()
     try:
         imgsrc = driver.find_element_by_css_selector('span#pr>img').get_attribute('src')
         pr = search(r'\d', imgsrc).group()
     except:
         pr = '暂无数据'
     driver.quit()
     return pr
Example #2
0
class plugin:
    def __init__(self):
        APP_ROOT = os.path.dirname(os.path.abspath(__file__))
        print(APP_ROOT)
        self.req = 0
        self.driver = PhantomJS(APP_ROOT + "/phantomjs",
                                service_log_path=os.path.devnull)
        self.driver.implicitly_wait(3)

    def restart(self):
        self.__init__()

    def frame_search(self, path):
        framedict = {}
        for child_frame in self.driver.find_elements_by_tag_name('frame'):
            child_frame_name = child_frame.get_attribute('name')
            framedict[child_frame_name] = {'framepath': path, 'children': {}}
            xpath = '//frame[@name="{}"]'.format(child_frame_name)
            self.driver.switch_to.frame(
                self.driver.find_element_by_xpath(xpath))
            framedict[child_frame_name]['children'] = self.frame_search(
                framedict[child_frame_name]['framepath'] + [child_frame_name])

            self.driver.switch_to.default_content()
            if len(framedict[child_frame_name]['framepath']) > 0:
                for parent in framedict[child_frame_name]['framepath']:
                    parent_xpath = '//frame[@name="{}"]'.format(parent)
                    self.driver.switch_to.frame(
                        self.driver.find_element_by_xpath(parent_xpath))
        return framedict

    def tmon(self):
        self.driver.get(
            "https://login.ticketmonster.co.kr/user/loginform?return_url=")
        self.driver.find_element_by_name('userid').send_keys(
            config['ACCOUNT']['tmon_id'])
        self.driver.find_element_by_name('password').send_keys(
            config['ACCOUNT']['tmon_pw'])
        self.driver.find_element_by_xpath('//*[@id="loginFrm"]/a[2]').click()
        self.driver.get(
            'http://m.benefit.ticketmonster.co.kr/promotions/page/attendance?view_mode=app'
        )
        self.driver.find_element_by_xpath(
            '//*[@id="attn_wrap"]/div/div/div[3]/div[2]/div[1]/button').click(
            )

        print(self.driver.find_element_by_class_name('content').text)
        self.tmon_ret = self.driver.find_element_by_class_name('content').text

    def ondisk(self):
        try:
            self.driver.get("http://ondisk.co.kr/index.php")
            self.driver.implicitly_wait(3)
            self.driver.find_element_by_xpath('//*[@id="mb_id"]').send_keys(
                config['ACCOUNT']['ondisk_id'])
            self.driver.find_element_by_xpath(
                '//*[@id="page-login"]/form/div[2]/p[2]/input').send_keys(
                    config['ACCOUNT']['ondisk_pw'])
            self.driver.find_element_by_xpath(
                '//*[@id="page-login"]/form/div[2]/p[3]/input').click()
            self.driver.get(
                "http://ondisk.co.kr/index.php?mode=eventMarge&sm=event&action=view&idx=746&event_page=1"
            )
            self.driver.switch_to_frame(1)
            self.driver.execute_script(
                "window.alert = function(msg){ window.msg = msg; };")
            self.driver.find_element_by_class_name('button').click()

            alert_text = self.driver.execute_script("return window.msg;")
            print(alert_text)
        except:
            print("ERR")
            print(self.driver.page_source)
        self.ondisk_ret = alert_text

    def ok_cash_bag(self):
        today = datetime.datetime.now().strftime("%Y%m%d")
        sess = requests.session()
        getdata = sess.get(
            "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101"
        )
        param = {
            "lsd": "AVpmy4vJ",
            "api_key": "645711852239977",
            "cancel_url":
            "https://member.okcashbag.com/ocb/socialId/facebookProcessor?error=access_denied&error_code=200&error_description=Permissions+error&error_reason=user_denied#_=_",
            "display": "page",
            "enable_profile_selector": "",
            "isprivate": "",
            "legacy_return": "0",
            "profile_selector_ids": "",
            "return_session": "",
            "skip_api_login": "******",
            "signed_next": "1",
            "trynum": "1",
            "timezone": "-540",
            "lgndim":
            "eyJ3IjoxOTIwLCJoIjoxMDgwLCJhdyI6MTkyMCwiYWgiOjEwNDAsImMiOjI0fQ==",
            "lgnrnd": "173648_UqkK",
            "lgnjs": "1528418208",
            "email": config['ACCOUNT']['fb_id'],
            "pass": config['ACCOUNT']['fb_pw'],
            "prefill_contact_point": config['ACCOUNT']['fb_id'],
            "prefill_source": "last_login",
            "prefill_type": "contact_point",
            "first_prefill_source": "last_login",
            "first_prefill_type": "contact_point",
            "had_cp_prefilled": "true",
            "had_password_prefilled": "false"
        }
        postdata = sess.post(
            "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101",
            data=param)

        # print(postdata.text)
        postdata = sess.post(
            "https://member.okcashbag.com//ocb/socialId/socialIdLoginProcess/42100/687474703A2F2F7777772e6f6b636173686261672e636f6d2F696e6465782e646f3F6c6f67696e3D59"
        )
        samlResponse = postdata.text.split("samlResponse.value = \"")[1].split(
            "\"")[0]
        # print(samlResponse)
        param = {"samlResponse": samlResponse, "sst_cd": "", "return_url": ""}
        postdata = sess.post("http://www.okcashbag.com/index.do?login=Y",
                             data=param)
        print(
            postdata.text.split('<span id="profileNickname" class="name">')
            [1].split("</span>")[0] + "님 로그인")
        print(
            postdata.text.split('<span id="spanUsablePoint">')[1].split(
                '</span>')[0] + "포인트")
        getdata = sess.get(
            "http://www.okcashbag.com/life/event/attend/attendMain.do")
        param = {"method": "", "myUrl": "", "recommUser": "", "today": today}
        postdata = sess.post(
            "http://www.okcashbag.com/life/event/attend/attend.do", data=param)
        print(postdata.text)
        if len(postdata.text.split('<i class="win-point">')) > 1:
            print(postdata.text.split('<i class="win-point">')[1] + "포인트 적립")
        elif len(postdata.text.split("success")) > 1:
            print("출석체크 완료 ")
            self.ok_ret = "출석체크 완료"
        else:
            print('이미 출석체크 완료')
            self.ok_ret = "이미 출석체크 완료"
Example #3
0
class RouteStatistic(object):
    def __init__(self,
                 url,
                 phantomjs=None,
                 resolution=None,
                 ya_class=None,
                 screen_path=None,
                 screen_pattern=None,
                 csv_path=None):
        self.url = url

        self.phantomjs = phantomjs or DEFAULT_PHANTOMJS
        assert os.path.isfile(self.phantomjs), "phantomjs не найден"

        resolution = resolution or FULLHD
        assert isinstance(resolution, (list, tuple))
        assert len(resolution) == 2

        self.ya_class = ya_class or DEFAULT_YA_CLASS
        self.screen_path = screen_path or PATH

        self.screen_pattern = screen_pattern or '%s.png'
        assert '%s' in self.screen_pattern

        self.csv_path = csv_path or os_join(PATH, 'statistic.csv')

        self.driver = PhantomJS(self.phantomjs)
        self.driver.set_window_size(*resolution)

    def track(self):
        self.driver.get(self.url)
        WebDriverWait(self.driver, 5).until(is_class_exist(self.ya_class))
        time = self.driver.find_element_by_class_name(self.ya_class).text
        now = datetime.now()
        self._save_screenshot(now)
        self._update_file(now, *[t.strip() for t in time.split(',')])

    def _save_screenshot(self, now):
        if '%s' in self.screen_pattern:
            file_name = self.screen_pattern % (now, )
        else:
            file_name = self.screen_pattern
        file_name = os_join(self.screen_path, file_name)
        self.driver.save_screenshot(file_name)

    def _update_file(self, now, time, distance):
        with open(self.csv_path, 'a') as csvfile:
            writer = csv.writer(csvfile, delimiter=str('\t'))
            writer.writerow([
                now,
                time,
                distance,
            ])

    def __call__(self):
        return self.track()

    def __del__(self):
        if hasattr(self, 'driver') and self.driver:
            self.driver.service.process.send_signal(signal.SIGTERM)
            self.driver.quit()
Example #4
0
driver = PhantomJS(
    './phantomjs')  # in case of PhantomJS not available, we can use Firefox
for line in tqdm(inputfile,
                 total=numLines,
                 desc='Crawling Instagram',
                 leave=True):
    try:
        idtweet, url = line.replace('\n', '').split(',')
        if idtweet in setUrlDefined:
            continue
    except IndexError:
        print colorama.Fore.RED, 'Corrupted Line', colorama.Fore.RESET
        continue
    try:
        driver.get(url)
        placetag = driver.find_element_by_class_name('_kul9p')
        placeurl = placetag.get_attribute('href').encode('utf-8')
        placename = placetag.get_attribute('title').encode('utf-8')

        usernametag = driver.find_element_by_class_name('_4zhc5')
        username = usernametag.get_attribute('title').encode('utf-8')

    except selenium.common.exceptions.NoSuchElementException:
        try:
            error = driver.find_element_by_class_name('error-container')
            print colorama.Fore.RED, 'Sample Not Available Anymore', colorama.Fore.RESET
            outputfile.write(idtweet + ',' + url + ',404\n')
            continue
        except selenium.common.exceptions.NoSuchElementException:
            print colorama.Fore.RED, 'No Coords Available', colorama.Fore.RESET
            outputfile.write(idtweet + ',' + url + ',NoCoords\n')
outputfile = open(cityName + '-instagram-output.csv', 'a', 0)

print colorama.Back.RED+colorama.Fore.YELLOW+str(len(setUrlDefined))+' URLs already defined! Lets Rock more now...'+colorama.Back.RESET+colorama.Fore.RESET

driver = PhantomJS('./phantomjs') # in case of PhantomJS not available, we can use Firefox
for line in tqdm(inputfile, total=numLines, desc='Crawling Instagram', leave=True):
	try:
		idtweet, url = line.replace('\n', '').split(',')
		if idtweet in setUrlDefined:
			continue
	except IndexError:
		print colorama.Fore.RED, 'Corrupted Line', colorama.Fore.RESET
		continue
	try:
		driver.get(url)
		placetag = driver.find_element_by_class_name('_kul9p')
		placeurl = placetag.get_attribute('href').encode('utf-8')
		placename = placetag.get_attribute('title').encode('utf-8')

		usernametag = driver.find_element_by_class_name('_4zhc5')
		username = usernametag.get_attribute('title').encode('utf-8')
		
	except selenium.common.exceptions.NoSuchElementException:
		try:
			error = driver.find_element_by_class_name('error-container')
			print colorama.Fore.RED, 'Sample Not Available Anymore', colorama.Fore.RESET
			outputfile.write(idtweet + ',' + url + ',404\n')
			continue
		except selenium.common.exceptions.NoSuchElementException:
			print colorama.Fore.RED, 'No Coords Available', colorama.Fore.RESET
			outputfile.write(idtweet + ',' + url + ',NoCoords\n')
Example #6
0
class WeixinPhantomjs(Base):
    all_uids = {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        # self.driver = Firefox()
        if hasattr(config, 'PHANTOMJS_PATH'):
            self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH'))
        else:
            self.driver = PhantomJS()

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass
        return 1

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.__class__.all_uids:
                    self.__class__.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException, NoSuchElementException):
            pass
        return False

    def crawl_single(self, word=None, go=0):
        is_go = True
        go_page = int(go)
        next_page_css = 'sogou_page_%s'

        is_break = self.open_weixin_browser(word)
        pages = self.get_total_pages_to_word()

        for page in range(self.start_page + 1, (pages or self.end_page) + 1):
            if is_go and page < go_page:
                continue
            else:
                is_go = False

            if not self.appear_element(by=next_page_css % page):
                is_break = True
                msg = '\tNot appear next page element, will break'
            elif self.is_forbidden:
                is_break = True
                msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

            if is_break:
                storage_word.append([word, page])
                self.logger.info(msg)
                break

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()

            # self.driver.find_element_by_id(next_page_css % page).click()
            # wt = randint(10, 40) if page % 5 == 0 else randint(5, 18)
            wt = randint(1, 5)
            self.logger.info('Word <{}>, Page <{}> Done, sleeping {}s!'.format(word, page, wt))
            # self.driver.implicitly_wait(wt)
            time.sleep(wt)

        self.close_browser()

    @classmethod
    def crawl_with_threads(cls):
        pool = ThreadPool(4)
        total_words = QueryWords().get_query_words()

        for bulk_words in total_words:
            try:
                pool.map(lambda w: cls().crawl_single(w), bulk_words)
            except Exception as e:
                cls.logger.info('Threads crawl error: type <{}>, msg <{}>'.format(e.__class__, e))

        pool.close()
        pool.join()
        in_client.close()

    def close_browser(self):
        try:
            self.driver.close()
        except (NoSuchWindowException,):
            pass
Example #7
0
class WeixinPhantomjs(Base):
    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        # self.driver = Firefox()
        if hasattr(config, 'PHANTOMJS_PATH'):
            self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH'))
        else:
            self.driver = PhantomJS()

        self.client = MongoClient(HOST, PORT)
        self.collection = self.client[DB][COLLECTION]
        self.all_uids = self.uids

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass
        return 1

    def get_query_words(self, word):
        query_words = []

        for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]):
            w = docs['conp']

            if w not in query_words:
                query_words.append(w)

            for item in docs['rel']:
                if item not in query_words:
                    query_words.append(item)

        self.client.close()

        return self.query_index(query_words, word)

    @property
    def uids(self):
        return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.all_uids:
                    self.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @staticmethod
    def query_index(words, cut_word):
        temp_words = words[START_INDEX:END_INDEX]

        try:
            index = temp_words.index(cut_word)
            return temp_words[index:], index + START_INDEX
        except ValueError:
            pass
        return temp_words, START_INDEX

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException, NoSuchElementException):
            pass
        return False

    def crawl(self, word=None, go=0):
        is_go = True
        is_break = False
        go_page = int(go)
        next_page_css = 'sogou_page_%s'
        query_words, ind = self.get_query_words(word)

        for index, word in enumerate(query_words, 1):
            next_ind = ind + index
            is_break = self.open_weixin_browser(word)
            pages = self.get_total_pages_to_word()

            for page in range(self.start_page + 1, (pages or self.end_page) + 1):
                if is_go and page < go_page:
                    continue
                else:
                    is_go = False

                if not self.appear_element(by=next_page_css % page):
                    is_break = True
                    msg = '\tNot appear next page element, will break'
                elif self.is_forbidden:
                    is_break = True
                    msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

                if is_break:
                    storage_word.append([word, page])
                    self.logger.info(msg)
                    break

                urls_uids = self.extract_urls_uids(word=word)
                Article(urls_uids=urls_uids, word=word).extract()

                # self.driver.find_element_by_id(next_page_css % page).click()
                wt = randint(10, 40) if page % 3 == 0 else randint(5, 18)
                self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt))
                # self.driver.implicitly_wait(wt)
                time.sleep(wt)

            if is_break:
                break

        in_client.close()
        self.close_browser()

    def close_browser(self):
        try:
            self.driver.close()
        except (NoSuchWindowException,):
            pass
Example #8
0
from selenium.webdriver import Firefox, PhantomJS


driver = PhantomJS()

url = ('https://www.google.com/finance?start=0&num=5000&q=%5B(exchange%20%3D'
       '%3D%20"{}")%20%26%20(last_price%20>%200.1)%20%26%20(last_price%20<'
       '%201500)%5D&restype=company&noIL=1')

driver.get(url.format('NYSE'))
nyse = (elem.text for elem in driver.find_elements_by_class_name('symbol'))
driver.get('https://www.google.com/finance?q=NYSE%3A{}'.format(list(nyse)[0]))
print driver.find_element_by_class_name('pr').text


# driver.get(url.format('NASDAQ'))
# nasdaq = (elem.text for elem in driver.find_elements_by_class_name('symbol'))
# print '\n'.join(list(nasdaq))