class ChromeTestCase(unittest.TestCase): def setUp(self): self.s = Session( 'chromedriver', browser='chrome', default_timeout=15, webdriver_options={'arguments': ['headless', 'disable-gpu']}) def test_cookie_transfer_to_requests(self): """Tested on http://testing-ground.scraping.pro/login""" self.s.driver.get('http://testing-ground.scraping.pro/login') self.s.driver.find_element_by_id('usr').send_keys('admin') self.s.driver.ensure_element_by_id('pwd').send_keys( '12345', Keys.ENTER) self.s.driver.ensure_element_by_xpath( '//div[@id="case_login"]/h3[@class="success"]') self.s.transfer_driver_cookies_to_session() response = self.s.get( 'http://testing-ground.scraping.pro/login?mode=welcome') success_message = response.xpath( '//div[@id="case_login"]/h3[@class="success"]/text()' ).extract_first() self.assertEqual( success_message, 'WELCOME :)', 'Failed to transfer cookies from Selenium to Requests') def test_cookie_transfer_to_selenium(self): self.s.get('http://testing-ground.scraping.pro/login') self.s.cookies.set('tdsess', 'TEST_DRIVE_SESSION', domain='testing-ground.scraping.pro') self.s.transfer_session_cookies_to_driver() self.s.driver.get( 'http://testing-ground.scraping.pro/login?mode=welcome') success_message = self.s.driver.xpath( '//div[@id="case_login"]/h3[@class="success"]/text()' ).extract_first() self.assertEqual( success_message, 'WELCOME :)', 'Failed to transfer cookies from Requests to Selenium') def tearDown(self): self.s.driver.close()
def main(): try: start = sys.argv[1] except: print('ERROR: Requires URL as the first argument.') quit(0) # Constants ALLDROPDOWN = '//*[@id="selectReadType"]/option[2]' ACTUALIMAGES = '//*[@id="divImage"]//img' IMGGROUPS = '.listing a' TITLE = '.bigChar' NEXT = '//*[(@id = "btnNext")]//src' s = Session( webdriver_path='C:\\Webdrivers\\chromedriver', browser='chrome' ) # ,webdriver_options={'arguments': ['headless', 'disable-gpu']} s.driver.get(start) s.driver.ensure_element_by_css_selector(TITLE) title = s.driver.find_element_by_css_selector(TITLE).text groups = s.driver.find_elements_by_css_selector(IMGGROUPS) s.transfer_driver_cookies_to_session() begin = to_attribute_list(groups, 'href').pop() response = s.get(begin).xpath(ACTUALIMAGES) print(response) s.close() quit(2)
class ChromeTestCase(unittest.TestCase): def setUp(self): self.s = Session('chromedriver', browser='chrome', default_timeout=15, webdriver_options={'arguments': ['headless', 'disable-gpu']}) def test_cookie_transfer_to_requests(self): """Tested on http://testing-ground.scraping.pro/login""" self.s.driver.get('http://testing-ground.scraping.pro/login') self.s.driver.find_element_by_id('usr').send_keys('admin') self.s.driver.ensure_element_by_id('pwd').send_keys('12345', Keys.ENTER) self.s.driver.ensure_element_by_xpath('//div[@id="case_login"]/h3[@class="success"]') self.s.transfer_driver_cookies_to_session() response = self.s.get('http://testing-ground.scraping.pro/login?mode=welcome') success_message = response.xpath( '//div[@id="case_login"]/h3[@class="success"]/text()').extract_first() self.assertEqual( success_message, 'WELCOME :)', 'Failed to transfer cookies from Selenium to Requests') def test_cookie_transfer_to_selenium(self): self.s.get('http://testing-ground.scraping.pro/login') self.s.cookies.set('tdsess', 'TEST_DRIVE_SESSION', domain='testing-ground.scraping.pro') self.s.transfer_session_cookies_to_driver() self.s.driver.get('http://testing-ground.scraping.pro/login?mode=welcome') success_message = self.s.driver.xpath( '//div[@id="case_login"]/h3[@class="success"]/text()').extract_first() self.assertEqual( success_message, 'WELCOME :)', 'Failed to transfer cookies from Requests to Selenium') def tearDown(self): self.s.driver.close()
def main(url): session = Session( webdriver_path='../Chrome Canary/chromedriver.exe', browser='chrome', default_timeout=6, webdriver_options={'arguments': ['disable-logging', 'headless']}) session.driver.get(url) div_content = WebDriverWait(session.driver, 5).until( EC.presence_of_element_located((By.XPATH, "//div[@id='content']"))) print('######## FROM SELENIUM ########') print(div_content.text) print('######## COPYING SESSION FROM SELENIUM TO REQUESTS ########') session.transfer_driver_cookies_to_session() final_response = session.get(url, headers={'user-agent': 'custom requestium'}) soup = BeautifulSoup(final_response.text, 'html.parser') print('######## FROM REQUESTS ########') body_text = soup.find(id="content") print(body_text.text)
class RugratsBot: def __init__(self, userLogin: str, userPass: str) -> None: self._rugratSession = Session("./chromedriver", browser="chrome", default_timeout=15) self._userLogin = userLogin self._userPassword = userPass self._isLogged = False # default/recomended range seconds between self._rangeTimeBetComments = 290 self._rangeTimeBetFollow = 400 def setLoginInfo(self, userLogin: str, userPass: str) -> None: self._userLogin = userLogin self._userPassword = userPass def setInstagramPageUrl(self, instaPageUrl: str) -> None: self._instagramPageUrl = instaPageUrl def setListOfComments(self, listOfComments: List) -> None: self._listOfComments = listOfComments def setTimeBetComments(self, timeBetweenComments: int) -> None: self._rangeTimeBetComments = timeBetweenComments def isInternetOn(self) -> bool: url = "https://duckduckgo.com/" timeout = 5 try: _ = self._rugratSession.get(url, timeout=timeout) return True except ConnectionError: print("No connection available") return False def login(self, saveLoginInformatin: bool = True) -> None: if self._userLogin == "" or self._userPassword == "": return # Sign in on instagram **outset** self._rugratSession.driver.get( "https://www.instagram.com/accounts/login/?hl=pt-br") sleep(5) self._rugratSession.driver.ensure_element_by_css_selector( "input[name='username']").send_keys(self._userLogin) self._rugratSession.driver.ensure_element_by_css_selector( "input[name='password']").send_keys(self._userPassword) sleep(5) self._rugratSession.driver.ensure_element_by_xpath( "/html/body/div[1]/section/main/div/article/div/div[1]/div/form/div/div[3]/button/div" ).click() # Sign in on instagram **end** if saveLoginInformatin == True: # Save login information on Chromium driver **outset** sleep(5) self._rugratSession.driver.ensure_element_by_xpath( "/html/body/div[1]/section/main/div/div/div/section/div/button" ).click() sleep(5) self._rugratSession.driver.ensure_element_by_xpath( "/html/body/div[4]/div/div/div/div[3]/button[1]").click() # Save login information on Chromium driver **end** self._isLogged = True def logout(self) -> None: if self._isLogged: self._rugratSession.driver.get("https://www.instagram.com") self._rugratSession.driver.ensure_element_by_xpath( "/html/body/div[1]/section/nav/div[2]/div/div/div[3]/div/div[5]/span/img" ).click() self._rugratSession.driver.ensure_element_by_xpath( "/html/body/div[1]/section/nav/div[2]/div/div/div[3]/div/div[5]/div[2]/div/div[2]/div[2]/div/div/div/div/div/div/div" ).click() sleep(5) def followProfiles(self, targetUser: str) -> None: if self._isLogged == False: raise Exception( "First, yout should be logged in. Before start to follow, run 'yourBabyRugrat.signIn()'" ) self._rugratSession.driver.get("https://www.instagram.com/" + targetUser) self._rugratSession.driver.ensure_element_by_xpath( "/html/body/div[1]/section/main/div/header/section/ul/li[3]/a" ).click() self._rugratSession.transfer_driver_cookies_to_session numberOfFollowers = int( self.getNumberOfFollowers(targetUser).replace(",", "")) followersContainerScroll = self._rugratSession.driver.ensure_element_by_xpath( "//div[@class='isgrP']") counter = 0 while counter < int(numberOfFollowers / 7): self._rugratSession.driver.execute_script( "arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;", followersContainerScroll, ) sleep(2) counter += 1 # self._rugratSession.driver.execute_script("window.scrollIntoView();") for userToFollow in range(1, numberOfFollowers): sleep(20) self._rugratSession.driver.ensure_element_by_xpath( "/html/body/div[4]/div/div/div[2]/ul/div/li[" + str(userToFollow) + "]/div/div[3]/button").click() def getNumberOfFollowers(self, targetUser: str) -> int: if self._isLogged == False: raise Exception( "First, yout should be logged in. Before start commenting, run 'yourBabyRugrat.signIn()'" ) profileResponse = self._rugratSession.get( "https://www.instagram.com/" + targetUser) soupResponse = BeautifulSoup(profileResponse.text, "html.parser") metaTags = soupResponse.find_all("meta") numberOfFollowers = str() for tag in metaTags: if str(tag).lower().find("followers") != -1: numberOfFollowers = tag numberOfFollowers = str(numberOfFollowers).split() numberOfFollowers = numberOfFollowers[3] return numberOfFollowers def commentingByScrapingStuff(self, instagramUrlToComment: str, subjectToComment: str) -> None: # not implemented yet pass def commentingByList(self, instagramUrlToComment: url, listOfComments: List) -> None: if self._isLogged == False: raise Exception( "First, yout should be logged in. Before start commenting, run 'yourBabyRugrat.signIn()'" ) # Load target instagram page **outset** self._rugratSession.driver.get(instagramUrlToComment) # Load target instagram page **end** # start commenting while True: maxTimeToComment = self._rangeTimeBetComments + 100 try: index = randrange(0, len(listOfComments)) sleepTime = randrange(self._rangeTimeBetComments, maxTimeToComment) commentArea = self._rugratSession.driver.ensure_element_by_xpath( "/html/body/div[1]/section/main/div/div[1]/article/div[3]/section[3]/div[1]/form/textarea" ) commentArea.click() commentArea = self._rugratSession.driver.ensure_element_by_xpath( "/html/body/div[1]/section/main/div/div[1]/article/div[3]/section[3]/div[1]/form/textarea" ) if self.isInternetOn() == False: continue commentArea.send_keys(listOfComments[index]) commentArea.submit() sleep(sleepTime) except KeyboardInterrupt as interrupted: try: print(interrupted) sys.exit(0) except SystemExit: os._exit(0)
class Downloader(): def __init__(self, username, password, driver_path=None, download_path=None, headless=True, logger=None): if not logger: logging.basicConfig(level=logging.DEBUG) self.logger = logging.getLogger(__name__) self.logger.setLevel('DEBUG') else: self.logger = logger self._username = username self._password = password self.driver_path = driver_path self.download_path = download_path self.logger = logging.getLogger('odigo_downloader.downloader') self.url = 'https://enregistreur.prosodie.com/odigo4isRecorder/EntryPoint?serviceName=LoginHandler' self.headless = headless self.validated = False self.active = False def __str__(self): return f"\nDOWNLOAD PATH: {self.download_path}\nOPTIONS: {self.webdriver_options}\n" \ f"DRIVER PATH: {self.driver_path}\nUSERNAME: {self._username}\nURL: {self.url}" def setup_selenium_browser(self): if self.active: return f"Session/Browser already active. Cannot have two concurrent sessions/browsers" options = webdriver.ChromeOptions() prefs = { 'download.default_directory': self.download_path, 'download.prompt_for_download': False, 'download.directory_upgrade': True, 'safebrowsing.enabled': False, 'safebrowsing.disable_download_protection': True } options.add_experimental_option('prefs', prefs) if self.headless: options.add_argument('--headless') self.browser = webdriver.Chrome(self.driver_path, options=options) if self.headless: self.browser.command_executor._commands["send_command"] = ( "POST", '/session/$sessionId/chromium/send_command') params = { 'cmd': 'Page.setDownloadBehavior', 'params': { 'behavior': 'allow', 'downloadPath': self.download_path } } command_result = self.browser.execute("send_command", params) for key in command_result: self.logger.debug("result:" + key + ":" + str(command_result[key])) self.active = True def setup_requestium_session(self): if self.active: return f"Session/Browser already active. Cannot have two concurrent sessions/browsers" if self.headless: webdriver_options = {'arguments': ['headless']} else: webdriver_options = {} self.logger.debug( f"Creating Session object with values: {webdriver_options}") self.session = Session(webdriver_path=self.driver_path, browser='chrome', default_timeout=15, webdriver_options=webdriver_options) self.active = True def login_requestium(self): if self.active: raise CustomException(f"Cannot have two active sessions/browsers") self.setup_requestium_session() self.logger.debug(f"Going to URL: {self.url}") self.session.driver.get(self.url) self.logger.debug(f"Entering credentials") self.session.driver.ensure_element_by_name('mail').send_keys( self._username) self.session.driver.ensure_element_by_name('password').send_keys( self._password) self.session.driver.ensure_element_by_name('valider').click() self.validated = True def login_selenium(self): if self.active: raise CustomException(f"Cannot have two active sessions/browsers") self.setup_selenium_browser() self.browser.get(self.url) self.browser.find_element_by_name('mail').send_keys(username) self.browser.find_element_by_name('password').send_keys(password) self.browser.find_element_by_name('valider').click() return def download_mp3(self, path=None, ref=None, xpath=None): self.logger.info( f"\ndownload_mp3 called with:\nPATH: {path},\nREF: {ref},\nXPATH: {xpath}" ) if ref is not None and xpath is None: self.session.driver.ensure_element_by_class_name( 'x-action-col-icon').click() elif xpath is not None and ref is None: self.session.driver.ensure_element_by_xpath(xpath).click() else: self.logger.error("Cannot use both reference number and xpath") return self.session.driver.switch_to.frame('result_frame') time.sleep(1) # Get URL of mp3 file src = self.session.driver.ensure_element_by_id( 'messagePlayer').get_attribute('src') # Selenium --> Requests self.session.transfer_driver_cookies_to_session() # Download r = self.session.get(src, stream=True) if path is None: if ref is None: # Get ref number soap = BeautifulSoup(self.session.driver.page_source, 'lxml') ref = soap.findAll('div', class_='x-grid-cell-inner')[1].text path = '%s.mp3' % ref if r.status_code == 200: with open(path, 'wb') as f: for chunk in r.iter_content(1024 * 2014): f.write(chunk) else: return 1 # Requests --> Selenium self.session.transfer_session_cookies_to_driver() self.session.driver.switch_to.default_content() return def download_mp3_by_ref(self, ref, path=None): self.login_requestium() self.search_by_ref(ref) result = self.download_mp3(path, ref) if result == 1: return 1 self.session.driver.close() def download_mp3_by_csv(self, csv_path, download_dir=None): if download_dir is None: download_dir = self.download_path self.login_requestium() refs = pd.read_csv(csv_path, sep=';').Name length = len(refs) for i, ref in enumerate(refs): sys.stdout.write('\r') sys.stdout.write('downloading: %s/%s' % (i + 1, length)) sys.stdout.flush() self.search_by_ref(ref) mp3_path = None if download_dir is not None: file_name = '%s.mp3' % ref mp3_path = os.path.join(download_dir, file_name) result = self.download_mp3(path=mp3_path, ref=ref) if result == 1: return 1 sys.stdout.write('\n') sys.stdout.flush() self.session.driver.close() return "Finished" def search_by_ref(self, ref): self.session.driver.get(self.url) self.session.driver.ensure_element_by_name('refEr').send_keys(ref) self.session.driver.ensure_element_by_id('button-1009').click() def change_date_format(self, date): try: correct_string = date.strptime(str(date.date()), '%Y-%m-%d').strftime('%d-%m-%Y') return correct_string except Exception as e: raise e def change_time_format(self, date): try: correct_string = date.strptime( str(date.hour) + ':' + str(date.minute), "%H:%M").strftime("%I:%M %p") if correct_string[0] == "0": return correct_string[1::] else: return correct_string except Exception as e: raise e def ceil_dt(self, dt, delta): """Round up to the nearest half hour""" return dt + (datetime.datetime.min - dt) % delta def set_range(self, now): """ Takes current datetime and finds the nearest, previous half hour. Returns the appropriate start and end times and date """ # Format: '10-19-2018' # Format: '12:00 AM' hour_ago = now - datetime.timedelta(minutes=60) rounded = self.ceil_dt(hour_ago, datetime.timedelta(minutes=30)) start_date = self.change_date_format(rounded) start_time = self.change_time_format(rounded) thirty_mins = datetime.timedelta(minutes=30) end_date = start_date end_time = self.change_time_format(rounded + thirty_mins) return (start_date, start_time, end_date, end_time) def search_by_range(self, start_date, start_time, end_date, end_time): """ Doesn't work correctly. Date seems to work but time not so much. Search records on www.prosodie.com by date range and return session. Input: s -- Requestium session (required | type: requestium.requestium.Session); start_date -- start date (not required | type: str). Format: 'mm:dd:yyyy'. Example: '03-05-1991'; start_time -- start time (not required | type: str). Example: '12:00 AM'; end_date -- end date (not required | type: str). Format: 'mm:dd:yyyy'. Example: '03-05-1991'; end_time -- end time (not required | type: str). Example: '12:00 PM'. Output: s -- Requestium session (type: requestium.requestium.Session). """ if start_date: self.browser.find_element_by_name('dateDebut').send_keys( start_date) if start_time: self.browser.find_element_by_name('heureDebut').send_keys( start_time) if end_date: self.browser.find_element_by_name('dateFin').send_keys(end_date) if end_time: self.browser.find_element_by_name('heureFin').send_keys(end_time) self.browser.find_element_by_id('button-1009').click() return def download_all_half_hour(self): self.logger.debug(f"Downloading calls from last half hour") self.logger.debug(f"Login check...") if not self.validated: self.logger.debug(f"Not logged in. Validating") self.login_selenium() self.logger.debug(f"Logged in.") self.logger.debug(f"Getting search range") search_range = self.set_range(datetime.datetime.now()) sleep(2) self.logger.debug(f"Applying filters") self.browser.find_element_by_id("criteres-inputEl").send_keys('_EN') self.search_by_range(*search_range) sleep(5) self.logger.debug(f"Downloading results to {self.download_path}") csvB = self.browser.find_element_by_id("csvButton") csvB.click() self.browser.find_element_by_id("button-1006").click() self.browser.switch_to.window(self.browser.window_handles[1]) sleep(5) self.logger.debug(f"Ending session")
print('Waiting for elements to load...') s.driver.ensure_element_by_class_name( "desktop-onboarding-sign-up__form-toggler", state='visible').click() if reddit_user_name: s.driver.ensure_element_by_id('user_login').send_keys(reddit_user_name) s.driver.ensure_element_by_id('passwd_login').send_keys(Keys.BACKSPACE) print('Please log-in in the chrome browser') s.driver.ensure_element_by_class_name("desktop-onboarding__title", timeout=60, state='invisible') print('Thanks!') if not reddit_user_name: reddit_user_name = s.driver.xpath( "//span[@class='user']//text()").extract_first() if reddit_user_name: s.transfer_driver_cookies_to_session() response = s.get( "https://www.reddit.com/user/{}/".format(reddit_user_name)) cmnt_karma = response.xpath( "//span[@class='karma comment-karma']//text()").extract_first() reddit_golds_given = response.re_first(r"(\d+) gildings given out") print("Comment karma: {}".format(cmnt_karma)) print("Reddit golds given: {}".format(reddit_golds_given)) else: print("Couldn't get user name")
class HTMLParser(object): """ Assign parsing task into it. It will mantain a queue and parse website in multithread with random switch proxy. Make headless an optional?! """ def __init__( self, mode: str = 'requestium', use_cache: bool = True, max_cache_size: int = 10000, timeout: int = 15, browser: str = 'chrome', loading_time: int = 3, # delay to wait the webpage loading webdriver_path: str = os.path.join(curr_dir, 'chromedriver')): assert mode in ['requests', 'selenium', 'requestium'] assert browser in ['chrome'] self.mode = mode self.loading_time = loading_time self.timeout = timeout self.use_cache = use_cache if use_cache: self.html_cache = LRUCache(maxsize=max_cache_size) if mode == 'requests': pass elif mode == 'selenium': from selenium import webdriver from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument("--headless") self.driver = webdriver.Chrome(webdriver_path, chrome_options=chrome_options) elif mode == 'requestium': from requestium import Session, Keys self.session = Session( webdriver_path=webdriver_path, browser='chrome', default_timeout=timeout, webdriver_options={'arguments': ['headless']}) else: assert False, '"mode" must be either requests, selenium, or requestium.' def _get_html(self, url: str, use_driver: bool = False, check_status: bool = False) -> str: """ TODO: Add asynchronous queue use_driver only used for requestium TODO: check_status is used for, when using "driver", we don't know the html status code https://stackoverflow.com/questions/5799228/how-to-get-status-code-by-using-selenium-py-python-code """ headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } if self.mode == 'requests': raw_html = requests.get(url, headers=headers) if raw_html.status_code == 200: return raw_html.text elif self.mode == 'selenium': self.driver.get(url) # give some time for driver to load webpabe time.sleep(self.loading_time) return self.driver.page_source elif self.mode == 'requestium': if use_driver: self.session.driver.get(url) # give some time for driver to load webpabe time.sleep(self.loading_time) return self.session.driver.page_source else: raw_html = self.session.get(url, headers=headers) if raw_html.status_code == 200: return raw_html.text def get_html_directly(self, url: str, use_driver: bool = False, check_status: bool = False) -> str: """ Cache wrapper TODO: auto fix url schema (i.e. add http or https) (requests.exceptions.MissingSchem) """ if self.use_cache: if url not in self.html_cache: html = self._get_html(url, use_driver, check_status) if not html: return None self.html_cache[url] = html return self.html_cache[url] else: return self._get_html(url, use_driver, check_status)
class API: def __init__(self, path): self.last_json = "" self.last_response = None self.IG_SIG_KEY = '4f8732eb9ba7d1c8e8897a75d6474d4eb3f5279137431b2aafb71fafe2abe178' self.SIG_KEY_VERSION = '4' self.USER_AGENT = 'Instagram 10.26.0 Android ({android_version}/{android_release}; 640dpi; 1440x2560; {manufacturer}; {device}; {model}; samsungexynos8890; en_US)'.format( **DEVICE_SETTINTS) self.s = Session(webdriver_path=path, browser='chrome', default_timeout=15) self.logger = logging.getLogger('[instatesi_{}]'.format(id(self))) self.privateUsers = {} self.users = {} fh = logging.FileHandler(filename='instatesi.log') fh.setLevel(logging.INFO) fh.setFormatter(logging.Formatter('%(asctime)s %(message)s')) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) self.logger.addHandler(fh) self.logger.addHandler(ch) self.logger.setLevel(logging.DEBUG) self.lastUserHandled = None def saveScrapedFollowers(self): import json self.logger.info("Except for the selected Followers...") if not os.path.exists(os.getcwd() + "/ScrapedFollowers/" + self.lastUserHandled + ".txt"): with open( os.getcwd() + "/ScrapedFollowers/" + self.lastUserHandled + ".txt", "w") as f: """ f.write("Scraped following from " + self.lastUserHandled +"\n") f.write("-------Non private users-------\n") f.write(json.dumps(self.users[''], indent=2)) f.write("\n-------Private users-------\n") f.write(json.dumps(self.privateUsers, indent=2)) """ for k, v in self.users.items(): f.write(k + ',\n') self.logger.info("Following successfully saved!") self.users = dict() self.privateUsers = dict() else: self.logger.warning( "Warning! The user is already present in the database. Overwrite?" ) #Define some logic for file overwriting def saveScrapedFollowing(self): import json self.logger.info("Except for the Following's Following...") if not os.path.exists(os.getcwd() + "/ScrapedFollowing/" + self.lastUserHandled + ".txt"): with open( os.getcwd() + "/ScrapedFollowing/" + self.lastUserHandled + ".txt", "w") as f: f.write("Scraped following from " + self.lastUserHandled + "\n") f.write("-------Non private users-------\n") f.write(json.dumps(self.users, indent=2)) f.write("\n-------Private users-------\n") f.write(json.dumps(self.privateUsers, indent=2)) self.logger.info("Following successfully saved!") self.users = dict() self.privateUsers = dict() else: self.logger.warning( "Warning! The user is already present in the database. Overwrite?" ) #Define some logic for file overwriting def getUserFollowers(self, userID, rank_token, selection="followers"): self.logger.info("User ID follower scraping started " + str(userID)) followers = self.getTotalFollowers(userID, rank_token, fromInput=selection) return [str(item['username']) for item in followers][::-1] if followers else [] def __getUsernameInfo(self, usernameId): return self.__send_request('users/' + str(usernameId) + '/info/') def __send_request_for_user_followers(self, user_id, rank_token, max_id='', selection="followers"): url = 'friendships/{user_id}/followers/?rank_token={rank_token}' if selection == "followers" else 'friendships/{user_id}/following/?max_id={max_id}&ig_sig_key_version={sig_key}&rank_token={rank_token}' url = url.format( user_id=user_id, rank_token=rank_token) if selection == "followers" else url.format( user_id=user_id, max_id=max_id, sig_key=self.SIG_KEY_VERSION, rank_token=rank_token) if max_id: url += '&max_id={max_id}'.format(max_id=max_id) return self.__send_request(url) def searchUsername(self, username): url = 'users/{username}/usernameinfo/'.format(username=username) self.logger.info("Looking for user information " + username) return self.__send_request(url) def getUsernameFromID(self, user_id): url = 'users/{user_id}/info/'.format(user_id=user_id) self.__send_request(url) self.logger.info("Return the requested username, or " + str(self.last_json['user']['username'])) return self.last_json['user']['username'] def __generateSignature(self, data, IG_SIG_KEY, SIG_KEY_VERSION): body = hmac.new( IG_SIG_KEY.encode('utf-8'), data.encode('utf-8'), hashlib.sha256).hexdigest() + '.' + urllib.parse.quote(data) signature = 'ig_sig_key_version={sig_key}&signed_body={body}' return signature.format(sig_key=SIG_KEY_VERSION, body=body) def castUsernameToUserID(self, usernameToLook): self.lastUserHandled = usernameToLook userID = "" self.searchUsername(usernameToLook) if "user" in self.last_json: userID = str(self.last_json["user"]["pk"]) self.logger.info("The username " + usernameToLook + " corresponds to the ID " + userID) return userID def seeStories(self): self.__send_request("feed/reels_tray/") return self.last_json def getTotalFollowers(self, usernameId, rank_token, fromInput="followers"): sleep_track = 0 followers = [] next_max_id = '' self.__getUsernameInfo(usernameId) if "user" in self.last_json: total_followers = self.last_json["user"][ 'follower_count'] if fromInput == "followers" else self.last_json[ "user"]['following_count'] if total_followers > 200000: self.logger.warning( "There are over 200,000 followers. It may take a while.") else: return False with tqdm(total=total_followers, desc="Retrieving followers", leave=False) as pbar: while True: self.__send_request_for_user_followers(usernameId, rank_token, next_max_id, selection=fromInput) temp = self.last_json try: pbar.update(len(temp["users"])) for item in temp["users"]: if item['is_private']: self.privateUsers[item['username']] = { 'ID': item['pk'], 'user_handle': item['username'], 'is_verified': item['is_verified'], 'is_private': item['is_private'], 'profile pic': item['profile_pic_url'], 'Full Name': item['full_name'] } else: self.users[item['username']] = { 'ID': item['pk'], 'user_handle': item['username'], 'is_private': item['is_private'], 'is_verified': item['is_verified'], 'profile pic': item['profile_pic_url'], 'Full Name': item['full_name'] } followers.append(item) sleep_track += 1 if sleep_track >= 20000: import random sleep_time = random.randint(120, 180) self.logger.info("Waiting for " + str(float(sleep_time / 60)) + " due to excessive demands.") time.sleep(sleep_time) sleep_track = 0 if len(temp["users"] ) == 0 or len(followers) >= total_followers: self.logger.info( "Returning account followers in the scraping phase, ie " + str(len(followers[:total_followers]))) return followers[:total_followers] except Exception: self.logger.error( "Returning account followers in the scraping phase, ie " + str(len(followers[:total_followers]))) return followers[:total_followers] if temp["big_list"] is False: self.logger.info( "Returning account followers in the scraping phase, ie " + str(len(followers[:total_followers]))) return followers[:total_followers] next_max_id = temp["next_max_id"] def __send_request(self, endpoint, post=None, login=False, with_signature=True): self.s.headers.update({ 'Connection': 'close', 'Accept': '*/*', 'Content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie2': '$Version=1', 'Accept-Language': 'en-US', 'User-Agent': self.USER_AGENT }) try: if post is not None: # POST if with_signature: post = self.__generateSignature(post, self.IG_SIG_KEY, self.SIG_KEY_VERSION) response = self.s.post('https://i.instagram.com/api/v1/' + endpoint, data=post) else: # GET response = self.s.get('https://i.instagram.com/api/v1/' + endpoint) except Exception as e: self.logger.error("Exception due to endpoint " + endpoint) self.logger.error(e) return False if response.status_code == 200: self.logger.info("The request to the endpoint " + endpoint + " has been successful") self.last_response = response self.last_json = json.loads(response.text) return True else: return False
class Github: def __init__(self, proxy=None): self.cookies = None self.sess = Session("/usr/local/phantomjs", "phantomjs", default_timeout=15) if proxy: self.sess.proxies['https'] = proxy self.sess.proxies['http'] = proxy self.proxy = proxy self.sess = Session(webdriver_path='/usr/local/bin/chromedriver', browser='phantomjs', default_timeout=15, webdriver_options={'arguments': ['headless']}) if proxy: self.proxies['http'] = proxy self.proxies['https'] = proxy self.user = None def save_session(self, name, password, cookie): gprint("save cred and session") with open(GITHUB_LOGIN, "wb") as fp: u = {"user": name, "pass": password} pickle.dump(u, fp) with open(GITHUB_SESSION, 'wb') as fp: pickle.dump(cookie, fp) def load_session(self): gprint("load seesion form github") if os.path.exists(GITHUB_SESSION): with open(GITHUB_SESSION, 'rb') as fp: self.cookies = pickle.load(fp) self.sess.cookies.update(self.cookies) self.sess.get("https://github.com") self.sess.transfer_session_cookies_to_driver() with open(GITHUB_LOGIN, 'rb') as fp: u = pickle.load(fp) self.user = u['user'] elif os.path.exists(GITHUB_LOGIN): with open(GITHUB_LOGIN, 'rb') as fp: u = pickle.load(fp) self.login(name=u['user'], password=u['pass']) else: name = input('Github name:') passwd = getpass.getpass("Github pass:"******"https://github.com/login") self.sess.driver.find_element_by_css_selector( "input[name=login]").send_keys(name) self.sess.driver.find_element_by_css_selector( "input[name=password]").send_keys(password) self.sess.driver.find_element_by_css_selector( "input[name=commit]").click() self.sess.transfer_driver_cookies_to_session() self.cookies = self.sess.cookies.get_dict() gprint(str(self.cookies)) self.save_session(name, password, self.cookies) def weak_search(self, key): self.load_session() self.search(key, "smtp") self.search(key, "ssh") # with ThreadPoolExecutor(max_workers=10) as exe: # # for k in ['smtp', 'ssh', 'email']: # # s1 = exe.submit(self.search,key, k) # s1.add_done_callback(print) def search(self, *key): gprint(key[-1]) if not self.cookies: self.load_session() res = requests.get("https://github.com/{}/product".format(self.user)) self.cookies = res.cookies.get_dict() gprint(str(self.cookies)) url = "https://github.com/search?q={}&type=code".format("+".join(key)) self.sess.driver.get(url) res = self.sess.driver.page_source b = BeautifulSoup(res, 'lxml') codes = b.select(".code-list-item") if len(codes) > 0: gprint("Found : %d" % len(codes)) else: gprint("Not found:") rprint(b.text.replace("\n", "")) # for i in b.select("a"): # gprint(str(i)) ss = {} for code in codes: k = code.select(".text-bold")[0].text v = { colored(str(n), 'green'): i.text.replace("\n", "") for n, i in enumerate(code.select("td.blob-code")) } gprint(colored(k, "blue")) Tprint(v)
class Monster(): api_throttle_secs = 3 def __init__( self ): self.verbose = False self._session = Session( webdriver_path='' ,browser='chrome' ,default_timeout=15 ,webdriver_options={ 'arguments' : [ 'headless' ] } ) @sleep_and_retry @limits( calls=1, period=api_throttle_secs ) def apply( self, job_link ): '''Apply to the job at the given job link for Monster.com. Args: job_link (str_or_SearchResult): the speed apply link for the job to apply to. Returns: bool: True if successful, False otherwise. ''' if isinstance( job_link, SearchResult ): job_link = job_link.ApplyLink apply_result = self._session.get( job_link ) if apply_result.status_code == 200: if apply_result.json()['success'] == True: return True elif self.verbose: print( job_link ) print( apply_result.json() ) return False def batchApply( self, job_links ): ''' Apply to all jobs in the list of job links given Args: job_links (list_or_generator): List, tuple, or generator of job links Returns: jobs_applied_to (int): The number of jobs applied to successfully ''' jobs_quantity = 0 quantity_applied_to = 0 if not isinstance( job_links, types.GeneratorType ): jobs_quantity = len( job_links ) progress_bar = tqdm( total=jobs_quantity ,desc='Applying' ,unit='Jobs' ) for job_link in job_links: if isinstance( job_links, types.GeneratorType ): progress_bar.total += 1 if self.apply( job_link ): progress_bar.update( 1 ) jobs_applied_to = progress_bar.n return jobs_applied_to @sleep_and_retry @limits( calls=1, period=api_throttle_secs ) def login( self, email, password ): '''Login to the Monster.com job board site. Args: email (str): Email address for logging into Monster.com. password (str): Password corresponding to email address to login to Monster.com job board site. Returns: bool: True if successful, False otherwise. ''' # GOTO LOGIN PAGE TO CHECK IF AVAILABLE & GET COOKIES login_page = self._session.get( SITE['login'] ) if login_page.status_code != 200: raise Exception( 'ERROR: COULD NOT GET LOGIN PAGE FOR MONSTER.COM : ' + SITE['login'] ) # BUILD FORM DATA login_data = { 'AreCookiesEnabled' : True ,'EmailAddress' : email ,'IsComingFromProtectedView': False ,'IsKeepMeLoggedInEnabled' : True ,'Password' : password ,'PersistLogin' : True } request_verification_token = \ login_page.xpath('//input[@name="__RequestVerificationToken"]/@value').extract()[0] login_data.update( { '__RequestVerificationToken' : request_verification_token } ) # LOGIN login_result = self._session.post( SITE['login'], data=login_data ) if login_result.status_code == 200: return True else: return False @sleep_and_retry @limits( calls=1, period=api_throttle_secs ) def getJobDetails( self, job_link ): ''' Get dictionary of details of the job, such as title and description. Args: job_link (str or int): Either a url containing the job id in the format of jobid={}, such as the apply link or the job page link. Or, directly supply the job id if it is available. Returns: job_dict (dict): Dictionary of the job link, job title, company name, job address, and job description. ''' job_link = str( job_link ) if not 'jobid' in job_link: job_id = job_link else: job_id = parse.parse_qs( parse.urlparse( job_link ).query )['jobid'][0] job_url = SITE[ 'job' ].format( job_id ) job_page = self._session.get( job_url ) job_json = job_page.json() job_description = job_json[ 'jobDescription' ] job_title = job_json[ 'companyInfo' ][ 'companyHeader' ] company_name = job_json[ 'companyInfo' ][ 'name' ] job_address = job_json[ 'companyInfo' ][ 'jobLocation' ] job_dict = { 'job_link' : job_link ,'job_title' : job_title ,'job_address' : job_address ,'company_name' : company_name ,'job_description' : job_description } return job_dict def search( self, quantity=25, filter_out_recruiting_agencies=True, **kwargs ): ''' Search Monster.com with the given filters and yield job links. Args: quantity (int): The max number of results to return. kwargs (dict): Dictionary of filters, such as keywords, type (full_time,part_time), and posteddaysago. Returns: SearchResult (namedtuple): generator of named tuples, each containing an ApplyLink and a DetailsLink. The ApplyLink, when followed, will apply for the job automatically. The Details link will return json data about the job. ''' search_url = SITE['search']['root'] # HANDLE SPECIAL CASE OF JOB TYPE, WHICH MUST PRECEED QUERY job_type_value = '' if 'type' in kwargs: job_type = kwargs['type'] options = SITE['search']['type']['options'] job_type_value = options[job_type] if job_type in options else '' kwargs.pop( 'type' ) search_url = search_url.format( type=urllib.parse.quote_plus( job_type_value ) ) # FORMAT URL WITH REMAINING FILTERS for search_field, search_value in kwargs.items(): if search_field in SITE['search']: if isinstance( SITE['search'][search_field], dict ): options = SITE['search'][search_field]['options'] if search_value in options: options_value = options[search_value] search_url += '+' + urllib.parse.quote_plus( options_value ) else: search_format = SITE['search'][search_field] search_url += \ '&{0}'.format(search_format.format(urllib.parse.quote_plus(search_value))) @sleep_and_retry @limits( calls=1, period=self.api_throttle_secs ) def getPage( page ): paged_search_url = search_url + '&page=' + str( page ) search_page = self._session.get( paged_search_url ) return search_page # GET AND PROCESS RETURNED JSON quantity_returned = 0 page = 1 while quantity_returned < quantity: search_page = getPage( page ) if search_page.status_code != 200: break search_json = search_page.json() for app_dict in search_json: if all( key in app_dict for key in ( 'MusangKingId', 'ApplyType' ) ): if app_dict['MusangKingId'] != 0 and app_dict['ApplyType'] != None: # filter jobs that are missing data / poorly formatted if any( x in app_dict['ApplyType'] \ for x in QUICK_APPLY_KEYWORDS ): # filter to include quick apply jobs only if not any( x.lower() in app_dict['Company']['Name'].lower() \ for x in RECRUITING_AGENCY_KEYWORDS ) or \ not filter_out_recruiting_agencies: # filter jobs from recruiting agencies job_id = app_dict['MusangKingId'] apply_url = SITE['speedapply'].format( job_id ) details_url = SITE['job'].format( job_id ) search_result = SearchResult( apply_url, details_url ) quantity_returned += 1 yield search_result if quantity_returned >= quantity: break page += 1
class EFundsInfo: def __init__(self): self.session = Session( webdriver_path='/usr/lib/chromium-browser/chromedriver', browser='chrome', default_timeout=15, webdriver_options={'arguments': ['headless']}) def __enter__(self): self.session = Session( webdriver_path='/usr/lib/chromium-browser/chromedriver', browser='chrome', default_timeout=15, webdriver_options={'arguments': ['headless']}) return self def __exit__(self, exc_type, exc_val, exc_tb): self.session.driver.quit() def e_funds_plan(self): self.session.driver.get("https://qieman.com/longwin/index") plan_div = self.session.driver.ensure_element_by_xpath( "//section[@class='plan-asset']") plan_list = [] for i, tr in enumerate( plan_div.find_elements_by_xpath("div//table[2]//tr")[1:], start=1): summary_list = tr.text.splitlines() abbreviation = summary_list[0] fund_name = summary_list[1][:-8] fund_code = summary_list[1][-7:-1] own_amount = re.compile("[持有](\d+)[份]").search( summary_list[2]).group(1) proportion = re.compile("[:]([-\d\.]+)").search( summary_list[2]).group(1) floating_pl = re.compile("[:]([-\d\.]+)").search( summary_list[3]).group(1) plan_list.append({ 'key': i, 'abbreviation': abbreviation, 'fund_name': fund_name, 'fund_code': fund_code, 'own_amount': own_amount, 'proportion': proportion, 'floating_pl': floating_pl, }) df = pd.DataFrame(plan_list) df.key = pd.to_numeric(df.key) df.own_amount = pd.to_numeric(df.own_amount) df.proportion = pd.to_numeric(df.proportion) df.floating_pl = pd.to_numeric(df.floating_pl) return df def transaction_history(self, func_code): history = [] today = arrow.now().format('YYYY-MM-DD') self.session.driver.get( "https://qieman.com/longwin/funds/{func_code}".format( func_code=func_code)) history_div = self.session.driver.ensure_element_by_xpath( "//section[@class='history']") detail_div = self.session.driver.ensure_element_by_xpath( "//section[@class='details']") amount_div_list = detail_div.find_elements_by_xpath( "div//span[@class='qm-amount']") average_price, latest_price = amount_div_list[0].text, amount_div_list[ 1].text history.append({ 'key': 'a', 'date': today, 'price': average_price, 'action': 'a' }) history.append({ 'key': 'y', 'date': today, 'price': latest_price, 'action': 'y' }) for idx, td in enumerate( history_div.find_elements_by_xpath("table/tbody/tr")): deal_date = td.find_element_by_xpath( "td//div[@class='variety-title']").text deal_price = td.find_element_by_xpath( "td//span[@class='qm-amount']").text action_text = td.find_element_by_xpath( "td//div[@class='order-action']").text action = "b" if "买" in action_text else "s" amount = pd.to_numeric( re.compile("[入|出](\d+)[份]").search(action_text).group(1)) history.extend([{ 'key': '{index}{action}{count}'.format(index=idx, action=action, count=i), "date": deal_date, "price": deal_price, "action": action } for i in range(amount)]) # df = pd.DataFrame(history).set_index("date") df = pd.DataFrame(history) # df.index = pd.to_datetime(df.index) df.price = pd.to_numeric(df.price) return df def e_fund_cost(self, func_code): self.session.driver.get( "https://qieman.com/longwin/funds/{func_code}".format( func_code=func_code)) detail_div = self.session.driver.ensure_element_by_xpath( "//section[@class='details']") cost = detail_div.find_element_by_xpath( "div//span[@class='qm-amount']").text return pd.to_numeric(cost) def fund_value_history(self, fund_code, duration='1m'): """ Query fund trading history data from Sina finance :param duration: string default is '1m', means query one month history. OR using one of following: '1m' - one month history, '3m' - three month history, '6m' - six month history, '1y' - one year history, '2y' - two year history, '3y' - three year history. :param fund_code: string specify the code of the fund you want to query :return: DataFrame: date - index, trading date, value - fund net / annual income total - accumulated net value / fund million return change - fund net growth rate """ result = [] kv = {'1m': -1, '3m': -3, '6m': -6, '1y': -12, '2y': -24, '3y': -36} duration_arrow = self.get_last_trading_info(fund_code)['date'].shift( months=kv.get(duration, -1)) df = ts.get_nav_history(fund_code, duration_arrow).reset_index() df.date = df.date.astype(str) return df def get_last_trading_date(self, fund_code): today = arrow.now().shift(months=-1) while True: latest_df = ts.get_nav_history(fund_code, today.format('YYYY-MM-DD')) if (latest_df is not None): return arrow.get(latest_df.index[0]) else: today = today.shift(months=-1) def get_last_trading_info(self, fund_code): today = arrow.now().shift(months=-1) while True: latest_df = ts.get_nav_history(fund_code, today.format('YYYY-MM-DD')) if (latest_df is not None): return { 'date': arrow.get(latest_df.index[0]), 'price': latest_df.value[0] } else: today = today.shift(months=-1) def real_time_valuation(self, fund_code: str): if fund_code == '001061': latest_info = self.get_last_trading_info(fund_code) # there is no real time valuation api for 001061 valuation_date = latest_info['date'] real_time_value_list = [['0930', latest_info['price']], ['1500', latest_info['price']]] elif fund_code.startswith('16'): # res = self.session.get("http://qt.gtimg.cn/q=sz{func_code}".format(func_code=func_code)) res = self.session.get( "http://data.gtimg.cn/flashdata/hushen/minute/sz{func_code}.js" .format(func_code=fund_code)) real_time_value_list = [] data_list = res.text.replace('\\n\\', '').splitlines() valuation_date = '{year}-{month}-{day}'.format( year='20' + data_list[1][-6:-4], month=data_list[1][-4:-2], day=data_list[1][-2:]) for i in data_list[2:-1]: time, value, _ = i.split() real_time_value_list.append([time, value]) else: res = self.session.get( "http://web.ifzq.gtimg.cn/fund/newfund/fundSsgz/getSsgz?app=web&symbol=jj{func_code}" .format(func_code=fund_code)) json_dict = json.loads(res.text)['data'] valuation_date = json_dict['date'] real_time_value_list = json_dict['data'] result = [] for i in real_time_value_list: result.append({ # 'time': '{date} {hour}:{miniute}:00'.format(date=valuation_date, hour=i[0][:2], miniute=i[0][2:]), 'time': '{hour}:{miniute}'.format(hour=i[0][:2], miniute=i[0][2:]), 'value': i[1], }) df = pd.DataFrame(result) df.value = pd.to_numeric(df.value) return df # f100032 = EFundsInfo()
class Driver(object): def __init__(self): # 使用requestium的Session, 使用requests和Selenium, 设置为headless模式 self.s = Session( webdriver_path='./chromedriver', browser='chrome', default_timeout=15, #webdriver_options={'arguments': ['headless']} ) # self.category_mapping = None # path = os.path.join(os.getcwd(), FILENAME) # if os.path.exists(path): # self.category_mapping = ujson.load(open(path)) # pprint(self.category_mapping) def close(self): if self.s.driver is not None: self.s.driver.quit() if self.s is not None: self.s.close() def login(self): """ 使用driver登录到启信宝 """ login_url = 'http://www.qixin.com/auth/login?return_url=%2F' self.s.driver.get(login_url) # 使用requestium中的ensure_*方法定位元素 user_element = self.s.driver.ensure_element_by_xpath( LOGIN_XPATH['username']) for c in USERNAME: # 间歇输入Username和Password user_element.send_keys(c) time.sleep(random.randint(0, 2)) password_element = self.s.driver.ensure_element_by_xpath( LOGIN_XPATH['password']) for c in PASSWORD: password_element.send_keys(c) time.sleep(random.random()) password_element.send_keys(Keys.ENTER) self.s.driver.implicitly_wait(20) def process_cookies(self): """ 使用requests抓取页面 """ # 将driver的cookies转给requests的session tmp_url = 'http://www.qixin.com/search?area.province=12&page=1&scope[]=1' self.s.driver.get(tmp_url) self.s.transfer_driver_cookies_to_session() self.s.copy_user_agent_from_driver() # 判断category mapping是否存在 if self.category_mapping is None: req = self.s.get('http://www.qixin.com') self.category_mapping = {} for element in req.xpath(CATEGORY_XPATH['info']): category_l1 = element.xpath( CATEGORY_XPATH['l1']).extract_first().strip() category_l2 = element.xpath(CATEGORY_XPATH['l2']).extract() self.category_mapping[category_l1] = category_l2 ujson.dump(self.category_mapping, open(os.path.join(os.getcwd(), FILENAME), 'w')) def fetch_page_with_chrome(self, url): self.s.transfer_session_cookies_to_driver() self.s.driver.get(url) def fetch_page_with_requests(self, url): """ url = 'http://www.qixin.com/search?area.province=12&page=%s&scope[]=1&sorter=4' % page :param url:请求的URL :param return: 返回list """ # 获取cookies之后,使用requests的session开始抓取数据 self.s.proxies.update({ 'http': 'http://forward.xdaili.cn:80', 'https': 'https://forward.xdaili.cn:80' }) self.s.headers.update({'Proxy-Authorization': sign()}) req = self.s.get(url) result = parse_list(req) return result
class HuPu: commentaries = [ '朋友圈每日更新 各种秒价第一时间了解:clpro7', '回收各种球鞋 aj 喷泡 椰子 实战 急用鞋换钱 闲置清理空间 全新二手皆可 打包优先 寻求多方合作 更多精彩尽在: clpro7', '最大限度发挥球鞋价值 接各种套现寄卖 全新二手都可以 加微信:clpro7 秒价实时更新' ] mail = { 'recipient': '*****@*****.**', 'subject': 'HuPu', 'content': '请重新登录 http://39.107.86.245:8080' } def __init__(self, comment_count=30, commentaries=None, start_at=8, end_with=23): self.s = Session( './chromedriver', 'chrome', default_timeout=60, webdriver_options={'arguments': ['headless', 'disable-gpu', f'user-agent={user_agent}']} ) self.s.headers.update(s_headers) self.comment_count = comment_count self.commentaries = commentaries self.start_at = start_at self.end_with = end_with self.posts = Queue() self.exception_recoder = [] @ExceptionReporter def login(self, third_party): """ third party can be vx or qq """ third_parties = {'vx': 0, 'qq': 1} resp = self.s.get('https://passport.hupu.com/pc/login') qrcode_urls = resp.xpath('//div[@class="login-method"]/a/@data-href').extract() qrcode_url = qrcode_urls[third_parties.get(third_party)] if third_party == 'qq': qrcode_url = 'https://passport.hupu.com' + qrcode_url self.s.driver.get(qrcode_url) self.s.driver.get_screenshot_as_file('qrcode.png') logger.info('qrcode saved!') @ExceptionReporter def get_topic_url(self): """ 获取帖子主页链接 """ self.s.driver.get('https://www.hupu.com') iuid = self.s.driver.ensure_element_by_id('g_m').get_attribute('iuid') self.topic_url = f'https://my.hupu.com/{iuid}/topic' @ExceptionReporter def get_posts(self): """ 只评论二手交易区 """ logger.info('updating posts......') self.s.driver.get(self.topic_url) posts = self.s.driver.find_elements_by_xpath('//table[@class="mytopic topiclisttr"]//a')[:self.comment_count*2] links, plates = posts[::2], posts[1::2] for link, plate in zip(links, plates): if plate.text == '二手交易区': self.posts.put(link.get_attribute('href')) def up_post(self, post_url): """ 顶一条帖 十条连续错误再发邮件报错 """ try: self.s.driver.get(post_url) self.s.driver.ensure_element_by_id('atc_content').send_keys(choice(self.commentaries)) self.s.driver.ensure_element_by_id('fastbtn').ensure_click() time.sleep(randrange(60, 120)) if 'post.php?action=reply' in self.s.driver.current_url: logger.error('up post error! %s', post_url) self.exception_recoder.append(False) else: logger.info('up post success! %s', post_url) self.exception_recoder.append(True) except: self.exception_recoder.append(False) if len(self.exception_recoder) < 10: return if any(self.exception_recoder) is True: self.exception_recoder.pop(0) else: send_mail() # @ExceptionReporter # def up_post(self, post_url): # self.s.driver.get(post_url) # self.s.driver.ensure_element_by_id('atc_content').send_keys( # choice(self.commentaries)) # self.s.driver.ensure_element_by_id('fastbtn').ensure_click() # time.sleep(randrange(60, 120)) # if 'post.php?action=reply' in self.s.driver.current_url: # logger.error('up post error! %s', post_url) # self.exception_recoder.append(False) # else: # logger.info('up post success! %s', post_url) # self.exception_recoder.append(True) def is_boundary(self): """ 判断是否在指定时间段 """ now = arrow.now() if now.hour >= self.end_with: logger.info('%s 正在休眠, 明天%s点再回帖', now, self.start_at) time.sleep((now.shift(days=1).replace(hour=self.start_at, minute=0) - now).seconds) elif now.hour < self.start_at: time.sleep((now.replace(hour=self.start_at, minute=0) - now).seconds) logger.info('%s 正在休眠, %s点再回帖', now, self.start_at) def up_posts(self): """ 在指定时间段顶帖 """ while True: self.is_boundary() while self.posts.empty(): self.get_posts() self.up_post(self.posts.get())
class Driver(object): def __init__(self): # 使用requestium的Session, 使用requests和Selenium, 设置为headless模式 self.s = Session( webdriver_path='./chromedriver', browser='chrome', default_timeout=15, #webdriver_options={'arguments': ['headless']} ) self.category_mapping = None path = os.path.join(os.getcwd(), FILENAME) if os.path.exists(path): self.category_mapping = ujson.load(open(path)) #pprint(self.category_mapping) def close(self): if self.s.driver is not None: self.s.driver.quit() if self.s is not None: self.s.close() def login(self): """ 使用driver登录到启信宝 """ login_url = 'http://www.qixin.com/auth/login?return_url=%2F' self.s.driver.get(login_url) # 使用requestium中的ensure_*方法定位元素 username_xpath = '//input[@class="form-control input-lg input-flat input-flat-user"]' user_element = self.s.driver.ensure_element_by_xpath(username_xpath) for c in USERNAME: # 间歇输入Username和Password user_element.send_keys(c) time.sleep(random.randint(0, 2)) password_xpath = '//input[@class="form-control input-lg input-flat input-flat-lock"]' password_element = self.s.driver.ensure_element_by_xpath( password_xpath) for c in PASSWORD: password_element.send_keys(c) time.sleep(random.random()) password_element.send_keys(Keys.ENTER) self.s.driver.implicitly_wait(10) def process_cookies(self): """ 使用requests抓取页面 """ # 将driver的cookies转给requests的session tmp_url = 'http://www.qixin.com/search?area.province=12&page=1&scope[]=1' self.s.driver.get(tmp_url) self.s.transfer_driver_cookies_to_session() self.s.copy_user_agent_from_driver() # 判断category mapping是否存在 if self.category_mapping is None: req = self.s.get('http://www.qixin.com') self.category_mapping = {} for element in req.xpath('//div[@class="grid-item"]'): category_l1 = element.xpath( './div/text()').extract_first().strip() category_l2 = element.xpath('./a/text()').extract() self.category_mapping[category_l1] = category_l2 ujson.dump(self.category_mapping, open(os.path.join(os.getcwd(), FILENAME), 'w')) def fetch_page(self): # 获取cookies之后,使用requests的session开始抓取数据 result = [] self.s.proxies.update({ 'http': 'http://forward.xdaili.cn:80', 'https': 'https://forward.xdaili.cn:80' }) for page in range(1, 11): url = 'http://www.qixin.com/search?area.province=12&page=%s&scope[]=1&sorter=4' % page self.s.headers.update({'Proxy-Authorization': sign()}) req = self.s.get(url) for element in req.xpath( "//div[contains(@class, 'company-item')]"): result.append({ 'title': element.xpath(".//div[@class='company-title']/a/text()" ).extract_first().strip(), 'legal_owner': element.xpath(".//div[@class='legal-person'][1]/text()" ).re_first(r'法定代表人:(\w*)').strip(), 'status': element.xpath( ".//div[@class='company-tags']/span[1]/text()"). extract_first().strip(), 'capital': element.xpath(".//div[contains(@class, 'col-3-1')]/text()" ).extract_first().strip(), 'date': element.xpath(".//div[contains(@class, 'col-3-2')]/text()" ).extract_first().strip(), 'url': element.xpath(".//div[@class='company-title']/a/@href" ).extract_first().strip() }) time.sleep(10) return result def process_search_condition(self): """ 构建搜索条件 * URL: http://www.qixin.com/search? * param 地区: area.province=12, area.district=120101-120119 * param 搜索范围: scope[]=1 * param 排序: sorter=3 | 4 * param 注册资本: capital: 1-5 * param 所属行业: industry.l1 一级行业, industry.l2 二级行业 * param 注册年份: year: 1-5 * param page: 页码,最大不超过500, 只能看5000条搜索结果 http://www.qixin.com/search?area.district=120101&area.province=12&capital=2&industry.l1=%E5%86%9C%E3%80%81%E6%9E%97%E3%80%81%E7%89%A7%E3%80%81%E6%B8%94%E4%B8%9A&industry.l2=%E5%86%9C%E4%B8%9A&page=1&scope[]=1&sorter=4&year=5 """ pass
class AuM(object): def __init__(self): # Create a session and authenticate self._s = Session( webdriver_path='/usr/lib/chromium-browser/chromedriver', browser='chrome', webdriver_options={"arguments": ["--headless"]}) self._s.headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0' }) self._s.get('https://www.adopteunmec.com') # Maybe not needed # Register a new account rand_s = ''.join( random.choice(string.ascii_lowercase + string.digits) for _ in range(6)) print('email used: francois_%[email protected]' % rand_s) r = self._s.post('https://www.adopteunmec.com/register/index', data={ 'sex': '1', 'day': '03', 'month': '4', 'year': '1997', 'email': '*****@*****.**' % rand_s, 'password': '******', 'password_check': 'Adottami1', 'country': 'fr', 'zipcode': '06000', 'city': 'Nice', 'confirm_city': '0', 'pseudo': 'RedoAA', 'cgu': '1', 'reg_submit': '', 'by_popup': '1', 'PreventChromeAutocomplete': '' }, headers={ "X-Requested-With": "XMLHttpRequest", "Origin": "https://www.adopteunmec.com/", "Referer": "https://www.adopteunmec.com/" }) status = r.json() # If registration were successful, go to redirect page to confirm account if (status['success'] == 1): self._s.get(status['redirect']) else: print('Something went wrong....') self._common_names = ( 'loic', 'marc', 'anthony', 'tom', 'jordan', 'florian', 'jean', 'manu', 'seb', 'alex', 'lilian', 'angelo', 'fred', 'valent', 'fabrice', 'fabien', 'nico', 'thomas', 'sylvain', 'tim', 'karim', 'robin', 'pierre', 'arnaud', 'max', 'luc', 'mike', 'yann', 'oliv', 'yvan', 'jerem', 'michel', 'mat', 'kev', 'damien', 'vinc', 'eric', 'gilles', 'jona', 'bruno', 'simon', 'adri', 'serge', 'tony', 'jul', 'quentin', 'leo', 'step', 'gab', 'david', 'paul', 'killian', 'alvaro', 'ronan', 'anto', 'jb', 'jp', 'jon', 'patrick', 'virgile', 'juju', 'stef', 'franck', 'alan', 'alain', 'albin', 'alban', 'fran', 'cyril', 'laure', 'phil', 'jacques', 'jack', 'ludo', 'chris', 'vic', 'jo', 'charles', 'geoffrey', 'igor', 'ciro', 'erwan', 'fabio', 'guillaume', 'thibaut', 'romain', 'rafa', 'lionel', 'cedric', 'xavier') def _common_name(self, name): return len( filter(lambda x: x is False, map(lambda n: name.lower().find(n) < 0, self._common_names))) > 0 def search_by_region(self, age_min=20, age_max=30, region=1, sex=0): return self.search({ 'age[min]': age_min, 'age[max]': age_max, 'by': 'region', 'region': region, "sex": sex }) def search_by_disance(self, age_min=20, age_max=30, distance=40, sex=0): return self.search({ 'age[min]': age_min, 'age[max]': age_max, 'by': 'distance', 'distance[max]': distance, "sex": sex }) def search(self, criteria=None): if criteria is None: return [] # Go to search page self._s.get('https://www.adopteunmec.com/mySearch') # POST a request r = self._s.post('https://www.adopteunmec.com/mySearch/save', data=criteria) time.sleep(3) # Wait a bit... # Trasnfer cookie to selenium, refresh the page, scroll to end 10 times, and get profiles self._s.transfer_session_cookies_to_driver() self._s.driver.get('https://www.adopteunmec.com/mySearch/results') for i in range(10): self._s.driver.find_element_by_tag_name('html').send_keys(Keys.END) time.sleep(.1) html = BeautifulSoup( self._s.driver.execute_script("return document.body.innerHTML"), 'lxml') self._s.transfer_driver_cookies_to_session() self._s.driver.close() # Might be done before ? # Look for <div> tags containing user info blocks = html.find_all('div', {'class': 'user-infos'}) # Get all <a> tags in a same list all_a = [a for sl in [b.find_all('a') for b in blocks] for a in sl] # Extract profiles ID doing common name checks to avoid visit too may profiles later profiles = [ l.get('href').split('/')[-1] for l in all_a if isinstance(l.get('href'), str) and l.get('href').find('profile') > 0 and len(l.get_text()) > 2 and not self._common_name(l.get_text()) ] return profiles def update_db(self, profiles=[], max_p=None, filename='data/justemenemoi.json'): db = {} try: with open(filename, 'r') as in_f: db = json.load(in_f) except: pass visited = 0 for uid in profiles: # Check if profile already in db if uid not in db: if max_p is not None and visited >= max_p: break visited += 1 url = "https://www.adopteunmec.com/profile/" + uid page = self._s.get(url) html = BeautifulSoup( page.content.decode('utf-8', 'xmlcharrefreplace')) name = html.find('div', {'class': 'username'}).get_text() desc = html.find(text='Description').find_parent('div').find( 'p').get_text() shop = html.find(text='Shopping List').find_parent('div').find( 'p').get_text() # Profile Filtering if desc.find("non renseign") >= 0 or shop.find( "non renseign") >= 0 or len(desc) < 20 or len( shop) < 20: continue img_url = html.find(id='img-current-pic')['src'] img_name = img_url.split('/')[-1] db[uid] = { "profile": url, "name": name, "img": img_name, "age": html.find('span', { 'class': 'age' }).get_text(), "city": html.find('span', { 'class': 'city' }).get_text(), "desc": desc, "shop": shop } # Download and save profile pic pic = self._s.get(img_url, stream=True) pic.raw.decode_content = True with open("data/pics/" + img_name, 'wb') as f: shutil.copyfileobj(pic.raw, f) time.sleep(.5) # Bit of rest... # Write back json json_s = json.dumps( db) # Dump as a string, to write to file and as JS var with open(filename, 'w') as out_f: out_f.write(json_s) with open(filename + '.js', 'w') as out_f: out_f.write("data = ") out_f.write(json_s)
class Charme(object): def __init__(self): # Create a session and authenticate self._s = Session( webdriver_path='/usr/lib/chromium-browser/chromedriver', browser='chrome') #, #webdriver_options={"arguments": ["--headless"]}) self._s.headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0' }) # Login r = self._s.post('https://www.adopteunmec.com/auth/login', data={ 'username': '******', 'password': '******' }) if not r.ok: raise 'Something wrong in login' else: time.sleep(2) def search_by_distance(self, age_min=20, age_max=30, distance=40, sex=1): return self.search({ 'age[min]': age_min, 'age[max]': age_max, 'by': 'distance', 'distance[max]': distance, "sex": sex }) def search(self, criteria=None): if criteria is None: return [] # Go to search page self._s.get('https://www.adopteunmec.com/mySearch') time.sleep(1) # POST a request r = self._s.post('https://www.adopteunmec.com/mySearch/save', data=criteria) time.sleep(3) # Wait a bit... # Trasnfer cookie to selenium, refresh the page, scroll to end 10 times, and get profiles self._s.transfer_session_cookies_to_driver() self._s.driver.get('https://www.adopteunmec.com/mySearch/results') for i in range(10): self._s.driver.find_element_by_tag_name('html').send_keys(Keys.END) time.sleep(.1) html = BeautifulSoup( self._s.driver.execute_script("return document.body.innerHTML"), 'lxml') self._s.transfer_driver_cookies_to_session() self._s.driver.close() # Might be done before ? # Look for <div> tags containing user info blocks = html.find_all('div', {'class': 'user-infos'}) # Get all <a> tags in a same list all_a = [a for sl in [b.find_all('a') for b in blocks] for a in sl] # Extract profiles ID doing common name checks to avoid visit too may profiles later profiles = [ l.get('href').split('/')[-1] for l in all_a if isinstance(l.get('href'), str) and l.get('href').find('profile') > 0 and len(l.get_text()) > 2 ] return profiles def charme(self, profiles=[], max_p=10, filename='data/charme.json'): db = {} try: with open(filename, 'r') as in_f: db = json.load(in_f) except: pass visited = 0 for uid in profiles: # Check if profile already in db if uid not in db: if max_p is not None and visited >= max_p: break visited += 1 url = "https://www.adopteunmec.com/profile/" + uid print "Visiting", url page = self._s.get(url) html = BeautifulSoup( page.content.decode('utf-8', 'xmlcharrefreplace'), 'lxml') img_url = html.find(id='img-current-pic')['src'] img_name = img_url.split('/')[-1] date = datetime.datetime.now().strftime("%m-%d %H:%M") db[uid] = { "profile": url, "name": html.find('div', { 'class': 'username' }).get_text(), "img": img_name, "age": html.find('span', { 'class': 'age' }).get_text(), "city": html.find('span', { 'class': 'city' }).get_text(), "desc": html.find(text='Description').find_parent('div').find( 'p').get_text(), "shop": html.find(text='Shopping List').find_parent('div').find( 'p').get_text(), "charmed": date } # Download and save profile pic pic = self._s.get(img_url, stream=True) pic.raw.decode_content = True with open("data/pics/" + img_name, 'wb') as f: shutil.copyfileobj(pic.raw, f) time.sleep(20) # Bit of rest... # Send a charme url = "https://www.adopteunmec.com/events/charm?id=" + uid r = self._s.get(url) if r.json()['member']['id'] != uid: raise 'Something wrong in response' # Write back json json_s = json.dumps( db) # Dump as a string, to write to file and as JS var with open(filename, 'w') as out_f: out_f.write(json_s)