def createSession(crawlingWebsite): print "\nCrawler Initiated. Searching in '" + crawlingWebsite + "' for domains.\n\n" dryscrape.start_xvfb() #Begin new session with loaded scripts try: session = dryscrape.Session() session.visit(crawlingWebsite) response = session.body() #Reset session for handling memory issues session.reset() except InvalidResponseError as e: print "Cannot open " + crawlingWebsite + "\n" print 'InvalidResponseError:', e quit() soup = BeautifulSoup(response, "html.parser") #Searches for hyperrefs in a page. This is the hardcoded bit. #Searching for items is webpage-specific. For a different website, please refer to its HTML content #to find out which tags are needed to obtain a list of domains if any. tableFound = soup.findAll("a", {"target": "_blank"}) if len(tableFound) == 0: print "Nothing found. Terminating crawler." quit() else: for row in tableFound: #Add found domains to the list of sites siteList.append(row.get('href'))
def findImdbUrl(movie_title, movie_writer): dryscrape.start_xvfb() session = dryscrape.Session() link = "http://www.imdb.com/find?q=" + urllib.quote(movie_title) + "&s=all" session.visit(link) response = session.body() soup = BeautifulSoup(response) div = soup.find(lambda tag: tag.name == 'div' and tag.has_key('class') and tag['class'] == 'findSection') if (div): div_content = "".join([unicode(x) for x in div.contents]) title_search = re.compile('/title/tt\d+') search_results = re.findall(title_search, div_content) for movie_url in search_results: try: names = extractPeopleBehind("http://www.imdb.com" + movie_url + "/") if not set(movie_writer).isdisjoint(names): return "http://www.imdb.com" + movie_url + "/" #soup_search = BeautifulSoup(resp_search) #people_behind = soup_search.findall(lambda tag: tag.name=='div' and tag.has_key('class') and tag['class']=='credit_summary_item') #for people in people_behind: print people.text except: pass return None
def listMovieScripts(): dryscrape.start_xvfb() session = dryscrape.Session() imsdbLink = "http://www.imsdb.com/all scripts/" session.visit(imsdbLink) webContent = session.body() bs = BeautifulSoup(webContent) movies = bs.findAll(lambda tag: tag.name=='p') links = {} writers = {} for movie in movies: #<p><a href="/Movie Scripts/Boyhood Script.html" title="Boyhood Script">Boyhood</a> (Undated Draft)<br><i>Written by Richard Linklater</i><br></p> movie_title = movie.find(lambda tag: tag.name=='a').text if (movie_title.endswith(", The")): movie_title = "The " + movie_title.replace(", The", "") movie_url = "http://www.imsdb.com" + urllib.quote(movie.find(lambda tag: tag.name=='a').get("href")) movie_writer = movie.find(lambda tag: tag.name=='i').text movie_writer = movie_writer.replace("Written by ", "") movie_writer_list = getlastNames(movie_writer.split(",")) #print movie_title, movie_url, movie_writer_list links[movie_title] = movie_url writers[movie_title] = movie_writer_list return (links, writers)
def get_jobs(url): ret = {} jobs = [] rake_object = rake.Rake( "/root/freshack/Jobscraper/freshdeskhack/SmartStoplist.txt", 3, 2, 1) dryscrape.start_xvfb() session = dryscrape.Session() session.visit(url) html_page = session.body() soup = BeautifulSoup(html_page, 'lxml') master_tag = soup.find_all("div", class_="fd-posdesc") for tag in master_tag: job = {} job["title"] = tag.h3.string div_list = tag.find_all("div") job_desc = "" for childdiv in div_list: text = childdiv.string if text: job_desc = job_desc + text keywords = rake_object.run(job_desc) words = [] for word in keywords: if "year" not in word[0]: words.append(word[0]) else: job["experience"] = word[0] job["keywords"] = words jobs.append(job) ret["jobs"] = jobs return json.dumps(ret)
def start_sess(): dryscrape.start_xvfb() sess = dryscrape.Session() sess.set_header( "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0" ) sess.set_attribute('auto_load_images', 0) print('Visitando skytorrents') sess.visit('https://www.skytorrents.lol/') print('Saltando la proteccion de Cloudflare...') for a in range(10, 0, -1): if a < 10: b = '0' + str(a) else: b = str(a) stdout.write("\r" + str(b)) stdout.flush() sleep(1) stdout.write("\033[K") #clear line stdout.flush() print() print('Ya puedes usar el buscador inline en Telegram') return (sess)
def postear_en_twitter(mensaje): if 'linux' in sys.platform: dryscrape.start_xvfb() sess = dryscrape.Session(base_url = 'https://www.twitter.com') sess.set_header("User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:41.0) Gecko/20100101 Firefox/41.0") # True = muestra imágenes # False = oculta las imágenes sess.set_attribute('auto_load_images', False) email='Escribe tu correo de twitter' # Debe de ir entrecomillado password='******' # Debe de ir entrecomillado try: sess.visit('/') q = sess.at_xpath('//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[1]/input') q.set(email) q = sess.at_xpath('//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[2]/input') q.set(password) q.form().submit() q=sess.at_xpath('//*[@id="tweet-box-home-timeline"]') q.click() q=sess.at_xpath('/html/body/div[2]/div[3]/div/div[2]/div[2]/div/form/div[2]/textarea') q.set(mensaje) q = sess.at_xpath('//*[@id="timeline"]/div[2]/div/form/div[3]/div[2]/button') q.click() sleep(1) # sess.render('twitter.png') except Exception as e: print (e)
def __init__(self): self.cat_path = '/courses' dryscrape.start_xvfb() self.session = dryscrape.Session(base_url=ECOMMERCE_URL) # No need to load images. self.session.set_attribute('auto_load_images', False)
def get_courses(searchPattern, school, prof_dict): if 'linux' in sys.platform: # start xvfb in case no X is running. Make sure xvfb # is installed, otherwise this won't work! dryscrape.start_xvfb() url = "https://www.coursicle.com/" + school + "/#search=" + searchPattern session = dryscrape.Session() session.set_attribute('auto_load_images', False) session.visit(url) response = session.body() soup = BeautifulSoup(response, 'lxml') table = soup.find('div', attrs={'id': 'show_results'}) codes = [] names = [] instructors = [] days = [] times = [] is_lecture = [] for courseNumber in table.findAll('div', attrs={'class': 'courseNumber'}): codes.append(courseNumber.text) cNum = courseNumber.text[-3:] if(int(cNum) < 600): is_lecture.append(True) else: is_lecture.append(False) for name in table.findAll('div', attrs={'class': 'courseName'}): #print(name.text) names.append(name.text) for instructor in table.findAll('div', attrs={'class': 'instructor'}): #print(instructor.text) instructors.append(instructor.text) for day in table.findAll('div', attrs={'class': 'days'}): #print(day.text) days.append(day.text) for time in table.findAll('div', attrs={'class': 'time'}): #print(time.text) times.append(time.text) output = [] for i in range(len(instructors)): if (is_lecture[i]): rating = get_rating(instructors[i], prof_dict) cBlock = course_block(codes[i], names[i], days[i], times[i], is_lecture[i], instructors[i], rating) output.append(cBlock) return output
def request_body(url): global session if session is None: dryscrape.start_xvfb() session = dryscrape.Session(base_url="http://www.ffvoile.fr/ffv/") session.set_attribute("auto_load_images",False) response = session.visit(url) body = PyQuery(session.body()) return body
def request_body(url): global session if session is None: dryscrape.start_xvfb() session = dryscrape.Session(base_url="http://www.ffvoile.fr/ffv/") session.set_attribute("auto_load_images", False) response = session.visit(url) body = PyQuery(session.body()) return body
def start(): import sys try: import dryscrape except: raise Exception('dryscrape not found') if 'linux' in sys.platform: # start xvfb in case no X is running. Make sure xvfb # is installed, otherwise this won't work! dryscrape.start_xvfb()
def get_url(url): try: if 'linux' in sys.platform: # start xvfb in case no X is running. Make sure xvfb # is installed, otherwise this won't work! dryscrape.start_xvfb() session = dryscrape.Session() session.visit(url) return session.body() except Exception as e: logging.exception("No get %s" % url)
def __init__(self, settings): super().__init__(settings) # start xvfb to support headless scraping if 'linux' in sys.platform: dryscrape.start_xvfb() self.dryscrape_session = dryscrape.Session(base_url='http://zipru.to') for key, value in settings['DEFAULT_REQUEST_HEADERS'].items(): # seems to be a bug with how webkit-server handles accept-encoding if key.lower() != 'accept-encoding': self.dryscrape_session.set_header(key, value)
def checkCourse(course_name, course_id, course_sec): dryscrape.start_xvfb() sess = dryscrape.Session() sess.set_attribute('auto_load_images', False) sess.visit("https://courses.students.ubc.ca/cs/main?sessyr=2015&sesscd=W") sess.at_xpath('//*[@id="ubc7-unit-navigation"]/ul/li[1]/div/a').click() sess.at_xpath('//*[@id="ubc7-unit-navigation"]/ul/li[1]/div/ul/li[2]/a').click() sess.visit("https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=5&dept=%s&course=%d§ion=%s" % (course_name, course_id, course_sec)) pagehtml = sess.source() regex = 'The requested section is either no longer offered at UBC Vancouver or is not being offered this session.' result = re.search(regex, pagehtml) return result
def main(): if 'linux' in sys.platform: # start xvfb in case no X is running. Make sure xvfb # is installed, otherwise this won't work! dryscrape.start_xvfb() try: opts, args = getopt.getopt(sys.argv[1:], "s:") except getopt.GetoptError as err: print str(err) print "probe_tez_ui.py -s UI endpoint" sys.exit(2) ui_server = None for o, a in opts: if o == "-s": ui_server = a if ui_server == None: assert False, "probe_tez_ui.py -s UI endpoint" suffix = "/tez-ui/#/?rowCount=100" url = "http://{0}/{1}".format(ui_server, suffix) print "Probing", url sess = dryscrape.Session(base_url = url) sess.visit(suffix) time.sleep(10.0) # Extract DAG links. targets = [] for link in sess.xpath('//a[@href]'): link = link['href'] if link.startswith("#/dag/"): targets.append(link) for link in targets: id = link.split('/')[-1] print(link) sess.visit(link) cumulative = 0 while cumulative <= 20: sleep_time = 0.25 cumulative += sleep_time print "Sleeping ({0})".format(cumulative) time.sleep(sleep_time) new_file_text = "{0}_{1:04d}s.html".format(id, int(cumulative*100)) text = sess.body().encode('utf-8') fd = open(new_file_text, "w") fd.write(text) fd.close() if text.find("Loading") == -1: break
def login(self, credentials=None): """ Log the given credentials in and authentication this service with cookies. @params credentials : Credentials used to log the user into a Microsoft Account. By default, this will use the last used credentials (which may be the credentials used to start the service). This should contain both username and password keys. @throws InvalidCredentialsException : Thrown if the given credentials don't authenticate the session. """ # load up the session and log the user in dryscrape.start_xvfb() session = dryscrape.Session() session.set_attribute('auto_load_images', False) session.visit('https://login.live.com/login.srf') if (not credentials): # if no new credentials were provided credentials = self.credentials else: self.credentials = credentials # enter the user's credentials into the login page session.at_css('#i0116').set(credentials['username']) session.at_css('#i0118').set(credentials['password']) session.at_css('#idSIButton9').click() # stupid trick to force the page to finish loading try: session.at_css('#FMht') except Exception: pass # make sure the login was succesful auth_cookies = {} # grab the authentication cookies from the session and save them for later for cookie in session.cookies(): cookie_info = cookie.split(';')[0].split('=') if ('AMC' in cookie_info[0]): auth_cookies[cookie_info[0]] = cookie_info[1] if (not 'AMCSecAuth' in auth_cookies): # login was not sucessful raise InvalidCredentialsException(credentials['username']) self.auth_cookies = auth_cookies self.credentials = credentials
def __init__(self, metadata, output_dir=None, strict_validation=None, fastmode=False): """ Create a new Scraper instance. :param metadata: metadata for this scraper :param output_dir: the data directory to use :param strict_validation: exit immediately if validation fails """ super(Scraper, self).__init__() # scrapelib overrides self.timeout = settings.SCRAPELIB_TIMEOUT self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR) self.requests_per_minute = settings.SCRAPELIB_RPM self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS if fastmode: self.requests_per_minute = 0 self.cache_write_only = False # if scraper uses dryscrape, set up session if settings.USES_DRYSCRAPE: dryscrape.start_xvfb() self.session = dryscrape.Session() self.metadata = metadata self.output_dir = output_dir self.output_names = set() # make output_dir os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir) # validation self.strict_validation = strict_validation self.validator = DatetimeValidator() self._schema = {} self._load_schemas() # logging convenience methods self.logger = logging.getLogger("billy") self.log = self.logger.info self.info = self.logger.info self.debug = self.logger.debug self.warning = self.logger.warning self.error = self.logger.error self.critical = self.logger.critical
def __init__(self, settings): """ 初始化一个 dryscrape 会话: 会话当作一个浏览器标签,它会做所有浏览器通常所做的事(如获取外部资源,获取脚本)。 我们可以在选项卡中导航到新的URL,点击按钮,输入文本以及做其它各类事务。 注: Scrapy支持请求和项目处理的并发,但响应的处理是单线程的, 所以可以使用这个单独的 dryscrape 会话,而不用担心线程安全。 :param settings: """ super().__init__(settings) # start xvfb to support headless scraping # 用 dryscrape 构造无头 webkit 实例 if 'linux' in sys.platform: dryscrape.start_xvfb() self.dryscrape_session = dryscrape.Session(base_url='http://demo.com')
def pollrssi(url,outfn,interval): outfn = Path(outfn).expanduser() dryscrape.start_xvfb() sess = setuphtml(url) outfn.write_text('time,status,rssi,sinr,bars') while True: html = sess.body() status,rssi,sinr,bars = parsehtml(html) line = '{},{},{},{},{}\n'.format(datetime.utcnow().strftime('%xT%X'),status,rssi,sinr,bars) with open(str(outfn),'a') as f: f.write(line) sleep(interval)
def save_exercise(isbn, url, chapter, section, exercise, solutions): import dryscrape if 'linux' in sys.platform: dryscrape.start_xvfb() chapter = chapter.strip().lower().replace(' ', '_') exercise = exercise.strip().replace('.', '') isbn = str(isbn) if not os.path.exists(SLADER_DIR + isbn): os.makedirs(SLADER_DIR + isbn) if not os.path.exists(SLADER_DIR + isbn + "/" + chapter): os.makedirs(SLADER_DIR + isbn + "/" + chapter) # /home/user/.slader/34564512443/chapter_6/6.1.2/png filename = "%s%s/%s/%s.%s.png" % (SLADER_DIR, isbn, chapter, section.strip(), exercise) js = open('simplify.js', 'r').read() try: session = dryscrape.Session() session.set_timeout(DRYSCRAPE_TIMEOUT) session.visit(url) session.eval_script(js) js = open('hide_all.js', 'r').read() session.eval_script(js) # toggle each solution on. take a screenshot. toggle off. repeat. for i in range(solutions): js = "document.getElementsByClassName(\"solution user-content\")[%d].style.visibility=\"visible\"" % i session.eval_script(js) session.render(filename.replace(".png", ".s%d.png" % i)) js = "document.getElementsByClassName(\"solution user-content\")[%d].style.visibility=\"hidden\"" % i session.eval_script(js) time.sleep(1) return True except socket.error: # "Connection refused. Exit to reset webkit." exit(1) except: return False
def get_js_session(url, wait=5, viewport=(1024, 768), render_fn=None): import dryscrape if 'linux' in sys.platform: # start xvfb in case no X is running. Make sure xvfb # is installed, otherwise this won't work! dryscrape.start_xvfb() sess = dryscrape.Session() sess.set_viewport_size(width=viewport[0], height=viewport[1]) sess.visit(url) sleep(wait) if render_fn: if not render_fn.endswith('.png'): render_fn += '.png' sess.render(render_fn) return sess
def veriify_signature(url, sign): """ Verification de lapresence d'une signature spécifique dans le code source de votre page :param url: :return: """ dryscrape.start_xvfb() sess = dryscrape.Session() # Nous n'avons pas besoin des images sess.set_attribute('auto_load_images', False) # visite du site sess.visit(url) corp = sess.body() if sign in corp: print('\n[Ok] * Signature présent dans le code source de la page Web\n') else: print('\n[Warning] * Pas de signature trouver dans le code source de votre page Web') print
def get_username_dynamic_cookie(self): try: url = "https://weibo.cn/{}/info".format(self.user_id) user_agent = ('Mozilla/5.0 (X11; Linux x86_64; rv:12.0) ' 'Gecko/20100101 Firefox/12.0', ) default_request_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;' 'q=0.9,*/*;q=0.8', 'User-Agent': user_agent, 'Connection': 'Keep-Alive', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,*', } # start xvfb to support headless scraping if 'linux' in sys.platform: dryscrape.start_xvfb() for key, value in default_request_headers.items(): # seems to be a bug with how webkit-server handles # accept-encoding if key.lower() != 'accept-encoding': self.dryscrape_session.set_header(key, value) self.dryscrape_session = dryscrape.Session(base_url=url) cookies = {} for cookie_string in self.dryscrape_session.cookies(): if 'domain=zipru.to' in cookie_string: key, value = cookie_string.split(';')[0].split('=') cookies[key] = value self.cookie = cookies html = requests.get(url, cookies=self.cookie, headers=default_request_headers).content selector = etree.HTML(html) username = selector.xpath("//title/text()")[0] self.username = username[:-3] print("User name: " + self.username) except Exception: logging.exception("message")
def main(): if 'linux' in sys.platform: # start xvfb in case no X is running. Make sure xvfb # is installed, otherwise this won't work! dryscrape.start_xvfb() # sPage = requests.get(startUrl) # sHtml = sPage.text # sPage.raise_for_status() sess = dryscrape.Session(base_url='https://www.owler.com') sess.set_attribute('auto_load_images', False) sess.visit('/sector/industrial-machinery-equipment-companies') print(sess.status_code(), sess.headers()) sHtml = sess.body() # with open('sample.txt', 'r') as f: # Mocked # sHtml = f.read() # Mocked resultsInfo = Extractor(sHtml) sdf = resultsInfo.getData() print(type(sdf)) # writeData(sdf, 'companies') writeData(sdf, 'runcompanies') # Mocked n = resultsInfo.nResults() for i in range(5, 0, -1): time.sleep(1) print('%s seconds - Next page will begin' % (i)) for v in range(2, int(n / 15)): nextone = '/sector/industrial-machinery-equipment-companies?p=%s' % (v) print(nextone) # page = requests.get(nextpage) # page.raise_for_status() # html = page.text sess.visit(nextone) print(sess.status_code(), sess.headers()) html = sess.body() info = Extractor(html) # info = Extractor(sHtml) # Mocked df = info.getData() # writeData(df, 'companies') writeData(df, 'runcompanies') # Mocked for i in range(20, 0, -1): time.sleep(1) print('%s seconds - Next page will begin' % (i))
def get_profs(): if 'linux' in sys.platform: dryscrape.start_xvfb() prof_dict = {} # Read the HTML File url = "https://www.ratemyprofessors.com" session = dryscrape.Session() session.set_attribute('auto_load_images', False) session.visit(url) url = "https://www.ratemyprofessors.com/search.jsp?queryBy=schoolId&schoolName=University+of+North+Carolina+at+Chapel+Hill&schoolID=1232&queryoption=TEACHER" session.visit(url) # <div class="content" onclick="javascript:mtvn.btg.Controller.sendLinkEvent({ linkName:'PROFMIDPANE:LoadMore', linkType:'o' } );">Load More</div> load_more = session.at_xpath( "//*[@id='mainContent']/div[1]/div/div[5]/div/div[1]") for i in range(292): load_more.click() page = session.body() parsed_html = BeautifulSoup(page, 'lxml') #print(parsed_html) list_of_profs = parsed_html.find('div', attrs={'id': 'body'}) list_of_profs = list_of_profs.find('div', attrs={'id': 'mainContent'}) list_of_profs = list_of_profs.find('div', attrs={'class': 'left-panel'}) list_of_profs = list_of_profs.find('div', attrs={'class': 'side-panel'}) list_of_profs = list_of_profs.find('div', attrs={'class': 'result-list'}) print(list_of_profs) list_of_profs = list_of_profs.find_all( 'li', attrs={'id': re.compile('my-professor*')}) for x in list_of_profs: name = x.find('span', attrs={'class': 'name'}).text name = name.split(" ") name[1] = re.sub('\s+', '', name[1]) name = name[0] + " " + name[1][:len(name[1]) - 2] name = re.sub(r'\d+', '', name) rating = x.find('span', attrs={'class': 'rating'}).text prof_dict[str(name)] = float(str(rating)) return prof_dict
def get_body_response(self, url, is_count_items_exists=False): if 'linux' in sys.platform: dryscrape.start_xvfb() self.session = dryscrape.Session() self.session.set_attribute('auto_load_images', False) self.session.set_header('User-agent', 'Google Chrome') self.session.visit(url) if is_count_items_exists: value = self.session.at_xpath('//span[@class="ProfileNav-value"]') self.count_items = int( value.get_attr('data-count')) if value else None for i in range(math.ceil(self.limit / PAGE_SIZE)): if self._is_last_tweet(): break self._load_more_results() return self.session.body()
class PageFetcher(object): logger = logging.getLogger(__name__) dryscrape.start_xvfb() def _can_fetch(self, url): return True robots_file = self._get_robots_file_url(url) rp = robotparser.RobotFileParser() rp.set_url(robots_file) rp.read() return rp.can_fetch(USER_AGENT, url) def _get_robots_file_url(self, url): hostname = urlparse.urlparse(url).hostname robots_file = "http://{hostname}/robots.txt".format(hostname=hostname) self.logger.debug("Robots file: {r}".format(r=robots_file)) return robots_file def _get_content(self, response): if response.info().get('Content-Encoding') == 'gzip': self.logger.debug("Decompressing gzip content") buf = StringIO( response.read()) f = gzip.GzipFile(fileobj=buf) return f.read() return response.read() def fetch_page(self, url): corrected_url = url if not url.startswith("http://") and not url.startswith("https://"): corrected_url = "http://{u}".format(u=url) self.logger.debug("Fetching page: {u}".format(u = corrected_url)) cache = get_page_cache() cached_content = cache.get_cached_content(corrected_url) if cached_content: self.logger.debug("Page served from cache") return cached_content if not self._can_fetch(corrected_url): self.logger.warn("Unable to fetch, disallowed by robots.txt") raise FetchException("Disallowed by robots.txt") try: parsed_url = urlparse.urlparse(url) base_url = parsed_url.scheme + "://" + parsed_url.hostname path = parsed_url.path sess = dryscrape.Session(base_url=base_url) sess.set_attribute('auto_load_images', False) sess.visit(path) content = sess.body() cache.save_content(corrected_url, content) return content except Exception as e: raise FetchException("Failed to load the page", e)
def get_website_dryscrape(): try: dryscrape.start_xvfb() session = dryscrape.Session() sleep(3) session.visit(url) sleep(3) response = session.body() soup = BeautifulSoup(response, "html.parser").find("div", { "id": "news-wrapper" }).find("ul") liste = [] for i in soup.findAll("li"): exam = str(i).replace("<li>", "").replace("</li>", "").split("(")[0] if len(exam) > 10: if not "\\" in exam: liste.append(exam) return liste except: return []
def parse(self, response): # start session os.system("echo 'Starting xvfb instance...'") dryscrape.start_xvfb() session = dryscrape.Session() # start session self.session_properties(session) os.system("echo 'Crawling...'") # visit url session.visit(self.start_urls[0]) # visit website response = session.body() os.system("echo 'Done crawling.'") # pkill xvfb os.system("echo 'Closing xvfb instance...'") os.system("sudo pkill Xvfb") # scraper objects self.scraper = PollenScraper() self.soup = BeautifulSoup(response, 'lxml') # extract if (self.date != ""): self.extract_date(self.date) # yield yield self.scraper else: for date in range(3): self.extract_date(date) # yield yield self.scraper
def listMovieScripts(): dryscrape.start_xvfb() session = dryscrape.Session() imsdbLink = "http://www.imsdb.com/all scripts/" session.visit(imsdbLink) webContent = session.body() bs = BeautifulSoup(webContent) movies = bs.findAll(lambda tag: tag.name == 'p') outlist = open("imsdb_urls_imdb.csv", "w") for movie in movies: #<p><a href="/Movie Scripts/Boyhood Script.html" title="Boyhood Script">Boyhood</a> (Undated Draft)<br><i>Written by Richard Linklater</i><br></p> movie_title = movie.find(lambda tag: tag.name == 'a').text if (movie_title.endswith(", The")): movie_title = "The " + movie_title.replace(", The", "") print movie_title movie_url = "http://www.imsdb.com" + urllib.quote( movie.find(lambda tag: tag.name == 'a').get("href")) movie_writer = movie.find(lambda tag: tag.name == 'i').text movie_writer = movie_writer.replace("Written by ", "") movie_writer_list = getlastNames(movie_writer.split(",")) dir_scripts = "Imsdb_scripts/" ensure_dir(dir_scripts) session.visit(movie_url) response = session.body() m = re.search(r'<a href="?\'?(/scripts/[^"\'>]*)', response) if m: filename = m.group(1) if filename.endswith(".html"): outscript = open( dir_scripts + movie_title.replace(" ", "_") + ".txt", "w") imsdb_url = "http://www.imsdb.com" + m.group(1) session.visit(imsdb_url) response = session.body() pre = re.search(r'<pre>(.*?)</pre>', response, re.DOTALL) if pre: script = pre.group(1) #print re.search(r'<script>(.*?)<pre>', script, re.DOTALL) rem = re.compile(r'<script>(.*?)<pre>', re.DOTALL) rem2 = re.compile(r'<title>(.*?)<pre>', re.DOTALL) rem3 = re.compile(r'<b>\s+\.*\d*\.*\s*</b>', re.MULTILINE) script = re.sub(rem, '', script) script = re.sub(rem2, '', script) script = re.sub(rem3, '\n', script) outscript.write(script.encode('utf8')) outscript.close() imdb_url = findImdbUrl(movie_title, movie_writer_list) if (not imdb_url): imdb_url = "" outlist.write(movie_title + "," + movie_writer + "," + imsdb_url + "," + imdb_url + "," + imsdb_url + "\n") time.sleep(60) outlist.close()
def postear_en_twitter(mensaje): #=================================================== # PARTE 1: PREPARATIVOS #=================================================== #---------------------------------------------------- # Si se ejecuta en linux, debemos ejecutar la # función start_xvfb() #---------------------------------------------------- if 'linux' in sys.platform: dryscrape.start_xvfb() #---------------------------------------------------- # Le decimos a que página queremos entrar #---------------------------------------------------- sess = dryscrape.Session(base_url='https://www.twitter.com') #---------------------------------------------------- # Se añade la cabecera que estamos usando un # navegador, sino entrarás en twitter como un bot y # seguramente no puedas acceder a él #---------------------------------------------------- sess.set_header( "User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:41.0) Gecko/20100101 Firefox/41.0") #---------------------------------------------------- # Ajusta si quieres que extraiga las imágenes al # scrappear el contenido o no. # # Para este bot no hace falta, porque lo que va a # hacer es enviar un mensaje #---------------------------------------------------- # True = muestra imágenes # False = oculta las imágenes sess.set_attribute('auto_load_images', False) #---------------------------------------------------- # Escribe las credenciales para entrar a tu cuenta # de Twitter #---------------------------------------------------- email = 'Escribe tu correo de twitter' # Debes dejar las comillas password = '******' # Debes dejar las comillas try: #=================================================== # PARTE 2: LOGUEO #=================================================== #---------------------------------------------------- # Visito https://www.twitter.com #---------------------------------------------------- sess.visit('/') #---------------------------------------------------- # Me voy a la caja de texto de usuario #---------------------------------------------------- q = sess.at_xpath( '//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[1]/input') #---------------------------------------------------- # Le digo que escriba el email que le he dicho antes #---------------------------------------------------- q.set(email) #---------------------------------------------------- # Me voy a la caja de texto de contraseña #---------------------------------------------------- q = sess.at_xpath( '//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[2]/input') #---------------------------------------------------- # Le digo que escriba el pass que le he dicho antes #---------------------------------------------------- q.set(password) #---------------------------------------------------- # Le digo que clickee en el botón de Iniciar sesión #---------------------------------------------------- q.form().submit() #=================================================== # PARTE 3: ESCRIBIR MENSAJE #=================================================== #---------------------------------------------------- # Le digo que clickee en la caja de texto para # escribir el mensaje #---------------------------------------------------- q = sess.at_xpath('//*[@id="tweet-box-home-timeline"]') q.click() #---------------------------------------------------- # Le digo que escriba el mensaje que le paso a la # función #---------------------------------------------------- q = sess.at_xpath( '/html/body/div[2]/div[3]/div/div[2]/div[2]/div/form/div[2]/textarea' ) q.set(mensaje) #---------------------------------------------------- # Clickeo en el botón de Twittear #---------------------------------------------------- q = sess.at_xpath( '//*[@id="timeline"]/div[2]/div/form/div[3]/div[2]/button') q.click() #---------------------------------------------------- # Hago una pausa de un segundo antes de salir de la # función #---------------------------------------------------- sleep(1) # sess.render('twitter.png') except Exception as e: print(e)
def extract_acts(site): """ @brief: Extract acts content to csv file @param site: First site in search result @note: Requires webkit_server (Unix only) """ # Start webkit server and session if 'linux' in sys.platform: dryscrape.start_xvfb() sess = dryscrape.Session() # Load site sess.visit(site) csv_exists = os.path.exists(settings.CSV_PATH) with open(settings.CSV_PATH, 'a', encoding='utf8') as output_file: csv_writer = csv.writer(output_file, delimiter=';', quoting=csv.QUOTE_ALL) if not csv_exists: csv_writer.writerow(settings.HEADERS) i = 1 prev_progress = '' # Iterate while "Next" button is enabled while True: try: progress = sess.wait_for( lambda: sess.at_css('.text.result-counter')).text() if prev_progress == progress: continue row = [i] elements = sess.wait_for(lambda: sess.css('.docValue')) # Iterate through all fields except last one (34 - 1 = 33) for element in elements[:33]: text = element.text().strip() row.append(text if text != 'Не заполнено' else '') frame = sess.eval_script( "$('.field-iframe').contents().find('body').html();") try: tree = html.fromstring(frame) elements = tree.xpath('//p') except Exception: # pylint: disable=broad-except elements = [] row.append(find_name(elements)) csv_writer.writerow(row) prev_progress = progress print(i) i += 1 button = sess.wait_for( lambda: sess.at_css('.card-paginator .to-right-red')) if 'yui-button-disabled' in button['class']: break button.children()[0].click() except dryscrape.mixins.WaitTimeoutError: sess.exec_script('location.reload();') except Exception: # pylint: disable=broad-except continue
def initialise(self): logging.info("URL:%s", self.url) #session = dryscrape.Session(base_url = self.url) if 'linux' in sys.platform: # start xvfb in case no X is running. Make sure xvfb is installed, otherwise this won't work! dryscrape.start_xvfb()
import mysql.connector import re import sys import HTMLParser from time import sleep try: read = str(sys.argv[1]) #read = str(input("enter the isbn number:")) #reader=["9350293471","9388369157","9385724060","9386797186","9386228343","9381626685","9385724061"] Titledb = Authordb = Pagesdb = Publisherdb = Languagedb = ISBN2db = Detailsdb = Subjectdb = None #for read in reader: ISBN1db = read #print("Books isbn-10:",ISBN1db) url = "https://www.amazon.in/dp/" + read print(url) dryscrape.start_xvfb() session = dryscrape.Session() session.visit(url) sleep(2) response = session.body() soup = BeautifulSoup(response, "lxml") try: extract_title = soup.find('span', {'id': 'productTitle'}) Title = extract_title.get_text() if Title: print("Book Title:", Title) Titledb = Title else: pass except: try:
#!/usr/bin/env python # -*- coding:utf-8 -*- import dryscrape # make sure you have xvfb installed dryscrape.start_xvfb() root_url = 'YOUR_BASE_URL' if __name__ == '__main__': # set up a web scraping session sess = dryscrape.Session(base_url = root_url) # we don't need images sess.set_attribute('auto_load_images', False) # visit webpage sess.visit('YOUR_RELATIVE_PATH_TO_BASE_URL') # search for iframe with id="mainframe" frame = sess.at_xpath('//*[@id="mainframe"]') # get the URL of iframe frameURL = root_url + frame['src'] # visit the URL of iframe sess2 = dryscrape.Session() sess2.visit(frameURL) # fill in the form in iframe name = sess2.at_xpath('//*[@id="username"]') name.set("John")
def __init__(self): self.url="http://www.flashscore.com/" self.timeout=20 self.useragent="Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0" dryscrape.start_xvfb()
#!/usr/bin/env python # -*- coding: UTF-8 -*- import dryscrape from bs4 import BeautifulSoup import re import sys import json import time import xlsxwriter import pdb from kununu import kununu workbook = xlsxwriter.Workbook('stuttgart.xlsx') #Create Excel File ws = workbook.add_worksheet() dryscrape.start_xvfb() # Start dryscrape session session = dryscrape.Session() session.visit( "https://www.dhbw-stuttgart.de/themen/internationales/internationale-studiengaenge/informatik/duale-partner/?tx_cronbafirmen_pi%5Boffset%5D=0&cHash=99f439f6a246d843d3a32e86bb8b32ca" ) #Visit DHBW Site response = session.body() soup = BeautifulSoup(response, "lxml") def has_colspan(tag): return tag.has_attr('colspan') ws.set_row(0, 24) ws.set_column(0, 0, 35) ws.set_column(1, 1, 20) ws.set_column(2, 2, 40) ws.set_column(3, 3, 40) ws.set_column(4, 4, 22)