Ejemplo n.º 1
0
def createSession(crawlingWebsite):
    print "\nCrawler Initiated. Searching in '" + crawlingWebsite + "' for domains.\n\n"
    dryscrape.start_xvfb()
    #Begin new session with loaded scripts
    try:
    	session = dryscrape.Session()
    	session.visit(crawlingWebsite)
    	response = session.body()
    	#Reset session for handling memory issues
    	session.reset()
    except InvalidResponseError as e:
	print "Cannot open " + crawlingWebsite + "\n"
	print 'InvalidResponseError:', e
	quit()
    soup = BeautifulSoup(response, "html.parser")
    #Searches for hyperrefs in a page. This is the hardcoded bit.
    #Searching for items is webpage-specific. For a different website, please refer to its HTML content
    #to find out which tags are needed to obtain a list of domains if any.
    tableFound = soup.findAll("a", {"target": "_blank"})

    if len(tableFound) == 0:
        print "Nothing found. Terminating crawler."
	quit()
    else:
        for row in tableFound:
            #Add found domains to the list of sites
            siteList.append(row.get('href'))
Ejemplo n.º 2
0
def findImdbUrl(movie_title, movie_writer):

    dryscrape.start_xvfb()
    session = dryscrape.Session()

    link = "http://www.imdb.com/find?q=" + urllib.quote(movie_title) + "&s=all"
    session.visit(link)
    response = session.body()

    soup = BeautifulSoup(response)
    div = soup.find(lambda tag: tag.name == 'div' and tag.has_key('class') and
                    tag['class'] == 'findSection')
    if (div):
        div_content = "".join([unicode(x) for x in div.contents])

        title_search = re.compile('/title/tt\d+')
        search_results = re.findall(title_search, div_content)

        for movie_url in search_results:
            try:
                names = extractPeopleBehind("http://www.imdb.com" + movie_url +
                                            "/")
                if not set(movie_writer).isdisjoint(names):
                    return "http://www.imdb.com" + movie_url + "/"

                #soup_search = BeautifulSoup(resp_search)
                #people_behind = soup_search.findall(lambda tag: tag.name=='div' and tag.has_key('class') and tag['class']=='credit_summary_item')
                #for people in people_behind: print people.text
            except:
                pass

    return None
Ejemplo n.º 3
0
def listMovieScripts():
    dryscrape.start_xvfb()
    session = dryscrape.Session()
    
    imsdbLink = "http://www.imsdb.com/all scripts/"
    session.visit(imsdbLink)
    webContent = session.body()
    
    bs = BeautifulSoup(webContent)
    movies = bs.findAll(lambda tag: tag.name=='p')

    links = {}
    writers = {}
    
    for movie in movies:
        #<p><a href="/Movie Scripts/Boyhood Script.html" title="Boyhood Script">Boyhood</a> (Undated Draft)<br><i>Written by Richard Linklater</i><br></p>
        movie_title = movie.find(lambda tag: tag.name=='a').text
        if (movie_title.endswith(", The")): movie_title = "The " + movie_title.replace(", The", "")

        movie_url = "http://www.imsdb.com" + urllib.quote(movie.find(lambda tag: tag.name=='a').get("href"))
        
        movie_writer = movie.find(lambda tag: tag.name=='i').text
        movie_writer = movie_writer.replace("Written by ", "")
        movie_writer_list = getlastNames(movie_writer.split(","))

        #print movie_title, movie_url, movie_writer_list
        links[movie_title] = movie_url
        writers[movie_title] = movie_writer_list
        
    	return (links, writers)		
Ejemplo n.º 4
0
def get_jobs(url):
    ret = {}
    jobs = []
    rake_object = rake.Rake(
        "/root/freshack/Jobscraper/freshdeskhack/SmartStoplist.txt", 3, 2, 1)
    dryscrape.start_xvfb()
    session = dryscrape.Session()
    session.visit(url)
    html_page = session.body()
    soup = BeautifulSoup(html_page, 'lxml')
    master_tag = soup.find_all("div", class_="fd-posdesc")

    for tag in master_tag:
        job = {}
        job["title"] = tag.h3.string
        div_list = tag.find_all("div")
        job_desc = ""
        for childdiv in div_list:
            text = childdiv.string
            if text:
                job_desc = job_desc + text

        keywords = rake_object.run(job_desc)
        words = []
        for word in keywords:
            if "year" not in word[0]:
                words.append(word[0])
            else:
                job["experience"] = word[0]
        job["keywords"] = words
        jobs.append(job)
    ret["jobs"] = jobs
    return json.dumps(ret)
Ejemplo n.º 5
0
def start_sess():
    dryscrape.start_xvfb()
    sess = dryscrape.Session()
    sess.set_header(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0"
    )
    sess.set_attribute('auto_load_images', 0)
    print('Visitando skytorrents')
    sess.visit('https://www.skytorrents.lol/')
    print('Saltando la proteccion de Cloudflare...')
    for a in range(10, 0, -1):
        if a < 10:
            b = '0' + str(a)
        else:
            b = str(a)

        stdout.write("\r" + str(b))
        stdout.flush()
        sleep(1)
    stdout.write("\033[K")  #clear line
    stdout.flush()
    print()
    print('Ya puedes usar el buscador inline en Telegram')
    return (sess)
Ejemplo n.º 6
0
def postear_en_twitter(mensaje):

	if 'linux' in sys.platform:
		dryscrape.start_xvfb()
    
	sess = dryscrape.Session(base_url = 'https://www.twitter.com')
	sess.set_header("User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:41.0) Gecko/20100101 Firefox/41.0")
	# True = muestra imágenes
	# False = oculta las imágenes
	sess.set_attribute('auto_load_images', False)

	email='Escribe tu correo de twitter'         # Debe de ir entrecomillado
	password='******'  # Debe de ir entrecomillado

	try:		
		sess.visit('/')
		q = sess.at_xpath('//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[1]/input')
		q.set(email)
		q = sess.at_xpath('//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[2]/input')
		q.set(password)
		q.form().submit()	

		q=sess.at_xpath('//*[@id="tweet-box-home-timeline"]')
		q.click()
		q=sess.at_xpath('/html/body/div[2]/div[3]/div/div[2]/div[2]/div/form/div[2]/textarea')
		q.set(mensaje)		
		q = sess.at_xpath('//*[@id="timeline"]/div[2]/div/form/div[3]/div[2]/button')		
		q.click()
		sleep(1)
		# sess.render('twitter.png')
	except Exception as e:
		print (e)			
Ejemplo n.º 7
0
    def __init__(self):
        self.cat_path = '/courses'

        dryscrape.start_xvfb()
        self.session = dryscrape.Session(base_url=ECOMMERCE_URL)

        # No need to load images.
        self.session.set_attribute('auto_load_images', False)
Ejemplo n.º 8
0
def get_courses(searchPattern, school, prof_dict):

    if 'linux' in sys.platform:
        # start xvfb in case no X is running. Make sure xvfb
        # is installed, otherwise this won't work!
        dryscrape.start_xvfb()

    url = "https://www.coursicle.com/" + school + "/#search=" + searchPattern

    session = dryscrape.Session()
    session.set_attribute('auto_load_images', False)
    session.visit(url)

    response = session.body()

    soup = BeautifulSoup(response, 'lxml')
    table = soup.find('div', attrs={'id': 'show_results'})

    codes = []
    names = []
    instructors = []
    days = []
    times = []
    is_lecture = []

    for courseNumber in table.findAll('div', attrs={'class': 'courseNumber'}):
        codes.append(courseNumber.text)
        cNum = courseNumber.text[-3:]
        if(int(cNum) < 600):
            is_lecture.append(True)
        else:
            is_lecture.append(False)

    for name in table.findAll('div', attrs={'class': 'courseName'}):
        #print(name.text)
        names.append(name.text)

    for instructor in table.findAll('div', attrs={'class': 'instructor'}):
        #print(instructor.text)
        instructors.append(instructor.text)

    for day in table.findAll('div', attrs={'class': 'days'}):
        #print(day.text)
        days.append(day.text)

    for time in table.findAll('div', attrs={'class': 'time'}):
        #print(time.text)
        times.append(time.text)

    output = []

    for i in range(len(instructors)):
        if (is_lecture[i]):
            rating = get_rating(instructors[i], prof_dict)
            cBlock = course_block(codes[i], names[i], days[i], times[i], is_lecture[i], instructors[i], rating)
            output.append(cBlock)

    return output
Ejemplo n.º 9
0
def request_body(url):
    global session
    if session is None:
        dryscrape.start_xvfb()
        session = dryscrape.Session(base_url="http://www.ffvoile.fr/ffv/")
        session.set_attribute("auto_load_images",False)

    response = session.visit(url)
    body = PyQuery(session.body())
    return body
Ejemplo n.º 10
0
def request_body(url):
    global session
    if session is None:
        dryscrape.start_xvfb()
        session = dryscrape.Session(base_url="http://www.ffvoile.fr/ffv/")
        session.set_attribute("auto_load_images", False)

    response = session.visit(url)
    body = PyQuery(session.body())
    return body
Ejemplo n.º 11
0
def start():
	import sys

	try:
		import dryscrape
	except:
		raise Exception('dryscrape not found')

	if 'linux' in sys.platform:
		# start xvfb in case no X is running. Make sure xvfb 
		# is installed, otherwise this won't work!
		dryscrape.start_xvfb()	
def get_url(url):
    try:
        if 'linux' in sys.platform:
            # start xvfb in case no X is running. Make sure xvfb
            # is installed, otherwise this won't work!
            dryscrape.start_xvfb()

        session = dryscrape.Session()
        session.visit(url)
        return session.body()
    except Exception as e:
        logging.exception("No get %s" % url)
Ejemplo n.º 13
0
    def __init__(self, settings):
        super().__init__(settings)

        # start xvfb to support headless scraping
        if 'linux' in sys.platform:
            dryscrape.start_xvfb()

        self.dryscrape_session = dryscrape.Session(base_url='http://zipru.to')
        for key, value in settings['DEFAULT_REQUEST_HEADERS'].items():
            # seems to be a bug with how webkit-server handles accept-encoding
            if key.lower() != 'accept-encoding':
                self.dryscrape_session.set_header(key, value)
Ejemplo n.º 14
0
def checkCourse(course_name, course_id, course_sec):
	dryscrape.start_xvfb()
	sess = dryscrape.Session()
	sess.set_attribute('auto_load_images', False)
	sess.visit("https://courses.students.ubc.ca/cs/main?sessyr=2015&sesscd=W")
	sess.at_xpath('//*[@id="ubc7-unit-navigation"]/ul/li[1]/div/a').click()
	sess.at_xpath('//*[@id="ubc7-unit-navigation"]/ul/li[1]/div/ul/li[2]/a').click()
	sess.visit("https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=5&dept=%s&course=%d&section=%s" % (course_name, course_id, course_sec))
	pagehtml = sess.source()
	regex = 'The requested section is either no longer offered at UBC Vancouver or is not being offered this session.'
	result = re.search(regex, pagehtml)
	return result
Ejemplo n.º 15
0
def main():
	if 'linux' in sys.platform:
		# start xvfb in case no X is running. Make sure xvfb 
		# is installed, otherwise this won't work!
		dryscrape.start_xvfb()

	try:
		opts, args = getopt.getopt(sys.argv[1:], "s:")
	except getopt.GetoptError as err:
		print str(err)
		print "probe_tez_ui.py -s UI endpoint"
		sys.exit(2)

	ui_server = None
	for o, a in opts:
		if o == "-s":
			ui_server = a
	if ui_server == None:
		assert False, "probe_tez_ui.py -s UI endpoint"

	suffix = "/tez-ui/#/?rowCount=100"
	url = "http://{0}/{1}".format(ui_server, suffix)
	print "Probing", url
	sess = dryscrape.Session(base_url = url)

	sess.visit(suffix)
	time.sleep(10.0)

	# Extract DAG links.
	targets = []
	for link in sess.xpath('//a[@href]'):
		link = link['href']
		if link.startswith("#/dag/"):
			targets.append(link)

	for link in targets:
		id = link.split('/')[-1]
		print(link)
		sess.visit(link)
		cumulative = 0
		while cumulative <= 20:
			sleep_time = 0.25
			cumulative += sleep_time
			print "Sleeping ({0})".format(cumulative)
			time.sleep(sleep_time)
			new_file_text = "{0}_{1:04d}s.html".format(id, int(cumulative*100))
			text = sess.body().encode('utf-8')
			fd = open(new_file_text, "w")
			fd.write(text)
			fd.close()
			if text.find("Loading") == -1:
				break
Ejemplo n.º 16
0
    def login(self, credentials=None):
        """
        Log the given credentials in and authentication this service with cookies.

        @params
        credentials : Credentials used to log the user into a Microsoft Account.
                      By default, this will use the last used credentials (which may
                        be the credentials used to start the service).
                      This should contain both username and password keys.

        @throws
        InvalidCredentialsException : Thrown if the given credentials don't authenticate the session.
        """

        # load up the session and log the user in
        dryscrape.start_xvfb()
        session = dryscrape.Session()

        session.set_attribute('auto_load_images', False)
        session.visit('https://login.live.com/login.srf')

        if (not credentials): # if no new credentials were provided
            credentials = self.credentials
        else:
            self.credentials = credentials

        # enter the user's credentials into the login page
        session.at_css('#i0116').set(credentials['username'])
        session.at_css('#i0118').set(credentials['password'])

        session.at_css('#idSIButton9').click()

        # stupid trick to force the page to finish loading
        try:
            session.at_css('#FMht')
        except Exception:
            pass

        # make sure the login was succesful
        auth_cookies = {}

        # grab the authentication cookies from the session and save them for later
        for cookie in session.cookies():
            cookie_info = cookie.split(';')[0].split('=')
            if ('AMC' in cookie_info[0]):
                auth_cookies[cookie_info[0]] = cookie_info[1]

        if (not 'AMCSecAuth' in auth_cookies): # login was not sucessful
            raise InvalidCredentialsException(credentials['username'])
        
        self.auth_cookies = auth_cookies
        self.credentials = credentials
Ejemplo n.º 17
0
    def login(self, credentials=None):
        """
        Log the given credentials in and authentication this service with cookies.

        @params
        credentials : Credentials used to log the user into a Microsoft Account.
                      By default, this will use the last used credentials (which may
                        be the credentials used to start the service).
                      This should contain both username and password keys.

        @throws
        InvalidCredentialsException : Thrown if the given credentials don't authenticate the session.
        """

        # load up the session and log the user in
        dryscrape.start_xvfb()
        session = dryscrape.Session()

        session.set_attribute('auto_load_images', False)
        session.visit('https://login.live.com/login.srf')

        if (not credentials):  # if no new credentials were provided
            credentials = self.credentials
        else:
            self.credentials = credentials

        # enter the user's credentials into the login page
        session.at_css('#i0116').set(credentials['username'])
        session.at_css('#i0118').set(credentials['password'])

        session.at_css('#idSIButton9').click()

        # stupid trick to force the page to finish loading
        try:
            session.at_css('#FMht')
        except Exception:
            pass

        # make sure the login was succesful
        auth_cookies = {}

        # grab the authentication cookies from the session and save them for later
        for cookie in session.cookies():
            cookie_info = cookie.split(';')[0].split('=')
            if ('AMC' in cookie_info[0]):
                auth_cookies[cookie_info[0]] = cookie_info[1]

        if (not 'AMCSecAuth' in auth_cookies):  # login was not sucessful
            raise InvalidCredentialsException(credentials['username'])

        self.auth_cookies = auth_cookies
        self.credentials = credentials
Ejemplo n.º 18
0
    def __init__(self,
                 metadata,
                 output_dir=None,
                 strict_validation=None,
                 fastmode=False):
        """
        Create a new Scraper instance.

        :param metadata: metadata for this scraper
        :param output_dir: the data directory to use
        :param strict_validation: exit immediately if validation fails
        """
        super(Scraper, self).__init__()

        # scrapelib overrides
        self.timeout = settings.SCRAPELIB_TIMEOUT
        self.cache_storage = scrapelib.FileCache(settings.BILLY_CACHE_DIR)
        self.requests_per_minute = settings.SCRAPELIB_RPM
        self.retry_attempts = settings.SCRAPELIB_RETRY_ATTEMPTS
        self.retry_wait_seconds = settings.SCRAPELIB_RETRY_WAIT_SECONDS

        if fastmode:
            self.requests_per_minute = 0
            self.cache_write_only = False

        # if scraper uses dryscrape, set up session
        if settings.USES_DRYSCRAPE:
            dryscrape.start_xvfb()
            self.session = dryscrape.Session()

        self.metadata = metadata
        self.output_dir = output_dir
        self.output_names = set()

        # make output_dir
        os.path.isdir(self.output_dir) or os.path.makedirs(self.output_dir)

        # validation
        self.strict_validation = strict_validation
        self.validator = DatetimeValidator()
        self._schema = {}
        self._load_schemas()

        # logging convenience methods
        self.logger = logging.getLogger("billy")
        self.log = self.logger.info
        self.info = self.logger.info
        self.debug = self.logger.debug
        self.warning = self.logger.warning
        self.error = self.logger.error
        self.critical = self.logger.critical
    def __init__(self, settings):
        """
        初始化一个 dryscrape 会话:
            会话当作一个浏览器标签,它会做所有浏览器通常所做的事(如获取外部资源,获取脚本)。
            我们可以在选项卡中导航到新的URL,点击按钮,输入文本以及做其它各类事务。
        注:
        Scrapy支持请求和项目处理的并发,但响应的处理是单线程的, 所以可以使用这个单独的 dryscrape 会话,而不用担心线程安全。
        :param settings:
        """
        super().__init__(settings)

        # start xvfb to support headless scraping
        # 用 dryscrape 构造无头 webkit 实例
        if 'linux' in sys.platform:
            dryscrape.start_xvfb()

        self.dryscrape_session = dryscrape.Session(base_url='http://demo.com')
Ejemplo n.º 20
0
def pollrssi(url,outfn,interval):
    outfn = Path(outfn).expanduser()

    dryscrape.start_xvfb()

    sess = setuphtml(url)

    outfn.write_text('time,status,rssi,sinr,bars')

    while True:
        html = sess.body()
        status,rssi,sinr,bars = parsehtml(html)
        line = '{},{},{},{},{}\n'.format(datetime.utcnow().strftime('%xT%X'),status,rssi,sinr,bars)
        with open(str(outfn),'a') as f:
            f.write(line)

        sleep(interval)
Ejemplo n.º 21
0
def save_exercise(isbn, url, chapter, section, exercise, solutions):
    import dryscrape

    if 'linux' in sys.platform:
        dryscrape.start_xvfb()

    chapter = chapter.strip().lower().replace(' ', '_')
    exercise = exercise.strip().replace('.', '')
    isbn = str(isbn)

    if not os.path.exists(SLADER_DIR + isbn):
        os.makedirs(SLADER_DIR + isbn)

    if not os.path.exists(SLADER_DIR + isbn + "/" + chapter):
        os.makedirs(SLADER_DIR + isbn + "/" + chapter)

    # /home/user/.slader/34564512443/chapter_6/6.1.2/png
    filename = "%s%s/%s/%s.%s.png" % (SLADER_DIR, isbn, chapter,
                                      section.strip(), exercise)
    js = open('simplify.js', 'r').read()

    try:
        session = dryscrape.Session()
        session.set_timeout(DRYSCRAPE_TIMEOUT)
        session.visit(url)
        session.eval_script(js)

        js = open('hide_all.js', 'r').read()
        session.eval_script(js)

        # toggle each solution on. take a screenshot. toggle off. repeat.
        for i in range(solutions):
            js = "document.getElementsByClassName(\"solution user-content\")[%d].style.visibility=\"visible\"" % i
            session.eval_script(js)
            session.render(filename.replace(".png", ".s%d.png" % i))
            js = "document.getElementsByClassName(\"solution user-content\")[%d].style.visibility=\"hidden\"" % i
            session.eval_script(js)

        time.sleep(1)
        return True
    except socket.error:
        #  "Connection refused. Exit to reset webkit."
        exit(1)
    except:
        return False
Ejemplo n.º 22
0
def get_js_session(url, wait=5, viewport=(1024, 768), render_fn=None):

    import dryscrape

    if 'linux' in sys.platform:
        # start xvfb in case no X is running. Make sure xvfb
        # is installed, otherwise this won't work!
        dryscrape.start_xvfb()

    sess = dryscrape.Session()
    sess.set_viewport_size(width=viewport[0], height=viewport[1])
    sess.visit(url)
    sleep(wait)
    if render_fn:
        if not render_fn.endswith('.png'):
            render_fn += '.png'
        sess.render(render_fn)
    return sess
Ejemplo n.º 23
0
def veriify_signature(url, sign):
    """
    Verification de lapresence d'une signature spécifique dans le code source de votre page
    :param url: 
    :return: 
    """

    dryscrape.start_xvfb()
    sess = dryscrape.Session()
    # Nous n'avons pas besoin des images
    sess.set_attribute('auto_load_images', False)
    # visite du site
    sess.visit(url)
    corp = sess.body()
    if sign in corp:
        print('\n[Ok] * Signature présent dans le code source de la page Web\n')
    else:
        print('\n[Warning] * Pas de signature trouver dans le code source de votre page Web')
        print
Ejemplo n.º 24
0
    def get_username_dynamic_cookie(self):
        try:
            url = "https://weibo.cn/{}/info".format(self.user_id)
            user_agent = ('Mozilla/5.0 (X11; Linux x86_64; rv:12.0) '
                          'Gecko/20100101 Firefox/12.0', )
            default_request_headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;'
                'q=0.9,*/*;q=0.8',
                'User-Agent': user_agent,
                'Connection': 'Keep-Alive',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'en-US,*',
            }

            # start xvfb to support headless scraping
            if 'linux' in sys.platform:
                dryscrape.start_xvfb()

            for key, value in default_request_headers.items():
                # seems to be a bug with how webkit-server handles
                # accept-encoding
                if key.lower() != 'accept-encoding':
                    self.dryscrape_session.set_header(key, value)

            self.dryscrape_session = dryscrape.Session(base_url=url)

            cookies = {}
            for cookie_string in self.dryscrape_session.cookies():
                if 'domain=zipru.to' in cookie_string:
                    key, value = cookie_string.split(';')[0].split('=')
                    cookies[key] = value
            self.cookie = cookies

            html = requests.get(url,
                                cookies=self.cookie,
                                headers=default_request_headers).content
            selector = etree.HTML(html)
            username = selector.xpath("//title/text()")[0]
            self.username = username[:-3]
            print("User name: " + self.username)
        except Exception:
            logging.exception("message")
Ejemplo n.º 25
0
def main():
    if 'linux' in sys.platform:
        # start xvfb in case no X is running. Make sure xvfb
        # is installed, otherwise this won't work!
        dryscrape.start_xvfb()
    # sPage = requests.get(startUrl)
    # sHtml = sPage.text
    # sPage.raise_for_status()
    sess = dryscrape.Session(base_url='https://www.owler.com')
    sess.set_attribute('auto_load_images', False)
    sess.visit('/sector/industrial-machinery-equipment-companies')
    print(sess.status_code(), sess.headers())
    sHtml = sess.body()
    # with open('sample.txt', 'r') as f:  # Mocked
    #     sHtml = f.read()  # Mocked
    resultsInfo = Extractor(sHtml)
    sdf = resultsInfo.getData()
    print(type(sdf))
    # writeData(sdf, 'companies')
    writeData(sdf, 'runcompanies')  # Mocked
    n = resultsInfo.nResults()
    for i in range(5, 0, -1):
        time.sleep(1)
        print('%s seconds - Next page will begin' % (i))
    for v in range(2, int(n / 15)):
        nextone = '/sector/industrial-machinery-equipment-companies?p=%s' % (v)
        print(nextone)
        # page = requests.get(nextpage)
        # page.raise_for_status()
        # html = page.text
        sess.visit(nextone)
        print(sess.status_code(), sess.headers())
        html = sess.body()
        info = Extractor(html)
        # info = Extractor(sHtml)  # Mocked
        df = info.getData()
        # writeData(df, 'companies')
        writeData(df, 'runcompanies')  # Mocked
        for i in range(20, 0, -1):
            time.sleep(1)
            print('%s seconds - Next page will begin' % (i))
Ejemplo n.º 26
0
def get_profs():
    if 'linux' in sys.platform:
        dryscrape.start_xvfb()

    prof_dict = {}

    # Read the HTML File
    url = "https://www.ratemyprofessors.com"
    session = dryscrape.Session()
    session.set_attribute('auto_load_images', False)
    session.visit(url)
    url = "https://www.ratemyprofessors.com/search.jsp?queryBy=schoolId&schoolName=University+of+North+Carolina+at+Chapel+Hill&schoolID=1232&queryoption=TEACHER"
    session.visit(url)
    # <div class="content" onclick="javascript:mtvn.btg.Controller.sendLinkEvent({ linkName:'PROFMIDPANE:LoadMore', linkType:'o' } );">Load More</div>
    load_more = session.at_xpath(
        "//*[@id='mainContent']/div[1]/div/div[5]/div/div[1]")
    for i in range(292):
        load_more.click()

    page = session.body()
    parsed_html = BeautifulSoup(page, 'lxml')

    #print(parsed_html)
    list_of_profs = parsed_html.find('div', attrs={'id': 'body'})
    list_of_profs = list_of_profs.find('div', attrs={'id': 'mainContent'})
    list_of_profs = list_of_profs.find('div', attrs={'class': 'left-panel'})
    list_of_profs = list_of_profs.find('div', attrs={'class': 'side-panel'})
    list_of_profs = list_of_profs.find('div', attrs={'class': 'result-list'})
    print(list_of_profs)
    list_of_profs = list_of_profs.find_all(
        'li', attrs={'id': re.compile('my-professor*')})

    for x in list_of_profs:
        name = x.find('span', attrs={'class': 'name'}).text
        name = name.split(" ")
        name[1] = re.sub('\s+', '', name[1])
        name = name[0] + " " + name[1][:len(name[1]) - 2]
        name = re.sub(r'\d+', '', name)
        rating = x.find('span', attrs={'class': 'rating'}).text
        prof_dict[str(name)] = float(str(rating))
    return prof_dict
Ejemplo n.º 27
0
    def get_body_response(self, url, is_count_items_exists=False):
        if 'linux' in sys.platform:
            dryscrape.start_xvfb()

        self.session = dryscrape.Session()
        self.session.set_attribute('auto_load_images', False)
        self.session.set_header('User-agent', 'Google Chrome')

        self.session.visit(url)

        if is_count_items_exists:
            value = self.session.at_xpath('//span[@class="ProfileNav-value"]')
            self.count_items = int(
                value.get_attr('data-count')) if value else None

        for i in range(math.ceil(self.limit / PAGE_SIZE)):
            if self._is_last_tweet():
                break
            self._load_more_results()

        return self.session.body()
Ejemplo n.º 28
0
class PageFetcher(object):

    logger = logging.getLogger(__name__)
    dryscrape.start_xvfb()

    def _can_fetch(self, url):
        return True
        robots_file = self._get_robots_file_url(url)
        rp = robotparser.RobotFileParser()
        rp.set_url(robots_file)
        rp.read()
        return rp.can_fetch(USER_AGENT, url)

    def _get_robots_file_url(self, url):
        hostname = urlparse.urlparse(url).hostname
        robots_file = "http://{hostname}/robots.txt".format(hostname=hostname)
        self.logger.debug("Robots file: {r}".format(r=robots_file))
        return robots_file

    def _get_content(self, response):
        if response.info().get('Content-Encoding') == 'gzip':
            self.logger.debug("Decompressing gzip content")
            buf = StringIO( response.read())
            f = gzip.GzipFile(fileobj=buf)
            return f.read()
        return response.read()

    def fetch_page(self, url):
        corrected_url = url
        if not url.startswith("http://") and not url.startswith("https://"):
            corrected_url = "http://{u}".format(u=url)
        self.logger.debug("Fetching page: {u}".format(u = corrected_url))
        cache = get_page_cache()
        cached_content = cache.get_cached_content(corrected_url)
        if cached_content:
            self.logger.debug("Page served from cache")
            return cached_content
        if not self._can_fetch(corrected_url):
            self.logger.warn("Unable to fetch, disallowed by robots.txt")
            raise FetchException("Disallowed by robots.txt")
        try:
            parsed_url = urlparse.urlparse(url)
            base_url = parsed_url.scheme + "://" + parsed_url.hostname
            path = parsed_url.path
            sess = dryscrape.Session(base_url=base_url)
            sess.set_attribute('auto_load_images', False)
            sess.visit(path)
            content = sess.body()
            cache.save_content(corrected_url, content)
            return content
        except Exception as e:
            raise FetchException("Failed to load the page", e)
Ejemplo n.º 29
0
def get_website_dryscrape():
    try:
        dryscrape.start_xvfb()
        session = dryscrape.Session()
        sleep(3)
        session.visit(url)
        sleep(3)
        response = session.body()
        soup = BeautifulSoup(response,
                             "html.parser").find("div", {
                                 "id": "news-wrapper"
                             }).find("ul")
        liste = []
        for i in soup.findAll("li"):
            exam = str(i).replace("<li>", "").replace("</li>",
                                                      "").split("(")[0]
            if len(exam) > 10:
                if not "\\" in exam:
                    liste.append(exam)
        return liste
    except:
        return []
Ejemplo n.º 30
0
    def parse(self, response):
        # start session
        os.system("echo 'Starting xvfb instance...'")
        dryscrape.start_xvfb()
        session = dryscrape.Session() # start session
        self.session_properties(session)

        os.system("echo 'Crawling...'")

        # visit url
        session.visit(self.start_urls[0]) # visit website
        response = session.body()

        os.system("echo 'Done crawling.'")

        # pkill xvfb
        os.system("echo 'Closing xvfb instance...'")
        os.system("sudo pkill Xvfb")

        # scraper objects
        self.scraper = PollenScraper()
        self.soup = BeautifulSoup(response, 'lxml')

        # extract
        if (self.date != ""):
            self.extract_date(self.date)

            # yield
            yield self.scraper

        else:
            for date in range(3):
                self.extract_date(date)

                # yield
                yield self.scraper
Ejemplo n.º 31
0
    def parse(self, response):
        # start session
        os.system("echo 'Starting xvfb instance...'")
        dryscrape.start_xvfb()
        session = dryscrape.Session()  # start session
        self.session_properties(session)

        os.system("echo 'Crawling...'")

        # visit url
        session.visit(self.start_urls[0])  # visit website
        response = session.body()

        os.system("echo 'Done crawling.'")

        # pkill xvfb
        os.system("echo 'Closing xvfb instance...'")
        os.system("sudo pkill Xvfb")

        # scraper objects
        self.scraper = PollenScraper()
        self.soup = BeautifulSoup(response, 'lxml')

        # extract
        if (self.date != ""):
            self.extract_date(self.date)

            # yield
            yield self.scraper

        else:
            for date in range(3):
                self.extract_date(date)

                # yield
                yield self.scraper
Ejemplo n.º 32
0
def listMovieScripts():
    dryscrape.start_xvfb()
    session = dryscrape.Session()

    imsdbLink = "http://www.imsdb.com/all scripts/"
    session.visit(imsdbLink)
    webContent = session.body()

    bs = BeautifulSoup(webContent)
    movies = bs.findAll(lambda tag: tag.name == 'p')

    outlist = open("imsdb_urls_imdb.csv", "w")

    for movie in movies:
        #<p><a href="/Movie Scripts/Boyhood Script.html" title="Boyhood Script">Boyhood</a> (Undated Draft)<br><i>Written by Richard Linklater</i><br></p>
        movie_title = movie.find(lambda tag: tag.name == 'a').text
        if (movie_title.endswith(", The")):
            movie_title = "The " + movie_title.replace(", The", "")

        print movie_title

        movie_url = "http://www.imsdb.com" + urllib.quote(
            movie.find(lambda tag: tag.name == 'a').get("href"))

        movie_writer = movie.find(lambda tag: tag.name == 'i').text
        movie_writer = movie_writer.replace("Written by ", "")
        movie_writer_list = getlastNames(movie_writer.split(","))

        dir_scripts = "Imsdb_scripts/"
        ensure_dir(dir_scripts)

        session.visit(movie_url)
        response = session.body()
        m = re.search(r'<a href="?\'?(/scripts/[^"\'>]*)', response)
        if m:
            filename = m.group(1)

            if filename.endswith(".html"):

                outscript = open(
                    dir_scripts + movie_title.replace(" ", "_") + ".txt", "w")
            imsdb_url = "http://www.imsdb.com" + m.group(1)

            session.visit(imsdb_url)
            response = session.body()
            pre = re.search(r'<pre>(.*?)</pre>', response, re.DOTALL)

            if pre:
                script = pre.group(1)
                #print re.search(r'<script>(.*?)<pre>', script, re.DOTALL)
                rem = re.compile(r'<script>(.*?)<pre>', re.DOTALL)
                rem2 = re.compile(r'<title>(.*?)<pre>', re.DOTALL)
                rem3 = re.compile(r'<b>\s+\.*\d*\.*\s*</b>', re.MULTILINE)
                script = re.sub(rem, '', script)
                script = re.sub(rem2, '', script)
                script = re.sub(rem3, '\n', script)

                outscript.write(script.encode('utf8'))
                outscript.close()

                imdb_url = findImdbUrl(movie_title, movie_writer_list)
                if (not imdb_url): imdb_url = ""

                outlist.write(movie_title + "," + movie_writer + "," +
                              imsdb_url + "," + imdb_url + "," + imsdb_url +
                              "\n")

        time.sleep(60)

    outlist.close()
Ejemplo n.º 33
0
def postear_en_twitter(mensaje):

    #===================================================
    # PARTE 1: PREPARATIVOS
    #===================================================

    #----------------------------------------------------
    # Si se ejecuta en linux, debemos ejecutar la
    # función start_xvfb()
    #----------------------------------------------------
    if 'linux' in sys.platform:
        dryscrape.start_xvfb()

    #----------------------------------------------------
    # Le decimos a que página queremos entrar
    #----------------------------------------------------
    sess = dryscrape.Session(base_url='https://www.twitter.com')
    #----------------------------------------------------
    # Se añade la cabecera que estamos usando un
    # navegador, sino entrarás en twitter como un bot y
    # seguramente no puedas acceder a él
    #----------------------------------------------------
    sess.set_header(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 5.1; rv:41.0) Gecko/20100101 Firefox/41.0")
    #----------------------------------------------------
    # Ajusta si quieres que extraiga las imágenes al
    # scrappear el contenido o no.
    #
    # Para este bot no hace falta, porque lo que va a
    # hacer es enviar un mensaje
    #----------------------------------------------------
    # True = muestra imágenes
    # False = oculta las imágenes
    sess.set_attribute('auto_load_images', False)

    #----------------------------------------------------
    # Escribe las credenciales para entrar a tu cuenta
    # de Twitter
    #----------------------------------------------------
    email = 'Escribe tu correo de twitter'  # Debes dejar las comillas
    password = '******'  # Debes dejar las comillas

    try:
        #===================================================
        # PARTE 2: LOGUEO
        #===================================================
        #----------------------------------------------------
        # Visito https://www.twitter.com
        #----------------------------------------------------
        sess.visit('/')
        #----------------------------------------------------
        # Me voy a la caja de texto de usuario
        #----------------------------------------------------
        q = sess.at_xpath(
            '//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[1]/input')
        #----------------------------------------------------
        # Le digo que escriba el email que le he dicho antes
        #----------------------------------------------------
        q.set(email)
        #----------------------------------------------------
        # Me voy a la caja de texto de contraseña
        #----------------------------------------------------
        q = sess.at_xpath(
            '//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[2]/input')
        #----------------------------------------------------
        # Le digo que escriba el pass que le he dicho antes
        #----------------------------------------------------
        q.set(password)
        #----------------------------------------------------
        # Le digo que clickee en el botón de Iniciar sesión
        #----------------------------------------------------
        q.form().submit()

        #===================================================
        # PARTE 3: ESCRIBIR MENSAJE
        #===================================================
        #----------------------------------------------------
        # Le digo que clickee en la caja de texto para
        # escribir el mensaje
        #----------------------------------------------------
        q = sess.at_xpath('//*[@id="tweet-box-home-timeline"]')
        q.click()
        #----------------------------------------------------
        # Le digo que escriba el mensaje que le paso a la
        # función
        #----------------------------------------------------
        q = sess.at_xpath(
            '/html/body/div[2]/div[3]/div/div[2]/div[2]/div/form/div[2]/textarea'
        )
        q.set(mensaje)
        #----------------------------------------------------
        # Clickeo en el botón de Twittear
        #----------------------------------------------------
        q = sess.at_xpath(
            '//*[@id="timeline"]/div[2]/div/form/div[3]/div[2]/button')
        q.click()
        #----------------------------------------------------
        # Hago una pausa de un segundo antes de salir de la
        # función
        #----------------------------------------------------
        sleep(1)
        # sess.render('twitter.png')
    except Exception as e:
        print(e)
Ejemplo n.º 34
0
def extract_acts(site):
    """
    @brief: Extract acts content to csv file
    @param site: First site in search result
    @note: Requires webkit_server (Unix only)
    """
    # Start webkit server and session
    if 'linux' in sys.platform:
        dryscrape.start_xvfb()
    sess = dryscrape.Session()
    # Load site
    sess.visit(site)

    csv_exists = os.path.exists(settings.CSV_PATH)
    with open(settings.CSV_PATH, 'a', encoding='utf8') as output_file:
        csv_writer = csv.writer(output_file,
                                delimiter=';',
                                quoting=csv.QUOTE_ALL)
        if not csv_exists:
            csv_writer.writerow(settings.HEADERS)

        i = 1
        prev_progress = ''
        # Iterate while "Next" button is enabled
        while True:
            try:
                progress = sess.wait_for(
                    lambda: sess.at_css('.text.result-counter')).text()
                if prev_progress == progress:
                    continue

                row = [i]
                elements = sess.wait_for(lambda: sess.css('.docValue'))
                # Iterate through all fields except last one (34 - 1 = 33)
                for element in elements[:33]:
                    text = element.text().strip()
                    row.append(text if text != 'Не заполнено' else '')

                frame = sess.eval_script(
                    "$('.field-iframe').contents().find('body').html();")
                try:
                    tree = html.fromstring(frame)
                    elements = tree.xpath('//p')
                except Exception:  # pylint: disable=broad-except
                    elements = []

                row.append(find_name(elements))

                csv_writer.writerow(row)
                prev_progress = progress
                print(i)
                i += 1

                button = sess.wait_for(
                    lambda: sess.at_css('.card-paginator .to-right-red'))
                if 'yui-button-disabled' in button['class']:
                    break
                button.children()[0].click()
            except dryscrape.mixins.WaitTimeoutError:
                sess.exec_script('location.reload();')
            except Exception:  # pylint: disable=broad-except
                continue
Ejemplo n.º 35
0
 def initialise(self):
     logging.info("URL:%s", self.url)
     #session = dryscrape.Session(base_url = self.url)
     if 'linux' in sys.platform:
         # start xvfb in case no X is running. Make sure xvfb is installed, otherwise this won't work!
         dryscrape.start_xvfb()
Ejemplo n.º 36
0
import mysql.connector
import re
import sys
import HTMLParser
from time import sleep
try:
    read = str(sys.argv[1])
    #read = str(input("enter the isbn number:"))
    #reader=["9350293471","9388369157","9385724060","9386797186","9386228343","9381626685","9385724061"]
    Titledb = Authordb = Pagesdb = Publisherdb = Languagedb = ISBN2db = Detailsdb = Subjectdb = None
    #for read in reader:
    ISBN1db = read
    #print("Books isbn-10:",ISBN1db)
    url = "https://www.amazon.in/dp/" + read
    print(url)
    dryscrape.start_xvfb()
    session = dryscrape.Session()
    session.visit(url)
    sleep(2)
    response = session.body()
    soup = BeautifulSoup(response, "lxml")
    try:
        extract_title = soup.find('span', {'id': 'productTitle'})
        Title = extract_title.get_text()
        if Title:
            print("Book Title:", Title)
            Titledb = Title
        else:
            pass
    except:
        try:
Ejemplo n.º 37
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import dryscrape

# make sure you have xvfb installed
dryscrape.start_xvfb()

root_url = 'YOUR_BASE_URL'

if __name__ == '__main__':
  # set up a web scraping session
  sess = dryscrape.Session(base_url = root_url)

  # we don't need images
  sess.set_attribute('auto_load_images', False)

  # visit webpage
  sess.visit('YOUR_RELATIVE_PATH_TO_BASE_URL')
  # search for iframe with id="mainframe"
  frame = sess.at_xpath('//*[@id="mainframe"]')

  # get the URL of iframe
  frameURL = root_url + frame['src']
  # visit the URL of iframe
  sess2 = dryscrape.Session()
  sess2.visit(frameURL)

  # fill in the form in iframe
  name = sess2.at_xpath('//*[@id="username"]')
  name.set("John")
Ejemplo n.º 38
0
	def __init__(self):
		self.url="http://www.flashscore.com/"
		self.timeout=20
		self.useragent="Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0"
		dryscrape.start_xvfb()
Ejemplo n.º 39
0
#!/usr/bin/env python # -*- coding: UTF-8 -*-
import dryscrape
from bs4 import BeautifulSoup
import re
import sys
import json
import time
import xlsxwriter
import pdb
from kununu import kununu
workbook = xlsxwriter.Workbook('stuttgart.xlsx')  #Create Excel File
ws = workbook.add_worksheet()
dryscrape.start_xvfb()  # Start dryscrape session
session = dryscrape.Session()
session.visit(
    "https://www.dhbw-stuttgart.de/themen/internationales/internationale-studiengaenge/informatik/duale-partner/?tx_cronbafirmen_pi%5Boffset%5D=0&cHash=99f439f6a246d843d3a32e86bb8b32ca"
)  #Visit DHBW Site
response = session.body()
soup = BeautifulSoup(response, "lxml")


def has_colspan(tag):
    return tag.has_attr('colspan')


ws.set_row(0, 24)
ws.set_column(0, 0, 35)
ws.set_column(1, 1, 20)
ws.set_column(2, 2, 40)
ws.set_column(3, 3, 40)
ws.set_column(4, 4, 22)