Exemple #1
0
    def __init__(self, app, username, password, debug_dir=None):
        self.app = app
        self.username = username
        self.password = password
        self.debug_dir = debug_dir

        self.session = dryscrape.Session()
        self.session_sub_cat = dryscrape.Session()

        if self.debug_dir:
            if not os.path.exists(self.debug_dir):
                os.mkdir(self.debug_dir)
Exemple #2
0
 def readDIs(self, sleeptime, session_time):
     session = dryscrape.Session()
     session.set_timeout(session_time)
     url = 'http://' + self.ip + '/login.cgi?webpwd=Admin&Submit=Submit'
     try:
         session.visit(url)
         time.sleep(sleeptime)
         self.link = True
         self.timestamp = time.time()
     except:
         newError = 'Destination Host Unreachable ' + self.name + ' IP: ' + self.ip
         if newError not in self.ERRORS:
             logger.warning(newError)
             self.ERRORS[newError] = time.time()
         return False
     response = session.body()
     soup = BeautifulSoup(response, 'lxml')
     cDIs = soup.findAll(id=re.compile('DI\d+'))
     self.currentDIs = []
     for DI in cDIs:
         if DI.text == 'ON':
             self.currentDIs.append(1)
         else:
             self.currentDIs.append(0)
     return True
Exemple #3
0
def findImdbUrl(movie_title, movie_writer):

    dryscrape.start_xvfb()
    session = dryscrape.Session()

    link = "http://www.imdb.com/find?q=" + urllib.quote(movie_title) + "&s=all"
    session.visit(link)
    response = session.body()

    soup = BeautifulSoup(response)
    div = soup.find(lambda tag: tag.name == 'div' and tag.has_key('class') and
                    tag['class'] == 'findSection')
    if (div):
        div_content = "".join([unicode(x) for x in div.contents])

        title_search = re.compile('/title/tt\d+')
        search_results = re.findall(title_search, div_content)

        for movie_url in search_results:
            try:
                names = extractPeopleBehind("http://www.imdb.com" + movie_url +
                                            "/")
                if not set(movie_writer).isdisjoint(names):
                    return "http://www.imdb.com" + movie_url + "/"

                #soup_search = BeautifulSoup(resp_search)
                #people_behind = soup_search.findall(lambda tag: tag.name=='div' and tag.has_key('class') and tag['class']=='credit_summary_item')
                #for people in people_behind: print people.text
            except:
                pass

    return None
def get_many_usernames(artist_urls, verbose=0):
    session = dryscrape.Session()
    session.set_attribute('auto_load_images', False)
    t1 = time.time()
    count = 0
    for url in artist_urls:
        username_urls = []
        session.visit(url)
        time.sleep(1)
        page = session.body()
        soup = BeautifulSoup(page)
        usernames = soup.find_all(class_="username")
        for un in usernames:
            username_urls.append(un.a.get('href'))
        slidernames = soup.find_all(class_="nowslider_name")
        for sn in slidernames:
            username_urls.append(sn.get('href'))
        #slightly minimize repeats
        #(not all of them, though, so do this again later!)
        username_urls = set(username_urls)
        with open("username_urls2.txt", 'a') as uuf:
            for uu in username_urls:
                uuf.write(uu + "\n")
        if verbose > 0:
            print count,
            print "elapsed seconds:", time.time() - t1
            sys.stdout.flush()
        count += 1
Exemple #5
0
def read_DIs(ip, name, normal_state):
    session = dryscrape.Session()
    url = 'http://' + ip + '/login.cgi?webpwd=Admin&Submit=Submit'
    print("")

    print(name + ' - ' + time.ctime())
    session.visit(url)
    time.sleep(0.1)
    response = session.body()
    soup = BeautifulSoup(response, 'lxml')
    DIs = soup.findAll(id=re.compile('DI\d+'))
    sensors = ["220V", "SMOK", "MUXL", "DOOR", "", ""]
    i = 0
    for DI in DIs:
        if DI.text != '-':
            if DI.text == 'ON':
                DIT = 1
            else:
                DIT = 0
            if str(DIT) != normal_state[i]:
                STATE = "ALARM"
            else:
                STATE = "OK"
            if i < 4:
                print('DI' + str(i) + ' ' + sensors[i] + ' ' + str(DIT) + ' ' +
                      normal_state[i] + ' ' + STATE)
            i = i + 1
    return
def getAd(url):
    url = "https:" + url
    session = dryscrape.Session()
    session.visit(url)
    res = session.body()
    print(url)
    return BeautifulSoup(res, "html.parser").find(id="adview")
Exemple #7
0
def get_jobs(url):
    ret = {}
    jobs = []
    rake_object = rake.Rake(
        "/root/freshack/Jobscraper/freshdeskhack/SmartStoplist.txt", 3, 2, 1)
    dryscrape.start_xvfb()
    session = dryscrape.Session()
    session.visit(url)
    html_page = session.body()
    soup = BeautifulSoup(html_page, 'lxml')
    master_tag = soup.find_all("div", class_="fd-posdesc")

    for tag in master_tag:
        job = {}
        job["title"] = tag.h3.string
        div_list = tag.find_all("div")
        job_desc = ""
        for childdiv in div_list:
            text = childdiv.string
            if text:
                job_desc = job_desc + text

        keywords = rake_object.run(job_desc)
        words = []
        for word in keywords:
            if "year" not in word[0]:
                words.append(word[0])
            else:
                job["experience"] = word[0]
        job["keywords"] = words
        jobs.append(job)
    ret["jobs"] = jobs
    return json.dumps(ret)
Exemple #8
0
    def hasPopUpWindow(self):
        """
        This method uses dryscrape which imlements webkit and can scrape a web page for Javascript has well as HTML.
        javascript has alert,confirm,prompt,window.open methods
        """
        from bs4 import BeautifulSoup

        retVal = 0
        try:
            sess = dryscrape.Session()
            sess.visit(self.url)
            response = sess.body()
            soup = BeautifulSoup(response)
            data = soup.find('script')
            for tag in soup.findAll('script'):
                stringTag = str(tag)
                matchObj = re.search(
                    r'.*open\(|alert\(|confirm\(|prompt\(.*',
                    stringTag)  # look for alert,confirm,prompt,open
                if matchObj:
                    retVal = 1
                else:
                    retVal = -1
        except:
            printFormat("exc", "hasPopUpWindow", "Pop up window exception")
        self.phishScore['popUpWindow'] = retVal
        return retVal
Exemple #9
0
    def __init__(self, uri, soup=True):
        if not uri.startswith('http'):
            uri = 'http://{0}'.format(uri)
        data_path = os.path.join(os.path.dirname(__file__), 'data')
        url_parts = urlparse(uri)
        local_page = url_parts.netloc
        page = url_parts.path if url_parts.path else '/'
        base_url = '{0}://{1}'.format(url_parts.scheme, url_parts.netloc)

        if url_parts.path:
            local_page += '-' + os.path.basename(url_parts.path)

        if url_parts.query:
            local_page += '-' + url_parts.query.replace('=', '-')
            page = '{0}?{1}'.format(page, url_parts.query)

        try:
            f = open(os.path.join(data_path, local_page), 'r')
            data = f.read()
            f.close()
        except IOError, e:
            sess = dryscrape.Session(base_url=base_url)
            sess.set_attribute('auto_load_images', True)
            sess.visit(page)
            data = sess.driver.body()

            local_file = os.path.join(data_path, local_page)

            with open(local_file, 'w') as data_file:
                data_file.write(data.encode('utf-8'))
Exemple #10
0
 def scrape_Taobao_dev(url, alertPrice, getName, getIsPriceUnder):
     session = dryscrape.Session()
     session.visit(url)
     response = session.body()
     soup = bs4.BeautifulSoup(response, "html.parser")
     try:
         price = float(soup.find(id='J_PromoPriceNum').getText())
     except AttributeError as error:
         price = float(
             soup.find('input', {
                 'name': 'current_price'
             }).get('value'))
     #print(price)
     if getName and (not getIsPriceUnder):
         name = str.strip(
             soup.find("h3", {
                 "class": "tb-main-title"
             }).getText())
         return name
     elif getIsPriceUnder and (not (getName)):
         if price <= alertPrice:
             return True
         else:
             return False
     elif getName and getIsPriceUnder:
         if price <= alertPrice:
             return (name, True)
         else:
             return (name, False)
def createSession(crawlingWebsite):
    print "\nCrawler Initiated. Searching in '" + crawlingWebsite + "' for domains.\n\n"
    dryscrape.start_xvfb()
    #Begin new session with loaded scripts
    try:
    	session = dryscrape.Session()
    	session.visit(crawlingWebsite)
    	response = session.body()
    	#Reset session for handling memory issues
    	session.reset()
    except InvalidResponseError as e:
	print "Cannot open " + crawlingWebsite + "\n"
	print 'InvalidResponseError:', e
	quit()
    soup = BeautifulSoup(response, "html.parser")
    #Searches for hyperrefs in a page. This is the hardcoded bit.
    #Searching for items is webpage-specific. For a different website, please refer to its HTML content
    #to find out which tags are needed to obtain a list of domains if any.
    tableFound = soup.findAll("a", {"target": "_blank"})

    if len(tableFound) == 0:
        print "Nothing found. Terminating crawler."
	quit()
    else:
        for row in tableFound:
            #Add found domains to the list of sites
            siteList.append(row.get('href'))
def postear_en_twitter(mensaje):

	if 'linux' in sys.platform:
		dryscrape.start_xvfb()
    
	sess = dryscrape.Session(base_url = 'https://www.twitter.com')
	sess.set_header("User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:41.0) Gecko/20100101 Firefox/41.0")
	# True = muestra imágenes
	# False = oculta las imágenes
	sess.set_attribute('auto_load_images', False)

	email='Escribe tu correo de twitter'         # Debe de ir entrecomillado
	password='******'  # Debe de ir entrecomillado

	try:		
		sess.visit('/')
		q = sess.at_xpath('//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[1]/input')
		q.set(email)
		q = sess.at_xpath('//*[@id="doc"]/div/div[1]/div[1]/div[1]/form/div[2]/input')
		q.set(password)
		q.form().submit()	

		q=sess.at_xpath('//*[@id="tweet-box-home-timeline"]')
		q.click()
		q=sess.at_xpath('/html/body/div[2]/div[3]/div/div[2]/div[2]/div/form/div[2]/textarea')
		q.set(mensaje)		
		q = sess.at_xpath('//*[@id="timeline"]/div[2]/div/form/div[3]/div[2]/button')		
		q.click()
		sleep(1)
		# sess.render('twitter.png')
	except Exception as e:
		print (e)			
Exemple #13
0
class Nykaa():
    url = "https://www.makeupalley.com/product/showreview.asp/ItemId=192371/Daily-Superfoliant/Dermalogica/Scrubs"
    session = dryscrape.Session()
    session.visit(url)
    page = session.body()
    soup = BeautifulSoup(page, "html.parser")

    data = soup.find("div", {"id": "reviews-wrapper"})
    xyz = []
    for item in data.findAll("div", {"class": "comments"}):
        dict = {}
        dict['author'] = item.find("div", {
            "class": "user-name"
        }).find("p").find(text=True).strip('\t')
        dict['date'] = item.find("div", {
            "class": "date"
        }).find("p").text.strip('on')
        dict['user_details'] = item.find("div", {
            "class": "important"
        }).find("p").find(text=True)
        dict['comment'] = item.find("div", {
            "class": "comment-content"
        }).find("p").find(text=True)
        xyz.append(dict)
    print(xyz)
Exemple #14
0
class Adorebeauty(object):
    url = "https://www.adorebeauty.com.au/dermalogica/dermalogica-daily-superfoliant.html"
    session = dryscrape.Session()
    session.visit(url)
    page = session.body()
    soup = BeautifulSoup(page, "html.parser")
    analyzer = SentimentIntensityAnalyzer()
    data = soup.find("div", {"id": "customer-reviews"})
    xyz = []
    for item in data.findAll("div", {"itemprop": "review"}):
        dict = {}
        dict['user'] = item.find("span", {
            "itemprop": "author"
        }).find(text=True)
        dict['review_title'] = item.find("span", {
            "class": "review-title"
        }).find(text=True)
        dict['date'] = item.find("span", {"class": "date"}).find(text=True)
        dict['comments'] = item.find("p", {
            "itemprop": "description"
        }).find(text=True)
        vs = analyzer.polarity_scores(dict['comment'])
        #print(str(vs))
        dict['sentiment'] = str(vs)
        xyz.append(dict)
    return xyz
Exemple #15
0
 def fetch_page(self, url):
     corrected_url = url
     if not url.startswith("http://") and not url.startswith("https://"):
         corrected_url = "http://{u}".format(u=url)
     self.logger.debug("Fetching page: {u}".format(u = corrected_url))
     cache = get_page_cache()
     cached_content = cache.get_cached_content(corrected_url)
     if cached_content:
         self.logger.debug("Page served from cache")
         return cached_content
     if not self._can_fetch(corrected_url):
         self.logger.warn("Unable to fetch, disallowed by robots.txt")
         raise FetchException("Disallowed by robots.txt")
     try:
         parsed_url = urlparse.urlparse(url)
         base_url = parsed_url.scheme + "://" + parsed_url.hostname
         path = parsed_url.path
         sess = dryscrape.Session(base_url=base_url)
         sess.set_attribute('auto_load_images', False)
         sess.visit(path)
         content = sess.body()
         cache.save_content(corrected_url, content)
         return content
     except Exception as e:
         raise FetchException("Failed to load the page", e)
Exemple #16
0
    def __init__(self, p_from=0, p_to=10000):
        self.__session = dryscrape.Session()
        self.__main = "http://auto.ria.com"
        self.db = sqlite3.connect('cars.db',timeout=1).cursor()
        try:
            self.db.execute('''CREATE TABLE cars (name test, price integer, currency text, url text)''')
        except:
            print "Couldn't create new table"

        params = {
            'target': 'search',
            'event': 'little',
            'category_id':1,
            'bodystyle[0]':3,
            'bodystyle[1]':4,
            'bodystyle[2]':6,
            'chooseTypeSearchAuto':'oldAutos',
            'marka':58,
            'model':0,
            'state':0,
            's_yers':2004,
            'po_yers':0,
            'price_ot':p_from,
            'price_do':p_to,
            'currency':1
        }

        self.__params = urllib.urlencode(params)
Exemple #17
0
def get_pgo_events(request):
    # eventos:
    # events-list__event__date__day : clase para los días (span)
    # events-list__event__date__month : clase para el mes
    # events-list__event__content (CONTENIDO, DIVIDIO POR TITULO Y TEXTO)
    # events-list__event__title : clase para el título del evento
    # events-list__event__body : cuerpo / descripción del evento
    sess = dryscrape.Session()
    sess.visit('https://pokemongolive.com/en/events/')
    body = sess.body()

    soup = BeautifulSoup(body, features="lxml")
    dias = list(soup.find_all('span', class_='events-list__event__date__day'))
    meses = list(
        soup.find_all('span', class_='events-list__event__date__month'))
    titulos = list(soup.find_all('div', class_='events-list__event__title'))
    descripciones = list(
        soup.find_all('div', class_='events-list__event__body'))

    if dias and meses and titulos and descripciones:
        for i in range(5):
            if not Event.objects.filter(days=dias[i].text,
                                        title=titulos[i].text).exists():
                Event.objects.create(title=titulos[i].text,
                                     description=descripciones[i].text,
                                     days=dias[i].text + ' ' + meses[i].text)

    return HttpResponse('Got them!')
Exemple #18
0
def getResumen(soup, url):
    # session = dryscrape.Session()
    # session.visit(url)
    # response = session.body()
    # soup = BeautifulSoup(response)
    htmlResumen = soup.find_all(class_='adp-summary')

    while (len(htmlResumen) == 0):
        print 'Error el resumen esta vacio. Esperamos 15 segundos para proseguir'
        time.sleep(15)
        session = dryscrape.Session()
        session.visit(url)
        response = session.body()
        soupinterno = BeautifulSoup(response)
        htmlResumen = soupinterno.find_all(class_='adp-summary')
        del soupinterno
        del response
        del session

    resumenParseado = parseaResumen(str(htmlResumen[0]))
    resumenParseadaJSON = {
        'distancia': resumenParseado[
            0],  #TODO meter esta distancia en km para así poder hacer los cálculos de las medias sin morirnos en el intento
        'tiempo': resumenParseado[
            1]  #TODO meter el tiempo en minutos para así poder hacer los cálculos de las medias sin morirnos en el intento
    }

    return json.dumps(resumenParseadaJSON)
Exemple #19
0
    def onMouseOver(self):
        """
        This method looks for the on mouse over re-writing of links in the status bar.  This type of ruse has become less 
        effective as browsers usually ignore this.
        """

        from bs4 import BeautifulSoup

        retVal = 0
        try:
            sess = dryscrape.Session()
            sess.visit(self.url)
            response = sess.body()
            soup = BeautifulSoup(response)
            for tag in soup.findAll('a'):
                if tag.has_attr('onmouseover'):
                    match = re.search(r'window.status', tag['onmouseover'])
                    if match:
                        retVal = 1
                    else:
                        retVal = -1
                if tag.has_attr('href'):  #matches the href=javascript tag
                    hrefMatch = re.search(r'javascript', tag['href'])
                    if hrefMatch:
                        retVal = 1
                    else:
                        retVal = -1
        except:
            printFormat("exc", "onMouseOver", "On mouse over exception")
        self.phishScore['onMouseOver'] = retVal
        return retVal
Exemple #20
0
def getRuta(soup, url):
    # session = dryscrape.Session()
    # session.visit(url)
    # response = session.body()
    # soup = BeautifulSoup(response)
    htmlRuta = soup.find_all(class_='adp-directions')
    while (len(htmlRuta) == 0):
        print 'Error la ruta esta vacio. Esperamos 15 segundos para proseguir y repetimos'
        time.sleep(15)
        session = dryscrape.Session()
        session.visit(url)
        response = session.body()
        soupinterno = BeautifulSoup(response)
        htmlRuta = soupinterno.find_all(class_='adp-summary')
        del soupinterno
        del response
        del session
    rutaParseada = ruta(str(htmlRuta[0]))
    rutaPorPasosJSON = []

    for paso in rutaParseada:

        if 'El destino' in paso[1]:
            paso[1] = paso[1].replace('El destino', '. El destino')
        pasoJSON = {
            'numero_paso': paso[0].replace('.', ''),
            'descripcion_recorrido': paso[1],
            'distancia_recorrida': paso[2]
        }
        rutaPorPasosJSON.append(pasoJSON)
    return json.dumps(rutaPorPasosJSON)
Exemple #21
0
	def find(self, string):
		url = "https://duckduckgo.com/?q=procon.org"

		for word in string.split(" "):
			url += "+" + word
		
		url += "&t=h_&ia=web"

		website_string = requests.get(url).text

		session = dryscrape.Session()
		session.visit(url)
		website_string = session.body()

		search_results = re.findall('<span class="result__url__domain">(.*?)</span>', website_string, flags=re.DOTALL)
		base_url = search_results[0]
		
		links = re.findall("<a.*?newblue-get-started-(.*?)'", requests.get(base_url).text)

		self.background_url = base_url

		if links != []:
			self.argument_url = base_url + links[1]
		else:
			self.argument_url = base_url
def listMovieScripts():
    dryscrape.start_xvfb()
    session = dryscrape.Session()
    
    imsdbLink = "http://www.imsdb.com/all scripts/"
    session.visit(imsdbLink)
    webContent = session.body()
    
    bs = BeautifulSoup(webContent)
    movies = bs.findAll(lambda tag: tag.name=='p')

    links = {}
    writers = {}
    
    for movie in movies:
        #<p><a href="/Movie Scripts/Boyhood Script.html" title="Boyhood Script">Boyhood</a> (Undated Draft)<br><i>Written by Richard Linklater</i><br></p>
        movie_title = movie.find(lambda tag: tag.name=='a').text
        if (movie_title.endswith(", The")): movie_title = "The " + movie_title.replace(", The", "")

        movie_url = "http://www.imsdb.com" + urllib.quote(movie.find(lambda tag: tag.name=='a').get("href"))
        
        movie_writer = movie.find(lambda tag: tag.name=='i').text
        movie_writer = movie_writer.replace("Written by ", "")
        movie_writer_list = getlastNames(movie_writer.split(","))

        #print movie_title, movie_url, movie_writer_list
        links[movie_title] = movie_url
        writers[movie_title] = movie_writer_list
        
    	return (links, writers)		
Exemple #23
0
 def get_dryscape_session(self):
     if self.session:
         self.session.reset()
         return self.session
     else:
         self.session = dryscrape.Session()
         return self.session
    def __init__(self, url):
        self.__url = url
        self.__haveResult = True
        self.__pageNum = 0
        self.__dataList = []

        sess = dryscrape.Session()
        sess.set_attribute('auto_load_images', False)
        sess.visit(url)
        time.sleep(.5)
        content = sess.body()
        self.soup = bs4.BeautifulSoup(content, "lxml")
        if (
                'delivery' in self.__url
        ):  #set mode by looking for substring in url: 'delivery' or 'pickup' (different html format)
            self.__mode = 'delivery'
        else:
            self.__mode = 'pickup'

        #check if any restaurant data in this area
        resultCheck = self.soup.select('.no-results-h3')
        if (len(resultCheck)):  #no result
            self.__haveResult = False
        else:  #got result, then find the total page number
            self.__pageNum = int(
                self.soup.select('.searchResults-footer-stats')[0].find_all(
                    'span')[3].string)
            #print '----total items: ' + str(self.soup.select('.searchResultsSnippet-text')[1].string)
        print '----' + self.__mode + ': total pages number:' + str(
            self.__pageNum)
Exemple #25
0
def get_premarket_volume():
    session = dryscrape.Session()
    session.visit(url)
    response = session.body()
    soup = BeautifulSoup(response, 'html.parser')
    right_table = soup.find('table', {'id': 'preOpenNiftyTab'})
    rows = []
    for row in right_table.find_all('tr'):
        rows.append([
            " ".join(
                val.text.encode('utf8').replace(" NFO", "").replace(
                    " NSE", "").replace(" MCX", "").replace(",", "").split())
            for val in row.find_all('td')
        ])

    sorted_rows = sorted(rows[2:], key=lambda x: float(x[7]), reverse=True)
    top_stocks = []
    for row in sorted_rows[:10]:
        top_stocks.append(row[0])
        sc.api_call("chat.postMessage",
                    channel="@dexter",
                    text='Volume - Stock:{} Volume:{} Change%:{}'.format(
                        row[0], row[7], row[5]),
                    as_user=False)
    return top_stocks
Exemple #26
0
def getClothes():
    """Fetch a list of clothes from a saga-falabella's url

    Args:
        url: Base men url address
        
    Returns:
        This script create json filter with clothe's cathegories

    Example:
        python3 firebase_script_clothes.py <url>

    """

    sess = dryscrape.Session()
    sess.set_attribute('auto_load_images', False)
    sess.visit(sys.argv[1])
    source = sess.body()
    soup = bs.BeautifulSoup(source,'lxml')
    b = soup.select('div.fb-pod__item')
    #vertical-filters-custom

    for index, clothe in enumerate(b.findAll(name='a')):
      try:
          print(clothe['href'])
          sess.visit(clothe['href'])#dryscrape
          source = sess.body()#dryscrape
          soup = bs.BeautifulSoup(source, 'lxml')
          # x = soup.select('div.row.transcripts.video-transcripts')#principal parent tag
          # for subtitle_block in x[0].findAll(name='span'):
          #     saveFile(name_clothe, subtitle_block)
      except:
          print("error")
def testURL(section_url):
    # # Put the stuff you see when using Inspect Element in a variable called html.
    # html = urlopen(section_url).read()
    # # Parse the stuff.
    # print html[:100]
    # soup = BeautifulSoup(html, "lxml")
    # # The next two lines will change depending on what you're looking for. This
    # # line is looking for <dl class="boccat">.
    # boccat = soup.find("dl", "ba-Eb-ba")
    # # This line organizes what is found in the above line into a list of
    # # hrefs (i.e. links).
    # print boccat
    # category_links = [BASE_URL + dd.a["href"] for dd in boccat.findAll("dd")]
    # return category_links

    session = dryscrape.Session(base_url=BASE_URL)
    session.visit(section_url)
    print(type(session))
    response = session.body()
    print(type(response))
    print response
    # soup = BeautifulSoup(response, 'lxml')
    soup = BeautifulSoup(session, 'lxml')
    # results = soup.findAll("div", {"class": "ba-Eb-ba"})
    results = soup.findAll("div", class_="ba-Eb-ba")
    # results = soup.findAll(text='ga:type=\"Comment\"')
    print results
    print len(results)
    # print(soup)
    # print len(soup)
    return 0
def main():
    url = input('URL: ')
    name = input('Name to save images: ')

    session = dryscrape.Session()
    session.visit(url)
    response = session.body()
    soup = BeautifulSoup(response, "html.parser")
    if not os.path.isdir('img'):
        os.mkdir('img')

    # Find images in links
    links = soup.findAll('a', {'href': True})
    for link in links:
        if link.findAll('img'):
            img_url = link['href']
            queue.put(img_url)

    # Find images in <img> tags
    imgs = soup.findAll('img', {'src': True})
    for img in imgs:
        img_url = img['src']
        queue.put(img_url)

    create_workers(url, name)
Exemple #29
0
def scrape():

    session = dryscrape.Session()
    session.set_attribute('auto_load_images', False)
    session.visit(URL)

    response = session.body()
    global_tree = html.fromstring(response)
    products_links = global_tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[1]/a/@href')
    output = []

    validator = Validator(ScrapeStrategy(scrape_first_type_object), ScrapeStrategy(scrape_second_type_object))

    for i, products_link in enumerate(products_links):
        t0 = datetime.now()

        session.visit('http:' + products_link)
        response = session.body()
        tree = html.fromstring(response)

        strategy = validator.choose_strategy(tree)
        output.append(strategy.scrape(tree, products_link))
        print datetime.now() - t0


    with open('products.csv', 'w') as csvfile:
        fieldnames = ['Brand', 'MPN', 'URL', 'Name', 'Price', 'Stock']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for obj in output:
            writer.writerow(obj.as_dict())
Exemple #30
0
    def process_sites(self):

        # Determine if a dryscrape session is required for a js-reliant site
        need_session = any([x.require_js for x in self.sites])
        if need_session:
            print "Dryscrape session required, creating now.."
            session = dryscrape.Session()
            print "Dryscrape session created"

        for site in self.sites:
            # If it requires JS, use dryscrape
            if site.require_js:
                session.visit(site.url)
                page = session.body()
            # Otherwise just use requests
            else:
                page = requests.get(site.url).text

            # Processing is done the same regardless
            soup = BeautifulSoup(page, "lxml")
            processed_data = site.process_page(soup)

            self.processed_auctions[site.base_url] = processed_data

        print ""
        print "Done processing sites"
        PP.pprint(self.processed_auctions)