Beispiel #1
3
def scraper():
    '''
        This script scrapes http://usesthis.com/interviews/
        for each interview, saves  the interviewees name,
        product name, product description, link to  the product
        to a csv file named everyone.csv
    '''
    outputFile = open('everyone.csv', 'a')
    scrapecount = 0
    response = requests.request('get', 'http://usesthis.com/interviews/')
    html = soup(response.text)
    interviewLinks = html.findSelect('#interviews li h2 a')
    linkLength = len(interviewLinks)
    while scrapecount < (linkLength):
        response = requests.request('get', interviewLinks[scrapecount]['href'])
        html = soup(response.text)
        person = html.findSelect('.person')[0].text
        product = html.findSelect('#contents article.contents p a')
        productLength = len(product)
        csvWriter = csv.writer(outputFile)
        for x in range(0, productLength, 1):
            try:
                print person, product[x].text, product[x]['title'], product[x]['href']
                csvWriter.writerow([person, product[x].text, product[x]['title'], product[x]['href']])
            except Exception as e:
                print '%s, %s, %s, %s' % ('Exception', 'Exception', 'Exception', e)
        scrapecount += 1
Beispiel #2
0
def __actinit(mech,paswd):
    mech.open(PLAT)
    
    mech.select_form(nr=0)
    mech["user"] = "******"
    mech["passwd"] = b6d(paswd)
    results = mech.submit().read()
    soup(results)
    print (PLAT+'file/memit/')
Beispiel #3
0
	def getLinks(self,page): #in later versions i plan to make a more comprehensive interpretation of the page ;)
		exploit_table = soup(page).findAll("tr")
		for exploit in exploit_table:
			ex=exploit.contents[1]
			ex=soup(str(ex))
			anchor=str(ex.a)
			if anchor != 'None':
				descr = anchor[anchor.index('>')+1:-4]
				link = anchor.split(" ")[1].replace("href=",'').replace("\"",'')
				sys.stderr.write("[%s]%shttp://www.1337day.com%s\n" % (descr,' '*(90-(len(descr)+len(link))),link))
Beispiel #4
0
def __actinit(mech, paswd):
    cj = cookielib.LWPCookieJar()
    mech.set_cookiejar(cj)
    mech.open(PLAT)
    mech.select_form(nr=0)
    mech["user"] = "******"
    mech["passwd"] = b6d(paswd)
    results = mech.submit().read()
    cookies = mech._ua_handlers['_cookies'].cookiejar
    soup(results)
    xsrf = [ck for ck in cookies if ck.name == '_xsrf'][0]
    print (PLAT+STORAGE, xsrf)
    return xsrf.value
Beispiel #5
0
def readhorse(horsename):
    '''  this function reads the primary web page for eachhorse '''
    horsename1 = horsename.replace(" ","+")
    webstring = "http://www.pedigreequery.com/"+horsename1
    horsename=horsename.strip()
    print repr(horsename)
    hsoup = soup(urllib.urlopen(webstring)).findAll(text=re.compile("{2-d} DP =",re.IGNORECASE))
    hsoup2 = soup(urllib.urlopen(webstring)).findAll(text=re.compile("Earnings",re.IGNORECASE))
    #hsoup = soup(urllib.urlopen(webstring))
    hsoup = str(hsoup)
    hsoup2 = str(hsoup2)
    dhorsename = horsename.rstrip()+".txt"
    with open(dhorsename,'w') as webout:
        webout.write(hsoup)
        webout.write(hsoup2)
Beispiel #6
0
 def search(self, item):
     item = quote(item)
     my_url = self.url + "search/?query=" + item
     response = requests.get(
         my_url)  # Opens connection, grabs the webpage and downloads it
     page_html = response.content
     #Parsing html
     page_soup = soup(page_html)
     #grabs each product
     containers = page_soup.findAll(
         "div", {"class": "search-result-gridview-item-wrapper"})
     # print(len(containers))
     storeObj = Store()
     for container in containers:
         out_of_stock = len(containers[0].findAll(
             "div",
             {"class": "product-sub-title-block product-out-of-stock"
              })) != 0
         if not out_of_stock:
             store = Store()
             store.store_name = 'Walmart'
             store.title = container.img["alt"]
             store.image_url = container.img["data-image-src"]
             store.product_url = self.url + container.a["href"]
             store.price = container.findAll(
                 "span", {"class": "visuallyhidden"})[-1].text
             storeObj.add_item(store)
     return storeObj.generate_json()
Beispiel #7
0
    def parse_packet(self):
        '''
        This function will parse the needed data from the packet PSML XML
        definition and send the data to the API.
        '''
        # If the counter timer is set to 0, then this is the first packet
        # we have parsed.  Set the counter to the current time so that we
        # dont send a single packet stat to the API.
        if self.counter == 0:
            self.counter = int(time.time())

        # Next we instantiate a BeautifulSoup object to parse the packet and
        # pull out the protocol name.
        packet = soup(self.packet)
        proto = packet.findAll('section')[4].text


        # If we dont see the protocol yet in the protos dictionary, we need
        # to initialize it.  After that, we can then increment regardless.
        if proto not in self.protos:
            self.protos[proto] = 0
        self.protos[proto] += 1

        # Once we reach 60 seconds, we need to purge out the protocol counts
        # that we have counted.  Make an API call for each proto we have,
        # then reset the counter timer and the protos dictionary.
        if (int(time.time()) - self.counter) >= 60:
            for proto in self.protos:
                log.debug('TSHARK: sending %s=%s' % (proto, self.protos[proto]))
                self.api.stat(proto, self.protos[proto])
            self.counter = int(time.time())
            self.protos = {}
Beispiel #8
0
    def get_soup_from_url(self, url_in):
        """
        Return data loaded from an URL, as BeautifulSoup(3) object.

        Wrapper helper function aronud self.get_data_from_url()
        """
        return soup(self.get_data_from_url(url_in))
Beispiel #9
0
    def get_input_register(self):
        """ Getting input name list in form register

        :return:
        """
        response = requests.get(self.form_url)
        resp = soup(response.text)

        password = resp.find('input', {'type': 'password'})
        if self.form_name:
            form = resp.find('form', {'name': self.form_name})
        else:
            form = password.findParent('form')

        self.attrs = {}

        for input_text in form.findAll('input'):
            input_name = input_text.get('name', None)
            value = input_text.get('value', None)
            if input_text and input_name:
                if input_text['type'] == 'checkbox':
                    self.attrs.update({input_name: ['on']})
                elif input_text['type'] == 'radio':
                    self.attrs.update({input_name: [value]})
                else:
                    self.attrs.update({input_name: value})

        for input_select in form.findAll('select'):
            values = input_select.findAll('option')[1:]
            self.attrs.update({input_select.get('name', None): random.choice([[value['value']] for value in values])})
Beispiel #10
0
def proxy(request, path):
    """
    Answer requests for webalizer images and monthly reports.
    If an image is requested let django's static.serve view do the work,
    if an html file is requested just insert the content of
    the <body> into the django template.
    
    """
    context = {'title': 'Webalizer'}
    if path is None or path is u'':
        path = 'index.html'
    if webalizer_dir is not None:
        if path.endswith('.png'): # webalizer generates png images
            return serve(request, path, document_root=webalizer_dir)
        else:
            try:
                webalizer_index = open(os.path.join(webalizer_dir, path)).read()
                webalizer_soup = soup(webalizer_index)
                context.update({'data': ' '.join([unicode(x) for x in webalizer_soup.body.contents])})
            except:
                context.update({'data': None})
    return direct_to_template(request, 'webalizer/index.html', context) 
            
            
            
Beispiel #11
0
    def get_soup_from_url(self, url_in):
        """
        Return data loaded from an URL, as BeautifulSoup(3) object.

        Wrapper helper function aronud self.get_data_from_url()
        """
        return soup(self.get_data_from_url(url_in))
Beispiel #12
0
def top_ten():
    httpreq = requests.get('https://news.ycombinator.com')
    dom = soup(httpreq.text)

    outer = dom.find("table")
    inner = outer.findAll("table")[1]

    rowlist = inner.findAll("tr")
    list = []
    for row in range(0,len(rowlist)-3,3):
        rowmain = rowlist[row].findAll("td")
        rowsub = rowlist[row+1].findAll("td")

        listitem = {"link": rowmain[2].find("a")["href"], "title": rowmain[2].find("a").string, "domain": rowmain[2].findAll("span")[1].string}
        try:
            listitem["poster"] = rowsub[1].findAll('a')[0].string
            listitem["posted"] = rowsub[1].findAll('a')[1].string
            listitem["comment"] = re.findall(r'\d+', rowsub[1].findAll('a')[2].string)[0]
        except:
            continue

        list.append(listitem)



    response = "HackerNews Top 10\n"
    for i in range(0,10):
        response += '['+str(i+1)+'] ' + list[i]['title'] + list[i]['domain'] + ' ('+list[i]["posted"]+' | ' +list[i]["comment"] +' comment)' + '\n'

    return response
Beispiel #13
0
    def parse_packet(self):
        '''
        This function will parse the needed data from the packet PSML XML
        definition and send the data to the API.
        '''
        # If the counter timer is set to 0, then this is the first packet
        # we have parsed.  Set the counter to the current time so that we
        # dont send a single packet stat to the API.
        if self.counter == 0:
            self.counter = int(time.time())

        # Next we instantiate a BeautifulSoup object to parse the packet and
        # pull out the protocol name.
        packet = soup(self.packet)
        proto = packet.findAll('section')[4].text

        # If we dont see the protocol yet in the protos dictionary, we need
        # to initialize it.  After that, we can then increment regardless.
        if proto not in self.protos:
            self.protos[proto] = 0
        self.protos[proto] += 1

        # Once we reach 60 seconds, we need to purge out the protocol counts
        # that we have counted.  Make an API call for each proto we have,
        # then reset the counter timer and the protos dictionary.
        if (int(time.time()) - self.counter) >= 60:
            for proto in self.protos:
                log.debug('TSHARK: sending %s=%s' %
                          (proto, self.protos[proto]))
                self.api.stat(proto, self.protos[proto])
            self.counter = int(time.time())
            self.protos = {}
Beispiel #14
0
def get_th(_num):
    num = 1
    if _num - 1 in range(0, 29):
        num = _num - 1
    else:
        num = randint(0,29)

    httpreq = requests.get('https://news.ycombinator.com')
    dom = soup(httpreq.text)

    outer = dom.find("table")
    inner = outer.findAll("table")[1]

    row = inner.findAll("tr")

    rowmain = row[(num*3)-3].findAll("td")
    rowsub = row[(num*3)-2].findAll("td")
    returnitem = {"link": rowmain[2].find("a")["href"], "title": rowmain[2].find("a").string, "domain": rowmain[2].findAll("span")[1].string}
    try:
        returnitem["poster"] = rowsub[1].findAll('a')[0].string
        returnitem["posted"] = rowsub[1].findAll('a')[1].string
        returnitem["comment"] = re.findall(r'\d+', rowsub[1].findAll('a')[2].string)[0]
        returnitem["posttype"] = 's'
    except:
        returnitem["posttype"] = 'j'

    response = '[->] ' + returnitem['title'] + returnitem['domain']
    if returnitem["posttype"] == 'j':
        response += ' (Job) \n'
    else:
        response += ' ('+returnitem["posted"]+' | ' +returnitem["comment"] +' comment)' + '\n'

    response += returnitem["link"]
    return response
Beispiel #15
0
    def anagram(self):
        if not self.values:
            self.chat("Enter a word or phrase")
            return

        word = '+'.join(self.values)
        url = "http://wordsmith.org/anagram/anagram.cgi?anagram=" + word + "&t=50&a=n"

        urlbase = pageopen(url)
        if not urlbase:
            self.chat("Fail")
            return

        cont = soup(urlbase)

        if len(cont.findAll("p")) == 6:
            self.chat("No anagrams found.")
            return

        try:
            paragraph = cont.findAll("p")[3]
            content = ','.join(paragraph.findAll(text=True))
            content = content[2:-4]
            content = content.replace(": ,", ": ")
            self.chat(content)
        
        # Usually not concerned with exceptions
        # in mongo, but this is bound to come up
        # again.
        except Exception as e:
            print e
            self.chat("Got nothin")
def handle_ly(filename, orderno):
    site = "http://www.ly.gov.tw"

    htmlfile = open(filename,encoding='UTF-8')
    start_keystring = u'屆 立法委員名單'
    end_keystring = u'查詢結果共'
    state = "none" # none, inline
    print u'第' + str(orderno) + u'屆 清單'
    state = "none" # none, inline
    cnt = 0
    for line in htmlfile:
        if line.count(start_keystring) >= 1:
            state = "inline"
        if line.count(end_keystring)  and state == "inline" >= 1:
            state = "none"

        if state == "inline":
            cnt+= 1
            html_line = soup(line, fromEncoding="UTF-8")
            for tag in html_line.findAll('a', {'href': True}) :
                if(tag.string != None and tag['href'] != None ):
                    url = site + tag['href']
                    legislator_name = tag.string.encode('UTF-8')
               #     print type(legislator_name.encode("UTF-8"))
                    inject_legislator_tag(legislator_name, url, orderno) 
Beispiel #17
0
	def extract(self):
		_html = soup(self.RESPONSE)
		for cite in _html.findAll("p", attrs={'class': 'PartialSearchResults-item-url'}):
			sub = re.search(self.regexp, cite.text, re.IGNORECASE)
			if sub:
				_sub = sub.group()
				if _sub not in self.SUBDOMAINS:
					self.SUBDOMAINS.append(_sub)
Beispiel #18
0
	def extract(self):
		_html = soup(self.RESPONSE)
		for cite in _html.findAll("cite"):
			sub = re.search(self.regexp, cite.text, re.IGNORECASE)
			if sub:
				_sub = sub.group()
				if _sub not in self.SUBDOMAINS:
					self.SUBDOMAINS.append(_sub)
Beispiel #19
0
 def _get_all_slunk_installer_links(self, page_text):
     logger.debug('Parsing page in search for links')
     html = soup(page_text)
     for tag in html.findAll('a', {'data-link': True}):
         link = tag.attrMap['data-link']
         logger.debug('Found link: %s' % link[:-4])
         if link.startswith(self.link_prefix):
             yield link
Beispiel #20
0
def findFontGUID(fontTable):
    xmlStr = open(fontTable, "r").read()
    xml    = soup(xmlStr)

    for i, k in enumerate(xml.findAll("w:font")):
        fName = k["w:name"]
        x     = soup(str(k))

        try:
            fontKey =  x.findAll("w:embedregular")[0]["w:fontkey"]
            fontKey = "".join(fontKey[1:-1].split("-"))
        except:
            continue

    print "\tFont: {0}\n\tKey : {1}".format(fName, fontKey)
    print

    return fontKey
def findFontGUID(fontTable):
    xmlStr = open(fontTable, "r").read()
    xml = soup(xmlStr)

    for i, k in enumerate(xml.findAll("w:font")):
        fName = k["w:name"]
        x = soup(str(k))

        try:
            fontKey = x.findAll("w:embedregular")[0]["w:fontkey"]
            fontKey = "".join(fontKey[1:-1].split("-"))
        except:
            continue

    print "\tFont: {0}\n\tKey : {1}".format(fName, fontKey)
    print

    return fontKey
def _getData(url):
    try:
        data = download_page(url)
        data = data.decode('utf-8')
        data = soup(data)
    except Exception as e:
        print '[Letterboxd][_getData] %s' % (e)
        return None, None
    else:
        return data, _getNextPage(data)
 def homepage_is_in_linkspanel(self, response):
     """
     Checks whether the homepage link is displayed
     """
     html = soup(response.content)
     links = html.findAll("a")
     for l in links:
         if self.homepage in l['href']:
             return True
     return False
Beispiel #24
0
    def linker(self, urls):
        for url in urls:
            # Special behaviour for Twitter URLs
            match_twitter_urls = re.compile("http[s]?://(www.)?twitter.com/.+/status/([0-9]+)")

            twitter_urls = match_twitter_urls.findall(url)
            if len(twitter_urls):
                self.tweet(twitter_urls)
                return

            fubs = 0
            title = "Couldn't get title"
            roasted = "Couldn't roast"

            urlbase = pageopen(url)
            if not urlbase:
                fubs += 1

            try:
                opener = urllib2.build_opener()
                roasted = opener.open(SHORTENER + url).read()
            except:
                fubs += 1

            ext = url.split(".")[-1]
            images = ["gif", "png", "jpg", "jpeg"]

            if ext in images:
                title = "Image"
            elif ext == "pdf":
                title = "PDF Document"
            else:
                try:
                    cont = soup(urlbase, convertEntities=soup.HTML_ENTITIES)
                    title = cont.title.string
                except:
                    self.chat("Page parsing error")
                    return

            deli = "https://api.del.icio.us/v1/posts/add?"
            data = urllib.urlencode({"url": url, "description": title, "tags": "okdrink," + self.lastsender})

            if DELICIOUS_USER:
                base64string = base64.encodestring("%s:%s" % (DELICIOUS_USER, DELICIOUS_PASS))[:-1]
                try:
                    req = urllib2.Request(deli, data)
                    req.add_header("Authorization", "Basic %s" % base64string)
                    send = urllib2.urlopen(req)
                except:
                    self.chat("(delicious is down)")

            if fubs == 2:
                self.chat("Total fail")
            else:
                self.chat(unescape(title) + " @ " + roasted)
 def panel_is_in_response(self, response):
     """
     Checks whether the versioned links panel is found in the rendered HTML
     response.
     """
     html = soup(response.content)
     panels = html.findAll("div", {'class': 'panel-heading'})
     for panel in panels:
         if 'versioned links' in str(panel):
             return True
     return False
Beispiel #26
0
 def scrape(self, url):
     if not any(url in s for s in self.visited):
         print("scanning %s", url)
         r = requests.get(url)
         self.visited.append(url)
         if r.status_code != 200:
             self.dead.append(url)
         else:
             s = soup(r.content)
             for link in s.findAll('a'):
                 self.unvisited.append(link)
 def get_general_info_link_panel(self, response):
     """
     Checks whether the links panel is found in the rendered HTML
     response.
     """
     html = soup(response.content)
     panels = html.findAll("div", {'class': 'panel-heading'})
     for panel in panels:
         if 'links' in str(panel) and not 'versioned links' in str(panel):
             return panel
     return False
def build_metadata(filename):
    list_html_file = open(filename, encoding="UTF-8")

    keyword = "public/Attachment"
    for line in list_html_file:
        if line.count(keyword) > 0:
            html_line = soup(line)
            print html_line.a["href"]
            print html_line.a.img["title"]
            wget_pdf(html_line)

    list_html_file.close()
Beispiel #29
0
    def GetFileHosts(self, url, list, lock, message_queue):
        import base64
        import re
        from entertainment.net import Net
        
        net = Net(do_not_cache_if_any=do_no_cache_keywords_list)

        def unpack_js(p, k):
            k = k.split('|')
            for x in range(len(k) - 1, -1, -1):
                if k[x]:
                    p = re.sub('\\b%s\\b' % base36encode(x), k[x], p)
            return p

        def base36encode(number, alphabet='0123456789abcdefghijklmnopqrstuvwxyz'):
            if not isinstance(number, (int, long)):
                raise TypeError('number must be an integer')
     
            if number == 0:
                return alphabet[0]
            base36 = ''
            sign = ''
            if number < 0:
                sign = '-'
                number = - number
            while number != 0:
                number, i = divmod(number, len(alphabet))
                base36 = alphabet[i] + base36
            return sign + base36
        
        sources = [] ; final = []
        html = soup(net.http_GET(url).content)


        links = html.findAll('script')
        for link in links:
            try:
                if 'eval' in link.contents[0]:
                    r = re.search('return p}\(\'(.+?);\',\d+,\d+,\'(.+?)\'\.split',link.contents[0])
                    if r: p, k = r.groups()
                    try: sources.append((base64.b64decode(unpack_js(p, k).split('"')[1]).split('>')[1].split('<')[0]))
                    except:pass
            except IndexError : pass
        for link in sources:
            if 'www' not in link.split('//')[1].split('.')[0]:
                final.append((link.split('//')[1].split('.')[0],link))
            else: final.append((link.split('//')[1].split('.')[1],link))
        for title,blob in final:

            if not 'watchfreeinhd' in title.lower():

                self.AddFileHost(list, 'SD', blob)
Beispiel #30
0
def main():
  files = os.popen('dir /B *.html').read()
  files = files.split('\n') 

  for file in files:
    if len(file) > 0:
      infile = open(file,'r')
      data = infile.read()
      infile.close()
      soupped = soup(data)
      outfile = open(file,'w')
      outfile.write(soupped.prettify())
      outfile.close()
Beispiel #31
0
def term_to_link(term):
    urlopener = urllib2.build_opener()
    urlopener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
                             'AppleWebKit/535.19 (KHTML, like Gecko) '
                             'Chrome/18.0.1025.168 Safari/535.19')]
    if term.find(' ') > 0:
        term = ('"%s"' % term).replace(' ', '+')
    google_results = urlopener.open('https://www.google.com/search?q=%s' % term)
    page = soup(google_results.read())
    try:
        first_result = page.find('div', id='ires').find('h3', 'r').find('a')['href']
    except AttributeError:
        return None
    return first_result
Beispiel #32
0
 def run(
     self,
     target_directory,
     start_chapter=1,
     start_page=1,
     failure_threshold=2):
     """
     :param target_directory:
     """
     if not exists(target_directory):
         makedirs(target_directory)
     current_chapter = start_chapter
     current_page = start_page
     current_directory = join(target_directory, str(current_chapter))
     if not exists(current_directory):
         makedirs(current_directory)
     failure = 0
     visited = []
     _logger.info('Downloading chapter %s' % current_chapter)
     next_page_url = self._url_builder(current_chapter, current_page)
     while next_page_url is not None:
         response = get(next_page_url)
         if response.status_code == 200:
             html = soup(response.text)
             image_url = self._image_extractor(html)
             if image_url is None or image_url in visited:
                 failure += 1
             else:
                 path = join(current_directory, str(current_page)) + '.jpg'
                 success = _download(image_url, path)
                 if success:
                     failure = 0
                     visited.append(image_url)
                 else:
                     _logger.warn('Failed downloading page %s' % current_page)
                     failure += 1
         else:
             failure += 1
             _logger.warn('Failed downloading page %s' % current_page)
         if failure == 0:
             current_page += 1
         if failure > failure_threshold:
             current_chapter += 1
             current_page = start_page
             visited = []
             current_directory = join(target_directory, str(current_chapter))
             if not exists(current_directory):
                 makedirs(current_directory)
             _logger.info('Downloading chapter %s' % current_chapter)
         next_page_url = self._url_builder(current_chapter, current_page)
Beispiel #33
0
def __parse_meal_ratty(html):
    parsed = soup(html)
    table  = parsed.find('table', {'id':'tblMain'})
    rows   = table.findAll('tr')[1:]
    cols = [urllib2.unquote(col.text) for col in rows[0].findAll('td')[1:]]
    print 'Cols: ' + str(cols)
    data = {col:[] for col in cols}
    for row in rows[1:-1]:
        row_cols = row.findAll('td')[1:]
        for ix, c in enumerate(row_cols):
            if c.text:
                data[cols[ix]].append(c.text)
    data['Other'] = [col.text for col in rows[-1].findAll('td') if col.text and col.text != '.']
    return data
Beispiel #34
0
 def getDom(self, url, host, data):
     opener = urllib2.build_opener(urllib2.HTTPHandler)
     opener.addheaders = [('User-Agent', ua), ('Host', host)]
     urllib2.install_opener(opener)
     try:
         response = urllib2.urlopen(url, data, timeout=120)
         result = response.read()
     except:
         try:
             response = urllib2.urlopen(url, data, timeout=120)
             result = response.read()
         except:
             return False
     return soup(result)
Beispiel #35
0
 def parse_packet(self):
     '''This function will parse the needed data from the packet XML
     definition and send the data to the API.
     '''
     packet = soup(self.packet)  # The BeautifulSoup parser object of the XML
     proto = packet.findAll('section')[-2].text
     if proto not in self.protos:
         self.protos[proto] = 0
     self.protos[proto] += 1
     if (int(time.time()) - self.wait_timer) >= self.interval:
         for proto in self.protos:
             log.debug('%s Stats: %s: %s' % (self.stanza, proto, self.protos[proto]))
             dofler.api.stat(proto, self.protos[proto])
         self.wait_timer = int(time.time())
         self.protos = {}
Beispiel #36
0
    def episodes(self, show_title):
        show_url = SHOW_URL + show_title.lower().replace(' ', '-')
        self.browser.open(show_url)

        show_html = soup(self.browser.response(), convertEntities=soup.HTML_ENTITIES)
        episodes = []
        for season in show_html('tr', {'class': 'episodes-list'}):
            for episode in season('td'):
                if not episode.has_key('data-id'):
                    continue
                episodes.append( {  'wid': episode['data-id'], 
                                    'id': episode('span', {'class': 'e-count'})[0].text,
                                    'name': episode('span', {'class': 'e-title'})[0].text
                    } )
        return episodes
Beispiel #37
0
 def parse_links(self, source):
     """
     a bit hacky
     parses all href attributes from html
     then "find all urls" in them
     second step is because some href attributes in template
     can be placeholders etc., which we don't need
     """
     all_links = set()
     for tag in soup(source).findAll("a", {"href": True}):
         val = tag.attrMap["href"]
         urls = re.findall("""http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+""", val)
         if len(urls) == 1:
             all_links.add(urls[0])
     return sorted(list(all_links))
Beispiel #38
0
def main(argv):
	global verbose
	
	argc = len(argv)
	html = ''

	if argc and not os.path.exists(argv[0]):
		print __doc__
		return 20
	
	if argc>0:
		html = open(argv[0]).read()
	else:
		while 1:
			try:
				line = sys.stdin.readline()
			except KeyboardInterrupt:
				sys.stderr.write("Interrupted by user!\n")
				return 5
		
			if not line:
				break
			html += line

	lcount0 = html.count('\n') + (1 if html[-1:]!='\n' else 0)
	ccount0 = len(html)

	if argc>1 and os.path.exists(argv[1]):
		root, ext = os.path.splitext(argv[1])
		backup = "%s.bak" % root
		if verbose:
			print "Overwriting %s, original backedup to %s" % (argv[1], backup)
		oldhtml = open(argv[1]).read()
		open(backup,'w').write(oldhtml)

	formatted = soup(html).prettify()
	lcount1 = formatted.count('\n') + (1 if formatted[-1:]!='\n' else 0)
	ccount1 = len(formatted)
	
	if argc>1:
		open(argv[1],'w').write(formatted)
	else:
		sys.stdout.write(formatted)
		
	if verbose: 
		print "Original:   %d lines, %d characters" % (lcount0,ccount0)
		print "Prettified: %d lines, %d characters" % (lcount1,ccount1)
	return 0
Beispiel #39
0
    def maybe_get_icon(self):
        u"""
        Get icon for the site as a QImage if we haven’t already.

        Get the site icon, either the 'rel="icon"' or the favicon, for
        the web page at url or passed in as page_html and store it as
        a QImage. This function can be called repeatedly and loads the
        icon only once.
        """
        if self.site_icon:
            return
        if not with_pyqt:
            self.site_icon = None
            return
        page_request = urllib2.Request(self.icon_url)
        if self.user_agent:
            page_request.add_header('User-agent', self.user_agent)
        page_response = urllib2.urlopen(page_request)
        if 200 != page_response.code:
            self.get_favicon()
            return
        page_soup = soup(page_response)
        try:
            icon_url = page_soup.find(name='link', attrs={'rel':
                                                          'icon'})['href']
        except (TypeError, KeyError):
            self.get_favicon()
            return
        # The url may be absolute or relative.
        if not urlparse.urlsplit(icon_url).netloc:
            icon_url = urlparse.urljoin(self.url,
                                        urllib.quote(icon_url.encode('utf-8')))
        icon_request = urllib2.Request(icon_url)
        if self.user_agent:
            icon_request.add_header('User-agent', self.user_agent)
        icon_response = urllib2.urlopen(icon_request)
        if 200 != icon_response.code:
            self.site_icon = None
            return
        self.site_icon = QImage.fromData(icon_response.read())
        max_size = QSize(self.max_icon_size, self.max_icon_size)
        icon_size = self.site_icon.size()
        if icon_size.width() > max_size.width() \
                or icon_size.height() > max_size.height():
            self.site_icon = self.site_icon.scaled(max_size,
                                                   Qt.KeepAspectRatio,
                                                   Qt.SmoothTransformation)
Beispiel #40
0
 def parse_links(self, source):
     """
     a bit hacky
     parses all href attributes from html
     then "find all urls" in them
     second step is because some href attributes in template
     can be placeholders etc., which we don't need
     """
     all_links = set()
     for tag in soup(source).findAll("a", {"href": True}):
         val = tag.attrMap["href"]
         urls = re.findall(
             """http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+""",
             val)
         if len(urls) == 1:
             all_links.add(urls[0])
     return sorted(list(all_links))
Beispiel #41
0
def getContent(url, patterns={}, ignore_pattern=[]):
    """
    Arugments:
    url(String): target url
    extract_pattern(dict): (name,extract_function)
    """
    ret = {}
    url_text = requests.get(url).text
    bs = soup(url_text)
    # 404 case
    if not bs.find(ignore_pattern[0], attrs=ignore_pattern[1]) == None:
        return None
    if not type(patterns) == dict:
        raise TypeError(
            "Error type of patterns, should be (name,extract_function)!")
    for p in patterns.items():
        ret[p[0]] = bs.find(p[1][0], attrs=p[1][1]).text
    return ret
Beispiel #42
0
def grab_listings(query_url):

    raw_html = urlopen(query_url)
    parsed_html = soup(raw_html)
    postings = parsed_html('p')

    all_details = []

    for post in postings:

        new_id = str(post('a')[1]['data-id'])

        if len(all_ids) == 1:
            all_ids.append(new_id)
            return []

        if new_id in all_ids:
            break

        else:

            all_ids.append(new_id)

            details = {}

            details['id'] = new_id

            details['title'] = post('a')[1].contents[0]
            #details['time'] = post('span')[0]('time')[0]

            try:
                details['nhood'] = scrap_html(post('span')[0]('small')[0])
            except:
                details['nhood'] = 'Not listed'

            details['price'] = scrap_html(post.find('span',
                                                    {'class': 'price'}))

            all_details.append(details)

            if len(all_details) > 19:
                break

    return all_details
Beispiel #43
0
 def buildDict(self, urlx):
     jr = jutils.getHttpJson(urlx)
     rdict = {}
     vmap = {}
     prefix = '/taxonomy/term/'
     countDataset = 0
     for item in jr:
         atag = soup(item[u'提供機關名稱']).a
         cd = int(item[u'資料集總數'])
         countDataset += cd
         vmap[atag['href'].replace(prefix,'')]={'name':atag.string,'dataset':cd}
     rdict['data'] = vmap
     rdict['meta'] = {'countTerm':len(jr),
     'countDataset':countDataset,
     'source':'http://data.gov.tw/data_usage/orgfullname/json',
       'host':'http://data.gov.tw',
       'build':'http://console.tw',
       'script':'https://github.com/y12studio/console-tw/tree/master/projects/datachart/build-org-term.py',
       'prefix':prefix,
       'time':datetime.datetime.utcnow().isoformat()}
     return rdict
Beispiel #44
0
def filter_html(in_html):
    doc = soup(in_html)

    #recs = doc.findAll("div", { "class": "class_name"})

    # remove unwanted tags
    for div in doc.findAll('head'):
        div.extract()
    for div in doc.findAll(['i', 'h1', 'script']):
        div.extract()
    for div in doc.findAll('div', 'top'):
        div.extract()
    for div in doc.findAll('div', 'bot'):
        div.extract()
    for div in doc.findAll('div', 'line'):
        div.extract()
    for div in doc.findAll('div', 'mainmenu'):
        div.extract()
    for div in doc.findAll('div', 'banner'):
        div.extract()
    for div in doc.findAll('div', 'maintainers'):
        div.extract()

    #for div in doc.findAll('div', {'style':'clear:both;margin-left:200px;'}):
    #    div.extract()

    # remove html comments
    comments = doc.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]

    out_html = doc.body.prettify()
    #out_html = re.sub('(\\n *)','\\n',out_html)

    # a little more cleaning
    out_html = re.sub('(<dd>)\\n', '', out_html)
    out_html = re.sub('(</dd>)\\n', '', out_html)
    out_html = re.sub('<br />', '<br/>', out_html)

    return out_html
Beispiel #45
0
def GPLastImg(rename=""):
    url = "http://10.5.5.9:8080/DCIM/100GOPRO/"
    page = urllib2.urlopen(url).read()
    html = soup(page)
    jpegs = []
    for href in html.findAll("a", {"class": "link"}):
        x = re.search('.*href="(.+\.JPG)".*', str(href))
        if x: jpegs.append(x.group(1))

    lastImg = jpegs[-1]
    imgUrl = url + "/" + lastImg

    import PIL
    from PIL import Image
    import urllib2 as urllib
    import io

    image_file = io.BytesIO(urllib2.urlopen(imgUrl).read())
    img = Image.open(image_file)

    img = img.resize((400, 300), PIL.Image.ANTIALIAS)
    img.save(lastImg)
    return lastImg
Beispiel #46
0
    def child_html_parse(data, web_url):
        """
		About: This is a helper function which parses the html
			   using the keys and values for the python dict 
			   named html_classes
			Args: data (str), web_url (str)
			Returns: None
		"""
        client_info = {
            'name': None,
            'address': None,
            'phone_number': None,
            'link': None
        }
        #t=  BeautifulSoup(html).findChildren("p", { "class" : "sc-Exhibitor_PhoneFax" })
        page_soup = soup(data)
        for key in client_info.keys():
            try:
                client_info[key] = page_soup.findAll(
                    html_classes[key][1],
                    {"class": html_classes[key][0]})[0].text.encode('utf-8')
            except Exception as e:
                print "{0} key failed for link {1}".format(key, web_url)
        client_data.append(client_info)
Beispiel #47
0
 def parse_table(self):
     docsoup = soup(self.region_page)
     table = docsoup.body.findAll('table')
     data = table[4]
     self.tr_data = data.findAll('tr')
     self.num_records = len(self.tr_data)
Beispiel #48
0
    def _doSearch(self,
                  search_strings,
                  search_mode='eponly',
                  epcount=0,
                  age=0,
                  epObj=None):

        results = []
        items = {'Season': [], 'Episode': [], 'RSS': []}

        if not self._doLogin():
            return results

        for mode in search_strings.keys():
            logger.log(u"Search Mode: %s" % mode, logger.DEBUG)
            for search_string in search_strings[mode]:

                if mode != 'RSS':
                    searchURL = self.urls['search'] % (urllib.quote_plus(
                        search_string.replace('.', ' ')), self.categories)
                else:
                    searchURL = self.urls['rss'] % self.categories

                logger.log(u"Search URL: %s" % searchURL, logger.DEBUG)
                if mode != 'RSS':
                    logger.log(u"Search string: %s" % search_string,
                               logger.DEBUG)

                data = self.getURL(searchURL)
                if not data:
                    logger.log("No data returned from provider", logger.DEBUG)
                    continue

                html = soup(data)
                if not html:
                    logger.log("No html data parsed from provider",
                               logger.DEBUG)
                    continue

                empty = html.find('No torrents here')
                if empty:
                    logger.log(
                        u"Data returned from provider does not contain any torrents",
                        logger.DEBUG)
                    continue

                tables = html.find('table',
                                   attrs={'class': 'mainblockcontenttt'})
                if not tables:
                    logger.log(
                        u"Could not find table of torrents mainblockcontenttt",
                        logger.ERROR)
                    continue

                torrents = tables.findChildren('tr')
                if not torrents:
                    continue

                # Skip column headers
                for result in torrents[1:]:
                    try:
                        cells = result.findChildren(
                            'td',
                            attrs={
                                'class':
                                re.compile(
                                    r'(green|yellow|red|mainblockcontent)')
                            })
                        if not cells:
                            continue

                        title = download_url = seeders = leechers = None
                        size = 0
                        for cell in cells:
                            try:
                                if None is title and cell.get(
                                        'title') and cell.get(
                                            'title') in 'Download':
                                    title = re.search(
                                        'f=(.*).torrent',
                                        cell.a['href']).group(1).replace(
                                            '+', '.')
                                    download_url = self.urls['home'] % cell.a[
                                        'href']
                                if None is seeders and cell.get(
                                        'class')[0] and cell.get('class')[
                                            0] in 'green' 'yellow' 'red':
                                    seeders = int(cell.text)
                                    if not seeders:
                                        seeders = 1
                                elif None is leechers and cell.get(
                                        'class')[0] and cell.get('class')[
                                            0] in 'green' 'yellow' 'red':
                                    leechers = int(cell.text)
                                    if not leechers:
                                        seeders = 0

                                # Need size for failed downloads handling
                                if re.match(
                                        r'[0-9]+,?\.?[0-9]* [KkMmGg]+[Bb]+',
                                        cells[7].text):
                                    size = self._convertSize(cells[7].text)
                                    if not size:
                                        size = -1

                            except:
                                logger.log(
                                    u"Failed parsing provider. Traceback: %s" %
                                    traceback.format_exc(), logger.ERROR)

                        if not all([title, download_url]):
                            continue

                        #Filter unseeded torrent
                        if seeders < self.minseed or leechers < self.minleech:
                            if mode != 'RSS':
                                logger.log(
                                    u"Discarding torrent because it doesn't meet the minimum seeders or leechers: {0} (S:{1} L:{2})"
                                    .format(title, seeders,
                                            leechers), logger.DEBUG)
                            continue

                        item = title, download_url, size, seeders, leechers
                        if mode != 'RSS':
                            logger.log(u"Found result: %s " % title,
                                       logger.DEBUG)

                        items[mode].append(item)

                    except (AttributeError, TypeError, KeyError, ValueError):
                        continue

            #For each search mode sort all the items by seeders if available
            items[mode].sort(key=lambda tup: tup[3], reverse=True)

            results += items[mode]

        return results
Beispiel #49
0
    def _doSearch(self, search_params, search_mode='eponly', epcount=0, age=0, epObj=None):

        results = []

        if not self._doLogin():
            return results

        for search_string in search_params if search_params else '':
            if isinstance(search_string, unicode):
                search_string = unidecode(search_string)


            searchURL = self.urls['search'] % (urllib.quote_plus(search_string.replace('.', ' ')), self.categories)
            logger.log(u"Search string: " + searchURL, logger.DEBUG)
            data = self.getURL(searchURL)
            if not data:
                logger.log(u'No grabs for you', logger.DEBUG)
                continue

            html = soup(data)
            if not html:
                continue

            empty = html.find('No torrents here')
            if empty:
                logger.log(u"Could not find any torrents", logger.ERROR)
                continue

            tables = html.find('table', attrs={'class': 'mainblockcontenttt'})
            if not tables:
                logger.log(u"Could not find table of torrents mainblockcontenttt", logger.ERROR)
                continue

            torrents = tables.findChildren('tr')
            if not torrents:
                 continue

            # Skip column headers
            for result in torrents[1:]:
                try:
                    cells = result.findChildren('td', attrs={'class': re.compile(r'(green|yellow|red|mainblockcontent)')})
                    if not cells:
                        continue

                    title = url = seeders = leechers = None
                    size = 0
                    for cell in cells:
                        try:
                            if None is title and cell.get('title') and cell.get('title') in 'Download':
                                title = re.search('f=(.*).torrent', cell.a['href']).group(1).replace('+', '.')
                                url = self.urls['home'] % cell.a['href']
                            if None is seeders and cell.get('class')[0] and cell.get('class')[0] in 'green' 'yellow' 'red':
                                seeders = int(cell.text)
                            elif None is leechers and cell.get('class')[0] and cell.get('class')[0] in 'green' 'yellow' 'red':
                                leechers = int(cell.text)

                            # Skip torrents released before the episode aired (fakes)
                            if re.match('..:..:..  ..:..:....', cells[6].text):
                                if (datetime.strptime(cells[6].text, '%H:%M:%S  %m/%d/%Y') -
                                    datetime.combine(epObj.airdate, datetime.min.time())).days < 0:
                                    continue

                            # Need size for failed downloads handling
                            if re.match('[0-9]+,?\.?[0-9]* [KkMmGg]+[Bb]+', cells[7].text):
                                size = self._convertSize(cells[7].text)

                            if not title or not url or not seeders or leechers is None or not size or \
                                    seeders < self.minseed or leechers < self.minleech:
                                continue

                            item = title, url, seeders, leechers, size
                            logger.log(u"Found result: " + title + " (" + searchURL + ")", logger.DEBUG)

                            results.append(item)

                        except:
                            raise

                except (AttributeError, TypeError, KeyError, ValueError):
                    continue

        results.sort(key=lambda tup: tup[3], reverse=True)
        return results
Beispiel #50
0
# nope, doesn't work:
# coding: utf-8

# but this does:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from BeautifulSoup import BeautifulSoup as soup, Comment, Tag
import urllib

if 1:
    url = 'http://eracks.com/customers'
    print url
    s = urllib.urlopen(url).read()
    s = soup(s)

    cont = s.body

    # extract text and a few select tags:
    snip = ''
    for e in cont.recursiveChildGenerator():
        if isinstance (e, unicode) or \
           isinstance (e, Tag) and (str(e).startswith (u'<a ') or
                                    str(e).startswith (u'<b>') or
                                    str(e).startswith (u'<img')):

            s = unicode(e)
            if s.strip() or len(e):
                snip += s  # unicode(e) # .extract() nope, confuses the generator
            e.contents = []
Beispiel #51
0
def main():

    urls = load_urls()
    state = load_state()
    i = 0
    while i < len(urls):
        url = urls[i]
        if isinstance(url, basestring):
            url = {
                'url': url,
                'type': 'html',
                'referer': url,
            }
            urls[i] = url
        i += 1
        try:
            body,hash = get(url)
        except Exception as e:
            continue

        if url['url'] not in state:
            state[url['url']] = {
                'previous_hash': None,
                'permutations': 0,
            }

        if hash != state[url['url']]['previous_hash']:
            state[url['url']]['previous_hash'] = hash
            state[url['url']]['permutations'] = state[url['url']]['permutations'] + 1
            print "Permutation %s: %s was updated on %s" % (state[url['url']]['permutations'],url['url'], time.ctime())
            store_body(url,body)
            if url['type'] == 'img':
                send_mail(config.email,'Updated image url %s' % url['url'],'Not sending the image, lolz')
                continue
            else:
                send_mail(config.email,'Updated site url %s' % url['url'],body)
            try:
                s = soup(body)
                for anchor in s('a'):
                    try:
                        new_url = {
                            'url': get_url(url['url'],anchor['href']),
                            'type':'html',
                            'referer':url['referer']
                        }
                        if new_url not in urls:
                            print "Permutation %s: Found URL in %s. New URL: %s" % (state[url['url']]['permutations'],url['url'], new_url['url'])
                            urls.append(new_url)
                            send_mail(config.email,'Found new website %s' % new_url['url'],'Hoi')
                            # Send e-mail
                    except Exception as e:
                        print e
                for img in s('img'):
                    try:
                        new_url = {
                            'url': get_url(url['url'],img['src']),
                            'type':'img',
                            'referer':url['referer']
                        }
                        if new_url not in urls:
                            print "Permutation %s: Found URL in %s. New URL: %s" % (state[url['url']]['permutations'],url['url'], new_url['url'])
                            urls.append(new_url)
                            send_mail(config.email,'Found new image %s' % new_url['url'],'Hoi')
                    except Exception as e:
                        print e
            except Exception as e:
                continue

    store_urls(urls)
    store_state(state)
import urllib2
from BeautifulSoup import BeautifulSoup as soup

teams = {'csk':"Chennai Super Kings","mi":"Mumbai Indians","kxip":"Kings XI Punjab","kkr":"Kolkata Knight Riders",\
         "rcb":"Royal Challengers Bangalore","srh":"Sunrisers Hyderabad","rr":"Rajasthan Royals","dd":"Delhi Daredevils"}
teamsAbbr = {"Chennai Super Kings":"csk","Mumbai Indians":"mi","Kings XI Punjab":"kxip","Kolkata Knight Riders":"kkr",\
         "Royal Challengers Bangalore":"rcb","Sunrisers Hyderabad":"srh","Rajasthan Royals":"rr","Delhi Daredevils":"dd"}

page = urllib2.urlopen(
    "http://www.cricbuzz.com/cricket-series/2676/indian-premier-league-2018/matches"
).read()
soupedPage = soup(page)
teamsPage = soupedPage.findAll("div", {"class": "cb-col-75 cb-col"})
#datePage = soupedPage.findAll("div", {"class": '"cb-col-25 cb-col pad10"'})
MatchesDataInOrder = []
for i in range(len(teamsPage)):
    AvsB = teamsPage[i].find("span")
    WinningTeam = teamsPage[i].find("a", {"class": "cb-text-link"})
    team1, team2 = map(
        teamsAbbr.get,
        map(str.strip, map(str, AvsB.contents[0].split(",")[0].split(" vs "))))
    if WinningTeam != None:
        winner = teamsAbbr[str(
            WinningTeam.contents[0].split("won")[0]).strip()]
    else:
        winner = "NP"
    MatchesDataInOrder.append([team1, team2, winner])
#for i in MatchesDataInOrder:
#    print i
pointsPerTeam = {}
for key in teams.keys():
def prettyhtml(value, autoescape=None):
    value = str(soup(value))
    if autoescape and not isinstance(value, SafeData):
        from django.utils.html import escape
        value = escape(value)
    return mark_safe(value)
Beispiel #54
0
def get_source(link):
    r = requests.get(link)
    if r.status_code == 200:
        return soup(r.text)
    else:
        sys.exit("[~] Invalid Response Received.")
Beispiel #55
0
import scraperwiki
from BeautifulSoup import BeautifulSoup as soup

urls = (
('event', 'http://www.leedsdigitalfestival.com/'),
('about', 'http://www.leedsdigitalfestival.com/about/'),
('news', 'http://www.leedsdigitalfestival.com/news/'),
('video', 'http://www.leedsdigitalfestival.com/video/'),
('location', 'http://www.leedsdigitalfestival.com/locations/'),
('resource', 'http://www.leedsdigitalfestival.com/resources/'),
)

data = []
for content_type, url in urls:
    s = soup(scraperwiki.scrape(url))
    page = s.find('div', 'page')
    for block in page.findAll('div', 'block'):
        if 'small_footer' in block['class'].split(' '):
            continue
        title = block.find('div', {'class': 'post-title'})
        title = title and title.h2.text
        ps = block.findAll('p')
        dt = ''
        venue = ''
        by = ''
        if hasattr(ps[0], 'span') and ps[0].span:
            dt = ps[0].span.text
            _venue = ps[0].text.replace(dt, '')
            if _venue.startswith('by '):
                by = _venue
            else:
Beispiel #56
0
 def setUp(self):
     self.base_url = u'http://www.mddep.gouv.qc.ca/sol/terrains/terrains-contamines/recherche.asp'
     self.test_text = open('tests/fixtures/testfile.html', 'r').readlines()
     self.parsed_file = soup(StringIO(self.test_text))
Beispiel #57
0
def scrape(host, number, paths, emails, wait, first):
    #setup signals and verbose flag
    signal(SIGINT, sig_handle)
    global verbose

    #check if the number of emails we requested has been met
    if len(emails) >= number:
        return True

    #get host site path
    hosturl = urlparse.urlparse(host)
    if hosturl.path in paths:
        return True
    paths.append(hosturl.path)

    #get page content
    sleep(wait)
    try:
        content = soup(urlopen(host).read())
        if verbose:
            print("Link read: {}".format(host))
    except Exception as e:
        if isinstance(e,
                      KeyboardInterrupt):  #write the file and quit if SigInt
            sig_handle(1, 2)
        else:
            if verbose:
                print("Link read failed: {}".format(host))
            return False

    #grab all email addresses from the page and then add it to the list
    pageemails = mailreg.findall(content.prettify())
    for pemail in pageemails:
        if not pemail in emails:
            print(pemail)
            emails.append(pemail)

    #scrape for URLs on the site
    links = []
    for x in content('a'):  #find all anchors
        try:
            links.append(x['href'])
        except:
            pass
    for link in links:
        plink = urlparse.urlparse(link)
        if plink.path is u'':  #if there is no path then just skip it
            continue
        if (hosturl.netloc in plink.netloc
                or plink.netloc is "") and plink.path not in paths:
            if plink.path[0] == '/':  #absolute path
                url = slashreg.sub("/", "{}/{}".format(hosturl.netloc,
                                                       plink.path))
            elif first is True:  #first page check
                url = slashreg.sub(
                    "/", "{}/{}/{}".format(hosturl.netloc, hosturl.path,
                                           plink.path))
                url = "{}://{}".format(hosturl.scheme, url)
                if (scrape(url, number, paths, emails, wait, False)):
                    continue
                else:  #we are at a full path, subtract the file name and start from the path
                    url = slashreg.sub(
                        "/", "{}/{}/{}".format(
                            hosturl.netloc,
                            "/".join(hosturl.path.split("/")[:-1]),
                            plink.path))
            else:  #all other pages, subtract filename and go from the path
                url = slashreg.sub(
                    "/",
                    "{}/{}/{}".format(hosturl.netloc,
                                      "/".join(hosturl.path.split("/")[:-1]),
                                      plink.path))
            url = "{}://{}".format(hosturl.scheme, url)
            scrape(url, number, paths, emails, wait, False)  #recursion!

    return True
Beispiel #58
0
def get_post_params(post):
    post_updated = dateutils.from_iso_format(post['updated'])
    post_published = dateutils.from_iso_format(post['published'])
    post_id = post['id']
    permalink = post['url']
    item = post['object']

    if post['verb'] == 'post':

        content = [item['content']]

    elif post['verb'] == 'share':
        content = [post.get('annotation', 'Shared:')]

        if 'actor' in item:
            content.append('<br/><br/>')
            if 'url' in item['actor'] and 'displayName' in item['actor']:
                content.append(
                    '<a href="%s">%s</a>' %
                    (item['actor']['url'], item['actor']['displayName']))
                content.append(' originally shared this post: ')
            elif 'displayName' in item['actor']:
                content.append(item['actor']['displayName'])
                content.append(' originally shared this post: ')

        content.append('<br/><blockquote>')
        content.append(item['content'])
        content.append('</blockquote>')

    elif post['verb'] == 'checkin':
        content = [item['content']]
        place = post.get('placeName', '')
        if place:
            if item['content']:
                # Add some spacing if there's actually a comment
                content.append('<br/><br/>')
            content.append('Checked in at %s' % place)

    else:
        content = []

    if 'attachments' in item:  # attached content
        for attach in item['attachments']:

            content.append('<br/><br/>')
            if attach['objectType'] == 'article':
                # Attached link
                content.append('<a href="%s">%s</a>' %
                               (attach['url'],
                                attach.get('displayName', 'attached link')))
            elif attach['objectType'] == 'photo':
                # Attached image
                content.append('<img src="%s" alt="%s" />' %
                               (attach['image']['url'], attach['image'].get(
                                   'displayName', 'attached image'))
                               )  # G+ doesn't always supply alt text...
            elif attach['objectType'] == 'photo-album' or attach[
                    'objectType'] == 'album':
                # Attached photo album link
                content.append('Album: <a href="%s">%s</a>' %
                               (attach['url'],
                                attach.get('displayName', 'attached album')))
            elif attach['objectType'] == 'video':
                # Attached video
                content.append('Video: <a href="%s">%s</a>' %
                               (attach['url'],
                                attach.get('displayName', 'attached video')))
            else:
                # Unrecognized attachment type
                content.append('[unsupported post attachment of type "%s"]' %
                               attach['objectType'])

    # If no actual parseable content was found, just link to the post
    post_content = u''.join(content) or permalink

    # Generate the post title out of just text [max: 100 characters]
    post_title = u' '.join(x.string
                           for x in soup(post_content).findAll(text=True))
    post_title = space_compress_regex.sub(' ', post_title)
    if len(post_title) > 100:
        if post_title == permalink:
            post_title = u"A public G+ post"
        else:
            candidate_title = post_title[:97]
            if '&' in candidate_title[
                    -5:]:  # Don't risk cutting off HTML entities
                candidate_title = candidate_title.rsplit('&', 1)[0]
            if ' ' in candidate_title[
                    -5:]:  # Reasonably avoid cutting off words
                candidate_title = candidate_title.rsplit(' ', 1)[0]
            post_title = u"%s..." % candidate_title

    return {
        'title': post_title,
        'permalink': xhtml_escape(permalink),
        'postatomdate': dateutils.to_atom_format(post_updated),
        'postatompubdate': dateutils.to_atom_format(post_published),
        'postdate': post_published.strftime('%Y-%m-%d'),
        'id': xhtml_escape(post_id),
        'summary': xhtml_escape(post_content),
    }