Beispiel #1
0
def index_allfiles():
	for x in file_names:
		file_data=""
		with open(x) as file:
			file_data = file.read()
			file.close()
		try:
			soup = BeautifulSoup(file_data)
		except UnicodeEncodeError:
			print "Soup error: "+x
		except TypeError:
			soup = BeautifulSoup(file_data.decode('utf-8','ignore'))
		if soup.title is None:
			page_title = " "
		else:
			page_title = soup.title.string
		for script in soup(["script", "style"]):
			script.extract()
		data = soup.getText(separator=u' ')
		try:
			writer.add_document(title=unicode(page_title), path=unicode(x), content=unicode(data))
		except UnicodeDecodeError:
			print "Error in " + x
		except UnicodeEncodeError:
			print "Error in " + x
	writer.commit()
Beispiel #2
0
def get_chapters(chapter_url, fic, web_site): 
    content_tag = "div"
    content_class = {"class" : "list"}
    chapter_tag = "li"
    chapter_class = {}
    #build index
    html_page = urllib2.urlopen(chapter_url)
    html_content = html_page.read()
    ans = []
    #get content
    content = BeautifulSoup(html_content)
    out = content.findAll(content_tag, content_class)
    contents = ''.join([str(item) for item in out])
    chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class)
    for item in chapters:
        try:
            item_str = str(item)
        except:
            item_str = unicode(item).encode('utf-8')
        item_tag = BeautifulSoup(item_str)
        try:
            url = item_tag.a['href']
            if 'http' not in url and url[0] != '/':
                url = chapter_url + url
            chapter_title = item_tag.getText()
        except:
            continue
        ans.append((url, chapter_title))
    if not ans:
        fic.delete()
        return
    print '获取小说 %s 章节完毕' % fic.fiction_title
    save_chapter(ans, fic, web_site)
    print '保存章节完毕'
Beispiel #3
0
def index_data():
	for x in file_array:
		file_data=""
		with open(x) as file:
			file_data = file.read()
			file.close()
		try:
			soup = BeautifulSoup(file_data)
		except UnicodeEncodeError:
			pass
		except TypeError:
			soup = BeautifulSoup(file_data.decode('utf-8','ignore'))
		if soup.title is None:
			page_title = " "
		else:
			page_title = soup.title.string
		for skip in soup(["script", "style"]):
			skip.extract()
		data = soup.getText(separator=u' ')
		try:
			writer.add_document(title=unicode(page_title), path=unicode(x), content=unicode(data))
			stop_writer.add_document(title=unicode(page_title), path=unicode(x), content=unicode(data))
			stemming_writer.add_document(title=unicode(page_title), path=unicode(x), content=unicode(data))
			stemming_stop_writer.add_document(title=unicode(page_title), path=unicode(x), content=unicode(data))
		except UnicodeDecodeError:
			pass
		except UnicodeEncodeError:
			pass
	writer.commit()
	stop_writer.commit()
	stemming_writer.commit()
	stemming_stop_writer.commit()
Beispiel #4
0
 def sanitize(self, data):
     soup = BeautifulSoup(data)
     for element in soup.findAll(['script', 'style']):
         element.extract()
     for comments in soup.findAll(text=lambda text: isinstance(text, Comment)):
         comments.extract()
     return html2text.html2text(soup.getText())
Beispiel #5
0
def view_feed_view(request):
    feed_requested = request.matchdict.get('feed')
    rss = DBSession.query(Flux).filter_by(id=feed_requested).first()
    rss_id = rss.id
    url = rss.text
    count = rss.count
    title = rss.title
    url = rss.text
    count = 10
    user = check_user_logged(request)
    if user is False:
        return dict(
            feed= "not logged",
            project= 'PyNews'
        )
    content = []
    try:
        feeds = feedparser.parse( url )
        title = feeds['feed']['title']
        feed_id = 0
        for items in feeds["items"]:
            text = items['summary_detail']['value']
            soup = BeautifulSoup(text, convertEntities=BeautifulSoup.HTML_ENTITIES)
            result = ""
            if count > 0:
                line = {
                    "head": items['title_detail']['value'],
                    "date": "sans date",
                    "feed": soup.getText(),
                    "link": items['link'],
                    "feed_id": count,
                }
                content.append(line)
                count -= 1
                feed_id += 1
        feed = {
            'url':url,
            'title':title,
            'count':count,
            'content': content,
            'id': rss_id,
        }
        return dict(
            feed= feed,
            title= feeds['feed']['title'],
            project= 'PyNews'
        )
    except:
        feed = {
            'url':url,
            'title':"",
            'count':"",
            'content': content,
            'id': rss_id,
        }
        return dict(
            feed= "error",
            title= "Error unable to fetch url: " + url,
            project= 'PyNews'
        )
Beispiel #6
0
def LyricWikia(artist, title):
    proxy = urllib.request.getproxies()
    url = 'http://lyrics.wikia.com/api.php?action=lyrics&artist={artist}&song={title}&fmt=json&func=getSong'.format(
        artist=artist, title=title).replace(" ", "%20")
    r = requests.get(url, timeout=15, proxies=proxy)
    # We got some bad formatted JSON data... So we need to fix stuff :/
    returned = r.text
    returned = returned.replace("\'", "\"")
    returned = returned.replace("song = ", "")
    returned = json.loads(returned)
    if returned["lyrics"] != "Not found":
        # set the url to the url we just recieved, and retrieving it
        r = requests.get(returned["url"], timeout=15, proxies=proxy)
        soup = BeautifulSoup(r.text, "lxml")
        soup = soup.find("div", {"class": "lyricbox"})
        [elem.extract() for elem in soup.findAll('div')]
        [elem.replaceWith('\n') for elem in soup.findAll('br')]
        #with old BeautifulSoup the following is needed..? For recent versions, this isn't needed/doesn't work
        try:
            soup = BeautifulSoup(str(soup),
                                 convertEntities=BeautifulSoup.HTML_ENTITIES)
        except:
            pass
        soup = BeautifulSoup(re.sub(r'(<!--[.\s\S]*-->)', '', str(soup)),
                             "lxml")
        [elem.extract() for elem in soup.findAll('script')]
        return (soup.getText())
    else:
        return ("error")
Beispiel #7
0
def get_chapter_630(chapter_url, fic, web_site):
    content_tag = "div"
    content_class = {"class" : "zjbox"}
    chapter_tag = "dd"
    chapter_class = {}
    #build index
    html_page = urllib2.urlopen(chapter_url)
    html_content = html_page.read()
    ans = []
    #get content
    content = BeautifulSoup(html_content)
    out = content.findAll(content_tag, content_class)
    contents = ''.join([str(item) for item in out])
    chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class)
    for item in chapters:
        item_str = str(item)
        if isinstance(item, unicode):
            item_str = item.encode('utf-8')
        item_tag = BeautifulSoup(item_str)
        if item_tag.a:
            url = item_tag.a['href']
            chapter_title = item_tag.getText()
            ans.append((url, chapter_title))
    if not ans:
        fic.delete()
    save_chapter(ans, fic, web_site)
Beispiel #8
0
def LyricWikia(artist, title):
	url = 'http://lyrics.wikia.com/api.php?action=lyrics&artist={artist}&song={title}&fmt=json&func=getSong'.format(artist=artist,
																													title=title).replace(" ","%20")
	r = requests.get(url, timeout=15)
	# We got some bad formatted JSON data... So we need to fix stuff :/
	returned = r.text
	returned = returned.replace("\'", "\"")
	returned = returned.replace("song = ", "")
	returned = json.loads(returned)
	if returned["lyrics"] != "Not found":
		# set the url to the url we just recieved, and retrieving it
		r = requests.get(returned["url"], timeout=15)
		soup = BeautifulSoup(r.text)
		soup = soup.find("div", {"class": "lyricbox"})
		[elem.extract() for elem in soup.findAll('div')]
		[elem.replaceWith('\n') for elem in soup.findAll('br')]
		#with old BeautifulSoup the following is needed..? For recent versions, this isn't needed/doesn't work
		try:
			soup = BeautifulSoup(str(soup), convertEntities=BeautifulSoup.HTML_ENTITIES)
		except:
			pass
		soup = BeautifulSoup(re.sub(r'(<!--[.\s\S]*-->)', '', str(soup)))
		[elem.extract() for elem in soup.findAll('script')]
		return(soup.getText())
	else:
		return()
Beispiel #9
0
def init_file(file):
    dic = {
        'id': file[0],
        'name': file[1],
        'icon_link': file[2],
        'icon_path': file[3],
        'source': file[4],
        'source_link': file[5],
        'rating': file[6],
        'version': file[7],
        'developer': file[8],
        'sdk_support': file[9],
        'category': file[10],
        'screen_support': file[11],
        'apk_size': file[12],
        'language': file[13],
        'publish_date': file[14],
        'downloads': file[15],
        'description': file[16],
        'images': file[17],
        'images_path': file[18],
        'qr_link': file[19],
        'download_link': file[20],
        'last_crawl': file[21],
        'vol_id': file[22],
        'package_name': file[23],
        'version_code': file[24],
        'sig': file[25],
        'min_sdk_version': file[26],
        'is_break': file[27],
        'platform': file[28],
        'file_type': file[29],
        'package_hash': file[30],
    }
    try:
        if dic.get('description'):
            soup = BeautifulSoup(dic.get('description').decode('utf8'))
            dic['description'] = soup.getText('\n')
        else:
            dic['desctiption'] = ''
        if dic['source'] == 'itunes.apple.com':
            if '游戏' in dic['sig']:
                if '网游' in dic['name'] or 'online' in dic['name']:
                    dic['category'] = '网络游戏'
                elif '飞机' in dic['name'] or '射击' in dic['name'] or '飞行' in dic[
                        'name']:
                    dic['category'] = '射击游戏'
                else:
                    dic['category'] = dic['sig']
            else:
                if '主题' in dic['name'] or '壁纸' in dic['name']:
                    dic['category'] = '主题美化'
                elif dic['category'] == '社交':
                    dic['category'] = random.choice(['社交一', '社交二'])
            dic['category'] = _adapt_cate_str(dic.get('category'))
    except Exception as e:
        print dic['source_link']
        print e
    return dic
Beispiel #10
0
def slug(content):
    res=u''
    soup=BeautifulSoup(content)
    res=soup.getText()
    if len(res)>100:
        res=res[:100]
        res+=u'...'
    return res
Beispiel #11
0
def _adapt_desc_str(desc_str):
    if not desc_str:
        return None
    soup = BeautifulSoup(desc_str)
    desc = soup.getText('\n')
    desc = __strip(desc)
    desc = __removeDuplicated(desc)
    desc = __cutTail(desc)
    return desc
Beispiel #12
0
def init_file(file):
    dic = {'id': file[0],
           'name': file[1],
           'icon_link': file[2],
           'icon_path': file[3],
           'source': file[4],
           'source_link': file[5],
           'rating': file[6],
           'version': file[7],
           'developer': file[8],
           'sdk_support': file[9],
           'category': file[10],
           'screen_support': file[11],
           'apk_size': file[12],
           'language': file[13],
           'publish_date': file[14],
           'downloads': file[15],
           'description': file[16],
           'images': file[17],
           'images_path': file[18],
           'qr_link': file[19],
           'download_link': file[20],
           'last_crawl': file[21],
           'vol_id': file[22],
           'package_name': file[23],
           'version_code': file[24],
           'sig': file[25],
           'min_sdk_version': file[26],
           'is_break': file[27],
           'platform': file[28],
           'file_type': file[29],
           'package_hash': file[30],
           }
    try:
        if dic.get('description'):
            soup = BeautifulSoup(dic.get('description').decode('utf8'))
            dic['description'] = soup.getText('\n')
        else:
            dic['desctiption'] = ''
        if dic['source'] == 'itunes.apple.com':
            if '游戏' in dic['sig']:
                if '网游' in dic['name'] or u'网游' in dic['description'] or 'online' in dic['name'] or u'online' in dic['description']:
                    dic['category'] = '网络游戏'
                elif '飞机' in dic['name'] or '射击' in dic['name'] or '飞行' in dic['name'] or u'飞机' in dic['description'] or u'射击' in dic['description']or u'飞行' in dic['description']:
                    dic['category'] = '射击游戏'
                else:
                    dic['category'] = dic['sig']
            else:
                if '主题' in dic['name'] or '壁纸' in dic['name'] or u'主题' in dic['description'] or u'壁纸' in dic['description']:
                    dic['category'] = '主题美化'
                elif dic['category'] == '社交':
                    dic['category'] = random.choice(['社交一', '社交二'])
            dic['category'] = _adapt_cate_str(dic.get('category'))
    except Exception as e:
        print dic['source_link']
        print e
    return dic
def _adapt_desc_str(desc_str):
    if not desc_str:
        return None
    soup = BeautifulSoup(desc_str)
    desc = soup.getText('\n')
    desc = __strip(desc)
    desc = __removeDuplicated(desc)
    desc = __cutTail(desc)
    return desc
	def process(self, item, spider, matcher):
		if item['url']:
			item['url'] = item['url'].lower()					
		if item['price'] != 'NA': 
			item['price'] = utils.cleanNumberArray(item['price'], 'float')
	
		if item['brand']:
			temp = item['brand'][0]
			temp = matcher.dualMatch(temp)
			item['brand'] = temp
		if not item['brand']:
			     logging.info(item['url'])
			     raise DropItem("**** **** **** Missing brand in %s . Dropping" % item)	
		
		if item['description']:
                	
			temp = item['description']
			bad = BeautifulSoup(temp[0])
			item['description'] = bad.getText()
		
		if item['volume']:
			#volume can be string or list depening on item count on particular page
			temp = item['volume']
			if isinstance(temp, list):
				if len(temp) > 1:
					temp = utils.getElementVolume(temp)
					item['volume'] = temp
			else:
				item['volume'] = utils.extractVolume(item['volume'])

		if item['category']:
			tempCat = item['category']
			item['category'] =utils.cleanChars(tempCat[0])
		if item['image']:
			temp = item['image'] 
			temp = temp[0]
			item['image'] = temp
		if item['sku']: 
			temp = item['sku']
			temp = temp[0]

		if item['comments']:
			comment_html = item['comments']
			try:
				item['comments'] = self.get_comments(comment_html, item['url'])
			except:
				exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
				logger.error('Error getting comments %s , Exception information: %s, %s, Stack trace: %s ' % (item['url'],
											exceptionType, exceptionValue, traceback.extract_tb(exceptionTraceback)))
				

		temp = price_package(item)
        	item['price'] = temp
		print ' big pacakge: %s' % temp	
		return item
Beispiel #15
0
    def getKeyword(self):
        soup = BeautifulSoup(self.getContent())
        comments = soup.findAll(text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments]

        rmlist = ["script", "style", "img"]
        for tag in soup.findAll():
            if tag.name.lower() in rmlist:
                tag.extract()

        return "".join(soup.getText())
Beispiel #16
0
 def get_short_content(self, obj):
     from BeautifulSoup import BeautifulSoup
     # soup = BeautifulSoup(obj.content)
     soup = BeautifulSoup(obj.content,
                          convertEntities=BeautifulSoup.HTML_ENTITIES)
     [s.extract() for s in soup('script')]
     data = ''.join(soup.findAll(text=True))[:200]
     data = soup.getText()
     data = re.sub(r'(\n)+|(\s)+', ' ', data)
     data = data.strip()
     return data
Beispiel #17
0
 def play_byu_live(self):
     soup = BeautifulSoup(make_request(self.apiurl + 'GetLiveStreamUrl?context=Android%24US%24Release'))
     urlCode = soup.getText().strip('"')
     reqUrl = 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/Iyamk6YZTw8DxrC60h0fQipg3BfL/'+urlCode+'?device=android_3plus_sdk-hook&domain=www.ooyala.com&supportedFormats=mp4%2Cm3u8%2Cwv_hls%2Cwv_wvm2Cwv_mp4'
     data = json.loads(make_request(reqUrl))
     for stream in data['authorization_data'][urlCode]['streams']:
         url = b64decode(stream['url']['data'])
         item = xbmcgui.ListItem(path=url)
         try:
             xbmcplugin.setResolvedUrl(int(sys.argv[1]), True, item)
         except:
             continue
Beispiel #18
0
def view_feed_view(request):
    feed_requested = request.matchdict.get('feed')
    rss = DBSession.query(Flux).filter_by(id=feed_requested).first()
    rss_id = rss.id
    url = rss.text
    count = rss.count
    title = rss.title
    url = rss.text
    count = 10
    user = check_user_logged(request)
    if user is False:
        return dict(feed="not logged", project='PyNews')
    content = []
    try:
        feeds = feedparser.parse(url)
        title = feeds['feed']['title']
        feed_id = 0
        for items in feeds["items"]:
            text = items['summary_detail']['value']
            soup = BeautifulSoup(text,
                                 convertEntities=BeautifulSoup.HTML_ENTITIES)
            result = ""
            if count > 0:
                line = {
                    "head": items['title_detail']['value'],
                    "date": "sans date",
                    "feed": soup.getText(),
                    "link": items['link'],
                    "feed_id": count,
                }
                content.append(line)
                count -= 1
                feed_id += 1
        feed = {
            'url': url,
            'title': title,
            'count': count,
            'content': content,
            'id': rss_id,
        }
        return dict(feed=feed, title=feeds['feed']['title'], project='PyNews')
    except:
        feed = {
            'url': url,
            'title': "",
            'count': "",
            'content': content,
            'id': rss_id,
        }
        return dict(feed="error",
                    title="Error unable to fetch url: " + url,
                    project='PyNews')
Beispiel #19
0
def get_one_cloud_hourly_rate(bot_hdr, website):
	soup = BeautifulSoup(website.read())

	# using beautiful soup to extract the raw text from the web page, see if the cloud we're looking at is 
	# IaaS.  if not, exit.  else, do parsing voodoo to extract the base plan price and the name of the cloud
	#
	raw_txt = soup.getText()
	if raw_txt.find("Infrastructure as a Service") == -1 or raw_txt.find("Plan Price") == -1:
		return -1, -1
	cloud_name = website.geturl().split("/")[-1]
	cloud_hourly_rate = raw_txt.split("Plan Price")[1].split(" ")[0]
	cloud_hourly_rate = float(cloud_hourly_rate.encode("ascii").strip("$"))
	return cloud_name, cloud_hourly_rate
Beispiel #20
0
    def _sanitize(self, html):
        """
        Clean html by removing tags, comments etc
        Returns: str
        """
        blacklist = ['style', 'script', '[document]', 'head', 'title', 'meta']
        soup = BeautifulSoup(html)

        for s in soup(blacklist):
            s.extract()
        for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
            comment.extract()
        return soup.getText()
Beispiel #21
0
def getReleases(page):
    latestPart = BeautifulSoup(page).findAll('div', {'class' : 'latest'})
    episodes = BeautifulSoup(str(latestPart)).findAll('div', {'class' : 'episode'})
    list = []
    for episode in episodes:
        resolutionsblock = BeautifulSoup(str(episode)).findAll('div', {'class' : 'resolutions-block'})
        for res in resolutionsblock:
            resolutionblock = BeautifulSoup(str(res)).findAll('div', {'class' : 'linkful resolution-block'})
            allA = BeautifulSoup(str(BeautifulSoup(str(resolutionblock[len(resolutionblock) - 1])).findAll('a')))
            for a in allA:
                dLinks = BeautifulSoup(str(a))
                if (dLinks.getText() == 'Magnet'):
                    list.append(dLinks.find('a').get('href'))
    return list
Beispiel #22
0
 def getLineItems(self, html):
     """Detects HTML list markups and returns a list of plaintext lines"""
     soup = BeautifulSoup(html)
     text = soup.getText("\n")  # will need to be updated for bs4
     if soup.findAll("ol"):
         self.markup = "ol"
     elif soup.findAll("ul"):
         self.markup = "ul"
     else:
         self.markup = "div"
     # remove empty lines:
     lines = re.sub(r"^(&nbsp;)+$", "", text,
                    flags=re.MULTILINE).splitlines()
     items = [line for line in lines if line.strip() != '']
     return items, None
Beispiel #23
0
 def getStems(html, withUnicode=False, asString=False):
     stemmer = utils.getStems
     if withUnicode:
         stemmer = utils.getUnicodeStems
     parsed = BeautifulSoup(html)
     text = parsed.getText(" ")
     goodStems = []
     skipWords = ["is", "an", "the", "and", "but", "a", "i"]
     for stem in stemmer(text.lower().split(), skipWords=skipWords):
         if len(stem) == 1:
             continue
         goodStems.append(stem)
     result = [x for x in goodStems if x]
     if asString:
         result = ",".join(result)
     return result
Beispiel #24
0
 def url2words(url):
   try:
     html = urllib2.urlopen(url).read()
   except HTTPError:
     html = ""
   #plain_text = nltk.clean_html(html).replace('\n','')#nltk.clean_html is not implement. instead of use beautifulsoup...
   soup = BeautifulSoup(html)
   
   #kill script tag and style tag
   for script in soup(["script","style"]):
     script.extract()
     
   plain_text = soup.getText()#plain_text がうまくいかないよ...
   #print plain_text
   words = extract_words(plain_text)
   return words
Beispiel #25
0
 def getStems(html, withUnicode=False, asString=False):
     stemmer = utils.getStems
     if withUnicode:
         stemmer = utils.getUnicodeStems
     parsed = BeautifulSoup(html)
     text = parsed.getText(" ")
     goodStems = []
     skipWords = ["is", "an", "the", "and", "but", "a", "i"]
     for stem in stemmer(text.lower().split(), skipWords=skipWords):
         if len(stem) == 1:
             continue
         goodStems.append(stem)
     result = [x for x in goodStems if x]
     if asString:
         result = ",".join(result)
     return result
Beispiel #26
0
        def url2words(url):
            try:
                html = urllib2.urlopen(url).read()
            except HTTPError:
                html = ""
            #plain_text = nltk.clean_html(html).replace('\n','')#nltk.clean_html is not implement. instead of use beautifulsoup...
            soup = BeautifulSoup(html)

            #kill script tag and style tag
            for script in soup(["script", "style"]):
                script.extract()

            plain_text = soup.getText()  #plain_text がうまくいかないよ...
            #print plain_text
            words = extract_words(plain_text)
            return words
	def process(self, item, spider, matcher):
		if item['url']:
			item['url'] = item['url'].lower()					
		if item['price']!= 'NA': 
			temp = item['price']
			clean = cleanNumberArray(temp, 'float')
			item['price'] = clean

		if item['description']:
			item['description'] = item['description'][0]
			soup = BeautifulSoup(item['description'])
			out = soup.getText()
			item['description'] = out		

		if item['name']:
			#temp = item['brand'][0]
			#temp = cleanChars(temp)
			brand = matcher.dualMatch(item['name'])
			item['brand'] = brand
			if not item['brand']:
				raise DropItem("******* Missing BRAND in %s . Dropping" % item['name'])
		#if item['description']:
			#temp = item['description']
			#temp = temp[0]
			#temp = cleanHtmlTags(temp)
			#item['description'] = temp 
		
		if item['category']:
			tempCat = item['category']
			item['category'] = tempCat[0]
			item['category'] = ''	
		if item['image']:
			temp = item['image'] 
			temp = temp[0]
			item['image'] = temp
		if item['volume']: 
			temp = item['name']
			item['volume'] = multiStateVolume(temp)
	
		if item['sku']: 
			temp = item['sku']
			temp = temp[0]
			item['sku'] = ''


		return item
Beispiel #28
0
 def __douban(self):
     name = '豆瓣·事情'
     req = urllib2.Request(url = self.url, headers = self.__headers)
     res = urllib2.urlopen(req).read()
     subject = re.search(r'<title>([\w\W]*)</title>', res).groups()[0]
     subject = subject.strip()
     res = res.replace('<img src="', '[img]').replace('" alt=', '[/img]<span id=')
     res = res.replace('<br>', '\n')
     soup = BeautifulSoup(res)
     soup = soup.find('div', {'class': 'note', 'id': 'link-report'})
     content = soup.getText()
     content = content.replace('[/img]', '[/img]\n').replace('[img]', '\n[img]')
     h = HTMLParser.HTMLParser()
     content = h.unescape(content).encode('utf8')
     content = self.__replaceImgs(content)
     message = self.__message(content, 'http://thing.douban.com/', name)
     return subject + ' | ' + name , message
def start_refresh():
    while True:
        apks = get_apks()
        if not apks:
            return
        report_list = []
        for apk in apks:
            try:
                if not apk[1]:
                    desc = ''
                else:
                    soup = BeautifulSoup(apk[1])
                    desc = soup.getText('\n')
                report_list.append((apk[0], desc))
            except Exception as e:
                print file
                print e
        report_status(report_list)
Beispiel #30
0
 def getLineItems(self, html):
     """Detects HTML list markups and returns a list of plaintext lines"""
     if ANKI20:  # do not supply parser to avoid AttributeError
         soup = BeautifulSoup(html)
     else:
         soup = BeautifulSoup(html, "html.parser")
     text = soup.getText("\n")  # will need to be updated for bs4
     if soup.findAll("ol"):
         self.markup = "ol"
     elif soup.findAll("ul"):
         self.markup = "ul"
     else:
         self.markup = "div"
     # remove empty lines:
     lines = re.sub(r"^(&nbsp;)+$", "", text,
                    flags=re.MULTILINE).splitlines()
     items = [line for line in lines if line.strip() != ""]
     return items, None
Beispiel #31
0
def main():
    if 2 > len(sys.argv):
        print 'python {0:s} <ted_url>'.format(__file__)
        return

    url = sys.argv[1]
    print url
    soup = BeautifulSoup(urlopen(url).read())
    attr_map = soup.find(id="share_and_save").attrMap
    langs = ['en', 'zh-cn']
    for lang in langs:
        subtitle_url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/html' % (attr_map['data-id'], lang)
        beautiful_soup = BeautifulSoup(urlopen(subtitle_url).read())

        subtitle = beautiful_soup.getText("\n").encode('utf-8')

        with open("%s-%s.txt" % ((attr_map['data-title']), lang), 'w') as f:
            f.write(subtitle)
Beispiel #32
0
def start_refresh():
    while True:
        apks = get_apks()
        if not apks:
            return
        report_list = []
        for apk in apks:
            try:
                if not apk[1]:
                    desc = ''
                else:
                    soup = BeautifulSoup(apk[1])
                    desc = soup.getText('\n')
                report_list.append((apk[0], desc))
            except Exception as e:
                print file
                print e
        report_status(report_list)
Beispiel #33
0
def has_badword(content, extra=u''):
    """
    function:检查html中是否含有关键字。
    params:
        content - 要检测的内容
        extra - 附加检查的内容
    return: True or False
    """
    keywords = [u'土巴兔', u'小兔', u'酷家乐', u'小酷', u'小乐', u'乐乐', u'kujiale', 'to8to', u'优居客', u'小优', u'youjuke', u'x团', u'X团']
    if (type(content) == str) or (type(content) == unicode):
        content = BeautifulSoup(content)
    html_text = content.getText()
    text = u'%s,%s' % (html_text, extra)
    for kw in keywords:
        if text.find(kw) > -1:
            return True
        
    return False
Beispiel #34
0
    def __init__(self, hedef):
        from BeautifulSoup import BeautifulSoup
        import urllib
        text = ""
        url = hedef
        html = urllib.urlopen(url).read()
        soup = BeautifulSoup(html)
        if soup is not None:
            for script in soup(["script", "style"]):
                script.extract()  # rip it out
            text = soup.getText()
            lines = (line.strip() for line in text.splitlines())
            # break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            # drop blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)

        Derlem.__init__(self, text.encode("utf8").splitlines(True))
Beispiel #35
0
    def __init__(self, hedef):
        from BeautifulSoup import BeautifulSoup
        import urllib
        text = ""
        url = hedef
        html = urllib.urlopen(url).read()
        soup = BeautifulSoup(html)
        if soup is not None:
            for script in soup(["script", "style"]):
                script.extract()  # rip it out
            text = soup.getText()
            lines = (line.strip() for line in text.splitlines())
            # break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines
                      for phrase in line.split("  "))
            # drop blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)

        Derlem.__init__(self, text.encode("utf8").splitlines(True))
def clean(document, document_type):
    if document_type == 0:
        soup = BeautifulSoup(document)
        document = soup.getText()
        document = document.replace('&#160;', ' ')
        document = document.replace('&#163;', ' ')
        document = document.replace('&#254;', ' x ')
        document = document.replace('&#253;', ' x ')
        document = document.replace('&#120;', ' x ')
        document = document.replace('&#9746;', ' x ')
        document = document.replace('&nbsp', ' ')
        document = document.replace(';', ' ')
    document = document.replace('\t', ' ')
    document = document.replace('\r\n', ' ')
    document = document.replace('\n', ' ')
    document = document.replace('\r', ' ')
    document = re.sub(' +', ' ', document)
    #document = document.lower()
    return document
Beispiel #37
0
def prepare_for_markdown(string):
    soup = BeautifulSoup(string)
    for tag in soup.findAll():
        if tag.name == 'code':
            if tag.string is not None:
                tag.string = '```' + tag.string + '```'
        elif tag.name == 'a':
            if tag['href'] is not None:
                tag.replaceWith(tag['href'])
        elif tag.name == 'strong':
            if tag.string is not None:
                tag.string = '*' + tag.string + '*'

    htmlParser = HTMLParser()
    soup_text  = soup.getText('\n')
    soup_text = soup_text.replace('*', '\*')
    soup_text = soup_text.replace('_', '\_')
    plain_text = htmlParser.unescape(soup_text)
    plain_text = plain_text.replace('\n\n', '\n')
    return plain_text
    def process(self, item,spider,matcher):
   	    if item['brand']:
		temp = item['brand'][0] 
            	temp = matcher.dualMatch(temp)
		item['brand'] = temp
            if not item['brand']:
                     logging.info(item['url'])
                     raise DropItem("**** **** **** Missing brand in %s . Dropping" % item)

	    if item['price'] != 'NA':
		  temp = item['price']
		  item['price'] = utils.cleanNumberArray(temp, 'float')
	    if item['description']:
			temp = item['description'] 
			soup = BeautifulSoup(temp[0])
			temp = soup.getText()
			if re.search(r'<xml>', temp) is None:
				item['description'] = temp
			else:
				print 'too long text going for else'
				a =soup.p.findChild('span')
				if a:
					a = a.getText()
					item['description'] = a	
				else:
					print ' no decription extracted'
				
	    if item['volume']:
		item['volume'] = utils.extractVolume(item['name'])
	    if item['comments']:
                comment_html = item['comments']
                try:
                    item['comments'] = self.get_comments(comment_html, item['url'])
                except:
                    exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
                    logger.error('Error getting comments %s , Exception information: %s, %s, Stack trace: %s ' % (item['url'],
                                            exceptionType, exceptionValue, traceback.extract_tb(exceptionTraceback)))
                
        
            return item
Beispiel #39
0
def get_chapter_aoye(chapter_url, fic, web_site):
    """获取xs8的某篇小说的所有的章节信息"""
    content_tag = "div"
    content_class = {"id": "detaillist"}
    chapter_tag = "li"
    chapter_class = {}
    html_page = urllib2.urlopen(chapter_url)
    html_content = html_page.read()
    ans = []
    html_content = gzip_content(html_content)
    content = BeautifulSoup(html_content)
    out = content.findAll(content_tag, content_class)
    contents = ''.join([str(item) for item in out])
    chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class)
    for item in chapters:
        item_str = str(item)
        item_tag = BeautifulSoup(item_str)
        if item_tag.a:
            url = item_tag.a['href']
            chapter_title = item_tag.getText()
            ans.append((url, chapter_title))
    save_chapter(ans, fic, web_site)
Beispiel #40
0
def get_chapter_aoye(chapter_url, fic, web_site):
    """获取xs8的某篇小说的所有的章节信息"""
    content_tag = "div"
    content_class = {"id" : "detaillist"}
    chapter_tag = "li"
    chapter_class = {}
    html_page = urllib2.urlopen(chapter_url)
    html_content = html_page.read()
    ans = []
    html_content = gzip_content(html_content)
    content = BeautifulSoup(html_content)
    out = content.findAll(content_tag, content_class)
    contents = ''.join([str(item) for item in out])
    chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class)
    for item in chapters:
        item_str = str(item)
        item_tag = BeautifulSoup(item_str)
        if item_tag.a:
            url = item_tag.a['href']
            chapter_title = item_tag.getText()
            ans.append((url, chapter_title))
    save_chapter(ans, fic, web_site)
Beispiel #41
0
    def save(self, domain_override=None,
             subject_template_name='registration/password_reset_subject.txt',
             email_template_name='registration/password_reset_email.html',
             use_https=False, token_generator=default_token_generator,
             from_email=None, request=None, html_email_template_name=None, extra_email_context={}, user=None):
        """
        Generates a one-use only link for resetting password and sends to the
        user.
        """
        from django.core.mail import send_mail
        email = self.cleaned_data["email"]
 
        if not domain_override:
            current_site = get_current_site(request)
            site_name = current_site.name
            domain = current_site.domain
        else:
            site_name = domain = domain_override
        c = {
            'email': user.email,
            'domain': domain,
            'site_name': site_name,
            'uid': urlsafe_base64_encode(force_bytes(user.pk)),
            'user': user,
            'token': token_generator.make_token(user),
            'protocol': 'https' if use_https else 'http',
            'extra' : extra_email_context,

        }
        subject = loader.render_to_string(subject_template_name, c)
        # Email subject *must not* contain newlines
        subject = ''.join(subject.splitlines())
        html_email = loader.render_to_string(email_template_name, c)
        soup = BeautifulSoup(html_email)
        email = soup.getText()

        
        send_mail(subject, email, from_email, [user.email], html_message=html_email, fail_silently=False)
Beispiel #42
0
def get_chapter_longtengzw(chapter_url, fic, web_site):
    """获取龙腾中文网的某篇小说的所有的章节信息"""
    content_tag = "div"
    content_class = {"class" : "readerListShow"}
    chapter_tag = "td"
    chapter_class = {"class" : "ccss"}
    html_page = urllib2.urlopen(chapter_url)
    html_content = html_page.read()
    ans = []
    html_content = gzip_content(html_content)
    content = BeautifulSoup(html_content)
    out = content.findAll(content_tag, content_class)
    contents = ''.join([str(item) for item in out])
    chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class)
    for item in chapters:
        item_str = str(item)
        item_tag = BeautifulSoup(item_str)
        if item_tag.a:
            url = item_tag.a['href']
            url = chapter_url + url
            chapter_title = item_tag.getText()
            ans.append((url, chapter_title))
    save_chapter(ans, fic, web_site)
Beispiel #43
0
def LyricWikia(artist,title):
	url = 'http://lyrics.wikia.com/api.php?artist={artist}&song={title}&fmt=json'.format(artist=artist,title=title).replace(" ", "%20")
	r = requests.get(url, timeout=15)
	#We got some bad formatted JSON data... So we need to fix stuff :/
	returned = r.text
	returned = returned.replace("\'","\"")
	returned = returned.replace("song = ","")
	returned = json.loads(returned)
	if returned["lyrics"] != "Not found":
		#set the url to the url we just recieved, and retrieving it
		r = requests.get(returned["url"], timeout=15)
		#curl.setopt(curl.URL, str(curl_return["url"]))
		#curl.perform()
		soup = BeautifulSoup(r.text)
		soup = soup.find("div", {"class": "lyricbox"})
		[elem.extract() for elem in soup.findAll('div')]
		[elem.replaceWith('\n') for elem in soup.findAll('br')]
		soup = BeautifulSoup(str(soup),convertEntities=BeautifulSoup.HTML_ENTITIES)
		soup = BeautifulSoup(re.sub(r'(<!--[.\s\S]*-->)','',str(soup)))
		[elem.extract() for elem in soup.findAll('script')]
		return soup.getText()
	else:
		return
Beispiel #44
0
def get_chapter_longtengzw(chapter_url, fic, web_site):
    """获取龙腾中文网的某篇小说的所有的章节信息"""
    content_tag = "div"
    content_class = {"class": "readerListShow"}
    chapter_tag = "td"
    chapter_class = {"class": "ccss"}
    html_page = urllib2.urlopen(chapter_url)
    html_content = html_page.read()
    ans = []
    html_content = gzip_content(html_content)
    content = BeautifulSoup(html_content)
    out = content.findAll(content_tag, content_class)
    contents = ''.join([str(item) for item in out])
    chapters = BeautifulSoup(contents).findAll(chapter_tag, chapter_class)
    for item in chapters:
        item_str = str(item)
        item_tag = BeautifulSoup(item_str)
        if item_tag.a:
            url = item_tag.a['href']
            url = chapter_url + url
            chapter_title = item_tag.getText()
            ans.append((url, chapter_title))
    save_chapter(ans, fic, web_site)
Beispiel #45
0
def main():
    # read dataset file
    f = 'dataset.csv'
    df = file_read(f)
    # check web site
    updateSites = []
    reg = []

    def _toMD5(text):
        return hashlib.md5(text).hexdigest()

    for i, row in df.iterrows():
        try:
            url = row[0]
            hash_ago = row[1]
        except:
            hash_ago = 0
        print url

        # read html
        x = AccessPage(url)
        soup = BeautifulSoup(x.html)
        soup = soup.body
        html = soup.getText().encode('ascii', errors='backslashreplace')

        hash_now = _toMD5(html)
        if hash_now != hash_ago and hash_ago != 0:
            updateSites.append(url)
        reg.append([url, hash_now])
        df = pd.DataFrame(reg)
    # recode
    file_write(f, df)
    # view page
    for i in updateSites:
        print i
        commands.getoutput('open ' + i)
def importCity(cityname, url, package):
    if cityname == 'hamburg':
        # Only take 'open data'
        if package['type'] != 'dataset' or 'forward-reference' in package[
                'title']:
            return {}

    #There is a version of CKAN that can output private datasets! but DKAN is using this field for different purposes
    if package['private'] and cityname not in dkanCities:
        return {}

    resources = []
    formats = set()
    files = []
    # Key for the file link in the resource
    urlkeys = ['url']
    formatkey = 'format'

    if ('resources' in package):
        resources = package['resources']

    for file in resources:
        for urlkey in urlkeys:
            if (file[urlkey] not in [None, '']):
                if '://' not in file[urlkey]:
                    files.append(url + file[urlkey])
                else:
                    files.append(file[urlkey])
                break
        if formatkey in file and file[formatkey] not in [None, '']:
            format = file[formatkey]
            formats.add(format.upper())

    row = {}

    row[u'Stadt'] = cityname
    row[u'Dateibezeichnung'] = package['title']
    if 'name' in package:
        row[u'URL PARENT'] = url + '/dataset/' + package['name']
    elif 'url' in package:
        row[u'URL PARENT'] = package['url']
    else:
        row[u'URL PARENT'] = ''
    if cityname in v3cities:
        licensekey = 'license_id'
        vstellekey = 'author'
        catskey = 'groups'
        catssubkey = 'title'
        if cityname == 'berlin':
            catssubkey = 'name'
    elif cityname == 'muenchen':
        licensekey = 'license_id'
        vstellekey = 'maintainer'
        catskey = 'groups'
        catssubkey = 'title'
    elif cityname in dkanCities:
        licensekey = 'license_title'
        vstellekey = 'maintainer'
        catskey = 'tags'
        catssubkey = 'name'
    # Generate URL for the catalog page
    if 'notes' in package and package['notes'] != None:
        row[u'Beschreibung'] = package['notes']
        if cityname == 'koeln':
            soup = BeautifulSoup(row[u'Beschreibung'])
            row[u'Beschreibung'] = soup.getText('\n')
    else:
        row[u'Beschreibung'] = ''
    row[u'Zeitlicher Bezug'] = ''
    if licensekey in package and package[licensekey] != None:
        row[u'Lizenz'] = package[licensekey]
        # if not already short, try to convert
        if metautils.isopen(row[u'Lizenz']) is 'Unbekannt':
            row[u'Lizenz'] = metautils.long_license_to_short(row[u'Lizenz'])
    else:
        row[u'Lizenz'] = 'nicht bekannt'
    if vstellekey in package and package[vstellekey] != None:
        row[u'Veröffentlichende Stelle'] = package[vstellekey]
    else:
        row[u'Veröffentlichende Stelle'] = ''
        if 'extras' in package:
            print 'WARNING: No author/maintainer/publisher, checking extras'
            for extra in package['extras']:
                if extra['key'] == 'contacts':
                    print 'WARNING: No author, but amazingly there is possibly data in the contacts: ' + extra[
                        'value']
    cat_groups = metautils.setofvaluesasarray(package[catskey], catssubkey)
    if cityname != 'berlin':
        odm_cats = metautils.matchCategories(cat_groups)
    else:
        for group in cat_groups:
            odm_cats = berlin_to_odm(group)
    row[u'categories'] = odm_cats

    row[u'Format'] = formats
    row[u'files'] = files

    row['metadata'] = package

    row[u'original_metadata'] = {
        u'metadata_created': package['metadata_created'],
        u'metadata_modified': package['metadata_modified']
    }

    return row
Beispiel #47
0
 def cleanup(self, msg):
     soup = BeautifulSoup(msg)
     return soup.getText().encode('utf-8').strip()
Beispiel #48
0
print "Crawling Pages, please wait..."
with tqdm(total=retrieveLimit) as progress:
    for page in urlList:
        if docIDCounter > retrieveLimit:
            break  #quits crawling if retrieval limit is reached
        try:
            #---------- Page Crawler (gets words and links from each page ---------
            soup = ""
            browse.open(page)
            if page.endswith(".txt"):
                soup = browse.response().read()
            else:
                soup = BeautifulSoup(browse.response().read(
                ))  #if can't parse, assumed to be binary file or 404
                soup = soup.getText()
            hashTest = hashlib.md5(soup.encode('utf-8')).hexdigest()
            if hashTest not in duplicateDetect:
                duplicateDetect.append(hashTest)
                wordsInPage = soup.split()
                if not page.endswith(".txt"):

                    for link in browse.links():
                        tempURL = urlparse.urljoin(link.base_url, link.url)
                        #BELOW: gets rid of duplicate urls resulting from index.html/index.htm
                        if tempURL.endswith("index.html"):
                            tempURL = tempURL.replace("index.html", "")
                        elif tempURL.endswith("index.htm"):
                            tempURL = tempURL.replace("index.htm", "")

                        if tempURL not in urlList:
Beispiel #49
0
 def get_text(self, html):
     s = BeautifulSoup(html)
     return s.getText()
Beispiel #50
0
def buildpattern(html, debug):
    doc = {}
    docwords = {}
    structure = []
    fulltext = []
    title = ''
    attributes = {}
    x = []
    y = []
    
    # HTMLDELIM = ["</title>", "</div>", "</script>", "</p>", "</li>", "</html>"]
    html = re.sub(r'<\/script>', "</script>\n", html)
    html = re.sub(r'<meta ', "\n<meta ", html)
    html = re.sub(r'<\/title>', "</title>\n", html)
    html = re.sub(r'<\/div>', "</div>\n", html)
    html = re.sub(r'<\/p>', "</p>\n", html)
    html = re.sub(r'<\/li>', "</li>\n", html)
    html = re.sub(r'<\/style>', "</style>\n", html)
    html = re.sub(r'<\/dd>', "</dd>\n", html)

    htmlstrings = html.splitlines()

    if htmlstrings:
        lineID = 0
        for line in htmlstrings:
            lenstr = len(line)
            words = len(line.split())
            comas = len(line.split(","))
            dots = len(line.split("."))
            equal = len(line.split("="))
            soup = BeautifulSoup(line)
            if words:
                htmltags = []
                visiblecontent = soup.getText()
                for child in soup.recursiveChildGenerator():
                    name = getattr(child, "name", None)
                    if name is not None:
                         htmltags.append(name)
                    elif not child.isspace(): # leaf node, don't print spaces
                         donothing = 1
                matrix = {}
                visiblewords = len(visiblecontent.split())
                matrix['words'] = str(words)
                matrix['visiblewords'] = 0
                matrix['comas'] = comas
                matrix['dots'] = dots
                matrix['equal'] = equal
                matrix['html'] = line
                matrix['tags'] = str(visiblecontent)
                code = 'W' + str(visiblewords) + ',C' + str(comas) + ',D' + str(dots) + ',E' + str(equal)
                matrix['code'] = code
                if visiblewords > 10:
                    matrix['visiblewords'] = str(visiblewords)
                doc[lineID] = matrix
            lineID = lineID + 1    
        
    if debug:
        sorted(doc, key=int)
         
        #for lineID in doc:
        for lineID,item in doc.items():
        #lineID = 1003
            if lineID:
                code = item['code']
                line = str(item['html'])
                words = item['words']
                words = item['visiblewords']
                tags = item['tags']
                x.append(lineID)
                y.append(int(words))
                #print 'W' + str(words) + ' ' + line + ' ' + code
                if words:
                    print str(lineID) + ',' + code + ',' + line + '\t' + tags
    
    return (x,y,doc)
Beispiel #51
0
def init_file(file):
    dic = {
        'id': file[0],
        'name': file[1],
        'icon_link': file[2],
        'icon_path': file[3],
        'source': file[4],
        'source_link': file[5],
        'rating': file[6],
        'version': file[7],
        'developer': file[8],
        'sdk_support': file[9],
        'category': file[10],
        'screen_support': file[11],
        'apk_size': file[12],
        'language': file[13],
        'publish_date': file[14],
        'downloads': file[15],
        'description': file[16],
        'images': file[17],
        'images_path': file[18],
        'qr_link': file[19],
        'download_link': file[20],
        'last_crawl': file[21],
        'vol_id': file[22],
        'package_name': file[23],
        'version_code': file[24],
        'sig': file[25],
        'min_sdk_version': file[26],
        'is_break': file[27],
        'platform': file[28],
        'file_type': file[29],
        'package_hash': file[30],
    }
    try:
        if dic['source'] == 'nduoa.com':
            dic['downloads'] = dic['downloads'].replace(u',', '')
            dic['apk_size'] = get_apk_size(dic.get('apk_size'))
        elif dic['source'] == 'hiapk.com':
            dic['apk_size'] = get_apk_size(dic.get('apk_size'))
        elif dic['source'] == 'goapk.com':
            if dic.get('downloads') and u'\u5927\u5c0f\uff1a' in dic[
                    'downloads'].decode('utf8'):
                dic['apk_size'] = dic['downloads'].decode('utf8')
                dic['downloads'] = 0
            if dic.get('category') and u'\u5927\u5c0f\uff1a' in dic[
                    'category'].decode('utf8'):
                dic['apk_size'] = dic['category'].decode('utf8')
                dic['category'] = ''
            if dic.get('category'
                       ) and u'\u7c7b\u522b' in dic['category'].decode('utf8'):
                dic['category'] = dic['category'].split(':')[1]
            dic['version'] = get_version(dic['version'])
            dic['apk_size'] = get_apk_size(dic.get('apk_size'))
        elif dic['source'] == 'appchina.com':
            dic['apk_size'] = get_apk_size(dic.get('apk_size'))
        elif dic['source'] == 'mumayi.com':
            if dic.get('apk_size'
                       ) and u'\u672a\u77e5' in dic['apk_size'].decode('utf8'):
                dic['apk_size'] = 0
            dic['apk_size'] = get_apk_size(dic.get('apk_size'))
        elif dic['source'] == 'as.baidu.com':
            dic['developer'] = None
        if dic.get('description'):
            soup = BeautifulSoup(dic.get('description').decode('utf8'))
            dic['description'] = soup.getText('\n')
        else:
            dic['desctiption'] = ''
        dic['rating'] = get_raing(dic.get('rating'))
        dic['category'] = _adapt_cate_str(dic.get('category'))
    except Exception as e:
        print dic['source_link']
        print e
    return dic
Beispiel #52
0
def unrenderhtml(html):
    soup = BeautifulSoup(html)
    return soup.getText('\n')
Beispiel #53
0
def buildpattern(html, debug):
    doc = {}
    docwords = {}
    structure = []
    fulltext = []
    title = ''
    attributes = {}
    x = []
    y = []
    
    # HTMLDELIM = ["</title>", "</div>", "</script>", "</p>", "</li>", "</html>"]
    html = re.sub(r'<script', "\n<script", html)
    html = re.sub(r'<style', "\n<style", html)
    html = re.sub(r'<\/script>', "\n</script>\n", html)
    html = re.sub(r'<meta ', "\n<meta ", html)
    html = re.sub(r'<\/title>', "</title>\n", html)
    html = re.sub(r'<\/div>', "</div>\n", html)
    html = re.sub(r'<\/p>', "</p>\n", html)
    html = re.sub(r'<\/li>', "</li>\n", html)
    html = re.sub(r'<\/style>', "\n</style>\n", html)
    html = re.sub(r'<\/dd>', "</dd>\n", html)

    htmlstrings = html.splitlines()

    if htmlstrings:
        lineID = 0
        active = 1
        for line in htmlstrings:
            lenstr = len(line)
            words = len(line.split())
            comas = len(line.split(","))
            dots = len(line.split("."))
            equal = len(line.split("="))
            soup = BeautifulSoup(line)
            
            if words:
                htmltags = []
                visiblecontent = soup.getText()
                for child in soup.recursiveChildGenerator():
                    name = getattr(child, "name", None)
                    if name is not None:
                         htmltags.append(name)
                    elif not child.isspace(): # leaf node, don't print spaces
                         donothing = 1
                matrix = {}
                visiblewords = len(visiblecontent.split())
                
                openignore = re.match(r'<style|<script', line)
                closeignore = re.match(r'<\/style|<\/script', line)
                urlstatus = re.findall(r'<a', line)
                timeflag = re.findall('([0-9]+:[0-9]+)', line)
                if openignore:
                    active = 0 
                            
                matrix['words'] = str(words)
                matrix['visiblewords'] = 0
                matrix['comas'] = comas
                matrix['dots'] = dots
                matrix['equal'] = equal
                matrix['html'] = line
                matrix['status'] = 'active'
                if timeflag:
                    matrix['timeflag'] = str(timeflag)
                else:
                    matrix['timeflag'] = ''
                matrix['tags'] = str(visiblecontent)
                if urlstatus:
                    matrix['urlstatus'] = 1
                else:
                    matrix['urlstatus'] = 0
                code = 'W' + str(visiblewords) + ',C' + str(comas) + ',D' + str(dots) + ',E' + str(equal) + ',U' + str(matrix['urlstatus']) + 'T' + matrix['timeflag']
                matrix['code'] = code
                if visiblewords > 0:
                    matrix['visiblewords'] = str(visiblewords)
                if active == 0:
                    matrix['visiblewords'] = 0
                    matrix['status'] = 'ignored'
                if visiblewords <= 1:
                    matrix['status'] = 'ignored'
                doc[lineID] = matrix

                if closeignore:
                    active = 1

            lineID = lineID + 1    
        
    if debug:
        sorted(doc, key=int)
        
        #for lineID in doc:
        for lineID,item in doc.items():
            line = str(item['html'])
            openignore = re.match(r'<style|<script', line)
            closeignore = re.match(r'<\/style|<\/script', line)
        #lineID = 1003
            if lineID:
                code = item['code']
                words = item['words']
                words = item['visiblewords']
                tags = item['tags']
                status = item['status']
                x.append(lineID)
                y.append(int(words))
                if status == 'active':
                    outstr = str(lineID) + ',' + code + ',' + line + '\t' + tags                    
                    #print outstr + '\n'
                    f.write(outstr + '\n') # python will convert \n to os.linesep

    return (x,y,doc)
Beispiel #54
0
#Use print to debug
            #print filename
#writes results in .csv file
#toread contains the exact file path
            towrite = filename+"\\"+file+".csv"
            toread = targetpath+"\\"+file+"\\html"+"\\"+file+".htm"
            #Here we use toread to read the htm file in the given path
#I am using beautifulsoup to read htm files because we can start playing with htm tags which is quite easy to write a parser
            with codecs.open(toread, encoding='utf-8', errors='replace') as f:
                t = f.read().encode('utf-8')
#x variable has potential to access data in specific tags because we have used beautifulsoup
                x = BeautifulSoup(t)
#So, before writing a parser it is necessary to study few documents you want to parse. 
#Then you will get some idea what are the common points in each document and while using those common points you won't loose any data
#Here I am considering a Page as checkpoint. So, it finds out page and get the Text.
                checkpoint = x.getText().find("Page")
#If it is true then it goes inside the if loop
                if checkpoint != -1:
#Here split is done with a term called Page where split actually makes a list but as mentioned it displays the data in the list where
#the index in a list is 1
                    x = x.getText().split("Page",1)[1]
#endpoint is nothing but getting the last index of the text
                    endpoint = len(x)-1
#So, in the startpoint it finds ('----------')
                    startpoint =  x.find('----------')
                else:
#If there is no term called Page in a document which you trying to parse. Then it directly tries to find ('----------') and get the
#end point of the text
                    x = x.getText().encode('utf-8')
                    endpoint = len(x)-1
                    startpoint =  x.find('----------')
Beispiel #55
0
print "Found " + str(len(links)) + " links!"

final_links = []
print "Searching links for matches..."

for i in range(len(links)):
    print str(i)
    link = links[i]
    if (not re.search("\.html$", link)
            and not re.search("/[^\.]*$", link, re.IGNORECASE)):
        continue
    # Try three times to get the text; after three failures, move on
    f = None
    for i in range(10):
        try:
            f = urllib2.urlopen(link, timeout=1)
        except urllib2.URLError:
            pass
        except socket.timeout:
            pass
    if f == None:
        continue
    soup = BeautifulSoup(f)
    current_text = soup.getText()
    if (re.search(regex, current_text)):
        final_links.append(link)

for final_link in final_links:
    print final_link
Beispiel #56
0
def cleantags(html):
    soup = BeautifulSoup(html)
    return soup.getText(separator=u' ')  #(soup.text)
    def process(self, item,spider,matcher):
        if item['url']:
            item['url'] = item['url'].lower()					
        if item['sku']: 
		item['sku'] = utils.cleanSkuArray(item['sku'], 'string')
	if item['price'] != 'NA': 
		temp = item['price']
		if len(temp) > 1:
			volarray = []
			parray = []
			for item in temp:
				if re.search(r'true', item):
					item = item.replace("'disponivel': true,","")
					dic = ast.literal_eval(item)
					price = dic['preco_promo']
					volume = dic['descricao']
					parray.append(price)
					volarray.append(volume)	
				else:
					item = item.replace("'disponivel': false,","")
					dic = ast.literal_eval(item)
					price = dic['preco_promo']
					volume = dic['descricao']
					parray.append(price)
					volarray.append(volume)
			item['price'] = utils.cleanNumberArray(parray, 'float')
			item['volume'] = volarray
			print 'BELEZA MULTI PASS'
			print item['price']
			print item['volume']	
		else:
			item['price'] =utils.cleanNumberArray(item['price'], 'float')
	
	if item['description']:
		temp = item['description']
		soup = BeautifulSoup(temp[0])
		text = soup.getText()
		item['description'] = text
 
        if item['brand']:
            tempBrand = item['brand']
            tempBrand = tempBrand[0]
            tempBrand = utils.extractBrand(tempBrand)
	    tempBrand = utils.cleanChars(tempBrand)
            item['brand'] = tempBrand
   	if item['volume']:
		#first check if volume array exists(if not getelement returns empty and see if the name contains volume information)
		print 'PIPELINE INPUT volume is %s' % item['volume']
		
		temp = item['volume']
		if isinstance(temp, list):
			length = len(temp)
			print "multi value volume %s" % temp
			
			item['volume'] = utils.getElementVolume(temp)
		else:
			print 'NON multi volume field %s' % item['volume']
			
 
        if item['category']:
            tempCat = item['category']
            item['category'] = tempCat[0]
        if item['image']:
            temp = item['image'] 
            temp = temp[0]
            item['image'] = temp
        if item['comments']:
            comment_html = item['comments']
            try:
                item['comments'] = self.get_comments(comment_html, item['url'])
            except:
                exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
                logger.error('Error getting comments %s , Exception information: %s, %s, Stack trace: %s ' % (item['url'],
                                            exceptionType, exceptionValue, traceback.extract_tb(exceptionTraceback)))
                

        return item
Beispiel #58
0
 def get_description(self, obj):
     from BeautifulSoup import BeautifulSoup
     soup = BeautifulSoup(obj.content)
     [s.extract() for s in soup('script')]
     return soup.getText()[:200]