Example #1
1
def countWords(c) :
    c.execute("SELECT title, content FROM listing")

    listings = c.fetchall()
    for title, content in listings :
        title = title + r' '
        all_content = BeautifulSoup(title + content).getText()
        all_content = all_content.replace('\n', ' ')
        all_content = re.sub(r'START CLTAGS.*END CLTAGS', '', all_content)
        tokens = set(token.lower() for token in all_content.split())
        tokens -= STOP_WORDS
        for word in tokens  :
            c.execute("INSERT INTO words (word, counter) "
                      "VALUES (%s, 1) "
                      "ON DUPLICATE KEY UPDATE counter = counter + 1", (word.lower(),))
Example #2
0
def crawlPost(link_id):
    exceptions = ["ipTRACKERonline.com"]
    core_url = "http://forum.419eater.com/forum/"
    page_name = "viewtopic.php"
    forum_args = "?t="
    response = None
    try:
        post = urllib2.urlopen(core_url + page_name + forum_args + link_id)
        soup = BeautifulSoup(post, convertEntities=BeautifulSoup.HTML_ENTITIES)
        try:  # hating BeautifulSoup
            response = soup.findAll("td", {"class": "postbody"})[1]
            response = response.renderContents()
            for elem in exceptions:
                if elem in response:
                    return None
            response = response.replace("<br />\n<br />\n", "<magic>\n")
            response = response.replace("<br />\n", "")
            response = response.replace("<magic>\n", "<br />\n")
            response = BeautifulSoup(response)
            response = response.findAll(text=True)
            response = "".join(response)
            response = response.encode("ascii", "ignore")
            response = response.replace("\r\n ", "\r\n")
            response = response.replace("\n\n  Quote:   ", "")
        except:
            return None
    except:
        print "Failing on:", core_url + page_name + forum_args + link_id
    return response  # WARNING: Unicode
def cleanhtml(raw):
    cleanr = re.compile('<.*?>|\\n')
    raw = re.sub(cleanr, ' ', raw)
    raw = BeautifulSoup(raw).getText()
    raw = raw.replace(u"\u2018",
                      "'").replace(u"\u2019",
                                   "'").replace(u"\u201c",
                                                '"').replace(u"\u201d", '"')
    raw = raw.replace(' .', '.').replace(' ,', ',')
    raw = ' '.join(raw.split())
    return raw
Example #4
0
    def parse_item(self, response):

        item = GpuItem()
        item['link'] = response.url
        item['name'] = response.css('span.cpuname').xpath(
            'text()').extract()[0]

        search = {
            'description': u'Description:',
            #'processzor_modell': u'Videocard Category:',
            'other_name': u'Other names:',
            #'memoria_merete': u'Videocard First Benchmarked:',
            'g3d_mark': u'G3DMark/\$Price:',
            #'memoria_max_seb':u'Overall Rank:',
            #'memoria_foglalat':u'Last Price Change:',
        }

        rank = u'Samples:'

        i = 0
        for sel in response.css("table.desc tr")[1].xpath('td'):
            text = sel.extract()
            textSplit = text.split(u'<span style="font-weight: bold;">')
            for ii in textSplit:
                cleantext = BeautifulSoup(ii).text
                for si in search:
                    if (cleantext.find(search[si]) != -1):
                        item[si] = cleantext.replace(search[si], "")
                if (cleantext.find(rank) != -1):
                    item['rank'] = cleantext.split(rank)[0]

            i = i + 1
        yield item
    def parse_item(self, response):

        item = CpuItem()
        item['link'] = response.url
        item['name'] = response.css('span.cpuname').xpath('text()').extract()[0]

        search = {
            'description': u'Description:',
            'other_name': u'Other names:',
            'g3d_mark':u'G3DMark/\$Price:',
            'clock':u'Clockspeed:',
            'core':u'No of Cores:'
        }

        rank = u'Samples:'

        i = 0
        for sel in response.css("table.desc tr")[1].xpath('td'):
            text = sel.extract()
            textSplit = text.split(u'<span style="font-weight: bold;">')
            for ii in textSplit:
                cleantext = BeautifulSoup(ii).text
                for si in search:
                    if (cleantext.find(search[si]) != -1 ):
                        item[si] = cleantext.replace(search[si],"")
                if (cleantext.find(rank) != -1):
                    item['rank'] = cleantext.split(rank)[0]

            i = i+1
        yield item
def validate_urls(urls, template_dir):
    """Validates a list of urls using the online W3C validator, saving results to html file"""
    print 'Validating urls now...'
    with open(path.join(template_dir, 'validation-results.html'), 'w') as file:
        for url in urls:
            print url
            html = requests.post('http://validator.w3.org/check', {'uri': url})
            time.sleep(1)
            results = BeautifulSoup(html.text).find('div', {'id': 'result'})
            if not results:
                continue
            del results['id']
            results['class'] = 'results'
            for img in results.findAll('img'):
                if not img['alt'] in ['Error', 'Warning', 'Info']:
                    img.extract()
            for element_with_id in results.findAll(True, {'id': True}):
                del element_with_id['id']
            for p in results.findAll('p', {'class': re.compile('helpwanted|backtop')}):
                p.extract()
            results = str(results)
            results = results.replace('images/info_icons/', '{{ STATIC_URL }}django_w3c_validator/')
            if not re.search('Congratulations', results):
                file.write('<div class="results-wrapper">\n')
                file.write('<h1>' + url + '</h1>\n')
                file.write(results)
                file.write('</div>\n')
Example #7
0
def ParseTTV(url):
    page= urllib2.urlopen(url)
    soup =BeautifulSoup(page)
    soup = str(soup)
    soup =soup.replace("<br />","")
    soup = BeautifulSoup(soup)
    return soup.find("div", {"class":"contents"})\
    .findAll("div",{"class":"bangumi_top"})
Example #8
0
    def parse_payment_message(self, body):
        if BeautifulSoup(body).find(text=u'Деньги успешно зачислены') or \
                BeautifulSoup(body).find(text=u'Перевод от другого пользователя'):
            sum = BeautifulSoup(body).find(text=u'Сумма').previous.previous.nextSibling.contents[0].text
            payer = BeautifulSoup(body).find(text=u'Комментарий').previous.previous.nextSibling.contents[0].text

            return BaseTransaction(None, None, sum.replace(u'&nbsp;', ''), payer, None, 'YM')

        return None
Example #9
0
def getEpsLegendados(url):
    link = openURL(url)
    soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
    eps = soup.findAll("div", {"class": "well well-sm"})

    plotE = re.findall('<span itemprop="description">\s*(.*?)</span>', link,
                       re.DOTALL | re.MULTILINE)[0]
    plotE = unicode(
        BeautifulStoneSoup(
            plotE,
            convertEntities=BeautifulStoneSoup.HTML_ENTITIES)).encode('utf-8')
    plotE = BeautifulSoup(plotE.replace("<br>", " ")).text

    totE = len(eps)

    try:
        anterior = re.findall('href="(.*?)">Voltar</a></li>', link)[0]
        primeira = re.findall('href="(.*?)">Primeiro</a></li>', link)[0]
        proxima = re.findall('href="(.*?)">Avançar</a></li>', link)[0]
        pa = re.findall('([0-9]+?)$', anterior)[0]
        pd = re.findall('([0-9]+?)$', primeira)[0]
        pp = re.findall('([0-9]+?)$', proxima)[0]
        if (pp != '2'):
            addDir('. Primeira Página', base + primeira, 31,
                   artfolder + 'pagantr.jpg')
        if (pp != '2'):
            addDir('<< Página Anterior ' + pa, base + anterior, 31,
                   artfolder + 'pagantr.jpg')
    except:
        pass

    for ep in eps:
        try:
            titE = ep.img["title"].encode('ascii', 'ignore')
            urlE = base + ep.a["href"]
            if ep.a.img.has_key("src"): imgE = ep.a.img["src"]
            else: imgE = ep.a.img["data-cfsrc"]
            addDir(titE, urlE, 100, imgE, False, totE, plotE)
        except:
            pass

    try:
        ultima = re.findall('href="(.*?)">Último</a></li>', link)[0]
        pu = re.findall('([0-9]+?)$', ultima)[0]
        if (pu != '1'):
            addDir('Página Seguinte ' + pp + ' >>', base + proxima, 31,
                   artfolder + 'proxpag.jpg')
        if (pu != '1'):
            addDir('Última Página ' + pu + ' >>', base + ultima, 31,
                   artfolder + 'proxpag.jpg')
    except:
        pass
Example #10
0
    def add_newsitem(self, entry):
        """ Add news item
        """
        title = entry.get('title', '')
        title = title.replace('&nbsp;', ' ').strip()

        description = BeautifulSoup(entry.get('summary', ''))
        description = ''.join([e for e in description.recursiveChildGenerator()
                        if isinstance(e, unicode)]).strip()

        ptool = getToolByName(self.context, 'portal_properties')
        sanitize = getattr(ptool, 'sanitize', None)
        if sanitize:
            title_sanitize = sanitize.getProperty('subject', [])
            for expr in title_sanitize:
                title = title.replace(expr, '')
            desc_sanitize = sanitize.getProperty('body', [])
            for expr in desc_sanitize:
                description = description.replace(expr, '')

        body = description

        utils = getUtility(IText)
        description = utils.truncate(description, 20, 200)

        if not (title and description):
            return None

        url = entry.get('link', '#').strip()

        updated = entry.get('updated', None)
        if not updated:
            updated = datetime.now(bucharest)
        else:
            try:
                updated = parseDatetimetz(updated)
            except SyntaxError:
                updated = parseDatetimetz(updated.replace(' ', 'T', 1))
            except:
                updated = datetime.now(bucharest)

            # Skip news older than 30 days
            plone_ro = 'plone.ro' in url
            if not plone_ro:
                try:
                    if updated < (datetime.now() - timedelta(10)):
                        return None
                except TypeError:
                    if updated < (datetime.now(bucharest) - timedelta(10)):
                        return None
                except Exception, err:
                    logger.exception(err)
Example #11
0
    def parse_payment_message(self, body):
        if BeautifulSoup(body).find(text=u'Деньги успешно зачислены') or \
                BeautifulSoup(body).find(text=u'Перевод от другого пользователя'):
            sum = BeautifulSoup(body).find(
                text=u'Сумма').previous.previous.nextSibling.contents[0].text
            payer = BeautifulSoup(body).find(
                text=u'Комментарий'
            ).previous.previous.nextSibling.contents[0].text

            return BaseTransaction(None, None, sum.replace(u'&nbsp;', ''),
                                   payer, None, 'YM')

        return None
Example #12
0
	def scrapeShowEpisodes(self, html, params = {}):
		get = params.get
		if self.__dbg__:
			print self.__plugin__ + " scrapeShowEpisodes"
		
		page = int(get("page", "0"))
		per_page = ( 10, 15, 20, 25, 30, 40, 50, )[ int( self.__settings__.getSetting( "perpage" ) ) ]
		
		oldVideos = self.__settings__.getSetting("show_" + get("show") + "_season_" + get("season","0") )
		
		if ( page == 0 or not oldVideos):
			videos = re.compile('<a href="/watch\?v=(.*)&amp;feature=sh_e_sl&amp;list=SL"').findall(html)
			
			list = SoupStrainer(name="div", attrs = {'class':"show-more-ctrl"})
			nexturl = BeautifulSoup(html, parseOnlyThese=list)
			if (len(nexturl) > 0):
				nexturl = nexturl.find(name="div", attrs = {'class':"button-container"})
				if (nexturl.button):
					nexturl = nexturl.button["data-next-url"]
				else:
					nexturl = ""
			
			if nexturl.find("start=") > 0:
				fetch = True
				start = 20
				nexturl = nexturl.replace("start=20", "start=%s")
				while fetch:
					url = self.urls["main"] + nexturl % start
					html = self._fetchPage(url)
					
					if html:
						html = html.replace("\\u0026","&")
						html = html.replace("\\/","/")
						html = html.replace('\\"','"')
						html = html.replace("\\u003c","<")
						html = html.replace("\\u003e",">")
						more_videos = re.compile('data-video-ids="([^"]*)"').findall(html)
						
						if not more_videos:
							fetch = False
						else:
							videos += more_videos
							start += 20
			if self.__dbg__:
				print self.__plugin__ + "found " + str(len(videos)) + " videos: " + repr(videos)
			
			self.__settings__.setSetting("show_" + get("show") + "_season_" + get("season","0"), self.core.arrayToPipe(videos))
		else:
			videos = oldVideos.split("|")
		
		if ( per_page * ( page + 1 ) < len(videos) ):
			next = 'true'
		else:
			next = 'false'
		
		subitems = videos[(per_page * page):(per_page * (page + 1))]
		
		( ytobjects, status ) = self.core._get_batch_details(subitems)

		if (len(ytobjects) > 0):
			ytobjects[len(ytobjects)-1]['next'] = next
		
		return (ytobjects, status)
Example #13
0
 def _text(self, value):
     value = BeautifulSoup(value.strip()).text
     return value.replace('&nbsp;', ' ')
Example #14
0
class Book(Item):
    def __init__(self, cf_title, bid, author):
        super(Book, self).__init__(cf_title)
        self.id = bid
        self.author = author

    def generate_title_candidates(self):
        """ generate title candidates for books"""
        for c in '{}[]\n.':
            self.cf_title = self.cf_title.replace(c, '')
        self.cf_title = self.cf_title.split(':')[0]
        self.cf_title = self.cf_title.split('(')[0]
        if len(self.cf_title) > 1:
            if self.cf_title[0] != self.cf_title[0].upper() or \
                    self.cf_title[1] != self.cf_title[1].lower():
                self.cf_title = self.cf_title[0].upper() +\
                    self.cf_title[1:].lower()
        ce = BeautifulSoup.HTML_ENTITIES
        self.cf_title = BeautifulSoup(self.cf_title, convertEntities=ce)
        self.cf_title = self.cf_title.contents[0]
        self.cf_title = self.cf_title.replace('reg;', '')
        self.cf_title = self.cf_title.replace(';', '')
        self.cf_title = self.cf_title.replace('(R)', '')
        self.cf_title = self.cf_title.replace('(r)', '')
        keys = {self.cf_title.strip()}

        # handle prefix/suffix swaps, e.g., "Haine, La"
        prefixes = {'The', 'A', 'An', 'La', 'Le', 'Les', 'Die', 'Das', 'Der',
                    'Ein', 'Il', "L'", 'Lo', 'Le', 'I', 'El', 'Los', 'Las', 'O'}
        new_keys = set()
        for k in keys:
            parts = k.split(' ')
            if len(parts) > 1 and parts[0].strip() in prefixes:
                new_keys.add(' '.join(parts[1:]))
        keys |= new_keys

        # add "The" to the beginning, if it is not already there
        new_keys = set()
        for k in keys:
            p = k.split(' ')[0]
            if p not in prefixes:
                new_keys.add('The ' + k)
        keys |= new_keys

        # adapt captialization to the Wikipedia Manual of Style
        # (this is only a heuristic)
        new_keys = set()
        minuscles = {'a', 'an', 'the', 'and', 'but', 'or', 'nor', 'for',
                     'yet', 'of', 'to', 'in', 'for', 'on', 'with'}

        for k in keys:
            parts = k.split(' ')
            parts = [p for p in parts if p]
            parts_new = [parts[0]]
            for p in parts[1:]:
                if p.lower() not in minuscles:
                    parts_new.append(p[0].upper() + p[1:])
                else:
                    parts_new.append(p)
            new_keys.add(' '.join(parts_new))
        keys |= new_keys

        author_last = self.author.rsplit(' ', 1)[-1]
        book = [k + ' (' + author_last + ' book)' for k in keys]
        booka = [k + ' (book)' for k in keys]
        novel = [k + ' (novel)' for k in keys]
        novela = [k + ' (' + author_last + ' novel)' for k in keys]
        keys.update(set(book), set(novel), set(booka), set(novela))
        self.title_candidates = {k: '' for k in keys}

    def select_title(self):
        """ select the title among the candidates
        and check if it's actually a book
        """
        super(Book, self).select_title(['books', 'novels', 'plays'])

        # sanity check - is this really a relevant article?
        if self.wikipedia_text:
            regex = re.compile('\[\[Category:([^#\|\]]+)', flags=re.IGNORECASE)
            data = self.title_candidates[self.wikipedia_title]
            categories = ' '.join(regex.findall(data))
            occurrences = categories.lower().count('books')
            occurrences += categories.lower().count('novels')
            occurrences += categories.lower().count('plays')
            occurrences += categories.lower().count('short story')
            if not occurrences:
                self.wikipedia_text = ''
                print('did not pass sanity check')
            if not self.author.split()[-1].lower() in self.wikipedia_text.lower():
                if DEBUG:
                    pdb.set_trace()
                self.wikipedia_text = ''
                print('author not in text')
            del self.title_candidates

    def obtain_categories(self):
        """scrape book categories from Google"""
        # sleep in-between to not get banned for too frequent requests
        if DEBUG:
            t = 1
        else:
            t = random.randint(10, 19)
        print('DEBUG')
        print('sleeping for', t, 'seconds')
        time.sleep(t)
        title = urllib.quote(urllib.unquote(self.wikipedia_title.encode()))
        query = '"' + title.replace('_', '+') + '"+' + 'genre'
        url = u"https://www.google.com/search?hl=en&biw=1195&bih=918" +\
              u"&sclient=psy-ab&q=" + query + u"&btnG=&oq=&gs_l=&pbx=1"
        try:
            request = urllib2.Request(url)
            # choose a random user agent
            ua = random.choice(Item.url_headers)
            request.add_header('User-agent', ua)
            data = Item.url_opener.open(request).read()
            data = data.decode('utf-8')
            if self.author.split()[-1].lower() not in data.lower():  # sanity check
                self.wikipedia_text = ''
                return []
        except (urllib2.HTTPError, urllib2.URLError) as e:
            print('!+!+!+!+!+!+!+!+ URLLIB ERROR !+!+!+!+!+!+!+!+')
            print('URLError', e)
            pdb.set_trace()

        rexes = [
            # r'<span class="kno-a-v">([^</]+)',
            #  r'<span class="answer_slist_item_title nonrich">([^</]+)',
            #  r'<span class="answer_slist_item_title">([^</]+)',
            r'Genres\s*(?:</span>)?(?:</a>)?:\s*(?:</span>)?\s*<span class="[-\_\sa-zA-Z]+">([^</]+)',
            r'Genre</td><td(?:[^</]*)>([^</]+)',
            r'Genre</th></tr><td(?:[^</]*)>([^</]+)',
        ]
        re_cat = re.compile('|'.join(rexes))
        cats = [e for g in re.findall(re_cat, data) for e in g if e]
        # cats = [g for g in re.findall(re_cat, data) if g]
        print(self.wikipedia_title)
        print(cats)
        if DEBUG:
            pdb.set_trace()
        cats = list(set(cats))
        if not cats:  # sanity check
            self.wikipedia_text = ''
        return cats

    def write_to_database(self, db_file):
        super(Book, self).write_to_database('books', db_file)
Example #15
0
def get_basic_info(html_info):

    """
    获取游戏的基本信息
    :param html_info:
    :return: dict
    """
    result = {}

    html = html_info

    if html:
        screenshot_url = ""
        developers = ""
        category = ""
        pname = ""
        download = ""
        install_num = ""
        version = ""
        size = 0
        api_level = ""
        icon_url = ""

        soup = BeautifulSoup(html)
        display_name = soup.find("h1", {"class": "app-name"})
        introduction = soup.find("div", {"class": "brief-long"})
        short_desc = soup.find("span", {"class": "head-content"})
        app_tags = soup.find("div", {"class": "nav"})
        screenshot_info = soup.find("div", {"class": "section-body"})
        star_percent = soup.find("span", {"class": "star-percent"})
        params_download_num = soup.find("span", {"class": "download-num"})
        params_platform = soup.find("span", {"class": "params-platform"})
        data_info = soup.find("a", {"class": "inst-btn-big highspeed"})

        if data_info:
            download = data_info["data_url"]
            data_size = data_info["data_size"]
            data_ver_name = data_info["data_versionname"]
            # data_ver_code = data_info["data_versioncode"]
            data_pkg_name = data_info["data_package"]
            icon_url = data_info["data_icon"]

            size = data_size or 0
            version = data_ver_name
            # version_code = data_ver_code or 0
            pname = data_pkg_name

        # 获取显示名
        if display_name:
            display_name = display_name.text

        # 获取简介
        if introduction:
            # print introduction
            introduction = str(introduction).replace("<br />", "$##$")
            introduction = BeautifulSoup(introduction)
            introduction = introduction.text.replace("$##$", "\n")
            introduction = introduction.replace(u"收起", "")

        # 获取标签类型信息
        if app_tags:
            tags_text = app_tags.text
            if tags_text:
                category = tags_text.split("&gt;")[1]

        # 获取截图url
        if screenshot_info:
            for img in screenshot_info.findAll('img'):
                src = img['src']
                if src:
                    screenshot_url += src + "\n"

        #获取下载地址,版本,安装数和大小
        # if download_info:
        #     download = download_info["href"]
        if params_download_num:
            install_num = params_download_num.text
        if params_platform:
            api_level = params_platform.text

        # 获取评分
        if star_percent:
            star_percent = star_percent.get("style")

        # 获取简短描述
        if short_desc:
            short_desc = short_desc.text

        result["display_name"] = display_name
        result["introduction"] = introduction
        result["screenshot_url"] = screenshot_url
        result["developers"] = developers
        result["category"] = category
        result["icon_url"] = icon_url
        result["pkg_name"] = pname
        result["version"] = version
        result["url1"] = download
        result["language"] = ""
        result["version_code"] = 0
        result["install_num"] = utils.format_install_num(install_num)
        result["size"] = size
        result["min_sdk_version"] = utils.format_android_level(api_level)
        result["star_num"] = utils.format_star_num(star_percent)
        result["short_desc"] = short_desc or ""

    return result
Example #16
0
def get_basic_info(html_info):
    """
    获取游戏的基本信息
    :param html_info:
    :return: dict
    """
    result = {}

    html = html_info

    if html:
        soup = BeautifulSoup(html)
        display_name = soup.find(id="app-name")
        introduction = soup.find(id="html-brief")
        app_tags = soup.find("div", {"class": "app-tags"})
        icon_url = soup.find("dt")
        short_desc = soup.find("dl", {"class": "clearfix"})

        screenshot_url = ""
        developers = ""
        category = ""
        pname = ""
        download = ""
        install_num = ""
        language = ""
        version_name = ""
        version_code = 0
        size = 0
        star_num = 0
        min_sdk_version = 0

        # 获取显示名
        if display_name:
            display_name = display_name.find("span").text

        # 如果获取不到介绍信息,尝试第二种方式
        if not introduction:
            introduction = soup.find("div", {"class": "infors"})

        # 获取简介,截图,开发商和类型
        if introduction:

            # 获取截图url
            scroll = introduction.find(id="scrollbar")
            desc = introduction.find("div", {"class": "breif"})
            if scroll:
                imgs = scroll["data-snaps"]
                for img in imgs.split(","):
                    if img and "icon.png" not in img:
                        screenshot_url += img + "\n"
            else:
                for img in introduction.findAll("img"):
                    src = img['src']
                    if src and "icon.png" not in src:
                        screenshot_url += src + "\n"

            if desc:
                desc_str = str(desc).replace("<br />", "$##$")
                desc_str = desc_str.replace("</td>", "$##$</td>")
                desc_str = BeautifulSoup(desc_str)
                introduction = desc_str.text.replace("$##$", "\n").replace(
                    "&nbsp;", " ")
                introduction = introduction.replace("versioncode",
                                                    "\nversioncode")
                introduction = introduction.replace("updatetime",
                                                    "\nupdatetime")
            else:
                introduction = str(introduction).replace("<br />", "$##$")
                introduction = introduction.replace("</td>", "$##$</td>")
                introduction = introduction.replace("</p>", "$##$</p>")
                introduction = BeautifulSoup(introduction)
                introduction = introduction.text.replace("$##$", "\n").replace(
                    "&nbsp;", " ")
                introduction = introduction.replace("versioncode",
                                                    "\nversioncode")
                introduction = introduction.replace("updatetime",
                                                    "\nupdatetime")

        #获取游戏包名和下载地址
        script_infos = soup.findAll('script')
        for script in script_infos:
            pkg_infos = script.text
            if "var detail = (function () {" in pkg_infos:
                search = re.search(r"return {([\s\S]*)};", pkg_infos,
                                   re.M | re.I)
                # print search
                if search:
                    data = search.group().replace("return", "").replace(
                        ";", "").replace("'", "\"")
                    pkg = json.loads(data)
                    pname = pkg['pname']
                    download = pkg['downloadUrl']
                    version_code = pkg['vcode']

        #获取语言,版本,安装数和大小
        pf = soup.find("div", {"class": "pf"})
        basic_info = soup.find("div", {"class": "base-info"})

        if pf:
            pf_s3 = pf.findAll("span", {"class": "s-3"})  # 下载量
            star = pf.find("span", {"class": "s-1 js-votepanel"})  # 评分
            if len(pf_s3) == 2:
                install_num = pf_s3[0].text
                size = pf_s3[1].text
                install_num = install_num.replace(u"下载:", "").replace(u"次", "")
            if star:
                star_num = star.text

        if basic_info:
            infos = basic_info.findAll("td")
            for info in infos:
                text = info.text
                # print text
                if u"作者:" in text:
                    developers = text.replace(u"作者:", "")
                if u"语言" in text:
                    language = text.replace(u"语言:", "")
                if u"版本" in text:
                    search = text.split("versioncode")
                    if len(search) > 0:
                        version_name = search[0].replace(u"版本:", "")
                if u"系统:" in text:
                    min_sdk_version = utils.format_android_level(text)

        # 获取标签类型信息
        if app_tags:
            tags = app_tags.findAll("a")
            for tag in tags:
                if "360" in tag.text:
                    continue
                category += tag.text + "\n"

        if icon_url:
            icon_url = icon_url.find("img")
            icon_url = icon_url["src"]

        # 获取简短描述
        if short_desc:
            short_desc = short_desc.find("p")
            if short_desc:
                short_desc = short_desc.text
                short_desc = short_desc.replace(u"【小编点评】", "")

        result["display_name"] = display_name
        result["introduction"] = introduction
        result["screenshot_url"] = screenshot_url
        result["developers"] = developers
        result["category"] = category
        result["icon_url"] = icon_url
        result["pkg_name"] = pname
        result["version"] = version_name
        result["url1"] = download
        result["language"] = language
        result["version_code"] = version_code
        result["install_num"] = utils.format_install_num(install_num)
        result["size"] = utils.format_file_size(size)
        result["star_num"] = utils.format_star_num(star_num)
        result["min_sdk_version"] = min_sdk_version
        result["short_desc"] = short_desc or ""

    return result
Example #17
0
def get_basic_info(html_info):

    """
    获取游戏的基本信息
    :param html_info:
    :return: dict
    """
    result = {}

    html = html_info

    if html:
        soup = BeautifulSoup(html)
        display_name = soup.find(id="app-name")
        introduction = soup.find(id="html-brief")
        app_tags = soup.find("div", {"class": "app-tags"})
        icon_url = soup.find("dt")
        short_desc = soup.find("dl", {"class": "clearfix"})

        screenshot_url = ""
        developers = ""
        category = ""
        pname = ""
        download = ""
        install_num = ""
        language = ""
        version_name = ""
        version_code = 0
        size = 0
        star_num = 0
        min_sdk_version = 0

        # 获取显示名
        if display_name:
            display_name = display_name.find("span").text

        # 如果获取不到介绍信息,尝试第二种方式
        if not introduction:
            introduction = soup.find("div", {"class": "infors"})

        # 获取简介,截图,开发商和类型
        if introduction:

            # 获取截图url
            scroll = introduction.find(id="scrollbar")
            desc = introduction.find("div", {"class": "breif"})
            if scroll:
                imgs = scroll["data-snaps"]
                for img in imgs.split(","):
                    if img and "icon.png" not in img:
                        screenshot_url += img + "\n"
            else:
                for img in introduction.findAll("img"):
                    src = img['src']
                    if src and "icon.png" not in src:
                        screenshot_url += src + "\n"

            if desc:
                desc_str = str(desc).replace("<br />", "$##$")
                desc_str = desc_str.replace("</td>", "$##$</td>")
                desc_str = BeautifulSoup(desc_str)
                introduction = desc_str.text.replace("$##$", "\n").replace("&nbsp;", " ")
                introduction = introduction.replace("versioncode", "\nversioncode")
                introduction = introduction.replace("updatetime", "\nupdatetime")
            else:
                introduction = str(introduction).replace("<br />", "$##$")
                introduction = introduction.replace("</td>", "$##$</td>")
                introduction = introduction.replace("</p>", "$##$</p>")
                introduction = BeautifulSoup(introduction)
                introduction = introduction.text.replace("$##$", "\n").replace("&nbsp;", " ")
                introduction = introduction.replace("versioncode", "\nversioncode")
                introduction = introduction.replace("updatetime", "\nupdatetime")

        #获取游戏包名和下载地址
        script_infos = soup.findAll('script')
        for script in script_infos:
            pkg_infos = script.text
            if "var detail = (function () {" in pkg_infos:
                search = re.search(r"return {([\s\S]*)};", pkg_infos, re.M | re.I)
                # print search
                if search:
                    data = search.group().replace("return", "").replace(";", "").replace("'", "\"")
                    pkg = json.loads(data)
                    pname = pkg['pname']
                    download = pkg['downloadUrl']
                    version_code = pkg['vcode']

        #获取语言,版本,安装数和大小
        pf = soup.find("div", {"class": "pf"})
        basic_info = soup.find("div", {"class": "base-info"})

        if pf:
            pf_s3 = pf.findAll("span", {"class": "s-3"})  # 下载量
            star = pf.find("span", {"class": "s-1 js-votepanel"})  # 评分
            if len(pf_s3) == 2:
                install_num = pf_s3[0].text
                size = pf_s3[1].text
                install_num = install_num.replace(u"下载:", "").replace(u"次", "")
            if star:
                star_num = star.text

        if basic_info:
            infos = basic_info.findAll("td")
            for info in infos:
                text = info.text
                # print text
                if u"作者:" in text:
                    developers = text.replace(u"作者:", "")
                if u"语言" in text:
                    language = text.replace(u"语言:", "")
                if u"版本" in text:
                    search = text.split("versioncode")
                    if len(search) > 0:
                        version_name = search[0].replace(u"版本:", "")
                if u"系统:" in text:
                    min_sdk_version = utils.format_android_level(text)

        # 获取标签类型信息
        if app_tags:
            tags = app_tags.findAll("a")
            for tag in tags:
                if "360" in tag.text:
                    continue
                category += tag.text + "\n"

        if icon_url:
            icon_url = icon_url.find("img")
            icon_url = icon_url["src"]

        # 获取简短描述
        if short_desc:
            short_desc = short_desc.find("p")
            if short_desc:
                short_desc = short_desc.text
                short_desc = short_desc.replace(u"【小编点评】", "")

        result["display_name"] = display_name
        result["introduction"] = introduction
        result["screenshot_url"] = screenshot_url
        result["developers"] = developers
        result["category"] = category
        result["icon_url"] = icon_url
        result["pkg_name"] = pname
        result["version"] = version_name
        result["url1"] = download
        result["language"] = language
        result["version_code"] = version_code
        result["install_num"] = utils.format_install_num(install_num)
        result["size"] = utils.format_file_size(size)
        result["star_num"] = utils.format_star_num(star_num)
        result["min_sdk_version"] = min_sdk_version
        result["short_desc"] = short_desc or ""

    return result
Example #18
0
 def _text(self, value):
     value = BeautifulSoup(value.strip()).text
     return value.replace('&nbsp;', ' ')
Example #19
0
def get_basic_info(html_info):
    """
    获取游戏的基本信息
    :param html_info:
    :return: dict
    """
    result = {}

    html = html_info

    if html:
        screenshot_url = ""
        developers = ""
        category = ""
        pname = ""
        download = ""
        install_num = ""
        version = ""
        size = 0
        api_level = ""
        icon_url = ""

        soup = BeautifulSoup(html)
        display_name = soup.find("h1", {"class": "app-name"})
        introduction = soup.find("div", {"class": "brief-long"})
        short_desc = soup.find("span", {"class": "head-content"})
        app_tags = soup.find("div", {"class": "nav"})
        screenshot_info = soup.find("div", {"class": "section-body"})
        star_percent = soup.find("span", {"class": "star-percent"})
        params_download_num = soup.find("span", {"class": "download-num"})
        params_platform = soup.find("span", {"class": "params-platform"})
        data_info = soup.find("a", {"class": "inst-btn-big highspeed"})

        if data_info:
            download = data_info["data_url"]
            data_size = data_info["data_size"]
            data_ver_name = data_info["data_versionname"]
            # data_ver_code = data_info["data_versioncode"]
            data_pkg_name = data_info["data_package"]
            icon_url = data_info["data_icon"]

            size = data_size or 0
            version = data_ver_name
            # version_code = data_ver_code or 0
            pname = data_pkg_name

        # 获取显示名
        if display_name:
            display_name = display_name.text

        # 获取简介
        if introduction:
            # print introduction
            introduction = str(introduction).replace("<br />", "$##$")
            introduction = BeautifulSoup(introduction)
            introduction = introduction.text.replace("$##$", "\n")
            introduction = introduction.replace(u"收起", "")

        # 获取标签类型信息
        if app_tags:
            tags_text = app_tags.text
            if tags_text:
                category = tags_text.split("&gt;")[1]

        # 获取截图url
        if screenshot_info:
            for img in screenshot_info.findAll('img'):
                src = img['src']
                if src:
                    screenshot_url += src + "\n"

        #获取下载地址,版本,安装数和大小
        # if download_info:
        #     download = download_info["href"]
        if params_download_num:
            install_num = params_download_num.text
        if params_platform:
            api_level = params_platform.text

        # 获取评分
        if star_percent:
            star_percent = star_percent.get("style")

        # 获取简短描述
        if short_desc:
            short_desc = short_desc.text

        result["display_name"] = display_name
        result["introduction"] = introduction
        result["screenshot_url"] = screenshot_url
        result["developers"] = developers
        result["category"] = category
        result["icon_url"] = icon_url
        result["pkg_name"] = pname
        result["version"] = version
        result["url1"] = download
        result["language"] = ""
        result["version_code"] = 0
        result["install_num"] = utils.format_install_num(install_num)
        result["size"] = size
        result["min_sdk_version"] = utils.format_android_level(api_level)
        result["star_num"] = utils.format_star_num(star_percent)
        result["short_desc"] = short_desc or ""

    return result
Example #20
0
 def _get_title(self, url):
     rv = BeautifulSoup(urllib2.urlopen(url)).title.string
     rv = reduce(lambda v, w: v.replace(w, " "), "\r\n\t", rv)
     while "  " in rv:
         rv = rv.replace("  ", " ")
     return rv