def whiteScrape(webaddress):
     url = ('file:///Users/user/Documents/programming/1-978-251-1362.html')
     ourUrl = opener.open(url).read()
     
     soup = BeautifulSoup(ourUrl)
     soup = soup.find("div", { "class" : "address-card" })
     #below code will delete tags except /br
     soup = str(soup)
     soup = soup.replace('<br/>' , '^')
     soup = BeautifulSoup(soup)
     soup = (soup.get_text())
     soup = str(soup)
     soup=soup.replace('^' , '<br/>')
     
     return soup
def cleanUpText(review):
    #this is by no means exauhstive
    punctuation = """.,?!:;(){}[]"""
    #remove html tags
    review_text = BeautifulSoup(review).get_text()
    #replace '\n' with ''
    review_text = review_text.replace('\n', '')
    #treat punctuation as a individual word
    for c in punctuation:
        review_text = review_text.replace(c," %s "%c)

    return review_text.split()
def pages():
    url = "https://kat.cr/usearch/%s%s%s" %(search, category, filetype)
    print "\n The url is: ", url, "\n"
    page = requests.get(url).content
    textblock = BeautifulSoup(page)
    textblock = textblock.find("div").h2
    textblock = textblock.find("span")
    textblock = str(textblock)
    textblock = textblock.replace("<span>  results 1-25 from ","")
    textblock = textblock.replace("</span>","")

    torrentamount = int(textblock)
    pageamount = (torrentamount/25) + 1
    return pageamount
Example #4
0
def extract_form_declaration(source):
    text = "PORTLET:YAML:"
    if text in source:
        soup = BeautifulSoup(source)
        soup = soup.find(text=re.compile(text))
        soup = soup.replace(text, "").strip()
        return yaml.load(soup)
    text = "PORTLET:HELPER:"
    if text in source:
        soup = BeautifulSoup(source)
        soup = soup.find(text=re.compile(text))
        soup = soup.replace(text, "").strip()
        return get_helper_declaration(soup)

    return {}
Example #5
0
def preprocessor(tweet):

    emo_repl_order = const.emo_repl_order
    emo_repl = const.emo_repl
    re_repl = const.re_repl

    tweet = BeautifulSoup(tweet).get_text()
    tweet = tweet.lower()
    for k in emo_repl_order:
        tweet = tweet.replace(k, emo_repl[k])
    tweet=tweet.replace('\'s ','').replace("-", " ").replace("_", " ").replace('"','').replace(".",'').\
        replace(',','').replace(';','').strip()
    for r, repl in re_repl.items():
        tweet = re.sub(r, repl, tweet)
    return tweet
def preprocessor(row_review):
    global emoticons_replaced
    data=BeautifulSoup(row_review).get_text()
    data = data.lower()
    for k in count.emo_repl_order:
        data = data.replace(k, count.emo_repl[k])
    for r, repl in count.re_repl.iteritems():
        data = re.sub(r, repl, data)
    data = data.replace('\'s ','')
    data = re.sub("[^a-z]"," ",data)
    newdata =" ".join(data.split())

    # english_stemmer = nltk.stem.SnowballStemmer('english')
    # newdata = " ".join([english_stemmer.stem(w) for w in data.split()])

    return newdata
def sanitize_detail(detail):
    replacements = [
        ('\t', ''),
        ('<br/>', '\n'),
        ('\n', '<br/>'),
        (':', '\b'),
        ('\b', ':'),
        ('\r', ''),
        ('////', '<br/>')
    ]

    reg_replacements = [
        (r'^:', ''),
        (r']$', ''),
        (r'(<br\/>)*$', ''),
        (r'^(<br\/>)*', ''),
        (r'\s{2,}', ''),
        (r'(<br\/>)*$', '')
    ]

    detail_text = detail['details'].replace('<br/>', '////')
    detail_text = BeautifulSoup(detail_text, "html.parser").text
    detail['title'] = detail['title'].replace(':', '').strip()

    for r in replacements:
        detail_text = detail_text.replace(r[0], r[1]).strip()

    for r in reg_replacements:
        detail_text = re.sub(r[0], r[1], detail_text).strip()

    detail['details'] = detail_text
    return detail
    def format(self, article, subscriber):
        """
        Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure
        :return: returns the sequence number of the subscriber and the constructed parameter dictionary
        """
        try:
            pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)

            odbc_item = {'Sequence': pub_seq_num, 'Category': article.get('anpa_category', [{}])[0].get('qcode'),
                         'Headline': article.get('headline', '').replace('\'', '\'\''),
                         'Priority': map_priority(article.get('priority'))}

            body = self.append_body_footer(article)
            if article.get(EMBARGO):
                embargo = '{}{}'.format('Embargo Content. Timestamp: ', article.get(EMBARGO).isoformat())
                body = embargo + body

            if article[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                body = BeautifulSoup(body, "html.parser").text

            odbc_item['StoryText'] = body.replace('\'', '\'\'')  # @article_text
            odbc_item['ident'] = '0'

            return [(pub_seq_num, odbc_item)]
        except Exception as ex:
            raise FormatterError.AAPSMSFormatterError(ex, subscriber)
def get_solution(url):
	#url = 'https://community.topcoder.com/stat?c=problem_solution&cr=40440099&rd=16747&pm=14278'
	
	#url = 'https://community.topcoder.com/stat?c=problem_solution&rm=329103&rd=16775&pm=14340&cr=23089515'

	#url = 'https://community.topcoder.com/stat?c=problem_solution&cr=40364957&rd=16747&pm=14278'

	print url

	#tcsso = 'b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584'

	#cookies = dict()
	#cookies['tcsso'] = '40451530|b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584'
	#'40451530|b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584'

	#cookies['JSESSIONID'] = 'UYKd7Rv1-OY-6bmewBWJDw**.tomcat_tc01'

	print cookies

	page = requests.get(url, cookies=cookies)
	#print page
	if str(page) == "<Response [503]>":
		while str(page) == "<Response [503]>":
			time.sleep(1)
			page = requests.get(url, cookies=cookies)
	html_content = page.text

	#print html_content[0:100000]

	#soup = BeautifulSoup(html_content, "html.parser")

	#text = soup.select("body > table > tbody > tr > td.bodyText > table.paddingTable > tbody > tr:nth-child(1) > td > table:nth-child(4) > tbody > tr:nth-child(13) > td")

	body = re.findall('<TD CLASS="problemText" COLSPAN="8" VALIGN="middle" class="alignMiddle" ALIGN="left">\n            (.+?)<BR>\n        </TD>', html_content, flags=re.S)

	text = body[0]

	text = text.replace("<BR>","\n")

	#print w

	#print repr(text)
	print text

	failed_to_download = None
	solution = None


	if len(text)==0:
		failed_to_download = solution_id
	else:
		body = BeautifulSoup(str(text), "html.parser").get_text()

		body = body.replace("\\","\\\\")
		solution = body.encode('utf-8').decode('string-escape')

		#print repr(solution)
		#print solution

	return solution
def html_prettify(html):
    """Prettify HTML main function."""
    log.info("Prettify HTML...")
    html = BeautifulSoup(html).prettify()
    html = html.replace("\t", "    ").strip() + "\n"
    log.info("Finished prettify HTML !.")
    return html
    def format(self, article, subscriber, codes=None):
        """
        Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure
        :return: returns the sequence number of the subscriber and the constructed parameter dictionary
        """
        try:
            pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
            sms_message = article.get('sms_message', article.get('abstract', '')).replace('\'', '\'\'')

            # category = 1 is used to indicate a test message
            category = '1' if superdesk.app.config.get('TEST_SMS_OUTPUT', True) is True \
                else article.get('anpa_category', [{}])[0].get('qcode').upper()

            odbc_item = {'Sequence': pub_seq_num, 'Category': category,
                         'Headline': BeautifulSoup(sms_message, 'html.parser').text,
                         'Priority': map_priority(article.get('priority'))}

            body = self.append_body_footer(article)
            if article.get(EMBARGO):
                embargo = '{}{}'.format('Embargo Content. Timestamp: ',
                                        get_utc_schedule(article, EMBARGO).isoformat())
                body = embargo + body

            if article[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                body = BeautifulSoup(body, "html.parser").text

            odbc_item['StoryText'] = body.replace('\'', '\'\'')  # @article_text
            odbc_item['ident'] = '0'

            return [(pub_seq_num, json.dumps(odbc_item))]
        except Exception as ex:
            raise FormatterError.AAPSMSFormatterError(ex, subscriber)
Example #12
0
    def debug(self, message):
        '''
        Utility method for debugging. Make sure
        settings.TEST_DEBUG is defined and set to
        True. When used, self.debug_buffer will contain
        concatinated debug messages.
        '''
        if (not hasattr(settings, 'TEST_DEBUG')) or (
            not settings.TEST_DEBUG
        ):
            return
        if not hasattr(self, 'debug_buffer'):
            self.debug_buffer = ''
        try:
            message = BeautifulSoup(message).body.get_text()
        except:
            pass

        while '\n\n' in message:
            message = message.replace('\n\n', '\n')
            
        self.debug_buffer += (
             message +
            '\n------------------------------\n'
        )
Example #13
0
 def Encode(self,text):
     text=BeautifulSoup(text).get_text()
     try:
         l=re.findall(r'&#(.*?);',text)
         for sub in l:
             try :
                 a='&#'+sub+';'
                 bc=int(sub)
                 text = text.replace(a,unichr(bc))
             except :
                 pass
     except:
         text=text
     # tag = False
     # quote = False
     # out = ""
     # for c in text:
     #     if c == '<' and not quote:
     #         tag = True
     #     elif c == '>' and not quote:
     #         tag = False
     #     elif (c == '"' or c == "'") and tag:
     #         quote = not quote
     #     elif not tag:
     #         out = out + c
     return text
def get_solution(contest, solution_id):
	url = 'http://codeforces.com/contest/' + str(contest[0]) + '/submission/' + str(solution_id)
	
	print url

	page = requests.get(url)
	if str(page) == "<Response [503]>":
		while str(page) == "<Response [503]>":
			time.sleep(1)
			page = requests.get(url)
	html_content = page.text

	#print html_content

	soup = BeautifulSoup(html_content, "html.parser")

	text = soup.select("body > div > div > div > div > pre")

	failed_to_download = None
	solution = None


	if len(text)==0:
		failed_to_download = solution_id
	else:
		body = BeautifulSoup(str(text[0]), "html.parser").get_text()

		body = body.replace("\\","\\\\")
		solution = body.encode('utf-8').decode('string-escape')

	return solution_id, solution, failed_to_download
def play_video(video_url = _common.args.url):
	stack_url = 'stack://'
	hbitrate = -1
	sbitrate = int(_addoncompat.get_setting('quality')) * 1024
	closedcaption = None
	video_data = _connection.getURL(video_url)
	video_tree = BeautifulSoup(video_data, 'html.parser')
	video_segments = video_tree.find_all('segment')
	for video_segment in video_segments:
		seg_url = VIDEOINFO % video_segment['id']
		seg_data = _connection.getURL(seg_url)
		seg_menu = BeautifulSoup(seg_data).find_all('file')
		hbitrate = -1
		file_url = None
		for video_index in seg_menu:
			try:
				bitrate = int(video_index['bitrate'])
				type = video_index['type']
				if bitrate > hbitrate and bitrate <= sbitrate:
					hbitrate = bitrate
					file_url = video_index.string
				elif bitrate == hbitrate and bitrate <= sbitrate and type == 'hd' :
					file_url = video_index.string
			except:
				pass
		if file_url is None:
			file_url = BeautifulSoup(seg_data).find_all('file',type = 'hd')[0].string
		stack_url += file_url.replace(',', ',,') + ' , '
	finalurl = stack_url[:-3]
	xbmcplugin.setResolvedUrl(pluginHandle, True, xbmcgui.ListItem(path = finalurl))
def get_solution(solution_id):
	#solutions = []
	#failed_to_download_s = []
	#for i in solution_ids:
	url = "https://www.codechef.com/viewplaintext/" + str(solution_id)
	
	page = requests.get(url)
	if str(page) == "<Response [503]>":
		while str(page) == "<Response [503]>":
			time.sleep(1)
			page = requests.get(url)
	html_content = page.text

	if html_content==None:
		failed_to_download_s.append(i)

	text = BeautifulSoup(html_content, "html.parser").get_text()

	#'''figure out if escape_lt needs to go here'''

	print len(text)
	#print text


	failed_to_download = None
	solution = None

	#print text
	if len(text)==0 or re.search('var _sf_startpt = (new Date()).getTime()', text) != None:
		failed_to_download = solution_id
	else:
		text = text.replace("\\","\\\\")
		solution = text.encode('utf-8').decode('string-escape')

	return solution_id, solution, failed_to_download
Example #17
0
 async def translate(self, ctx, to_language, *, msg):
     """Translates words from one language to another. Do [p]help translate for more information.
     Usage:
     [p]translate <new language> <words> - Translate words from one language to another. Full language names must be used.
     The original language will be assumed automatically.
     """
     await ctx.message.delete()
     if to_language == "rot13":  # little easter egg
         embed = discord.Embed(color=discord.Color.blue())
         embed.add_field(name="Original", value=msg, inline=False)
         embed.add_field(name="ROT13", value=codecs.encode(msg, "rot_13"), inline=False)
         return await ctx.send("", embed=embed)
     async with self.bot.session.get("https://gist.githubusercontent.com/astronautlevel2/93a19379bd52b351dbc6eef269efa0bc/raw/18d55123bc85e2ef8f54e09007489ceff9b3ba51/langs.json") as resp:
         lang_codes = await resp.json(content_type='text/plain')
     real_language = False
     to_language = to_language.lower()
     for entry in lang_codes:
         if to_language in lang_codes[entry]["name"].replace(";", "").replace(",", "").lower().split():
             language = lang_codes[entry]["name"].replace(";", "").replace(",", "").split()[0]
             to_language = entry
             real_language = True
     if real_language:
         async with self.bot.session.get("https://translate.google.com/m",
                                     params={"hl": to_language, "sl": "auto", "q": msg}) as resp:
             translate = await resp.text()
         result = str(translate).split('class="t0">')[1].split("</div>")[0]
         result = BeautifulSoup(result, "lxml").text
         embed = discord.Embed(color=discord.Color.blue())
         embed.add_field(name="Original", value=msg, inline=False)
         embed.add_field(name=language, value=result.replace("&amp;", "&"), inline=False)
         if result == msg:
             embed.add_field(name="Warning", value="This language may not be supported by Google Translate.")
         await ctx.send("", embed=embed)
     else:
         await ctx.send(self.bot.bot_prefix + "That's not a real language.")
	def authorFilter(self, pageSoup):
		author = BeautifulSoup(str(pageSoup.find_all(
			'a', {'class': 'trb_ar_by_nm_au_a'}))).get_text().encode('ascii',errors='ignore')

		if author != '[]': #when author is with href
			return author.replace("\n",'')
		else: #when author is without href
			authorSoup = BeautifulSoup(str(pageSoup.find_all(
				'span', {'class': 'trb_ar_by_nm_au'}))).get_text().encode('ascii',errors='ignore')
			authorFilter = BeautifulSoup(str(pageSoup.find_all(
				'span', {'itemprop': 'author'}))).get_text().encode('ascii',errors='ignore')

			if authorFilter == '[]':
				return "NULL" #No Author Present in Article
			else:
				return authorFilter.replace('\n','')
def list_qualities(BASE, video_url = _common.args.url, media_base = VIDEOURL):
	if media_base not in video_url:
		video_url = media_base + video_url
	bitrates = []
	if 'feed' not in video_url:
		swf_url = _connection.getRedirect(video_url, header = {'Referer' : BASE})
		params = dict(item.split("=") for item in swf_url.split('?')[1].split("&"))
		uri = urllib.unquote_plus(params['uri'])
		config_url = urllib.unquote_plus(params['CONFIG_URL'])
		config_data = _connection.getURL(config_url, header = {'Referer' : video_url, 'X-Forwarded-For' : '12.13.14.15'})
		feed_url = BeautifulSoup(config_data, 'html.parser', parse_only = SoupStrainer('feed')).feed.string
		feed_url = feed_url.replace('{uri}', uri).replace('&amp;', '&').replace('{device}', DEVICE).replace('{ref}', 'None').strip()
	else:
		feed_url = video_url
	feed_data = _connection.getURL(feed_url)
	video_tree = BeautifulSoup(feed_data, 'html.parser', parse_only = SoupStrainer('media:group'))
	video_segments = video_tree.find_all('media:content')
	srates = []
	for video_segment in video_segments:
		video_url3 = video_segment['url'].replace('{device}', DEVICE)
		video_data3 = _connection.getURL(video_url3, header = {'X-Forwarded-For' : '12.13.14.15'})
		video_menu = BeautifulSoup(video_data3).findAll('rendition')
		orates = srates
		srates = []	
		for video_index in video_menu:
			bitrate = int(video_index['bitrate'])
			srates.append((bitrate, bitrate))
		if orates != []:
			srates = list(set(srates).intersection(orates))
	bitrates  =srates
	return bitrates
def OTVVideos(params):
	onlinetv_cookie = HTML('http://onlinetv.kg/Auth/Login', {'UserName':onlinetvkg_login, 'Password':onlinetvkg_password, 'RememberMe':'true'})
	if not onlinetv_cookie or onlinetv_cookie == 'false':
		Noty('Online TV', 'Ошибка авторизации / Cервер недоступен', '', 5000)
	else:
		onlinetv_token = HTML('http://onlinetv.kg/TV/GetNewTransmissionUID?')
		html = BeautifulSoup(re.sub('\s+', ' ', HTML(params['url']).replace('<br/>',' ')))

		try:    page = int(params['page'])
		except: page = 0

		current_page = page if page > 0 else 1
		try:
			pages_all = html.find('ul', attrs={'class':'pages'}).findAll('a')[-1].string
			XBMCItemAdd({'title':Colored('[ Перейти на страницу ]', 'opendialog') + ' ' + str(current_page) + ' из ' + str(pages_all) + ' страниц', 'thumb':ImagePath('findpage.png')},
				{
					'func': 'OTVSearchPage',
					'page': current_page,
					'url' : params['url']
				})

			if current_page < int(pages_all):
				current_page = (page + 1) if page > 0 else 2
				url = (params['url'] + '/' + str(current_page)) if page < 1 else params['url'][:(len(str(page)) * -1)] + str(current_page)

				XBMCItemAdd({'title':Colored('[ Следующая страница ]', 'nextpage'), 'thumb':ImagePath('next.png')},
					{
						'func': 'OTVVideos',
						'page': current_page,
						'url' : url
					})
		except: pass

		video_list = html.find('div', attrs={'class':'results'}).findAll('a')
		if len(video_list) > 0:
			for a in video_list:
				url = str(a['href'])

				time = Colored(str(a.div.find('div', {'class':'time'}).string).decode('utf-8'), 'FF268789').encode('utf-8')
				description = Colored(str(a.div.find('div', {'class':'description'}).string).decode('utf-8'), 'FF61a061').encode('utf-8').replace('-', '').strip()
				
				if url.find('GenreGroupTransmissions') >= 0 or url.find('GroupedSearch') >= 0:
					name1 = str(a.div.find('div', {'class':'name'}))
					name1 = BeautifulSoup(name1.replace('<span>', '[COLOR FF00AA00] ').replace('</span>', '[/COLOR] ').strip())
					name = Colored(name1.div.string.encode('utf-8'), 'bold')
					
					XBMCItemAdd({'title': time + '  |  ' + name + '  |  ' + description},
						{
							'func': 'OTVVideos',
							'page': current_page,
							'url' : 'http://onlinetv.kg' + url
						})
				else:
					name = Colored(a.div.find('div', {'class':'name'}).string.title(), 'bold').encode('utf-8')
					url_re = re.compile('TV/VOD/(.+[0-9])').findall(url)[0]
					XBMCItemAdd({'title': time + '  |  ' + name + '  |  ' + description}, {'url' : 'http://vod.onlinetv.kg/FileUpload/' + onlinetv_token + '/rus/' + url_re + '.ts'}, False)
			XBMCEnd()
		else:
			Noty('Online TV', 'Видео не найдено')
Example #21
0
def cleaning(original_text):

	text = BeautifulSoup(original_text,"lxml").get_text()

	# Remove Encodings
	text = re.sub(r'\\\\', r'\\', text)
	text = re.sub(r'\\x\w{2,2}',' ', text)
	text = re.sub(r'\\u\w{4,4}', ' ', text)
	text = re.sub(r'\\n', '.', text)

	#Whitespace Formatting
	text = text.replace('"', ' ')
	text = text.replace('\\', ' ')
	text = text.replace('_', ' ')
	text = text.replace('-', ' ')
	text = re.sub(' +',' ', text)
	
	#Remove Unicode characters
	text = codecs.decode(text, 'unicode-escape')
	text = ''.join([i if ord(i) < 128 else '' for i in text])

	#Remove email addresses
	text = re.sub(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}', ' ', text)
	
	#Remove Twitter Usernames
	text = re.sub(r"(\A|\s)@(\w+)+[a-zA-Z0-9_\.]", ' ', text)

	#Remove urls
	text = re.sub(r'\w+:\/\/\S+', ' ', text)

	# Word Standardizing (Ex. Looooolll should be Looll)
	text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))

	#Convert words to lower case
	text = text.lower().split()

	#Remove contractions by expansion of words
	text = [contractions[word] if word in contractions else word for word in text]

	#Rejoin words
	text = " ".join(text)

	#Remove non-alphabets
	text = re.sub("[^a-z\s]", " ", text)

	return " ".join(text.split())
Example #22
0
    def __iter__(self):
        for path, dirs, files in os.walk(self.dirname):
            for d in dirs:
                dir_path = os.path.join(self.dirname,d)
                for fname in os.listdir(dir_path):
                    for line in open(os.path.join(dir_path, fname)):
                        # 1. Remove HTML
                        line = BeautifulSoup(line).get_text()
                        # 2. Remove non-letters
                        line = re.sub("[^a-zA-Z]"," ", line)
                        # 3. Remove numbers from text
                        for i in range(10):
                            line.replace(str(i),'')
                        # 4. Convert words to lower case and split them
                        words=line.lower().split()

                        yield words
Example #23
0
    def format(self, article, output_channel, selector_codes):
        """ Constructs a dictionary that represents the parameters passed to the IPNews InsertNews stored procedure

        :param article:
        :param output_channel:
        :param selector_codes:
        :return: returns the sequence number of the output channel and the constructed parameter dictionary
        """
        try:

            pub_seq_num = superdesk.get_resource_service('output_channels').generate_sequence_number(output_channel)

            odbc_item = {}
            odbc_item['originator'] = article.get('originator', None)
            odbc_item['sequence'] = pub_seq_num
            odbc_item['category'] = article.get('anpa-category', {}).get('qcode')  # @category
            odbc_item['headline'] = article.get('headline', '')  # @headline
            odbc_item['author'] = article.get('byline', '')  # @author
            odbc_item['keyword'] = article.get('slugline', None)

            if article['subject'][0]:
                odbc_item['subject_reference'] = article['subject'][0].get('qcode', None)
                if odbc_item['subject_reference']:
                    odbc_item['subject'] = subject_codes[odbc_item['subject_reference'][:2] + '000000']
                    odbc_item['subject_matter'] = subject_codes[odbc_item['subject_reference'][:5] + '000']
                    odbc_item['subject_detail'] = subject_codes[odbc_item['subject_reference']]

            odbc_item['take_key'] = article.get('anpa_take_key', None)  # @take_key
            odbc_item['usn'] = article.get('unique_id', None)  # @usn
            if article['type'] == 'preformatted':
                odbc_item['article_text'] = article.get('body_html', '').replace('\'', '\'\'')  # @article_text
            elif article['type'] == 'text':
                text = BeautifulSoup(article.get('body_html', '')).text
                text = text.replace('\'', '\'\'')
                odbc_item['article_text'] = text

            if 'genre' in article:
                odbc_item['genre'] = article['genre'][0].get('name', None)
            else:
                odbc_item['genre'] = 'Current'  # @genre
            if article.get('type', 'text') == 'text':
                odbc_item['texttab'] = 'x'
            elif article.get('type', None) == 'preformatted':
                odbc_item['texttab'] = 't'
            odbc_item['wordcount'] = article.get('word_count', None)  # @wordcount
            odbc_item['news_item_type'] = 'News'
            odbc_item['priority'] = article.get('priority', None)  # @priority
            odbc_item['service_level'] = 'a'  # @service_level

            sel_codes = selector_codes[output_channel['_id']]
            odbc_item['selector_codes'] = ' '.join(sel_codes)

            odbc_item['fullStory'] = 1
            odbc_item['ident'] = '0'  # @ident

            return pub_seq_num, odbc_item
        except Exception as ex:
            raise FormatterError.AAPIpNewsFormatterError(ex, output_channel)
Example #24
0
def page2text(pagename):
  page = wekeypedia.WikipediaPage(pagename)
  content = page.get_revision()

  txt = BeautifulSoup(content, "html.parser")
  txt = txt.get_text()
  txt = txt.replace("[edit]","")

  return txt
Example #25
0
 def txt2words(self, txt, remove_stopwords=True):
   txt = BeautifulSoup(txt).get_text()
   txt = ftfy.fix_text(txt)
   txt = txt.replace("\\n", '')
   txt = re.sub("[^0-9a-zA-Z]"," ", txt)
   if remove_stopwords:
     words = [self.save_stem(w) for w in txt.lower().split() if (w not in self.stopwords) & (len(w) > 2) & (not w.isdigit())]
   else:
     words = [self.save_stem(w) for w in txt.lower().split() if (len(w) > 2) & (not w.isdigit())]
   return words
Example #26
0
class TVShow:
    def __init__(self, tvshow):
        self.tvshow = tvshow
        self.series = 0
        self.season = 0
        self.episode = 0
        self.title = ""

    def __str__(self):
        return "{0} - {1}x{2} - {3}".format(self.tvshow, 
                                            self.season.zfill(SEASON_PADDED_ZEROS), 
                                            self.episode.zfill(EPISODE_PADDED_ZEROS), 
                                            self.title)

    def process(self):
        release = re.findall("S?(\d+)x?E?(\d+)", self.tvshow, flags=2)
        if len(release) > 0:
            self.season = release[0][0].lstrip("0") if len(re.findall("^0+$", release[0][0])) == 0 else "0"
            self.episode = release[0][1].lstrip("0") if len(re.findall("^0+$", release[0][1])) == 0 else "0"

        # Replace any fluff from the file name
        self.tvshow = re.sub(FILTER, " ", self.tvshow, flags=2).strip()

    def fetch(self):
        r = requests.get("http://thetvdb.com/api/GetSeries.php?seriesname={0}&language={1}".format(self.tvshow, LANGUAGE))
        soup = BeautifulSoup(r.content)

        self.series = soup.find("seriesid").text
        self.tvshow = soup.find("seriesname").text

    def get_episode(self):
        r = requests.get("http://thetvdb.com/api/{0}/series/{1}/default/{2}/{3}/{4}.xml"
            .format(API_KEY, self.series, str(self.season), str(self.episode), LANGUAGE))

        if r.status_code == 404: print("Error! 404: Not found")
        else: self.title = BeautifulSoup(r.content).find("episodename").text

    def replace_illegal_characters(self):
        for illegal_character in ILLEGAL_CHARACTERS:
            self.title = self.title.replace(illegal_character, REPLACE_CHAR)

        # Replace the ellipsis with three periods to prevent UnicodeError
        self.title = self.title.replace("…",  "...")
Example #27
0
 def _get_model(self, dbms):
     print(dbms.extract())
     model = dbms.css('tr th:nth-child(5)')
     link = dbms.css('tr th:nth-child(5) a')
     span = dbms.css('tr th:nth-child(5) span span')
     if span:
         html = span.extract()[0]
     elif link:
         html = link.extract()[0]
     else:
         html = model.extract()[0]
     models = BeautifulSoup(html, 'html.parser').text
     return models.replace(',', '|')
Example #28
0
 def img_save(self, page_url):
     img_response = self.hit(page_url)
     img_url = BeautifulSoup(img_response.text, 'lxml').find('div', class_ = 'main-image').find('img')['src']
     name = img_url.replace('/', '_')
     try:
         img = self.hit(img_url)
         f = open(name + '.jpg', 'ab')
         f.write(img.content)
         f.close()
         print ">>>>>>>>>>>>>>>>写入成功~" 
     except:
         print '不能写入图片数据:' + img_url
         return False
    def _render_levelings(self, html: BeautifulSoup, nvalues: int) -> List[Leveling]:
        # Do some pre-processing on the html
        if not isinstance(html, str):
            html = str(html)
        html = html.replace("</dt>", "\n</dt>")
        html = html.replace("</dd>", "\n</dd>")
        html = BeautifulSoup(html, "lxml")
        html = html.text.strip()
        while "\n\n" in html:
            html = html.replace("\n\n", "\n")
        while "  " in html:
            html = html.replace("  ", " ")
        levelings = html.replace("\xa0", " ")

        # Get ready
        results = []

        # Let's parse!
        initial_split = levelings.split("\n")
        initial_split = [
            lvling.strip()
            for lvling in initial_split
            if lvling.strip()
            not in (
                "Takedown scales with Aspect of the Cougar's rank",
                "Swipe scales with Aspect of the Cougar's rank",
                "Pounce scales with Aspect of the Cougar's rank",
                "Cougar form's abilities rank up when Aspect of the Cougar does",
            )
        ]
        initial_split = list(grouper(initial_split, 2))

        for attribute, data in initial_split:
            if attribute.endswith(":"):
                attribute = attribute[:-1]
            result = self._render_leveling(attribute, data, nvalues)
            results.append(result)

        return results
Example #30
0
def clean_text(string, remove_spaces=False):
    matches = ["\n", "<br>"]
    for m in matches:
        string = string.replace(
            m, " ").strip()
    string = ' '.join(string.split())
    string = BeautifulSoup(string, "lxml").get_text()
    SAFE_PTN = r"[|\^&+\-%*/=!>]"
    string = re.sub(SAFE_PTN, ' ',  string.strip()
                    ).strip()
    if remove_spaces:
        string = string.replace(' ', '_')
    return string
Example #31
0
    def format(self, comment):
        found_difficult = ""
        # digits = re.findall("\d+\.", comment)
        # for digit in set(digits):
        #     comment = comment.replace(digit, "<b>"+digit + " </b>")

        all_a_links = re.findall("(<a href.*?>(.*?)</a>)", comment)
        for a_link_and_text in all_a_links:
            a_link, text = a_link_and_text
            comment = comment.replace(a_link, text)

        if "pages/images/hard.gif" in comment:
            found_difficult += "*"
        if "pages/images/harder.gif" in comment:
            found_difficult += "*"

        # we need to specifically keep these tags because the "text" property will remove them so we "hide" them with nosense characters
        tags_to_keep = ["u", "b"]
        comment = comment.replace("<u>", "$!u$").replace("</u>", "$/!u$")
        comment = comment.replace("<b>", "$!b$").replace("</b>", "$/!b$")
        text = BeautifulSoup(comment, "lxml").text

        text = text.strip()
        while "  " in text:
            text = text.replace("  ", " ")

        # following code makes sure "3.\nhello" becomes "3. hello"
        digit = re.match(u"^.{1,2}[\)|\.]", text)
        if digit:
            text = text.replace(digit.group(0), u"")
            text = text.strip()
            text = digit.group(0) + u" " + text

        # now get the tags back and remove nonsense chars
        text = text.replace("$!u$", "<u>").replace("$/!u$", "</u>")
        text = text.replace("$!b$", "<b>").replace("$/!b$", "</b>")
        text = text.replace("\n", "<br/>")

        return (found_difficult + text).strip()
Example #32
0
def clean_text(string, remove_spaces=False):
    matches = ["\n", "<br>"]
    for m in matches:
        string = string.replace(
            m, " ").strip()
    string = ' '.join(string.split())
    string = BeautifulSoup(string, 'lxml').get_text()
    SAFE_PTN = "[^0-9a-zA-Z-_.'()]+"
    string = re.sub(SAFE_PTN, ' ',  string.strip()
                    ).strip()
    if remove_spaces:
        string = string.replace(' ', '_')
    return string
 def cleanHTMLtext(self, raw_html):
     """
     Function to clean the Description Col in Indeed Dataset
     """
     if type(raw_html) == str:
         cleantext = BeautifulSoup(raw_html).get_text(" ")
         #BeautifulSoup(raw_html, "html.parser").text
         cleantext = cleantext.replace('\r', ' ').replace('\n', ' ')[1:-1]
         re.sub('\W+', ' ', cleantext)
         re.sub(',', ' ', cleantext)
         return cleantext
     else:
         return None
Example #34
0
def internet_search(indebug, inquery, inkey):
    debug = str(indebug)
    query = str(inquery)
    key = str(inkey)

    try:
        # create credential for authentication
        user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; FDM; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322)'
        creds = (':%s' % key).encode('utf-8')[:-1]
        auth = 'Basic %s' % creds

        # Search query for Bing. To obtains definitions
        url = 'http://www.bing.com/search?q=define+"%s"' %query
        url = str(url)

        request = urllib.request.Request(url)
        request.add_header('Authorization', auth)
        request.add_header('User-Agent', user_agent)

        requestor = urllib.request.build_opener()
        result = requestor.open(request)

        soup = BeautifulSoup(result, 'lxml')
        out = soup.findAll('ol', {'class': 'b_dList'}) # Obtains the information of 'ol' tag whose class name is 'b_dList'
        out = str(out) # Converting class type to string type
        if(out == '[]'):
            if (debug == '--debug'):
                print(("No results found for '%s' using Bing Search" %(query)))
            return 0
        else:
            out = BeautifulSoup(out, 'lxml')
            out = out.findAll('li')
            out = str(out)
            out = str((re.compile(r'<li>(.*)</li>').search(out)).group()) # Extracting <li>..</li> tag information
            out = out.replace('>, <', '>\n<') # Breaks the list items / definitions into one per line
            out = out.split('\n') # outputs a list
            definitions = [] # A list to store the definitions
            for i in out:
                i = BeautifulSoup(i, 'lxml')
                i = i.text # Extracts text only leaving the html tags
                definitions.append(i) # Appends definitions to the list
            if (debug == '--debug'):
                #print "------------------------------------------------------"
                print(("Definitions of '%s' from Internet search:" %(query)))
                for x in range(0,len(definitions)):
                    print(('%d : %s' %(x+1, definitions[x])))
                print("------------------------------------------------------")
            return definitions
    except:
        print("Exception raised!! in obtaining definitions using Bing Search API")
        return 0
Example #35
0
def pdf_to_html(fonts,jumpiness,word_rotation,width_shift,height_shift,rotace):
    #change all the html files
    for filee in os.listdir("data\\converted\\pdf"):
        filee_converted = "data\\converted\\pdf\\" + filee
        filee_dest = "data\\done\\" + filee
        #find text and change it
        with open(filee_converted,"r",encoding="utf-8") as f:
            result = f.read()
            whole_file = BeautifulSoup(result,"html.parser")
            schulubung = whole_file.find("div",attrs={"id" : "page-container"})
            #najit pages
            for data in schulubung.children:
                #page cislo
                if (data.name == "div"):
                    #najit jenom page
                    this_page = False
                    for page in data.children:
                        if (page.name == "div"): #jenom ten spravnej text
                            #for loop mezi divs
                            for bad_div in page.children:
                                for divs in bad_div.children:
                                    if (divs.name == "div"):
                                        divs["style"] = "margin:0px 0px {1}px {0}px;transform:rotate({2}deg);".format(randrange(width_shift[0],width_shift[1]),randrange(height_shift[0],height_shift[1]),randrange(rotace[0],rotace[1]))
                                        #loop v divu a randomize fontu
                                        line = divs.decode_contents()
                                        res = ""
                                        i = 0
                                        while i < len(line):
                                            if (line[i:i + 1] == " "):
                                                res += line[i:i + 1]
                                            elif (unidecode(line[i:i + 1]) == unidecode("")):
                                                res += " "
                                            elif (line[i:i + 5] == "<span" or line[i:i + 6] == "</span"):
                                                while line[i:i + 1] != ">":
                                                    res += line[i:i + 1]
                                                    i += 1
                                                res += ">"
                                            else:
                                                word = ["<span style='margin-top:10px;font-family:{0};color:#000F55;position:relative;top:{1}px;font-size:40%;transform:skewY({2}deg)'>".format(choice(fonts),randrange(jumpiness[0],jumpiness[1]),randrange(word_rotation[0],word_rotation[1])),"</span>"]
                                                res += word[0] + line[i:i + 1] + word[1]
                                            i += 1
                                        divs.string = res
                        this_page = not this_page
            

        #write new file
        with open(filee_dest,"w",encoding="utf-8") as f:
            whole_file = str(whole_file).replace("&lt;","<")
            whole_file = whole_file.replace("&gt;",">")
            f.write(str(whole_file))
            print("done")
Example #36
0
 def __populate_db_contexts_for_opinion(
         self,
         session: Session,
         opinion: Opinion,
         reporter_resource_dict: dict,
         context_slice=slice(-128, 128),
 ) -> None:
     unstructured_html = opinion.html_text
     if not unstructured_html:
         raise ValueError(f"No HTML for case {opinion.resource_id}")
     unstructured_text = BeautifulSoup(unstructured_html,
                                       features="lxml").text
     clean_text = unstructured_text.replace("U. S.", "U.S.")
     tokenizer = OneTimeTokenizer(self.eyecite_tokenizer)
     citations = list(eyecite.get_citations(clean_text,
                                            tokenizer=tokenizer))
     cited_resources = eyecite.resolve_citations(citations)
     for resource, citation_list in cited_resources.items():
         cited_opinion_res_id = reporter_resource_dict.get(
             format_reporter(
                 resource.citation.groups.get("volume"),
                 resource.citation.groups.get("reporter"),
                 resource.citation.groups.get("page"),
             ))
         if cited_opinion_res_id is None:
             continue
         for citation in citation_list:
             if not isinstance(citation, CaseCitation):
                 continue
             if (citation.metadata.parenthetical is not None
                     and ParentheticalProcessor.is_descriptive(
                         citation.metadata.parenthetical)):
                 session.add(
                     OpinionParenthetical(
                         citing_opinion_id=opinion.resource_id,
                         cited_opinion_id=cited_opinion_res_id,
                         text=ParentheticalProcessor.prepare_text(
                             citation.metadata.parenthetical),
                     ))
             start = max(0, citation.index + context_slice.start)
             stop = min(len(tokenizer.words),
                        citation.index + context_slice.stop)
             session.add(
                 CitationContext(
                     citing_opinion_id=opinion.resource_id,
                     cited_opinion_id=cited_opinion_res_id,
                     text=" ".join([
                         s for s in tokenizer.words[start:stop]
                         if isinstance(s, str)
                     ]),
                 ))
def tweet_cleaning_for_sentiment_analysis(tweet):    
    
    #Escaping HTML characters
    tweet = BeautifulSoup(tweet).get_text()
    #Special case not handled previously.
    tweet = tweet.replace('\x92',"'")
    #Removal of hastags/account
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", tweet).split())
    #Removal of address
    tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
    #Removal of Punctuation
    tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split())
    #Lower case
    tweet = tweet.lower()
    #CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
    CONTRACTIONS = load_dict_contractions()
    tweet = tweet.replace("’","'")
    words = tweet.split()
    reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
    tweet = " ".join(reformed)
    # Standardizing words
    tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))
    #Deal with smileys
    #source: https://en.wikipedia.org/wiki/List_of_emoticons
    SMILEY = load_dict_smileys()  
    words = tweet.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    tweet = " ".join(reformed)
    #Deal with emojis
    tweet = emoji.demojize(tweet)
    #Strip accents
    tweet= strip_accents(tweet)
    tweet = tweet.replace(":"," ")
    tweet = ' '.join(tweet.split())
    
    # DO NOT REMOVE STOP WORDS FOR SENTIMENT ANALYSIS - OR AT LEAST NOT NEGATIVE ONES

    return tweet
Example #38
0
def get_fifa_data():
    url = "https://www.fifaindex.com/teams/fifa20_358/?league=13&order=desc"
    r = requests.get(url)
    #check if the site being used sends a 200 status code. Verifying it can be webscraped.
    print(r.status_code)

    #break down site html and siphon out team data.
    soup = BeautifulSoup(r.text, "html.parser")
    soup = soup.findAll('tbody')
    soup = soup[0].text
    soup = soup.replace('\n', '')
    #remove Premier league string from the results
    delete_league = "Premier League"
    soup = soup.replace(delete_league, " ")
    #separate the names and numbers from the data
    list_of_names = re.findall('\D+', soup)
    list_of_nums = re.findall("\d{8}", soup)

    #dictionary to hold our new data
    data = {"Squads": []}
    #organize names and averages into the list of data
    i = 0
    for n in list_of_names:
        _att = list_of_nums[i][:2]
        _mid = list_of_nums[i][2:4]
        _def = list_of_nums[i][4:6]
        _ovr = list_of_nums[i][6:8]
        data["Squads"].append({"Squad": n, "ATT": _att, "MID": _mid, "DEF": _def, "OVR": _ovr})
        i += 1

    #normalize our team data into a json dataframe so we can make a easy to read csv file
    df = pd.json_normalize(data["Squads"])
    path = "../Data/team_data_fifa.csv"
    list_of_files.append(path)
    df.to_csv(path, index = False)
    #avoiid being flagged as a spammer from site
    time.sleep(1)
    return df
Example #39
0
def rtn_chapter_txt(chapterHtml):
    soup = BeautifulSoup(chapterHtml, 'html.parser')

    try:
        txtContent = soup.find_all(name="div", attrs={"class":
                                                      SUB_DOWN_FLAG})[0]
        txtContent = str(txtContent).replace('<br/>', "\n")
        txtContent = BeautifulSoup(txtContent, 'html.parser')
        txtContent = txtContent.find_all(name="div",
                                         attrs={"id":
                                                "ChapterContents"})[0].text
        # time.sleep(5000)

    except:
        time.sleep(2)
        print(chapterHtml)

    # txtContent = txtContent.split("最新章节!")[1]
    txtContent = txtContent.strip()
    txtContent = txtContent.replace('								      ', "")

    # txtContent = txtContent.replace("一秒记住【顶点小说网 www.23wx.so】,精彩小说无弹窗免费阅读!", "")
    # txtContent = txtContent.replace("       ", "")
    # txtContent = txtContent.replace("        ", "")
    # txtContent = txtContent.replace("    ", "")
    # txtContent = txtContent.replace(" ", "")
    # txtContent = txtContent.replace("~", "")
    # txtContent = txtContent.replace("\r\n", "")
    # txtContent = txtContent.replace("\n\n", "")

    txtContent = txtContent.replace('\xa0', '')
    # txtContent = txtContent.replace('\u016f','')
    # txtContent = txtContent.replace('\u027c','')
    # txtContent = txtContent.replace('\u025b','')
    # txtContent = txtContent.replace('\u0c4a','')
    # txtContent = txtContent.replace('\u0154','')
    # txtContent = txtContent.replace('\u0189','')
    return txtContent
def clean_texts(x):
    if x:
        x = strip_non_ascii(x)
        x = BeautifulSoup(x, "lxml")
        x = x.get_text()
        x = x.replace('\n', ' ').replace('\r', '').replace('\t', ' ')
        # remove between word dashes
        x = x.replace('- ', ' ').replace(' -', ' ').replace('-', ' ')
        #replace parentheses
        x = x.replace("(", "").replace(")", "").replace("[",
                                                        "").replace("]", "")
        #remove punctuation but keep commas, semicolons, periods, exclamation marks, question marks, intra-word dashes and apostrophes (e.g., "I'd like")
        x = x.replace(r"[^[:alnum:][:space:].'-:]",
                      " ").replace('+', ' ').replace('*', ' ').replace(
                          "' ", "").replace(" '", "").replace("'", "").replace(
                              ",", " ").replace(";", " ").replace(":", " ")
        #remove numbers (integers and floats)
        x = re.sub('\d+', '', x)
        #remove extra white space, trim and lower
        x = re.sub('\\s+', ' ', x).strip()
        return x
    else:
        return ""
Example #41
0
def d2rucrawl(url):
    pages = 1
    activity = []

    html = urlopen(url)
    soup = BeautifulSoup(html, 'lxml')
    type(soup)
    rows = soup.find_all('div', class_='dropdown')
    str_cells = str(rows)
    username = BeautifulSoup(str_cells, "lxml").get_text()
    rows2 = soup.find_all('span', id="user-posts-count")
    str_cells2 = str(rows2)
    messagesn = BeautifulSoup(str_cells2, "lxml").get_text()
    rows3 = soup.find_all('span', class_="points")
    str_cells3 = str(rows3)
    likes = BeautifulSoup(str_cells3, "lxml").get_text()
    img = soup.find_all('img', class_='my')
    img_a = (str(img).split('/')[1:7])
    img_b = [i + '/' for i in img_a]
    img_c = [''.join(img_b)]

    while pages < 11 and pages != 0:  #блок расчета колличества постов
        html = urlopen(url + 'activity/page-' + str(pages))
        soup = BeautifulSoup(html, 'lxml')
        type(soup)
        rows4 = soup.find_all('div', class_='text-medium')
        str_cells4 = str(rows4)
        activ = BeautifulSoup(str_cells4, "lxml").get_text()
        activity.append(activ)
        pages += 1
    data = activity
    text_string2 = str(data).lower()
    match_pattern2 = re.findall(r'\b[а-я]{3,15}\b', text_string2)
    frequency2 = {}
    for word in match_pattern2:
        count = frequency2.get(word, 0)
        frequency2[word] = count + 1
    frequency_list2 = frequency2.keys()
    activity_mess = []
    for words in frequency_list2:
        if frequency2[words] > 5:
            activity_mess.append(str(words) + ': ' + str(frequency2[words]))

    activity_end = ', '.join(activity_mess)
    return 'Никнейм: ' + username.replace(
        ' ', ''
    )[2:-1] + ', Сообщения: ' + messagesn[1:-1] + ', Симпатии: ' + likes[
        1:
        -1] + ' Часто используемые слова - за последние 100 сообщений ' + activity_end + ' Аватар: ' + 'https://dota2.ru/' + str(
            img_c)[2:-4]
Example #42
0
def GetQuote():
    url = 'http://quotesondesign.com/wp-json/posts?filter[orderby]=rand&filter[posts_per_page]=1'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }

    request = urllib.request.Request(url, headers=headers)
    connection = urllib.request.urlopen(request)
    response = connection.read()

    data = json.loads(response)

    quote = BeautifulSoup(data[0]["content"], 'html.parser').get_text()
    quote = quote.replace('\n', '')
    quote = quote.strip()
    author = BeautifulSoup(data[0]["title"], 'html.parser').get_text()
    author = author.replace('\n', '')
    author = author.strip()

    final_text = "{} ~{}".format(quote, author)

    return final_text
 def processTweet(self, tweet):
     # Cleansing and tokenizing tweet
     tweet = BeautifulSoup(
         tweet).get_text()  # Extracts text from HTML (just in case!)
     tweet = tweet.lower()  # Converts text to lower-case
     tweet = re.sub("((www\.[^\s]+)|(https?://[^\s]+))", "URL",
                    tweet)  # Replces URLs by URL constan
     tweet = re.sub("@[^\s]+", "USERTAGGING",
                    tweet)  # Replaces usernames by USERTAGGING constant
     tweet = re.sub(r"#([^\s]+)", r"\1", tweet)  # Removes the # in #hashtag
     for p in punctuation:
         tweet = tweet.replace(p, "")  # Removes punctiation
     tweet = word_tokenize(tweet)  # Creates a list of words
     return [word for word in tweet if word not in self._stopwords]
Example #44
0
def about_love(message):
    n = randint(13564, 13687)
    r = requests.get(f'http://ruspoeti.ru/aut/tushnova/{n}/')
    soup = BeautifulSoup(r.text, 'lxml')
    soup = str(soup)
    start = soup.find('class="pcont"')
    finish = soup.find('class="pfoot"')
    soup = soup[start:finish]
    st = 'Тушнова</em><br/><br/>'
    start = soup.find('Тушнова</em><br/><br/>')
    finish = soup.find('</p><div')
    soup = soup[start + len(st):finish]
    soup = soup.replace('<br/>', '')
    bot.send_message(message.chat.id, soup)
Example #45
0
def scrape(episode_id):
    response = requests.get('https://open.spotify.com/embed-podcast/episode/{}'.format(episode_id))

    page_data = BeautifulSoup(response.text, 'html.parser').find_all('script')[-1].string

    data_json = page_data.replace('window.__PRELOADED_STATE__ = ', '')
    data = json.loads(data_json)

    result = {
        'filename': data['data']['name'],
        'url': data['data']['unencryptedAudioUrl']
    }

    return result
Example #46
0
 def post_require(self):
     """ 爬取职位描述
     """
     for c in self.company:
         r = requests.get(
             c.get('href'), headers=self.headers).content.decode('gbk')
         bs = BeautifulSoup(r, 'lxml').find(
             'div', class_="bmsg job_msg inbox").text
         s = bs.replace("举报", "").replace("分享", "").replace("\t", "").strip()
         self.text += s
     # print(self.text)
     with open(os.path.join("data", "post_require.txt"),
               "w+", encoding="utf-8") as f:
         f.write(self.text)
def clean_texts(tweet):
    if tweet:
        tweet = strip_non_ascii(tweet)
        tweet= BeautifulSoup(tweet, "lxml")
        tweet= tweet.get_text()
        tweet= tweet.replace('\n', '').replace('\r', ' ').replace('\t', ' ')
        tweet= tweet.replace('!', '')###### remove !
        tweet= tweet.replace('"', '')###### remove "
        tweet = re.sub(r'^http?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
        tweet = re.sub(r"http\S+", "", tweet, flags=re.MULTILINE)
        # remove between word dashes
        tweet= tweet.replace('- ', '').replace(' -','').replace('-','')
        #replace parentheses
        tweet= tweet.replace("(","").replace(")","").replace("[","").replace("]","").replace("RT","")
        #remove punctuation but keep commas, semicolons, periods, exclamation marks, question marks, intra-word dashes and apostrophes (e.g., "I'd like")
        tweet= tweet.replace(r"[^[:alnum:][:space:].'-:]", "").replace('+','').replace('*','').replace("' ","").replace(" '","").replace("'","").replace(","," ").replace(";"," ").replace(":"," ").replace("."," ")
        #remove numbers (integers and floats)
        tweet= re.sub('\d+', '', tweet)        
        #remove extra white space, trim and lower
        tweet = re.sub('\\s+',' ',tweet).strip()
        return tweet
    else:
        return ""    
Example #48
0
    def parse(self, project, line_list):
        string = utils.make_string(line_list)
        soup = BeautifulSoup(string, 'html.parser')
        soup = soup.find('body')
        soup = soup.find('main')
        soup = soup.find('article')
        soup = str(soup)

        string = soup.replace('\n', '!@#$')
        string = self._string_arrange(string)

        string = string.split('!@#$')
        string = utils.make_string(string, conj='\n')
        return string
Example #49
0
 async def translate(self, ctx, to_language, *, msg):
     """Translates words from one language to another. Do [p]help translate for more information.
     Usage:
     [p]translate <new language> <words> - Translate words from one language to another. Full language names must be used.
     The original language will be assumed automatically.
     """
     await bot.message.delete()
     if to_language == "rot13":  # little easter egg
         embed = discord.Embed(color=discord.Color.blue())
         embed.add_field(name="Original", value=msg, inline=False)
         embed.add_field(name="ROT13",
                         value=codecs.encode(msg, "rot_13"),
                         inline=False)
         return await bot.send("", embed=embed)
     async with self.bot.session.get(
             "https://gist.githubusercontent.com/astronautlevel2/93a19379bd52b351dbc6eef269efa0bc/raw/18d55123bc85e2ef8f54e09007489ceff9b3ba51/langs.json"
     ) as resp:
         lang_codes = await resp.json(content_type='text/plain')
     real_language = False
     to_language = to_language.lower()
     for entry in lang_codes:
         if to_language in lang_codes[entry]["name"].replace(
                 ";", "").replace(",", "").lower().split():
             language = lang_codes[entry]["name"].replace(";", "").replace(
                 ",", "").split()[0]
             to_language = entry
             real_language = True
     if real_language:
         async with self.bot.session.get("https://translate.google.com/m",
                                         params={
                                             "hl": to_language,
                                             "sl": "auto",
                                             "q": msg
                                         }) as resp:
             translate = await resp.text()
         result = str(translate).split('class="t0">')[1].split("</div>")[0]
         result = BeautifulSoup(result, "lxml").text
         embed = discord.Embed(color=discord.Color.blue())
         embed.add_field(name="Original", value=msg, inline=False)
         embed.add_field(name=language,
                         value=result.replace("&amp;", "&"),
                         inline=False)
         if result == msg:
             embed.add_field(
                 name="Warning",
                 value=
                 "This language may not be supported by Google Translate.")
         await bot.send("", embed=embed)
     else:
         await bot.send(self.bot.bot_prefix + "That's not a real language.")
Example #50
0
def detweet(text):
    #remove html
    text = text.replace("#", "")
    text = text.replace('\u2044', ' or ')
    text = BeautifulSoup(text, features="html.parser").get_text()
    text = text.replace('\x92', "'")

    #remove hashtags (but just hashtags, they can be quite informative about sentiment)

    #remove mentions
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)", "", text).split(" "))
    # remove web adress
    text = ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())
    return text
Example #51
0
def cleanText(text):
    stemmer = WordNetLemmatizer()
    en_stop = set(nltk.corpus.stopwords.words('english'))
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\W', ' ', str(text))
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub(r'^b\s+', '', text)
    text = text.lower()
    text = re.sub(r'\|\|\|', r' ', text)
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    text = text.replace(',', ' ')
    text = re.sub('\n', ' ', text)
    text = re.sub('[n|N]o\.', 'number', text)
    tokens = text.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    tokens = [word for word in tokens if len(word) > 3]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text
Example #52
0
def preprocessing_english(x):
    x = BeautifulSoup(x)
    x = EmailReplyParser.parse_reply(x.get_text())
    x = re.sub(r'<.*?>', '', x)
    x = x.replace("\n", " ").strip()
    x = re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', repl='', string=x)
    x = x.replace("\n", " ").strip()
    x = x.strip()
    x = re.sub(r"(^|\W)\d+", "", x)
    x = x.lower()
    x = re.sub(r'[^a-zA-Z]', ' ', x)
    x = re.sub("\s\s+", " ", x)

    stopwords = {
        'forwarded', 'message', 'lz', 'logitech', 'dear', 'my', 'date', 'i',
        'recently', 'hi', 'hello', 'product', 'serial', 'number', '1', '2',
        '3', '4', '5', '6', '7', '8', '9', '0', 'purchased', 'purchase',
        'support', 'http', 'com', 'logitech', 'www', 'https', 'logi',
        'customercare', 'contact', 'terms', 'blvd', 'gateway', 'newark', 'usa',
        'logo', 'care', 'ca', 'footer', 'use', 'customer', 'owned', 'us',
        'survey', 'americas', 'copyright', 'headquarters', 'owners',
        'respective', 'the', 'rights', 'trademarks', 'reserved', 'property',
        'dear', 'regards', 'thanks', 'mail', 'email', 'lz', 'g', 'x', 'k',
        'date', 'like', 'get', 'one', 'set', 'thank', 'also', 'two', 'see',
        'able', 'n', 'could', 'since', 'last', 'know', 'still', 'got', 'pm',
        'p', 'n', 's'
        'operating', 'system', 'platform', 'ce', 's', 'hs', 'y', 'mr', 'de',
        'lfcm', 'sy', 'm', 'kh', 'w', 'ks', 'hs', 'afternoon', 'morning',
        'regards', 'thx'
        'thanks', 'fri', 'mon', 'tue', 'wed', 'thu', 'sat', 'sun', 'jan',
        'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'sep', 'oct', 'nov', 'dec'
    }

    x = x.split()
    x = [word for word in x if word.lower() not in stopwords]
    x = ' '.join(x)
    return x
Example #53
0
 def parse_details(self, response):
     items = HouzzItem()
     # make sure only un-cached / new records are saved in the spreadsheet
     if not "cached" in response.flags:
         try:
             PhoneNumber = response.xpath(
                 "//div[@compid='Profile_Phone']/span[@class='pro-contact-text']/text()"
             )[0].extract()
         except:
             PhoneNumber = "-"
         try:
             ContactPersonRAW = response.xpath(
                 "normalize-space(//div[@class='info-list-text']/b[text()='Contact']/../text())"
             )[0].extract()
             ContactPerson = ContactPersonRAW.split(": ")[1]
         except:
             ContactPerson = "-"
         try:
             LocationRAW = response.xpath(
                 "//div[@class='info-list-text']/b[text()='Location']/..")
             Street = LocationRAW.xpath(
                 "./span[@itemprop='streetAddress']/text()")[0].extract()
             AddressLocality = LocationRAW.xpath(
                 "./span[@itemprop='addressLocality']/text()")[0].extract()
             AddressRegion = LocationRAW.xpath(
                 "./span[@itemprop='addressRegion']/text()")[0].extract()
             PostalCode = LocationRAW.xpath(
                 "./span[@itemprop='postalCode']/text()")[0].extract()
             AddressCountry = LocationRAW.xpath(
                 "./span[@itemprop='addressCountry']/text()")[0].extract()
             Location = Street + ", " + AddressLocality + ", " + AddressRegion + ", " + PostalCode + ", " + AddressCountry
         except:
             Location = BeautifulSoup(
                 response.xpath(
                     "//div[@class='info-list-text']/b[text()='Location']/.."
                 )[0].extract(), 'lxml').get_text()
             Location = Location.replace("Location: ", "")
         items["category"] = response.meta['category'],
         items["posttitle"] = response.meta['posttitle'],
         items["posthref"] = response.meta['posthref'],
         items["location"] = Location,
         items["contact"] = ContactPerson,
         items["phone"] = PhoneNumber
         yield items
         self.logger.info("Item processed!")
         #yield scrapy.FormRequest(GoogleURL, formdata=DataObject, callback=self.dummy, method="POST", dont_filter=True, meta={"refresh_cache":True})
     else:
         # self.logger.info("Page is cached!")
         pass
Example #54
0
def HTMLparser(page, blog, url):
    title = None
    content = None
    author = None
    datePublished = None
    dateModified = None

    soup = BeautifulSoup(page, 'lxml')
    doc = Document(page)
    title = doc.short_title()
    content = BeautifulSoup(doc.summary(), 'lxml').get_text()
    try:
        application_json_ld = json.loads(
            soup.find('script', {
                'type': 'application/ld+json'
            }).get_text())
    except:
        application_json_ld = None
    if application_json_ld is not None:
        if 'author' in application_json_ld:
            if isinstance(application_json_ld['author'], list):
                author = application_json_ld['author'][0]['name']
            else:
                author = application_json_ld['author']['name']
        if 'datePublished' in application_json_ld:
            datestring = application_json_ld['datePublished']
            datePublished = parse(datestring)
        if 'dateModified' in application_json_ld:
            datestring = application_json_ld['dateModified']
            dateModified = parse(datestring)

    if blog == 'steemit':
        author = soup.find('a', {'class': 'ptc'}).get_text().split(" ")[0]
        datestring = soup.find('span',
                               {'class': 'updated'})['title'].split()[0]
        datePublished = parse(datestring)

    if len(content) < 500:
        return None

    content = content.replace('\n', '')
    return Post(meta={'id': url},
                title=title,
                content=content,
                rawContent=content,
                author=author,
                datePublished=datePublished,
                dateModified=dateModified,
                url=url)
def obtenerTextoCompleto(response):

    try:
        TextoCompleto = BeautifulSoup(response, "html.parser").find(
            "div", {"class": "fullpost__cuerpo"})
        TextoCompleto = str(TextoCompleto.contents)
        TextoCompleto = re.sub('</p>.*?<p>', ' ', TextoCompleto)

        TextoCompleto = TextoCompleto.replace("]",
                                              "").replace("[", "").replace(
                                                  ", '\n',", ' ')
        #TextoCompleto = "<br />".join(TextoCompleto.split("\n"))
        return TextoCompleto
    except Exception as e:
        print("No se pudo obtener la Imagen ", e)
Example #56
0
def csv_formater(title, abstract, pmid):
    print("Formating CSV entry from url number", count)
    title = BeautifulSoup(title).text  # remove html tags
    abstract = BeautifulSoup(abstract).text
    pmid = BeautifulSoup(pmid).text

    # handle characters that cause errors for ML-reader
    title = title.replace("[", "").replace("]", "")
    abstract = abstract.replace("=", " equals ")
    #abstract = abstract.replace("\n", " ") # new-line in the paragraph is problematic

    # This regex statement (?<=[A-Za-z0-9()[\]%])\.(?=[A-Za-z()[\]]{2})|(?<=[A-Za-z()[\]%]{2})\.(?=[A-Za-z0-9()[\]])
    # find all periods without a space after them, as in: (---best of times.It was the worst---),
    # but ignore decimals(12.3, 0.21), abreviations (N.Y., D.C.), and titles (Dr., Mr.)
    # the replacement value is ". "

    # build the regex as a string in-order to loop the list of punctuations.
    # find all punctuations without a space, but not numbers, and abreviations.
    punctuation = [".", "!", "?"]
    for punk in punctuation:
        repunk = re.escape(punk)
        punkregex = r"(?<=[A-Za-z0-9()[\]%])" + repunk + r"(?=[A-Za-z()[\]]{2})|(?<=[A-Za-z()[\]%]{2})" + repunk + r"(?=[A-Za-z0-9()[\]])"
        title = re.sub(punkregex, punk + " ", title)
        abstract = re.sub(punkregex, punk + " ", abstract)

    # join-split combo reduces all white space to single space, and eliminates trailing/leading space and \escape-char
    title = ' '.join(title.split())
    abstract = ' '.join(abstract.split())

    # remouve escape markers from '"Error: Failed to Scrape " + esc_url'; added in 'def parse_url(url):'
    title = title.replace("\\", "")
    abstract = abstract.replace("\\", "")
    pmid = pmid.replace("\\", "")

    entry = [[title, abstract, pmid]]
    return entry
 def parseJSON(self):
     """Parse JSON VUIDs into data struct"""
     # Format of JSON file is:
     # "API": { "core|EXT": [ {"vuid": "<id>", "text": "<VU txt>"}]},
     # "VK_KHX_external_memory" & "VK_KHX_device_group" - extension case (vs. "core")
     for top_level in sorted(self.json_data):
         if "validation" == top_level:
             for api in sorted(self.json_data[top_level]):
                 for ext in sorted(self.json_data[top_level][api]):
                     for vu_txt_dict in self.json_data[top_level][api][ext]:
                         print ("Looking at dict for api:ext entry %s:%s" % (api, ext))
                         vuid = vu_txt_dict['vuid']
                         vutxt = vu_txt_dict['text']
                         # strip asciidoc xref from vu text
                         vutxt = re.sub('&amp;amp;lt;&amp;amp;lt;([^&]*,\\s*|)(.*?)&amp;amp;gt;&amp;amp;gt;', '\\2', vutxt)
                         #print ("%s:%s:%s:%s" % (api, ext, vuid, vutxt))
                         #print ("VUTXT orig:%s" % (vutxt))
                         just_txt = BeautifulSoup(vutxt, 'html.parser')
                         #print ("VUTXT only:%s" % (just_txt.get_text()))
                         num_vuid = vuid_mapping.convertVUID(vuid)
                         self.json_db[vuid] = {}
                         self.json_db[vuid]['ext'] = ext
                         self.json_db[vuid]['number_vuid'] = num_vuid
                         self.json_db[vuid]['struct_func'] = api
                         just_txt = just_txt.get_text().strip()
                         unicode_map = {
                         u"\u2019" : "'",
                         u"\u201c" : "\"",
                         u"\u201d" : "\"",
                         u"\u2192" : "->",
                         }
                         for um in unicode_map:
                             just_txt = just_txt.replace(um, unicode_map[um])
                         self.json_db[vuid]['vu_txt'] = just_txt.replace("\\", "")
                         print ("Spec vu txt:%s" % (self.json_db[vuid]['vu_txt']))
Example #58
-5
def injectid(obj):
    z = LoggedMessage.objects.get(pk=obj.pk)

    p = sanitise_case(z.site, z.text)
    if not p['status']:
        soup = Soup(z.text, 'xml')
        # GET HID
        k = IssuedIdentifier.objects.filter(site=z.site)
        _all = Identifier.objects.exclude(pk__in=k.values('identifier_id'))
        hid = _all[0]
        print p
        case_ = "household_head_health_id" if p['household'] else "health_id"
        case_type = p['form_type']
        c = soup.find(case_)
        mm = "<%s>%s</%s>" % (case_, hid.identifier, case_)
        c = str(c)
        soup = str(soup)
        soup = soup.replace(c, mm)

        soup = soup.replace("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n", "")
        y = "<%s> %s </%s>" % (case_type, soup, case_type)

        COMMCARE_URL = COMMCARE_LINK % z.site
        print "HID: %s \n" % hid.identifier
        print "COMMCARE_URL: %s \n" % COMMCARE_URL
        print y
        print "=========================================================="
        form = {'data': y,
                'SUBMIT_TO_COMMCARE': SUBMIT_TO_COMMCARE,
                'COMMCARE_URL': COMMCARE_URL}
        if transmit_form(form):
            s = LoggedMessage()
            s.text = y
            s.direction = s.DIRECTION_OUTGOING
            s.response_to = z
            s.site = z.site
            s.save()

            z.status = s.STATUS_SUCCESS
            z.save()

            p = IssuedIdentifier()
            p.status = IssuedIdentifier.STATUS_ISSUED
            p.identifier = hid
            p.site = z.site
            p.save()
        else:
            s = LoggedMessage()
            s.text = y
            s.direction = s.DIRECTION_OUTGOING
            s.response_to = z
            s.site = z.site
            s.save()

            z.status = s.STATUS_ERROR
            z.save()
	def titleFilter(self, pageSoup):
		title = BeautifulSoup(str(pageSoup.find_all(
			'h1', {'class': 'trb_ar_hl_t'}))).get_text().encode(
			'ascii',errors='ignore')
		title = title.replace("\n", "")
		title = title.replace("      ", "")
		return title
Example #60
-25
def parsePage(url):
	r = requests.get(url)
	data = r.text
	soup = BeautifulSoup(data)


	invalid_tags = ['b', 'i', 'u', 'ul','li', 'p','em']
	soup = soup.find(id='primary')



	for tag in invalid_tags: 
	 for match in soup.findAll(tag):
	     match.replaceWithChildren()
	    

	for match in soup.findAll('span'):
		match.replaceWith('')

	for match in soup.findAll('div'):
		match.replaceWith('')


	soup = str(soup)
	soup = soup.replace('<strong>', "%")
	soup = soup.replace('</strong>', "%")
	finalOutput = soup.split('%')

	for n in range(0,4):
		finalOutput[n]=""

	return finalOutput