def TimeCode(resp): string = BeautifulSoup(str(resp)) string = unicode(string.find('b')) #print string Day ='' for i in range(2,8): if u'Thứ '+str(i) in string: Day = str(i) if u'Chủ nhật' in string: Day = str(8) string = string.split(u'tiết ')[1] string = string.split(' (LT)')[0] #string = 1,2 / 1,2,3 / 3,4,5 / 4,5 ... NumOfPeriod = str((len(string)+1)/2) #NumOfPeriod = 2/3/4 Period = str(string[0]) #fist period of class return Day+Period+NumOfPeriod
def countWords(c) : c.execute("SELECT title, content FROM listing") listings = c.fetchall() for title, content in listings : title = title + r' ' all_content = BeautifulSoup(title + content).getText() all_content = all_content.replace('\n', ' ') all_content = re.sub(r'START CLTAGS.*END CLTAGS', '', all_content) tokens = set(token.lower() for token in all_content.split()) tokens -= STOP_WORDS for word in tokens : c.execute("INSERT INTO words (word, counter) " "VALUES (%s, 1) " "ON DUPLICATE KEY UPDATE counter = counter + 1", (word.lower(),))
def parse_item(self, response): item = CpuItem() item['link'] = response.url item['name'] = response.css('span.cpuname').xpath('text()').extract()[0] search = { 'description': u'Description:', 'other_name': u'Other names:', 'g3d_mark':u'G3DMark/\$Price:', 'clock':u'Clockspeed:', 'core':u'No of Cores:' } rank = u'Samples:' i = 0 for sel in response.css("table.desc tr")[1].xpath('td'): text = sel.extract() textSplit = text.split(u'<span style="font-weight: bold;">') for ii in textSplit: cleantext = BeautifulSoup(ii).text for si in search: if (cleantext.find(search[si]) != -1 ): item[si] = cleantext.replace(search[si],"") if (cleantext.find(rank) != -1): item['rank'] = cleantext.split(rank)[0] i = i+1 yield item
def parse_item(self, response): item = GpuItem() item['link'] = response.url item['name'] = response.css('span.cpuname').xpath( 'text()').extract()[0] search = { 'description': u'Description:', #'processzor_modell': u'Videocard Category:', 'other_name': u'Other names:', #'memoria_merete': u'Videocard First Benchmarked:', 'g3d_mark': u'G3DMark/\$Price:', #'memoria_max_seb':u'Overall Rank:', #'memoria_foglalat':u'Last Price Change:', } rank = u'Samples:' i = 0 for sel in response.css("table.desc tr")[1].xpath('td'): text = sel.extract() textSplit = text.split(u'<span style="font-weight: bold;">') for ii in textSplit: cleantext = BeautifulSoup(ii).text for si in search: if (cleantext.find(search[si]) != -1): item[si] = cleantext.replace(search[si], "") if (cleantext.find(rank) != -1): item['rank'] = cleantext.split(rank)[0] i = i + 1 yield item
def cleanhtml(raw): cleanr = re.compile('<.*?>|\\n') raw = re.sub(cleanr, ' ', raw) raw = BeautifulSoup(raw).getText() raw = raw.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c", '"').replace(u"\u201d", '"') raw = raw.replace(' .', '.').replace(' ,', ',') raw = ' '.join(raw.split()) return raw
def convertIMGBase64(text): tags = re.findall("<img.*?>", text) for tag in tags: filename = BeautifulSoup(tag).findAll("img")[0]['src'] filetype = filename.split(".")[1] file = open("./{}".format(filename)) data = file.read() file.close() data = data.encode("base64") new_tag = '<img src="data:image/{};base64,{}">'.format(filetype, data) text = text.replace(tag, new_tag) return text
def format_zagaz_station_2013(block_station): href_string = BeautifulSoup(block_station[0][0])\ .find('a', {'href' : re.compile('station.php\?id_s*')})['href'] id_station = re.search('station.php\?id_s=([0-9]*)', href_string).group(1) highway = BeautifulSoup(block_station[0][0])\ .find('a', {'href' : re.compile('autoroute.php\?id_a*')}) if highway: highway = highway['title'] brand_and_name_station = BeautifulSoup(block_station[0][0])('strong')[0].string brand_station = brand_and_name_station.split(' - ')[0] name_station = brand_and_name_station.split(' - ')[1] street_station = BeautifulSoup(block_station[0][1]).findAll(text=True) street_station = [str_correct_html(elt).strip() for elt in street_station\ if str_correct_html(elt).strip()] zip_station, city_station = None, None if len(block_station[0]) > 2: zip_station = BeautifulSoup(block_station[0][2]).p.find(text=True).strip() # maybe fragile city_station = str_correct_html(BeautifulSoup(block_station[0][2])\ .find('a', {'href' : re.compile('prix-carburant.php*')}).string) if block_station[1]: comment_station = BeautifulSoup(block_station[1][0]).find('div', {'class' : 'station_comm'}).string else: comment_station = None latitude = re.search('Latitude: ([0-9.]*)', block_station[2][0]) longitude = re.search('longitude: (-?[0-9.]*)', block_station[2][0]) if latitude and longitude: gps_station = (latitude.group(1), longitude.group(1), block_station[2][1]) else: gps_station = (None, None, None) ls_zagaz_station = [id_station, brand_station, name_station, comment_station, street_station, zip_station, city_station, gps_station, highway] return ls_zagaz_station
def feed_entry_description_terms(description): # Get text only with beautifulsoup text = BeautifulSoup(description).getText() # Get words by splitting text with whitespace words = text.split() # Remove word contains only non-ascii character # Reference : http://stackoverflow.com/questions/1276764/stripping-everything-but-alphanumeric-chars-from-a-string-in-python words = [word for word in words if re.sub(r'\W+', '', word)] # Convert to lowercase words = [word.lower() for word in words] return words
def format_zagaz_station(block_station): href_string = BeautifulSoup(block_station[0][0])\ .find('a', {'href' : re.compile('station.php\?id_s*')})['href'] id_station = re.search('station.php\?id_s=([0-9]*)', href_string).group(1) highway = BeautifulSoup(block_station[0][0])\ .find('a', {'href' : re.compile('autoroute.php\?id_a*')}) if highway: highway = highway['title'] brand_and_name_station = BeautifulSoup(block_station[0][0])('strong')[0].string # check if other ('strong') with highway? brand_station = brand_and_name_station.split(' - ')[0] name_station = brand_and_name_station.split(' - ')[1] street_station = str_correct_html(BeautifulSoup(block_station[0][1])('p')[0].string) zip_station = BeautifulSoup(block_station[0][2])('p')[0].contents[0].strip() city_station = str_correct_html(BeautifulSoup(block_station[0][2])\ .find('a', {'href' : re.compile('prix-carburant.php*')}).string) if block_station[1]: comment_station = BeautifulSoup(block_station[1][0]).find('div', {'class' : 'station_comm'}).string else: comment_station = None latitude = re.search('Latitude: ([0-9.]*)', block_station[2][0]) longitude = re.search('longitude: (-?[0-9.]*)', block_station[2][0]) if latitude and longitude: gps_station = (latitude.group(1), longitude.group(1), block_station[2][1]) else: gps_station = (None, None, None) ls_zagaz_station = [id_station, brand_station, name_station, comment_station, street_station, zip_station, city_station, gps_station, highway] return ls_zagaz_station
def baggify(content, c) : content_text = BeautifulSoup(content).getText() content_text = re.sub(r'START CLTAGS.*END CLTAGS', '', content_text) tokens = [token.lower() for token in content_text.split()] content_bag = [] for token in tokens : if token in STOP_WORDS : continue c.execute("SELECT counter FROM words where word = %s", (token,)) count = c.fetchone() if count and count[0] > 10 : content_bag.append(token) return r' '.join(content_bag)
def parse_pages(): blogfile = open("blogatog_page_1.html", "r") soup = BeautifulSoup(blogfile.read()) menu = soup.findAll('div',attrs={'class':'post-content'}) # some hackery to extract the number of pages to iterate through num_pages = int(soup.findAll('span',attrs={'class':'page-number'})[0].getText().split("of")[1].strip()) print(str(num_pages) + "pages") outfile = open("_maybe_data.xml", "w", 0) convos = etree.Element("Conversations") question_counter = 0 # parse pages till end for i in range(1,9000): blogfilename = "blogatog_page_" + str(i) + ".html" if not os.path.isfile(blogfilename): print ("no existy") break blogfile = open(blogfilename, "r") soup = BeautifulSoup(blogfile.read()) # print(str(i)) blogtxt = open(blogfilename, "r").read() page_soup = BeautifulSoup(blogtxt) menu = soup.findAll('div',attrs={'class':'post-content'}) for subMenu in menu: if len(str(subMenu).split("</b></p><p>")) < 2: continue # hackery to split the question and answer q = str(subMenu).split("</b></p><p>")[0].split("asked: ")[1] a = str(subMenu).split("</b></p><p>")[1].replace("</p>\n</div>", "") a = html_escape(a) # regex to look for anything that looks like "maybe :)" if re.search("aybe.*:.*\)", a): q = BeautifulSoup(q).getText() a = BeautifulSoup(a.split("<div class=\"tags\">")[0]).getText() print q print a print("\n") # print("f") convo = etree.SubElement(convos, "Conversation") convo.set("id", str(question_counter)) question_counter += 1 etree.SubElement(convo, "Question").text = q etree.SubElement(convo, "Answer").text = a xmlout = etree.tostring(convos, pretty_print=True) outfile.write(xmlout)
def listProd(): """ A partir du site internet de Leroy Merlin, retourne une liste de contreplaque avec carac au format json """ listProduits = [] url = "http://www.leroymerlin.fr/v3/search/search.do?pageTemplate=Recherche&resultOffset=0&resultLimit=100&resultListShape=SEARCHENGINE_PRODUCT_LIST_PLAIN&facet=PRODUCT&keyword=contre+plaqu%C3%A9&sort=TRI_PAR_PRIX_CROISSANT_ID&intuitiontodo=newSearchAllSite" data = urllib.urlopen(url).read() soup = BeautifulSoup(data) soup = soup.prettify() lines = soup.split("\n") for i in range(len(lines)): line = lines[i] chaine_a_chercher = "prd-infos" if chaine_a_chercher in line: produit = {} prodTot = lines[i+4] description = prodTot.split(",")[0] descriptionList = description.split(" ") typ = descriptionList[7] if len(descriptionList)>9: materiau = " ".join(descriptionList[9:]) dimension = prodTot.split(",")[1] dimension = dimension.replace(" ", "") dimension = dimension.replace("L", "") dimension = dimension.replace("l", "") dimension = dimension.replace(".", "") longueur = float(dimension.split("x")[0]) largeur = float(dimension.split("x")[1]) epaisseur = prodTot.split(",")[2].replace("epais. ", "") epaisseur = epaisseur.replace("mm", "") surface = (largeur * longueur) / 1000 if "price-wrapper" in line: prix = float(lines[i+6].replace("€", "")) prixSurface = round(prix /surface, 2) produit["typ"] = typ produit["materiau"] = materiau produit["longueur"] = longueur produit["largeur"] = largeur produit["surface"] = surface produit["epaisseur"] = epaisseur produit["prix"] = prix produit["prixSurface"] = prixSurface listProduits.append(produit) return(listProduits)
def convertIMGBase64(text): tags = re.findall("<img.*?>", text) for tag in tags: filename = BeautifulSoup(tag).findAll("img")[0]['src'] filetype = filename.split(".")[1] img = Image.open("./"+filename) orig_height = img.size[1] orig_width = img.size[0] if orig_width > 550: percent = 550 / float(orig_width) height = int(float(orig_height) * float(percent)) img = img.resize((550, height), PIL.Image.ANTIALIAS) img = img.save(filename) file = open("./{}".format(filename)) data = file.read() file.close() data = data.encode("base64") new_tag = '<img src="data:image/{};base64,{}">'.format(filetype, data) text = text.replace(tag, new_tag) return text
def get_school_and_rank(self, outfile): rows = self.soup.findAll('tr', {'valign': 'top'}) for row in rows: rank = re.findall(self.numbers_regex, \ row.find('span').renderContents()) current_school = row.find('a', { 'class': 'school-name' }).contents[0] current_school = BeautifulSoup( current_school, convertEntities=BeautifulSoup.HTML_ENTITIES) current_school = ''.join([ i if ord(i) < 128 else ' ' for i in current_school.contents[0] ]) current_school = ' '.join( [word.lower() for word in current_school.split(' ')]) print current_school + ' ' + str(rank) json.dump(dict(zip([current_school], rank)), outfile) outfile.write('\n')
def scrape(start_page=1): global query, max_pages, data_dir for n in xrange(start_page, max_pages): dirname = os.path.join(data_dir,str(n)) if os.path.exists(dirname): continue print "Retrieving page set: ", n, "...", os.makedirs(dirname) url = query.replace('page=1','page=%d' % n) data = json.load(urlopen(url)) i = 1 for x in data['response']['results']: fields = x['fields'] headline = fields['headline'].replace('\n','') raw_body = fields['body'] body = BeautifulSoup(raw_body,smartQuotesTo=None).getText('\n') body_cleaned = body.split('\n\n\n\n\n')[0] save(dirname, i, headline, body_cleaned) i += 1 print 'done.' sleep(1)
def update_one(self, show_ccn, link, start_time, duration): duration = timedelta(minutes=duration) # Load detail show page and check if the latest data is updated page = urllib2.urlopen(link) soup = BeautifulSoup(page) update_time = datetime.strptime(soup.em.string.replace('-', ''), DATE_FORMAT_B) if not self.is_latest(show_ccn, update_time): # Get H1 tag that includes title data and imdb link data h1 = soup.h1.a title = h1.string # Convert normal imbd link to mobile version imdb_mobilelink = "http://m.imdb.com/title/tt" imdb_tt_number = h1['href'][27:] imdb_link = imdb_mobilelink + ( '0' * (7 - len(imdb_tt_number))) + imdb_tt_number # Get CSV representation of episode data from TVRage csv_link = self.find_csv_link(soup) csv_page = urllib2.urlopen(csv_link) # Convert CSV string to list of episode csv_string = BeautifulSoup(csv_page).pre.string.replace('\r\n', '') csv_ep_list = csv_string.split('\n')[1:] ep_list = [] # Examine each episode to classify for csv_ep in csv_ep_list: ep_info_split = csv_ep.split(',') ep_info = [] for ep_info_item in ep_info_split: if len(ep_info_item) > 0: if (ep_info_item[0] == '"') and (ep_info_item[-1] == '"'): ep_info_item = ep_info_item[1:-1] ep_info.append(ep_info_item) idx, season, num, pcode, airdate = ep_info[:5] ep_title = ','.join(ep_info[5:-2]) if ep_info[-1] == 'n': is_special = False else: is_special = True if idx: if idx.isdigit(): idx = int(idx) else: self.logger.error( 'Index "%s" must be a number - %s for %s' % (idx, str(csv_ep), title)) else: if is_special: idx = 0 else: self.logger.error( 'Only special episodes can have a empty index - %s for %s' % (str(csv_ep), title)) idx = -1 if season.isdigit(): season = int(season) else: self.logger.error( 'Season "%s" must be a number - %s for %s' % (season, str(csv_ep), title)) season = -1 if num.isdigit(): num = int(num) else: if is_special != True: self.logger.error( 'Episode number "%s" must be a number - %s for %s' % (num, str(csv_ep), title)) num = -1 else: num = 0 airdate = str(airdate.replace('"', '')) date_fmt = re.compile( '\d\d/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d\d' ) if airdate == 'UNKNOWN': air_dt_tuple = [9999, 12, 31, 23, 59] elif date_fmt.match(airdate): # Combine timeslot and date to make datetime object and convert timezone to kst kst_air_dt = self.dc.convert_tz('kst', (str(airdate), start_time)) kst_end_dt = kst_air_dt + duration air_dt_tuple = self.dc.datetime_to_tuple(kst_air_dt) ep_ccn = show_ccn + ep_title + str(idx) if kst_air_dt > self.now: status = 'yet' self.next.update( {'ccn': ep_ccn}, {'$set': { 'title': title, 'ad': air_dt_tuple }}) elif kst_end_dt < self.now: status = 'aired' else: status = 'airing' elif airdate == 'UNAIRED': air_dt_tuple = [0000, 1, 1, 0, 0] else: air_dt_tuple = [-1, -1, -1, -1, -1] # Build up episode information list. ep = [ idx, season, num, pcode, air_dt_tuple, ep_title, is_special ] ep_list.append(ep) # Update detail information about the show self.shows.update({'ccn': show_ccn}, {'$set': { 'episodes': ep_list }}) self.update_timestamp(show_ccn, update_time) # Upsert show into index self.index.update({'ccn': show_ccn}, {'$set': { 'title': title, 'imdb_m': imdb_link }}, upsert=True) self.temp_index.append([show_ccn, title, imdb_link]) self.logger.info('Episodes Updated for ' + show_ccn + ' titled with ' + title) else: # Data is the latest, so no need to update self.logger.info('Episodes for ' + show_ccn + ' is latest.')
def decode_title(self, title): new_title = BeautifulSoup(title.encode('utf-8'), convertEntities="html").prettify() if '\n' in new_title: new_title = new_title.split('\n')[0] return new_title
def lake_union_weather(): url_to_use = 'https://lakeunionweather.info' page = requests.get(url_to_use) soup = BeautifulSoup(page.content) try: header_data = soup.findAll("div", {"id": "Header"})[0] atmosphere_data = soup.findAll("table", {"id": "WeatherTable"})[0] water_data = soup.findAll("table", {"id": "WaterTable"})[0] except IndexError: pass # First lets get the date and time out of this info_date = None date_data = header_data.findAll('h4')[0] date_data = BeautifulSoup("{}".format(date_data)).getText() date_string = date_data.split('recorded on')[1].strip() info_date = datetime.strptime(date_string, "%d %b %Y %I:%M %p") # This is a gross way to get all the data from the table, but so it goes air_temp_f = None wind_chill_f = None avg_windspeed_dir = None avg_windspeed_mph = None for tr in BeautifulSoup("{}".format(atmosphere_data)).findAll('tr')[1:]: ths = BeautifulSoup("{}".format(tr.findAll('th')[0])).getText() tds = BeautifulSoup("{}".format(tr.findAll('td')[0])).getText() if ths.find('Temperature') >= 0: air_temp_f = float(tds.split('°F')[0]) elif ths.find('Wind Chill') >= 0: wind_chill_f = float(tds.split('°F')[0]) elif ths.find('Av. Windspeed') >= 0: avg_windspeed_mph = float(tds.split('MPH')[0].strip()) avg_windspeed_dir = tds.split('from the')[1].strip() # This is a gross way to get all the data from the table, but so it goes water_temp_f = None for tr in BeautifulSoup("{}".format(water_data)).findAll('tr')[1:]: tds = tr.findAll('td') if float(BeautifulSoup("{}".format(tds[0])).getText()) < 5: water_temp_f = float(BeautifulSoup("{}".format(tds[1])).getText()) # Now let's find the time diff when we got this if info_date is None: time_string = "" else: # Need to make this aware of the time zone tz = pytz.timezone('US/Pacific') latest_date_tz = tz.localize(info_date) time_diff = datetime.now(tz) - latest_date_tz # time_diff = datetime.now() - latest_date_water_temp if time_diff.days > 0: hours_diff = time_diff.days * 24 hours_diff += time_diff.seconds / 60 / 60 else: hours_diff = time_diff.seconds / 60 / 60 time_string = " about {} hours ago".format(hours_diff) if air_temp_f is None and water_temp_f is None: # This means we didn't find anythiing retval = "I'm sorry, I couldn't find any recent data about the weather on lake union" else: retval = "Last known conditions on lake union include: " num_values = 0 if air_temp_f is not None: retval += "Water temperature of {:.0f} degrees fahrenheit".format( round(air_temp_f)) num_values += 1 if air_temp_f is not None: if num_values > 0: retval += ", and " retval += "Air temperature of {:.0f} degrees fahrenheit, ".format( round(air_temp_f)) retval += "Wind chill of {:.0f} degrees fahrenheit, ".format( round(wind_chill_f)) retval += "wind speed of {:.0f} miles per hour ".format( round(utils.mps_to_mph(avg_windspeed_mph), 1)) retval += "coming from the {}".format( utils.compass_to_words(avg_windspeed_dir)) retval += "{}".format(time_string) return retval
def getNewLines(self): if self.newlines is None: rawcontent = urllib2.urlopen(self.url).read() newcontent = BeautifulSoup(rawcontent).prettify() self.newlines = newcontent.split("\n") return self.newlines
def get_links_list(url): response = requests.get(url, headers=headers) # response.status htm = response.text htm = htm.encode('iso-8859-1', 'ignore') htm = BeautifulSoup(htm) plot = htm.findAll('h1', attrs={'class': re.compile('sporttitle')}) plot = parser.unescape( str(plot).replace('\t', '').replace('\n', '').replace('[', '').replace( ']', '').decode('utf-8').strip()) plot = re.sub(r'<[^>]*>', r'', plot) htm = str(htm) if re.search(get_localized_string(T_ED_RESUL), htm): htm = re.split(get_localized_string(T_ED_RESUL), htm, 1)[1] htm = htm.split('<div id="comblockabs">', 1)[0] htm = re.sub(r'\t', r'', ''.join(htm)) htm = re.sub(r'\n', r'', htm) htm = parser.unescape(htm.decode('utf-8').strip()) query = """.+?<b>(.+?)</b>.+?""" elinks = re.compile(query, re.DOTALL).findall(htm) elinks[0] = '[COLOR lightskyblue]' + get_localized_string( T_ED_RESUL) + elinks[0] + '[/COLOR]' for el in elinks: image = media + '/33054.png' list_item = xbmcgui.ListItem(label=el) list_item.setArt({ 'fanart': addonID.getAddonInfo('fanart'), 'icon': image, 'thumb': image, 'poster': image }) list_item.setInfo('video', {'plot': plot}) list_item.setProperty('IsPlayable', 'false') k_url = '' listing.append((k_url, list_item, False)) elif re.search('AceStream Links', htm): htm = htm.split('<span class="lnkt">AceStream Links</span>', 1)[-1] htm = htm.split('<div id="comblockabs">', 1)[0] htm = re.sub(r'\t', r'', ''.join(htm)) htm = re.sub(r'\n', r'', htm) htm = re.sub(r'<td width="16">', r'\n', htm) htm = parser.unescape(htm.decode('utf-8').strip()) query = """<img title=".+?/linkflag/(.+?).png" />.+?class="bitrate".+?">(.+?)/td>.+?<a href="acestream:(.+?)">.+?""" elinks = re.compile(query, re.DOTALL).findall(htm) for el in elinks: image = media + "/flags/" + el[0] + ".gif" list_item = xbmcgui.ListItem( label='[B]Audio: ' + check_audio_lang(el[0]) + ', Bitrate: AceStream ' + el[1].replace('<', '') + '[/B]') list_item.setArt({ 'fanart': addonID.getAddonInfo('fanart'), 'icon': image, 'thumb': image, 'poster': image }) list_item.setInfo('video', {'plot': plot}) list_item.setProperty('IsPlayable', 'true') k_url = 'plugin://program.plexus/?mode=1&url=acestream:' + el[ 2] + '&name=[B]Audio: ' + check_audio_lang( el[0]) + ', Bitrate: AceStream ' + el[1].replace( '<', '') + '[/B]' listing.append((k_url, list_item, False)) else: image = media + '/33056.png' list_item = xbmcgui.ListItem( label='[I][B]' + get_localized_string(T_NO_LSTRM) + '[/B][/I]') list_item.setArt({ 'fanart': addonID.getAddonInfo('fanart'), 'icon': image, 'thumb': image, 'poster': image }) list_item.setInfo('video', {'plot': plot}) list_item.setProperty('IsPlayable', 'false') k_url = '' listing.append((k_url, list_item, False)) if len(listing) < 1: image = media + '/33057.png' list_item = xbmcgui.ListItem(label='¡¡¡GRFTJX!!! ¡¡¡GRMBLFJ!!!') list_item.setArt({ 'fanart': addonID.getAddonInfo('fanart'), 'icon': image, 'thumb': image, 'poster': image }) list_item.setInfo('video', {'plot': plot}) list_item.setProperty('IsPlayable', 'false') k_url = '' listing.append((k_url, list_item, False)) return listing
# Debug Variable debug = False # Registers FireFox and Microsoft Edge as available browser ffpath = 'C:\\Program Files (x86)\\Mozilla Firefox\\firefox.exe' webbrowser.register('firefox', None, webbrowser.BackgroundBrowser(ffpath), 1) mepath = 'edge.bat' webbrowser.register('edge', None, webbrowser.BackgroundBrowser(mepath), 2) # Retrieve Array of Words to use from Long Wikipedia Article random_words = urllib2.urlopen( 'https://en.wikipedia.org/wiki/1918_New_Year_Honours').read() clean_words = BeautifulSoup(random_words).text words = [] for word in clean_words.split(): words.append(word) print("Length of Array of Words is " + str(len(words))) # Function to search Bing and acquire points def search_bing(): word1 = random.randint(0, len(words)) word2 = random.randint(0, len(words)) word3 = random.randint(0, len(words)) #webbrowser.open_new('https://www.bing.com/search?q='+words[word1]+'+'+words[word2]+'+'+words[word3]) open_curr_tab('https://www.bing.com/search?q=' + unidecode(words[word1]) + '+' + unidecode(words[word2]) + '+' + unidecode(words[word3])) print("Done")
def parse_store_detail(self, response): hxs = HtmlXPathSelector(response) item = KoubeiStoreItem() # Url item['link_url'] = response.url match = self.city_pattern.match(response.url) if match: item['city'] = match.group(1) # Bread Crumb crumb_elems = hxs.select("//div[@class='crumb k2-fix-float']/*").extract() if crumb_elems: item['bread_crumb'] = u'\xbb'.join([ BeautifulSoup(c).text for c in crumb_elems ]) # Name name_elem = hxs.select("//input[@id='store-full-name']/@value").extract() if name_elem: item['name'] = name_elem[0] # Address address_elem = hxs.select("//input[@id='store-address']/@value").extract() if address_elem: item['address'] = address_elem[0] # Telephone tel_elem = hxs.select("//input[@id='store-tel']/@value").extract() if tel_elem: item['tel'] = tel_elem[0] # Average Cost avg_elem = hxs.select("//div[@class='store-info-card']//li/text()").extract() for text in avg_elem: if text.startswith("人均".decode('utf-8')): item['avg_cost'] = text.split(u'\uff1a')[1] break # Rating rating_elem = hxs.select("//div[@class='store-free-title k2-fix-float']/p/b/text()").extract() if rating_elem: item['rating'] = rating_elem[0] item['n_rating'] = int(rating_elem[1]) # Detail detail_elem = hxs.select("//div[@class='detail-main']/ul/li").extract() for elem in detail_elem: text = BeautifulSoup(elem).find('label').text if text.startswith('网站地址'.decode('utf-8')): item['url'] = text.split(u'\uff1a')[1].strip() if text.startswith('店铺标签'.decode('utf-8')): item['tag_list'] = [a.text for a in BeautifulSoup(elem).findAll('a')] # Description desc_elem = hxs.select("//div[@class='detail-intro']/div/text()").extract() if desc_elem: item['description'] = desc_elem[0].strip() # Promote promote_elems= hxs.select("//div[@id='promote-more']//p").extract() promotes = [] for elem in promote_elems: name = BeautifulSoup(elem).find('a').text.strip() count = int(BeautifulSoup(elem).find('span').text[1:-1]) promotes.append((name, count)) if promotes != []: item['promote_list'] = promotes # Impress impress_elems = hxs.select("//div[@id='impress-more']//span/text()").extract() if impress_elems: item['impress_list'] = [imp.strip() for imp in impress_elems] #print "PARSING : %s | %s | %s | %s" % (item['name'], item['tel'], item['address'], item['avg_cost']) return item
# This is a very simple counter to check how many characters and words there are # in the file by simply joining together all of the <plaintext /> elements. import re from BeautifulSoup import BeautifulSoup file = "output_pretty.xml" f = open(file, 'r+') f = f.readlines() f = ''.join(map(str.strip,f)) f = BeautifulSoup(f) f = f.findAll('plaintext') g = [] for item in f: g.append(str(item)) f = ' '.join(g) print len(f) f = f.split(' ') print len(f)
def update_one(self, show_ccn, link, start_time, duration): duration = timedelta(minutes = duration) # Load detail show page and check if the latest data is updated page = urllib2.urlopen(link) soup = BeautifulSoup(page) update_time = datetime.strptime(soup.em.string.replace('-', ''), DATE_FORMAT_B) if not self.is_latest(show_ccn, update_time): # Get H1 tag that includes title data and imdb link data h1 = soup.h1.a title = h1.string # Convert normal imbd link to mobile version imdb_mobilelink = "http://m.imdb.com/title/tt" imdb_tt_number = h1['href'][27:] imdb_link = imdb_mobilelink + ('0' * (7 - len(imdb_tt_number))) + imdb_tt_number # Get CSV representation of episode data from TVRage csv_link = self.find_csv_link(soup) csv_page = urllib2.urlopen(csv_link) # Convert CSV string to list of episode csv_string = BeautifulSoup(csv_page).pre.string.replace('\r\n', '') csv_ep_list = csv_string.split('\n')[1:] ep_list = [] # Examine each episode to classify for csv_ep in csv_ep_list: ep_info_split = csv_ep.split(',') ep_info = [] for ep_info_item in ep_info_split: if len(ep_info_item) > 0: if (ep_info_item[0] == '"') and (ep_info_item[-1] == '"'): ep_info_item = ep_info_item[1:-1] ep_info.append(ep_info_item) idx, season, num, pcode, airdate = ep_info[:5] ep_title = ','.join(ep_info[5:-2]) if ep_info[-1] == 'n': is_special = False else: is_special = True if idx: if idx.isdigit(): idx = int(idx) else: self.logger.error('Index "%s" must be a number - %s for %s' % (idx, str(csv_ep), title)) else: if is_special: idx = 0 else: self.logger.error('Only special episodes can have a empty index - %s for %s' % (str(csv_ep), title)) idx = -1 if season.isdigit(): season = int(season) else: self.logger.error('Season "%s" must be a number - %s for %s' % (season, str(csv_ep), title)) season = -1 if num.isdigit(): num = int(num) else: if is_special != True: self.logger.error('Episode number "%s" must be a number - %s for %s' % (num, str(csv_ep), title)) num = -1 else: num = 0 airdate = str(airdate.replace('"', '')) date_fmt = re.compile('\d\d/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d\d') if airdate == 'UNKNOWN': air_dt_tuple = [9999, 12, 31, 23, 59] elif date_fmt.match(airdate): # Combine timeslot and date to make datetime object and convert timezone to kst kst_air_dt = self.dc.convert_tz('kst', (str(airdate), start_time)) kst_end_dt = kst_air_dt + duration air_dt_tuple = self.dc.datetime_to_tuple(kst_air_dt) ep_ccn = show_ccn + ep_title + str(idx) if kst_air_dt > self.now: status = 'yet' self.next.update({'ccn': ep_ccn}, {'$set': {'title': title, 'ad': air_dt_tuple}}) elif kst_end_dt < self.now: status = 'aired' else: status = 'airing' elif airdate == 'UNAIRED': air_dt_tuple = [0000, 1, 1, 0, 0] else: air_dt_tuple = [-1, -1, -1, -1, -1] # Build up episode information list. ep = [idx, season, num, pcode, air_dt_tuple, ep_title, is_special] ep_list.append(ep) # Update detail information about the show self.shows.update({'ccn': show_ccn}, {'$set': {'episodes': ep_list}}) self.update_timestamp(show_ccn, update_time) # Upsert show into index self.index.update({'ccn': show_ccn}, {'$set': {'title': title, 'imdb_m': imdb_link}}, upsert=True) self.temp_index.append([show_ccn, title, imdb_link]) self.logger.info('Episodes Updated for ' + show_ccn + ' titled with ' + title) else: # Data is the latest, so no need to update self.logger.info('Episodes for ' + show_ccn + ' is latest.')
def parse_store_detail(self, response): hxs = HtmlXPathSelector(response) item = KoubeiStoreItem() # Url item['link_url'] = response.url match = self.city_pattern.match(response.url) if match: item['city'] = match.group(1) # Bread Crumb crumb_elems = hxs.select( "//div[@class='crumb k2-fix-float']/*").extract() if crumb_elems: item['bread_crumb'] = u'\xbb'.join( [BeautifulSoup(c).text for c in crumb_elems]) # Name name_elem = hxs.select( "//input[@id='store-full-name']/@value").extract() if name_elem: item['name'] = name_elem[0] # Address address_elem = hxs.select( "//input[@id='store-address']/@value").extract() if address_elem: item['address'] = address_elem[0] # Telephone tel_elem = hxs.select("//input[@id='store-tel']/@value").extract() if tel_elem: item['tel'] = tel_elem[0] # Average Cost avg_elem = hxs.select( "//div[@class='store-info-card']//li/text()").extract() for text in avg_elem: if text.startswith("人均".decode('utf-8')): item['avg_cost'] = text.split(u'\uff1a')[1] break # Rating rating_elem = hxs.select( "//div[@class='store-free-title k2-fix-float']/p/b/text()" ).extract() if rating_elem: item['rating'] = rating_elem[0] item['n_rating'] = int(rating_elem[1]) # Detail detail_elem = hxs.select("//div[@class='detail-main']/ul/li").extract() for elem in detail_elem: text = BeautifulSoup(elem).find('label').text if text.startswith('网站地址'.decode('utf-8')): item['url'] = text.split(u'\uff1a')[1].strip() if text.startswith('店铺标签'.decode('utf-8')): item['tag_list'] = [ a.text for a in BeautifulSoup(elem).findAll('a') ] # Description desc_elem = hxs.select( "//div[@class='detail-intro']/div/text()").extract() if desc_elem: item['description'] = desc_elem[0].strip() # Promote promote_elems = hxs.select("//div[@id='promote-more']//p").extract() promotes = [] for elem in promote_elems: name = BeautifulSoup(elem).find('a').text.strip() count = int(BeautifulSoup(elem).find('span').text[1:-1]) promotes.append((name, count)) if promotes != []: item['promote_list'] = promotes # Impress impress_elems = hxs.select( "//div[@id='impress-more']//span/text()").extract() if impress_elems: item['impress_list'] = [imp.strip() for imp in impress_elems] #print "PARSING : %s | %s | %s | %s" % (item['name'], item['tel'], item['address'], item['avg_cost']) return item
import telnetlib import time from BeautifulSoup import BeautifulSoup # Debug Variable debug = False # Registers FireFox as available browser ffpath = 'C:\\Program Files (x86)\\Mozilla Firefox\\firefox.exe' webbrowser.register('firefox', None, webbrowser.BackgroundBrowser(ffpath), 1) # Retrieve Array of Words to use from Long Wikipedia Article random_words = urllib2.urlopen('https://en.wikipedia.org/wiki/1918_New_Year_Honours').read() clean_words = BeautifulSoup(random_words).text words = [] for word in clean_words.split(): words.append(word) print("Length of Array of Words is " + str(len(words))) # Function to search Bing and acquire points def search_bing(): word1 = random.randint(0,len(words)) word2 = random.randint(0,len(words)) word3 = random.randint(0,len(words)) #webbrowser.open_new('https://www.bing.com/search?q='+words[word1]+'+'+words[word2]+'+'+words[word3]) open_curr_tab('https://www.bing.com/search?q='+words[word1]+'+'+words[word2]+'+'+words[word3]) print("Done") # Establish TelNet Session to open FireFox so all searches are contained in one tab. HOST = 'localhost' PORT = 4242
try: mode = args.get('mode', None) except (SyntaxError, TypeError) as e: xbmc.log(msg='Error: %s' % str(e), level=xbmc.LOGERROR) if mode is None: li = '' response = requests.get(url, headers=headers) # response.status htm = response.text htm = htm.encode('iso-8859-1', 'ignore') htm = BeautifulSoup(htm) htm = str(htm) htm = htm.split('<div id="aul">', 1)[-1] htm = htm.split('<a href="/es/majorcompetitions/">', 1)[0] htm = [line for line in htm.split('\n') if '<a class="main" ' in line] htm = re.sub(r'\t', r'', ''.join(htm)) htm = re.sub(r'</td>', r'</td>\n', htm) htm = re.sub(r'</a><td background=', r'</a>\n<td background=', htm) htm = re.sub(r'(?m)^<td background=.*\n?', r'', htm) htm = parser.unescape(htm.decode('utf-8').strip()) query = """<a class=.+?href="(.+?)".+?<b>(.+?)</b>.+?""" sports = re.compile(query, re.DOTALL).findall(htm) #print (htm) #print (sports) for s in sports:
for page in urlList: if docIDCounter > retrieveLimit: break #quits crawling if retrieval limit is reached try: #---------- Page Crawler (gets words and links from each page --------- soup = "" browse.open(page) if page.endswith(".txt"): soup = browse.response().read() else: soup = BeautifulSoup(browse.response().read()) #if can't parse, assumed to be binary file or 404 soup = soup.getText() hashTest = hashlib.md5(soup.encode('utf-8')).hexdigest() if hashTest not in duplicateDetect: duplicateDetect.append(hashTest) wordsInPage = soup.split() if not page.endswith(".txt"): for link in browse.links(): tempURL = urlparse.urljoin(link.base_url, link.url) #BELOW: gets rid of duplicate urls resulting from index.html/index.htm if tempURL.endswith("index.html"): tempURL = tempURL.replace("index.html", "") elif tempURL.endswith("index.htm"): tempURL = tempURL.replace("index.htm", "") if tempURL not in urlList: if tempURL.startswith(baseUrl): if robots.can_fetch("*", "/" + link.url): #checks robots.txt, necessary because of unusual robots.txt location urlList.append(tempURL)
if docIDCounter > retrieveLimit: break #quits crawling if retrieval limit is reached try: #---------- Page Crawler (gets words and links from each page --------- soup = "" browse.open(page) if page.endswith(".txt"): soup = browse.response().read() else: soup = BeautifulSoup(browse.response().read( )) #if can't parse, assumed to be binary file or 404 soup = soup.getText() hashTest = hashlib.md5(soup.encode('utf-8')).hexdigest() if hashTest not in duplicateDetect: duplicateDetect.append(hashTest) wordsInPage = soup.split() if not page.endswith(".txt"): for link in browse.links(): tempURL = urlparse.urljoin(link.base_url, link.url) #BELOW: gets rid of duplicate urls resulting from index.html/index.htm if tempURL.endswith("index.html"): tempURL = tempURL.replace("index.html", "") elif tempURL.endswith("index.htm"): tempURL = tempURL.replace("index.htm", "") if tempURL not in urlList: if tempURL.startswith(baseUrl): if robots.can_fetch( "*", "/" + link.url ): #checks robots.txt, necessary because of unusual robots.txt location
def get_event_list(url): li = '' response = requests.get(url, headers=headers) # response.status htm = response.text htm = htm.encode('iso-8859-1', 'ignore') htm = BeautifulSoup(htm) image = url.split('/') image = media + "/sports/" + image[5] + ".sport.png" plot = htm.findAll('span', attrs={'class': re.compile('sltitle')}) plot = parser.unescape( str(plot).replace('\t', '').replace('\n', '').replace('[', '').replace( ']', '').decode('utf-8').strip()) plot = re.sub(r'<[^>]*>', r'', plot) htm = str(htm) tday = strftime("%-d de %B, %A ", localtime()) htm = htm.split('<span class="sltitle">', 1)[-1] htm = htm.split('<a href="/es/archive/">', 1)[0] htm = re.sub(r'\t', r'', ''.join(htm)) htm = re.sub(r'\n', r'', htm) htm = re.sub(r'/icons/', r'\n/icons/', htm) htm = re.sub(r'</span>', r'</span>\n', htm) htm = [line for line in htm.split('\n') if '/icons/' in line] htm = '\n'.join(htm) htm = parser.unescape(htm.decode('utf-8').strip()) query = """/icons/(.+?)".+?<a class=.+?href="(.+?)">(.+?)</a>.+?"evdesc">(.+?)<.+?>(.+?)</span>""" events = re.compile(query, re.DOTALL).findall(htm) events = list(dict.fromkeys(events)) # print (htm2) # print (events) for e in events: #image = "http://cdn.livetvcdn.net/img/icons/" + e[0] hrefs = urlbase + e[1] event = e[2] time = change_date_format(e[3]) mins = 1440 - hms_to_m(str(datetime.datetime.now().time())[:-7]) tnow = datetime.datetime.now() + datetime.timedelta(minutes=30) tday = datetime.datetime.now() + datetime.timedelta(minutes=mins) desc = e[4] # desc_image = unicodedata.normalize('NFD', desc[1:-1]).encode('ascii', 'ignore') # # response_image = google_images_download.googleimagesdownload() # arguments_image = { # "keywords": desc_image[1:-1], # "suffix_keywords": "logo", # "limit": 1, # "format": "jpg", # "output_directory": "storage", # #"image_directory": "pictures", # "no_directory": True, # "no_download": True # } # absolute_image_paths = response_image.download(arguments_image) # image = absolute_image_paths[desc_image[1:-1]+' logo'][0] if time < tnow: time = time.strftime("%d/%m/%y %H:%M") url = build_url({'mode': 'folder', 'foldername': hrefs}) li = xbmcgui.ListItem('[COLOR lightskyblue](' + time + ')[/COLOR] [B]' + event + '[/B] [COLOR lightseagreen]' + desc + '[/COLOR]') li.setArt({ 'fanart': addonID.getAddonInfo('fanart'), 'icon': image, 'thumb': image, 'poster': image }) li.setInfo('video', {'plot': plot}) xbmcplugin.addSortMethod(handle=addon_handle, sortMethod=xbmcplugin.SORT_METHOD_LABEL) xbmcplugin.addDirectoryItem(handle=addon_handle, url=url, listitem=li, isFolder=True) elif time < tday: time = time.strftime("%d/%m/%y %H:%M") url = '' li = xbmcgui.ListItem('[I][COLOR lightskyblue](' + time + ')[/COLOR] [B]' + event + '[/B] [COLOR lightseagreen]' + desc + '[/COLOR][/I]') li.setArt({ 'fanart': addonID.getAddonInfo('fanart'), 'icon': image, 'thumb': image, 'poster': image }) li.setInfo('video', {'plot': plot}) xbmcplugin.addSortMethod(handle=addon_handle, sortMethod=xbmcplugin.SORT_METHOD_LABEL) xbmcplugin.addDirectoryItem(handle=addon_handle, url=url, listitem=li, isFolder=False) if li == '': image = media + '/33056.png' url = '' li = xbmcgui.ListItem('[I][B]' + get_localized_string(T_NO_LSTRM) + '[/B][/I]') li.setArt({ 'fanart': addonID.getAddonInfo('fanart'), 'icon': image, 'thumb': image, 'poster': image }) li.setInfo('video', {'plot': plot}) xbmcplugin.addDirectoryItem(handle=addon_handle, url=url, listitem=li, isFolder=False) xbmcplugin.endOfDirectory(handle=addon_handle, succeeded=True)