def parse_wikicode(input_str, outputformat='vcard', language='english'): try: language = getattr(__import__('translation.' + language), language) except ImportError: language = translation.english input_str = html_decode(input_str) found = [] for line in input_str.split('\n*'): line = '*' + line for cls in [Tag, Vcard, Untagged]: try: found += cls.parse(line, language) except ValueError: raise if not found: for line in input_str.split('\n*'): found += Untagged.parse(line, language, restrictive=False) if outputformat == 'raw': return found elif outputformat == 'json': return [json.dumps(l) for l in found] elif outputformat == 'tag': return [Tag.tostring(l, language) for l in found] elif outputformat == 'vcard': return [Vcard.tostring(l, language) for l in found] else: raise ValueError('Invalid output outputformat: %s' % outputformat)
def get_from_link(input_str): input_str = input_str.strip() if (input_str.count('\n') <= 1 and input_str.startswith('http://') and 'action=edit' in input_str and 'wikivoyage' in input_str): input_str = fake_agent_readurl(input_str) t = ElementSoup.parse(StringIO(input_str)) if sys.version_info[:2] < (2, 7): # Xpath too stupid for bracket syntax, # fortunately there seems to be only one input_str = t.find(".//textarea").text else: input_str = t.find(".//textarea[@id='wpTextbox1']").text return html_decode(input_str) return input_str
def __init__(self, url): self.url = url html = self.get_html(url) # Добавить имя категории self.xml_str += '<category name="%s">' % self.get_name(html) # Добавить XML подкатегорий for url_and_image in self.get_subcategory_urls_and_images(html): prefix = 'http://www.air-gun.ru/' url = prefix + html_decode(url_and_image[0]) image = prefix + url_and_image[1] subcategory = Subcategory(url, image) self.xml_str += subcategory.get_xml_str() self.xml_str += '</category>'
def main(): while True: rss_urls = ["http://www.otakubot.org/feed/", "http://www.otakubot.org/feed/?paged=2", "http://www.otakubot.org/feed/?paged=3"] d = [] for url in rss_urls: d.extend(feedparser.parse(url).entries) try: already_used = cPickle.load(open('used_links.pkl', 'r')) except: already_used = [] rss_count = 0 for a in d: try: skip = False summary_html = "" post_id = a.guid html = "" if post_id in already_used: rss_count += 1 continue if DEBUG: already_used.append(post_id) cPickle.dump(already_used, open("used_links.pkl", 'w')) continue try: video_rez = utils.html_decode(re.findall('Video: (.*?)\<br />', \ a.content[0]['value'])[0]).split(',')[2].split('×')[1].lstrip() except: video_rez = "NONE" filename = utils.html_decode(re.findall('Release name: (.*?)\<br />', \ a.content[0]['value'])[0]) magnet_link = re.findall('(magnet:\?xt=[^\"<]*)', \ a.content[0]['value']) download_urls = re.findall('<a href="?\'?([^"\'>]*)', \ a.content[0]['value']) download_urls.append(magnet_link) if "otakubot" in download_urls[0] or "zupimages" in download_urls[0]: download_urls.pop(0) if "otakubot" in download_urls[0] or "zupimages" in download_urls[0]: download_urls.pop(0) count = 0 for url in download_urls: if "Go4UP" in url[20:]: download_urls[count] = url.replace("Go4UP", "") elif "Hugefiles" in url[20:]: download_urls[count] = url.replace("Hugefiles", "") elif "Uploaded" in url[20:]: download_urls[count] = url.replace("Uploaded", "") elif "Torrent" in url[20:]: download_urls[count] = url.replace("Torrent", "") count += 1 episode_number = utils.get_episode_number(filename) series_name = utils.get_new_name(utils.get_series_name(filename, episode_number)) if series_name == "SKIP": continue episode_number = episode_number + utils.get_remove_ep(series_name) if episode_number == utils.get_last_ep(series_name): # Is last episode post_title = "{0} Episode {1} Final".format(series_name, episode_number) elif not episode_number: # Is movie/ova post_title = "{0}".format(series_name) else: # Is normal episode post_title = "{0} Episode {1}".format(series_name, episode_number) # CHANGE TO 1 if episode_number <= 1 or not episode_number: # New series if not utils.get_if_stored(series_name): utils.get_series_info(series_name) html = utils.html_download_div(series_name, episode_number, video_rez, \ filename, download_urls) print "New Post:" print post_title print print "HTML:" print html already_used.append(post_id) break except: print("~@~@~@~@~@~@error@~@~@~@~@~@~") cPickle.dump(already_used, open("used_links.pkl", 'w')) time.sleep(15)
def clean_key(self): if 'keyword' in self.cleaned_data: keyword = self.cleaned_data['keyword'] return html_decode(keyword) else: return ''
def clean_key(self): if 'key' in self.cleaned_data: key = self.cleaned_data['key'] return html_decode(key) else: return ''
def __init__(self, id, text, sound, stat=0): self._id = id self.text = html_decode(text) self.sound = sound self.stat = stat
def runTest(self): res = html_decode(self.test_data) self.assertGreater(len(res), len(self.test_data) / 2) self.assertNotIn('<', res) self.assertEqual(res.count('<see name="'), 4) self.assertEqual(res.count('</see>'), 4)