def retrieve_from_syobocal(self): start_html = web.open_url(SYOBOCAL_START_URL, params={'cat': 1}) lines = [] for (url_part, title_name) in re_title_url.findall(start_html): if not self.is_valid_word(title_name): continue title_url = SYOBOCAL_BASE_URL + url_part title_html = web.open_url(title_url) title_yomi = ''.join(re_title_yomi.findall(title_html)) lines.append(self.to_mecab_format(title_name, title_yomi, 'SC')) time.sleep(INTERVAL) if lines: self.write(lines)
def get_trend_search_query(self, url): html = web.open_url(url) soup = BeautifulSoup(html) for item in soup.find_all('item'): title = item.find('title') keyword = str(title.string).strip() keyword = re_sumikakko.sub('', keyword) if ' ' in keyword: for kwd in keyword.split(' '): yield kwd else: yield keyword
def get_trend_search_query(self, url): html = web.open_url(url) soup = BeautifulSoup(html, "html5lib") for item in soup.find_all('item'): title = item.find('title') keyword = str(title.string).strip() keyword = re_sumikakko.sub('', keyword) if ' ' in keyword: for kwd in keyword.split(' '): yield kwd else: yield keyword
def give_valentine_present(*arg): if random.randint(0, 11) > 8: icon_url = arg[1]['icon'].replace('_normal', '') filename = icon_url.split('/')[-1] web.download(icon_url, '/tmp/%s' % (filename)) misc.command('%s evaluate.py --checkpoint ../../data/ckpt ' % (PYTHON_EXE_PATH) + '--in-path /tmp/%s --out-path /tmp/%s' % (filename, filename), shell=True, allow_err=True, cwd=STYLE_TRANSFER_PATH) return {'text': '%nameをチョコにしてやろうか!(゚Д゚)', 'media[]': '/tmp/%s' % (filename)} pid = random.randint(0, 59) xml = web.open_url(SAFEBOORU_URL % pid) soup = BeautifulSoup(xml, 'lxml') post = misc.choice(soup.find_all('post')) image_url = 'https:' + post['file_url'] web.download(image_url, '/tmp/present') suffix = '!' * random.randint(0, 59) return {'text': '%nameにチョコをヽ(´ー`)ノ' + suffix, 'media[]': '/tmp/present'}
def _get_title(self, url): title = '' root, ext = os.path.splitext(url) if ext in image_extensions: time.sleep(3) # for avoiding to be treated as spam by Google logger.info('Search by google: %s' % url) results = google_image.search(url, best_kwds_max_length=18) keywords = filter(lambda x: not x.isdigit(), results['best_keywords']) title = ''.join(keywords) elif not ext in ignore_extensions: logger.info('Retrieve web resource: %s' % url) html = web.open_url(url) soup = BeautifulSoup(html, "html5lib") if soup.title and soup.title.string: title = soup.title.string title = normalize.normalize(title) title = self._shorten_title(title) return title
def test_open_url(): url = 'http://qwerty.on.arena.ne.jp/' got = web.open_url(url) assert_true(u'あやしいわーるど' in got)
def test_open_url(): url = 'http://misao.on.arena.ne.jp/cgi-bin/bbs.cgi' got = web.open_url(url) assert 'あやしいわーるど' in got