def _get_title(_url): HEADER = {'Accept-Language':'en-US,en;q=0.5'} request = Request(_url, headers=HEADER) data = urlopen(request) htree=etree.parse(data, hparser) raw_title = htree.find(".//title").text code = get_host_code(_url) title = title_cleaners[code](raw_title) title = re.sub('[\|\*\[\]\(\)~\\\]','',title) return title
def _get_title(_url): HEADER = {'Accept-Language': 'en-US,en;q=0.5'} request = Request(_url, headers=HEADER) data = urlopen(request) htree = etree.parse(data, hparser) raw_title = htree.find(".//title").text code = get_host_code(_url) title = title_cleaners[code](raw_title) title = re.sub('[\|\*\[\]\(\)~\\\]', '', title) return title
def get_video_links_from_html(text): """ Strips video link from a string in html format by looking for the href attribute. """ # could also just use BeautifulSoup, but this regex works fine link_pat = re.compile('href="(.*?)"') links = link_pat.findall(text) video_links = [] for l in links: code = get_host_code(l) if code: clean = link_cleaners[code] if clean: link = clean(fix_html_entities(l)) if link: video_links.append(link) return video_links