def get_xpath_text(self, dom_item, path): try: result = dom_item.xpath(path) if len(result) == 1: result = result[0] etree.tostring(result) return etree.tostring(result) except Exception, e: print e self.logger.error("Wrong XPath - %s" % path) return None
def scrappURL(link): all = requests.get(link) treeObj = lxml.html.fromstring(all.text) title = treeObj.xpath("//div[@id='mh_lesson_page']")[0].xpath('.//h1')[0] content = treeObj.xpath("//div[@id='mh_lesson_page']")[0].xpath('.//p')[0] title = etree.tostring(title, with_tail=False)[4:-5] content = etree.tostring(content, with_tail=False)[3:-4] print(title) if not Dashbord.objects.filter(title=title): print('%s saved' % title) print(link) row = Dashbord.objects.create(title=title, content=content, url=link)
def get_xpath_text(self, dom_item, path): ''' - Try to get text from the dom model according to the path. ''' try: result = dom_item.xpath(path) if len(result) == 1: result = result[0] etree.tostring(result) return etree.tostring(result) except Exception, e: self.logger.error("Wrong XPath - %s" % path) return None
def get_node_note_by_id(self, node_id=None): node = self.get_node_by_id(node_id) if node is None: raise self.FreeplaneNodeNotExisting else: richcontent_node = node.find(self.T_RICHCONTENT) if richcontent_node is None: self.logger.debug('get_node_note_by_id: No richcontent tag under {0}'.format(node_id)) a = None else: if self.A_TYPE in richcontent_node.attrib: if richcontent_node.attrib[self.A_TYPE] == self.V_TYPE_NOTE: note_elements = richcontent_node.find('html') a = ETH.tostring(note_elements) else: self.logger.debug( 'get_node_note_by_id: richcontent tag under {0} is not of type note'.format(node_id)) a = None else: self.logger.debug('get_node_note_by_id: richencontent tag exists but no type defined') raise self.FreeplaneRichContentTagNotProperlyDefined if a is not None: a = a.decode('ascii') return a
def get_apocopes(list_urls): apo_urls = [] for list_url in list_urls: for node in parse(list_url).findall('.//div[@class="mw-category"].//li/a[@href]'): apo_urls.append((node.text, 'http://fr.wiktionary.org' + node.attrib['href'])) with codecs.open('wiki.log', 'w', 'utf-8') as log: apos = {} for short, url in sorted(apo_urls): short = short.lower() if short not in apos: apos[short] = [] fulls = apos[short] for node in parse(url).findall('.//dl/dd'): #/i/a[@href] text = etree.tostring(node, encoding = 'unicode', method = "text").lower().replace('\n', '') fulls_sub = [] for match in extractor.findall(text): for full in match: full = cleaner.sub('\\1', full) if not full: continue fulls_sub.append(full) log.write(delim.join([short, str(fulls_sub), text]) + newline) if not fulls_sub: print short, '=>', text continue for full in fulls_sub: if full not in fulls: fulls.append(full) return apos
def extract(self, selector, host='', with_body_html=False): body = selector.xpath('//body')[0] for node in iter_node(body): node_hash = hash(node) density_info = self.calc_text_density(node) text_density = density_info['density'] ti_text = density_info['ti_text'] text_tag_count = self.count_text_tag(node, tag='p') sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti']) images_list = node.xpath('.//img/@src') if host: images_list = [pad_host_for_images(host, url) for url in images_list] node_info = {'ti': density_info['ti'], 'lti': density_info['lti'], 'tgi': density_info['tgi'], 'ltgi': density_info['ltgi'], 'node': node, 'density': text_density, 'text': ti_text, 'images': images_list, 'text_tag_count': text_tag_count, 'sbdi': sbdi} if with_body_html: body_source_code = unescape(etree.tostring(node).decode()) node_info['body_html'] = body_source_code self.node_info[node_hash] = node_info std = self.calc_standard_deviation() self.calc_new_score(std) result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True) return result
def all_purpose_template(parse, title): lines = parse.xpath('/html//text()') content = "".join(lines) title = title.encode("gbk", errors="replace").decode("gbk", errors="replace").replace("?", "") open("test.html", "w", encoding="utf-8").write(etree.tostring(parse).decode("utf-8")) content = content.split(title, 2)[2].split("评论")[0].encode("utf-8",errors="replace").decode("utf-8",errors="replace") return content
def get_apocopes(list_urls): apo_urls = [] for list_url in list_urls: for node in parse(list_url).findall( './/div[@class="mw-category"].//li/a[@href]'): apo_urls.append( (node.text, 'http://fr.wiktionary.org' + node.attrib['href'])) with codecs.open('wiki.log', 'w', 'utf-8') as log: apos = {} for short, url in sorted(apo_urls): short = short.lower() if short not in apos: apos[short] = [] fulls = apos[short] for node in parse(url).findall('.//dl/dd'): #/i/a[@href] text = etree.tostring(node, encoding='unicode', method="text").lower().replace('\n', '') fulls_sub = [] for match in extractor.findall(text): for full in match: full = cleaner.sub('\\1', full) if not full: continue fulls_sub.append(full) log.write(delim.join([short, str(fulls_sub), text]) + newline) if not fulls_sub: print short, '=>', text continue for full in fulls_sub: if full not in fulls: fulls.append(full) return apos
def extract(self, selector, host='', body_xpath='', with_body_html=False, use_visiable_info=False): body_xpath = body_xpath or config.get('body', {}).get('xpath', '') use_visiable_info = use_visiable_info or config.get( 'use_visiable_info', False) if body_xpath: body = selector.xpath(body_xpath)[0] else: body = selector.xpath('//body')[0] for node in iter_node(body): if use_visiable_info: if not node.attrib.get('is_visiable', True): continue coordinate_json = node.attrib.get('coordinate', '{}') coordinate = json.loads(coordinate_json) if coordinate.get('height', 0) < 150: # 正文块的高度应该要大于150px continue node_hash = hash(node) density_info = self.calc_text_density(node) text_density = density_info['density'] ti_text = density_info['ti_text'] text_tag_count = self.count_text_tag(node, tag='p') sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti']) images_list = node.xpath('.//img/@src') host = host or config.get('host', '') if host: images_list = [ pad_host_for_images(host, url) for url in images_list ] node_info = { 'ti': density_info['ti'], 'lti': density_info['lti'], 'tgi': density_info['tgi'], 'ltgi': density_info['ltgi'], 'node': node, 'density': text_density, 'text': ti_text, 'images': images_list, 'text_tag_count': text_tag_count, 'sbdi': sbdi } if use_visiable_info: node_info['is_visiable'] = node.attrib['is_visiable'] node_info['coordinate'] = node.attrib.get('coordinate', '') if with_body_html or config.get('with_body_html', False): body_source_code = unescape( etree.tostring(node, encoding='utf-8').decode()) node_info['body_html'] = body_source_code self.node_info[node_hash] = node_info self.calc_new_score() result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True) return result
def FROM_HTML(cls, r, c, x): '''Gets the HTML source of an HTML element as string. Args: x (lxml.html.HtmlElement ): The element to get the source of. Returns: str: HTML source of the element. ''' return etree.tostring(x, encoding='unicode').strip()
def string(self): """ return string of element :return: """ return etree.tostring(self, pretty_print=True, encoding="utf-8", method='html').decode('utf-8')
def guess_htmlcontent(self, content_xpath): htmlcontent = '' try: htmlcontent = self.element.xpath(content_xpath) except Exception as e: pprint(f"error:{e.args},xpath:{content_xpath}", indent=4) if htmlcontent: htmlcontent = "".join(etree.tostring( htmlcontent[0], encoding='utf-8').decode("utf-8")) return htmlcontent
def visit_page(url,path="",save=False): content = {"title" : "", "url" : "", "keywords" : "", "links" : [], "body" : ""} if urlparse(url).netloc == "": #print "partial:", url return content if urlparse(url).netloc in BANNED: #print "banned:", url return content try: resp = urllib2.urlopen(url) except URLError as e: print e return content if resp.getcode() != 200: "Bad response: ",resp.getcode() return content #resolves URL content["url"] = resp.url html = resp.read() try: tree = etree.parse(StringIO.StringIO(html), parser) except: print "LXML error" return content content["title"] = tree.xpath("//title//text()") if len(content["title"]) > 0: content["title"] = content["title"][0].strip() content["links"] = tree.xpath("//body//@href") content["keywords"] = tree.xpath("//meta[@name='keywords']/@content") if content["keywords"] == "": content["keywords"] = tree.xpath("//meta[@name='Keywords']/@content") print "caught a case ",url #content["body"] = cleaner.clean_html(etree.tostring(tree.xpath("//body")[0])) body = cleaner.clean_html(etree.tostring(tree.xpath("//body")[0])) content["word_count"] = len(body.split(" ")) #will save full html if save: filename = urllib.quote_plus(content["url"][0:60])+".txt" #filename.replace("http%3A%2F%2F", "") #if file doesn't already exist if not findInSub(filename,path): #make that day's path path = path + strftime("/%Y/%m/%d/", gmtime()) if not os.path.exists(path): os.makedirs(path) f = open(path+filename, "w+") f.write(html) f.close() print "wrote " + path+filename else: print "already had " + filename return content
def process_page(sterile_page, target_url): """ Process the page so all the links has it's text wrapped in <em></em> and all the words that are longer than 4 symbols are wrapped in <strong></strong> :param sterile_page: A string, target page's source stripped from all the tags, but <a></a> :param target_url: A string, an URL which user gave us :return: A string, processed page ready to render in template """ # Parse the inbound page into element tree with lxml root = etree.fromstring(sterile_page) # First, let's deal with <a></a> for a_tag in root.xpath(".//a"): # If <a></a> has some text in it if a_tag.text and a_tag.text.strip(): # Create new element <em></em>, assign the text from <a></a> to it, delete the text from <a></a>, # and insert <em></em> element instead em = etree.Element('em') em.text = a_tag.text a_tag.text = None a_tag.insert(0, em) # While we are at it, let's fix all the broken relative links we got from page source # #crutch_alert try: # If it works, we don't need to do anything with the a_tag's href valid = URLValidator() valid(a_tag.attrib['href']) except ValidationError: # Good chances are, that this malformed url is _relative_ to target url's domain a_tag.attrib['href'] = absolutize_url( schemeful_domain(target_url), a_tag.attrib['href']) else: # If <a></a> is empty (e.g., after removing an image from anchor's text), remove it altogether with hrefs. a_tag.getparent().remove(a_tag) # Take every element in the tree and traverse the tree, checking if it has text in it # If it does, inflict reinforce_text() which will wrap the words in <strong></strong> if they are longer than 4 for element in root.iter(): if element.text and element.text.strip(): element.text = reinforce_text(element.text) if element.tail and element.tail.strip(): element.tail = reinforce_text(element.tail) # The final bit: flatten the modified tree back to string, decode it and then unescape everything what was escaped # (< and > in <strong></strong>) return unescape(etree.tostring(root, method='html').decode())
def sep_tag(elems, split_pattern): ret = [] for e in elems: e_str = re.split(split_pattern, etree.tostring(e).decode('utf-8')) for s in e_str: try: s = "".join(etree.HTML(s).xpath("//text()")).strip() s = clean_text(s) if not re.fullmatch(r"\s*", s): ret.append(s) except: pass return ret
def parse_count(tree): match = tree.xpath('//div/br/following-sibling::text()') rgx = re.compile('of\\s+(\\d+)\\.') if match: count = rgx.search(' '.join(match)) else: count = rgx.search(etree.tostring(tree)) if not count: logger.error('Failed to count links in search page, returning 0') return 0 links = int(count.group(1)) logger.info('Number of links found in search page: %d', links) return int(count.group(1))
def extract(self, html: Union[str, etree._Element]): if isinstance(html, etree._Element): html = etree.tostring(html, encoding='utf8').decode() try: res = re.findall(r'<html><body><p>(.*)</p></body></html>', html, flags=re.DOTALL | re.S) html = res[0] except: pass if self.many: matches = self._re_object.finditer(html) return [self._parse_match(match) for match in matches] else: match = self._re_object.search(html) return self._parse_match(match)
def to_string(self, element: Element, limit: int = None): """ convert element to string :param element: :param limit: :return: """ result = etree.tostring(element, pretty_print=True, encoding="utf-8", method='html').decode('utf-8') if limit: return result[:limit] return result
def post_filter(self, args): title = args[0].split('[[')[-1].split(']]')[0].split('|')[-1] if title.strip(): title = title.strip() text = args[1] counts = {} doc = etree.fromstring(text, etree.HTMLParser()) hids = [] toc_html = '<div id="toc" class="table_of_contents"><h3>%s</h3>\n'%(title) for node in doc.xpath('//h1|//h2|//h3|//h4|//h5'): if node.tag.lower() == 'h1': this_depth = 0 elif node.tag.lower() == 'h2': this_depth = 1 elif node.tag.lower() == 'h3': this_depth = 2 elif node.tag.lower() == 'h4': this_depth = 3 elif node.tag.lower() == 'h5': this_depth = 4 else: continue p = re.compile('[^a-zA-Z0-9\s\_]') this_id = re.sub(p, '-', node.text).replace(' ','-') if this_id in hids: counts[this_id] = counts.get(this_id, 0) + 1 this_id = '%s-%s'%(this_id, counts[this_id]) hids.append(this_id) pat = '%s'%(etree.tostring(node)) rep = '<%s id="%s" class="toc_heading">%s'\ '<span class="toc_top"><a href="#toc">↩</a></span></%s>'\ '<p style="clear: both;"></p>'\ %(node.tag, this_id, node.text, node.tag) text = text.replace(pat, rep, 1) indent_px = this_depth * 20 toc_html += '<p style="margin-left: %spx">+ '\ '<a href="#%s">%s</a></p>\n'\ %(indent_px, this_id, node.text) toc_html += '</div>\n' text = text.replace(text,toc_html+text) return text
def extract(self, selector, host='', body_xpath='', with_body_html=False): body_xpath = body_xpath or config.get('body', {}).get('xpath', '') if body_xpath: body = selector.xpath(body_xpath)[0] else: body = selector.xpath('//body')[0] body = self.remove_list_relevant(body) for node in iter_node(body): density_info = self.calc_text_density(node) node_hash = hash(node) text_density = density_info['density'] ti_text = density_info['ti_text'] text_tag_count = self.count_text_tag(node, tag='p') sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti']) images_list = node.xpath('.//img/@src') images_list = self.remove_img(images_list) host = host or config.get('host', '') if host: images_list = [ pad_host_for_images(host, url) for url in images_list ] node_info = { 'ti': density_info['ti'], 'lti': density_info['lti'], 'tgi': density_info['tgi'], 'ltgi': density_info['ltgi'], 'node': node, 'body': body, 'density': text_density, 'text': ti_text, 'images': images_list, 'text_tag_count': text_tag_count, 'sbdi': sbdi } if with_body_html or config.get('with_body_html', False): body_source_code = unescape( etree.tostring(node, encoding='utf-8').decode()) node_info['body_html'] = body_source_code self.node_info[node_hash] = node_info self.calc_new_score() result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True) return result
def filter( self, html: str, inline: bool = False, outgoing: bool = False, display_name_mentions: Optional[Dict[str, str]] = None, ) -> str: """Filter and return HTML.""" mentions = display_name_mentions sanit = Sanitizer(self.sanitize_settings(inline, outgoing, mentions)) html = sanit.sanitize(html).rstrip("\n") if not html.strip(): return html tree = etree.fromstring( html, parser=etree.HTMLParser(encoding="utf-8"), ) for a_tag in tree.iterdescendants("a"): self._mentions_to_matrix_to_links(a_tag, mentions, outgoing) if not outgoing: self._matrix_to_links_add_classes(a_tag) html = etree.tostring(tree, encoding="utf-8", method="html").decode() html = sanit.sanitize(html).rstrip("\n") if outgoing: return html # Client-side modifications html = self.quote_regex.sub(r'\1<span class="quote">\2</span>\3', html) if not inline: return html return self.inline_quote_regex.sub( r'\1<span class="quote">\2</span>', html, )
def get_html_at_url( url, charset='UTF-8', ): for banned in banned_list: if (banned in url): return flask.render_template('error.html') try: website = urllib.request.Request(url) except ValueError: return flask.render_template('error.html') try: html = urllib.request.urlopen(website).read().decode(charset) except Exception as e: return flask.render_template('error.html') root = make_etree(html, url) head = root.find('.//head') if head is not None: base = etree.Element('base', href=url) head.insert(0, base) profile_photo = copy_profile_photo_to_static(root) if profile_photo is not None: img_info = get_image_info(profile_photo) add_glasses(profile_photo, img_info['faces'][0]) new_html = etree.tostring(root) # Credit: Alexander J. Quinn. Used with permission. https://piazza.com/class/jkspuifikh3s9?cid=789 mo = re.search(r"\s*<.+?>", html, flags=re.DOTALL) if mo is not None: doctype = mo.group(0) new_html = doctype.encode('utf8') + b"\n" + new_html return new_html else: return flask.render_template('noprofile.html')
def sep_tag(elems, split_pattern): """ 在html元素的字符串中进行切割 :param elems: html元素 :param split_pattern: 切割所使用的正则表达式 :return: list of str, 每个字符串都经过清洗 """ ret = [""] url_head = "https://asoiaf.huijiwiki.com" # e_str = re.split(split_pattern, etree.tostring(elems).decode('utf-8')) elems = etree.HTML(etree.tostring(elems).decode('utf-8')) for s in elems.xpath("//body/*/text()|//body/*/*"): try: if isinstance(s, str): # 访问到字符串元素 if not re.fullmatch(r"\s*", s): ret[-1] += clean_text(s) else: if s.tag == "br": ret.append("") elif s.tag == "a": if "new" in s.xpath("@class"): ret[-1] += clean_text("".join(s.xpath(".//text()"))) else: ret[-1] += get_header(url_head + s.xpath("@href")[0]) else: ret[-1] += clean_text("".join(s.xpath(".//text()"))) except BaseException as e: print(e) # 清理空字符串,分割逗号 ret_copy = [] for item in ret: if item != "": # ret_copy += [*item.split(",")] ret_copy.append(item) return ret_copy
def parse(html_str): html = etree.HTML(html_str) pics = html.xpath("//div[@class='single-content']/p") for pic in pics: global images images.append(etree.tostring(pic).decode('utf-8'))
def extract(self, selector, host='', with_body_html=False): """ W3C标准中HTML结构: <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>网页标题</title> </head> <body> <h1>网页正文</h1> </body> </html> :param selector: :param host: :param with_body_html: :return: """ body = selector.xpath('//body')[0] # 选中body标签 for node in iter_node(body): node_hash = hash(node) # 计算节点文本密度 density_info = self.calc_text_density( node ) # 返回{'density': density, 'ti_text': ti_text, 'ti': ti, 'lti': lti, 'tgi': tgi, 'ltgi': ltgi} # 计算文字符号密度 text_density = density_info['density'] ti_text = density_info['ti_text'] text_tag_count = self.count_text_tag(node, tag='p') # 计算文本标签(p)数量 sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti']) # 返回sbdi or 1 # 解析图片url(获取所有img的src,若用户定义了host主域名,则加上) images_list = node.xpath('.//img/@src') host = host or config.get('host', '') if host: images_list = [ pad_host_for_images(host, url) for url in images_list ] node_info = { 'ti': density_info['ti'], 'lti': density_info['lti'], 'tgi': density_info['tgi'], 'ltgi': density_info['ltgi'], 'node': node, 'density': text_density, 'text': ti_text, 'images': images_list, 'text_tag_count': text_tag_count, 'sbdi': sbdi } # 生成新闻正文所在标签的 HTML 源代码 if with_body_html or config.get('with_body_html', False): body_source_code = unescape( etree.tostring(node, encoding='utf-8').decode()) node_info['body_html'] = body_source_code self.node_info[node_hash] = node_info std = self.calc_standard_deviation() # 计算标准差 self.calc_new_score(std) # 评分核心函数 # sorted(key)参数含义: 按照第几维的元素进行排序。 # 此处按照第二维中的score对应的值排序 result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True) return result
data = list() for detail_url in detail_list: url = root + detail_url rsp = requests.get(url=url, headers=headers) html = etree.HTML(rsp.text) row = dict() # 获取标题 row['title'] = html.xpath('//h1[@class="page-title"]/text()')[0].replace( ' ', '').replace('\n', '') # 获取配方 row['ins'] = dict() for r in html.xpath('//div[@class="ings"]//tr[@itemprop]'): k = r.find('td[@class="name"]') k = etree.tostring(k, encoding='utf-8').decode('utf-8') k = re.sub('<.*?>', '', k).replace(' ', '').replace('\n', '') v = r.find('td[@class="unit"]').text.replace(' ', '').replace('\n', '') row['ins'][k] = v # 获取烹饪步骤 row['steps'] = list() for r in html.xpath('//div[@class="steps"]/ol/li'): li = dict() p = r.find('p[@class="text"]') img = r.find('img').get('src') if r.find('img') is not None else '' text = etree.tostring(p, encoding='utf-8').decode('utf-8') text = re.sub('<.*?>', '', text).replace(' ', '').replace('\n', '') li['text'] = text li['img'] = img row['steps'].append(li)
step_.replace('\n', '</p>\n<p>\n') + '</p>\n]]>' expect_ = '<![CDATA[<p>\n' + \ expect_.replace('\n', '</p>\n<p>\n') + '</p>\n]]>' test_case = etree.SubElement(testsuite_, 'testcase', name=case_) preconditions = etree.SubElement(test_case, 'preconditions') preconditions.text = u'{0}'.format(pre_) steps = etree.SubElement(test_case, 'steps') step = etree.SubElement(steps, 'step') step_number = etree.SubElement(step, 'step_number') step_number.text = u'1' actions = etree.SubElement(step, 'actions') actions.text = u'{0}'.format(step_) expectedresults = etree.SubElement(step, 'expectedresults') expectedresults.text = u'{0}'.format(expect_) keywords = etree.SubElement(test_case, 'keywords') keyword_list = keywords_.split('\n') for kw in keyword_list: keyword = etree.SubElement(keywords, 'keyword', name=kw) except Exception as e: print("line:", seq) print(str(e)) for item in sys.exc_info(): print(item) s = etree.tostring(xmlroot, pretty_print=True, encoding='utf-8').decode() s = s.replace('<', '<') # 临时强制修改,将来碰到内容中包含大于小于的可能会导致XML格式错误,导入失败。 s = s.replace('>', '>') with open('output.xml', mode='w+', encoding='utf-8') as target: target.write(s)
from lxml.html import etree """ 用lxml来解析html代码 """ text = ''' <div> <ul> <li class="item_1"><a href="0.html">item</a><li> <li class="item_2"><a href="0.html">item</a><li> <li class="item_3"><a href="0.html">item</a><li> </ul> </div> ''' #利用etree,html把字符串解析成HTML文档 html = etree.HTML(text) s = etree.tostring(html) print(s)
try: tree = parse(arg['url']) #parser = etree.HTMLParser() #tree = etree.parse(arg['url'], parser) except: # should not be raised if only some minor HTML errors occur err_exit('Unable to parse the given HTML document.') if arg['b']: """handle --beautiful-output argument""" # nasty solution, but we need the "cleaned" version (i.e. the # lxml-specific one) of HTML tree printw(BeautifulSoup( etree.tostring(tree.getroot(), method='html') ).prettify()) sys.exit(0) # precompile regexps eres = { 'url': re.compile('^(http://)?www[.]', re.IGNORECASE), 'trail_space': re.compile('\\s*$'), 'lead_space': re.compile('^\\s*'), 'whole_blank': re.compile('^\\s*$'), 'blank': re.compile('[\\s\u00a0]+'), # \u00a0 == nbsp 'url_hex': re.compile('(%[A-F0-9]{2}|\\+)+'), 'year4': re.compile('[(]\\s*([12][0-9][0-9][0-9])\\s*[)]'), 'year2': re.compile('[(]\\s*([01][0-9])\\s*[)]'), 'year4r': re.compile('[(]\\s*([12][0-9][0-9][0-9])(?!.*?[12][0-9][0-9][0-9])\\s*[)]'), 'year2r': re.compile('[(]\\s*([01][0-9])(?!.*?[01][0-9])\\s*[)]'), 'year4r_no_paren': re.compile('\\s*([12][0-9][0-9][0-9])(?!.*?[12][0-9][0-9][0-9])\\s*'),