def test_named_address(self): addr = 'localhost:8000' option = config_options.IpAddress() value = option.validate(addr) self.assertEqual(utils.text_type(value), addr) self.assertEqual(value.host, 'localhost') self.assertEqual(value.port, 8000)
def test_default_address(self): addr = '127.0.0.1:8000' option = config_options.IpAddress(default=addr) value = option.validate(None) self.assertEqual(utils.text_type(value), addr) self.assertEqual(value.host, '127.0.0.1') self.assertEqual(value.port, 8000)
def test_valid_IPv6_address(self): addr = '[::1]:8000' option = config_options.IpAddress() value = option.validate(addr) self.assertEqual(utils.text_type(value), addr) self.assertEqual(value.host, '[::1]') self.assertEqual(value.port, 8000)
def _add_entry(self, title, text, loc): """ A simple wrapper to add an entry and ensure the contents is UTF8 encoded. """ self._entries.append({ 'title': title, 'text': utils.text_type(text.strip().encode('utf-8'), encoding='utf-8'), 'location': loc })
def _add_entry(self, title, text, loc): """ A simple wrapper to add an entry and ensure the contents is UTF8 encoded. """ text = text.replace('\u00a0', ' ') text = re.sub(r'[ \t\n\r\f\v]+', ' ', text.strip()) self._entries.append({ 'title': title, 'text': utils.text_type(text.encode('utf-8'), encoding='utf-8'), 'location': loc })
def _add_entry(self, title, text, loc): """ A simple wrapper to add an entry and ensure the contents is UTF8 encoded. """ text = text.replace('\u00a0', ' ') text = re.sub(r'[ \t\n\r\f\v]+', ' ', text.strip()) # self._entries.append({ # 'title': title, # 'text': utils.text_type(text.encode('utf-8'), encoding='utf-8'), # 'location': loc # }) text = utils.text_type(text.encode('utf-8'), encoding='utf-8') self._entries.append( dict(title=" ".join([ token.strip() for token in jieba.cut(title.replace('\n', ''), True) ]), text=" ".join([ token.strip() for token in jieba.cut(text.replace('\n', ''), True) ]), location=loc))
found[ext].append(location) file = io.open(os.path.join(dirpath, name), 'r', encoding='utf8') logbody = file.read() #parse the html soup = BeautifulSoup(logbody,"lxml") #get title of the page title = soup.title page_details = soup.find_all(["p", "pre", "h1", "h2" , "h3", "h4"]) for detail in page_details: text = detail.get_text() text = text.replace('\u00a0', ' ') text = re.sub(r'[ \t\n\r\f\v]+', ' ', text.strip()) text = utils.text_type(text.encode('utf-8'), encoding='utf-8') if (text and str(os.path.relpath(location))): hashVal = abs(hash((str(os.path.relpath(location)), text, str(title.get_text())))) if hashVal not in addedHashes: currentPage = { 'location' : str(os.path.relpath(location)), 'text' : text, 'title' : str(title.get_text()), 'index' : hashVal } addedHashes.append(hashVal) searchData['docs'].append(currentPage) # Write results to the json file with open(outputjson, 'w') as logfile: json.dump(searchData, logfile)