def string(html, cssSelector='*') -> str: if isinstance(html, str): return strip_markup(html).strip() if not isinstance(html, Selector): html = Selector(html) data = html.css(cssSelector).extract_first() if data is None: data = str('') data = strip_markup(data) return data.strip()
def getRepos(self, name = None): yield self.header() if not name: yield '<p>Please actually enter a username.</p>' else: name = strip_markup(name) try: url = "https://api.github.com/users/%s/repos" % name req = urllib2.Request(url) opener = urllib2.build_opener() f = opener.open(req) data = json.load(f) gotten = True except urllib2.URLError, e: if '404' in str(e): yield "<p>Something went wrong, you likely didn't type in the username correctly.</p>" elif '403' in str(e): yield "<p>Something went wrong, you've likely gone beyond the GitHub API request limits.</p>" else: yield "<p>Something went wrong: " + str(e) + "</p>" gotten = False if(gotten): if(len(data) != 0): final_data = "<p> The user %s has these public repos: <br /> <ul>" % name for i in data: final_data += "<li>" + i['name'] + "</li>" final_data += "</ul></p>" yield final_data else: yield "<p>This user doesn't currently have any repos.</p>"
def post(self): title = strip_markup(self.get_argument("title")) tag = strip_markup(self.get_argument("tag")) length = strip_markup(self.get_argument("length")) dmy = strip_markup(self.get_argument("start_date")) hour = strip_markup(self.get_argument("hour")) ssub = strip_markup(self.get_argument("short_description")) lsub = strip_markup(self.get_argument("long_description")) user = self.current_user ch_id = uuid.uuid4() url = uuid_to_url(ch_id) length = int(length) hour = ''.join([dmy, " ", hour]) dmy = ''.join([dmy, " ", "00:00"]) datetime_dmy = datetime.datetime.strptime(dmy, '%Y-%m-%d %H:%M') datetime_start = datetime.datetime.strptime(hour, '%Y-%m-%d %H:%M') utc = pytz.timezone('UTC') datetime_dmy = utc.localize(datetime_dmy) datetime_start = utc.localize(datetime_start) #timestamp will just use cassandra getdate(now()) # Need to insert into all channel column families, see: db_sechma for columns yield gen.Task(create_channel.apply_async, args=[title, tag, length, datetime_dmy, datetime_start, user, ch_id, url, ssub, lsub]) self.redirect("/ch/%s" % url)
def phone_validator(node, value): """ checks to make sure that the value looks like a phone number """ value = htmllaundry.strip_markup(value) allowed = set(string.ascii_lowercase + string.digits + ' ' + '.' + '+' + '(' + ')' + '-') tval = set(value) <= allowed if value is u'': raise colander.Invalid(node, 'Please provide a valid phone number') if not tval: raise colander.Invalid( node, '%s is not a valid telephone number format' % value)
def phone_validator(node, value): """ checks to make sure that the value looks like a phone number """ value = htmllaundry.strip_markup(value) allowed = set(string.ascii_lowercase + string.digits + ' ' + '.' + '+' + '(' + ')' + '-') tval = set(value) <= allowed if value is u'': raise colander.Invalid(node, 'Please provide a valid phone number') if not tval: raise colander.Invalid(node, '%s is not a valid telephone number format' % value)
def html_cleaner(html): soup = BeautifulSoup('\n'.join(html)) [s.extract() for s in soup('script')] # remove 'script', 'style', 'option' tags [s.extract() for s in soup('style')] [s.extract() for s in soup('option')] cleaned_sents = htmllaundry.strip_markup(str(soup)) # leave only text # remove continuous empty lines cleaned_sents = re.sub(r'\n\s*\n+', '\n\n', cleaned_sents).strip() cleaned_sents = re.sub(r'\s+', ' ', cleaned_sents, re.M).strip() # remove continuous spaces return cleaned_sents
def cleanup(self): a = self.transformable a['title'] = html.unescape( a['title'] ).replace('\'\'', '\'') try: a['intro'] = strip_markup( html.unescape(a['intro']) ) except: pass a['authors'] = a['authors'].split(',') for idx in a['authors']: # a['authors'][idx] = int(a['authors'][idx]) idx = int(idx) try: a['dossier'] = int(float(a['dossier'])) except: a['dossier'] = None return a
def preProcessingData(file_name): data_frame = dp.read_csv(file_name) # Data Tokenization list_tokenization = [ strip_markup(clean_text).split(" ") for clean_text in data_frame['Body'] ] # Lower case conversion and Removal of Stop words data_tokenization_lower = [] for tokenization in list_tokenization: tokenization_lower = [] for x in tokenization: x_lower = x.lower().replace('\n', ' ') if x_lower not in stopwords.words('english'): tokenization_lower.append(x_lower) data_tokenization_lower.append(tokenization_lower) data_frame['Body'] = data_tokenization_lower return data_frame
async def mal_cmd(self,ctx,type,*,name): if type.lower() == "anime": query = ''' query ($id: Int, $page: Int, $perPage: Int, $search: String) { Page (page: $page, perPage: $perPage) { pageInfo { total currentPage lastPage hasNextPage perPage } media (id: $id, search: $search, type: ANIME) { id title { romaji english native } status startDate{ year month day } episodes format coverImage{ large } bannerImage siteUrl source type averageScore meanScore description } } } ''' variables = { 'search': name, 'page': 1, 'perPage': 3 } url = 'https://graphql.anilist.co' async with aiohttp.ClientSession() as session: async with session.post(url,json={'query':query,'variables':variables}) as response: responsed=await response.json() page=responsed['data']['Page'] media=page['media'] result=media[0] embed_obj=discord.Embed(colour=discord.Colour.red(),url=result['siteUrl']) embed_obj.set_thumbnail(url=result['coverImage']['large']) embed_obj.set_image(url=result['bannerImage']) if result['title']['english']=="None": embed_obj.title=f"{result['title']['romaji']}({result['title']['native']})" else: embed_obj.title=f"{result['title']['romaji']}({result['title']['native']})" embed_obj.add_field(name="Status:",value=result['status']) embed_obj.add_field(name='Started at:',value=f"{result['startDate']['year']}/{result['startDate']['month']}/{result['startDate']['day']}") embed_obj.add_field(name="Episodes:",value=result['episodes']) embed_obj.add_field(name="Type:",value=result['type']) embed_obj.add_field(name="Average score:",value=result['averageScore']) embed_obj.add_field(name="Mean score:",value=result['meanScore']) if len(result['description'])>1024: await(ctx.send("Synopsis is more than 1024 characters.Wanna me to send synopsis here?\n*Yes/No*\n*Timeout set to 20 seconds.*")) else: embed_obj.add_field(name='Synopsis:',value=strip_markup(result['description']),inline=True) await ctx.send(embed=embed_obj) def check(message): if message.content.lower()=="yes" or message.content.lower()=="y": return True elif message.content.lower()=="no" or message.content.lower()=="n": return False await self.client.wait_for('message',timeout=20,check=check) description=strip_markup(result['description']) await ctx.send(description) if type.lower() == "manga": query = ''' query ($id: Int, $page: Int, $perPage: Int, $search: String) { Page (page: $page, perPage: $perPage) { pageInfo { total currentPage lastPage hasNextPage perPage } media (id: $id, search: $search, type: MANGA) { id title { romaji english native } status startDate{ year month day } episodes format coverImage{ large } bannerImage siteUrl source type averageScore meanScore description } } } ''' variables = { 'search': name, 'page': 1, 'perPage': 3 } url = 'https://graphql.anilist.co' async with aiohttp.ClientSession() as session: async with session.post(url,json={'query':query,'variables':variables}) as response: responsed=await response.json() page=responsed['data']['Page'] media=page['media'] result=media[0] embed_obj=discord.Embed(colour=discord.Colour.red(),url=result['siteUrl']) embed_obj.set_thumbnail(url=result['coverImage']['large']) if result['title']['english']=="None": embed_obj.title=f"{result['title']['romaji']}({result['title']['native']})" else: embed_obj.title=f"{result['title']['romaji']}({result['title']['native']})" embed_obj.add_field(name="Status:",value=result['status']) embed_obj.add_field(name='Started at:',value=f"{result['startDate']['year']}/{result['startDate']['month']}/{result['startDate']['day']}") embed_obj.add_field(name="Episodes:",value=result['episodes']) embed_obj.add_field(name="Type:",value=result['type']) embed_obj.add_field(name="Average score:",value=result['averageScore']) embed_obj.add_field(name="Mean score:",value=result['meanScore']) if len(result['description'])>1024: await(ctx.send("Synopsis is more than 1024 characters.Wanna me to send synopsis here?\n*Yes/No*\n*Timeout set to 20 seconds.*")) else: embed_obj.add_field(name='Synopsis:',value=strip_markup(result['description']),inline=True) await ctx.send(embed=embed_obj) def check(message): if message.content.lower()=="yes" or message.content.lower()=="y": return True elif message.content.lower()=="no" or message.content.lower()=="n": return False await self.client.wait_for('message',timeout=20,check=check) description=strip_markup(result['description']) await ctx.send(description)
def clean_my_file(self, x): # preprocess the text # print x # get rid of newlines, tabs and carriage returns. x = re.sub('\r', '', x) x = re.sub('\t', '', x) x = re.sub('\n', '', x) # some of the blog posts have various html code elements in it's undecoded form, # some don't, we want to make sure that we get rid of all html code. That is why # we decode the most common html characters. # replace all linked content with [URL] # we will use the linked content in one of our features. x = re.sub('<[aA] (href|HREF)=.*?</[aA]>;?', ' URL ', x) # replace urls x = re.sub('<img.*?>;?', ' URL ', x) # replace urls x = re.sub('(http|https|ftp)://?[0-9a-zA-Z\.\/\-\_\?\:\=]*', ' URL ', x) x = re.sub('(http|https|ftp)://?[0-9a-zA-Z\.\/\-\_\?\:\=]*', ' URL ', x) x = re.sub('(^|\s)www\..+?(\s|$)', ' URL ', x) x = re.sub('(^|\s)(http|https|ftp)\:\/\/t\.co\/.+?(\s|$)', ' URL ', x) x = re.sub('(^|\s)(http|https|ftp)\:\/\/.+?(\s|$)', ' URL ', x) x = re.sub('(^|\s)pic.twitter.com/.+?(\s|$)', ' URL ', x) # clean all the HTML markups, this function is a part of htmllaundry x = strip_markup(x) # get rid of bbcode formatting and remaining html markups x = re.sub('[\[\<]\/?b[\]\>];?', '', x) x = re.sub('[\[\<]\/?i[\]\>];?', '', x) x = re.sub('[\[\<]br [\]\>];?', '', x) x = re.sub('/>', '', x) x = re.sub('[\<\[]\/?h[1-4][\>\]]\;?', '', x) x = re.sub('\[\/?img\]', '', x) x = re.sub('\[\/?url\=?\]?', '', x) x = re.sub('\[/?nickname\]', '', x) # x = re.sub(';{1,}',' ', x) # get rid of whitespaces x = re.sub(' {1,}', ' ', x) x = self.h.unescape(x) # delete everything else that strip_markup doesn't x = re.sub('height=".*?"', '', x) x = re.sub('width=".*?"', '', x) x = re.sub('alt=".*?"', '', x) x = re.sub('title=".*?"', '', x) x = re.sub('border=".*?"', '', x) x = re.sub('align=".*?', '', x) x = re.sub('style=".*?"', '', x) x = re.sub(' otted border-color:.*?"', '', x) x = re.sub(' ashed border-color:.*?"', '', x) x = re.sub('target="_blank">', '', x) x = re.sub('<a target=" _new" href=" ]', '', x) x = re.sub('<a target="_new" rel="nofollow" href=" ]', '', x) # users for tweeter x = re.sub('(^|\s)@(?!\s).+?(?=(\s|$))', ' USER ', x) x = x.strip().lstrip() # print x return x
def remove_html_tags(word): if word is None: return "" return strip_markup(word)
def pre_validate(self, form): self.data = htmllaundry.strip_markup(self.data)
def Getterms(content, lang, prods, returnJSON): global __debug_on__, bad_stemmer_1, bad_stemmer_3, bad_stemmer_4 bad_stemmer_1 = bad_stemmer_3 = bad_stemmer_4 = 0 Service.logger.debug("Started processing for %d segments for language %s" % (len(content), lang)) new_content = set() new_content_orig = "" new_content_orig_tok = set() for seg in set(content): # Mask { and } as they clash with the inner workings of the chunker seg = seg.replace(u"{",u"﹛").replace(u"}",u"﹜") # Treating UI strings containing \r escapes # Treating UI strings containing \n escapes # Collapsing new lines # Clean-up the line endings—not sure if useful at all seg = seg.replace('\\r','\r').replace('\\n','\n').replace('\r\n','\n').replace('\n ','\n') new_content_orig += " " + seg for word in word_tok.tokenize(seg): new_content_orig_tok.add(word) seg = seg.replace('%','').replace(".. ."," ...").replace('<openparen>','(').replace('<closeparen>',')').replace("'","'").replace('"', '"').replace('&', '&').replace('<', '<').replace('>', '>') # Do the following even occur in our data? # new_content = new_content.replace('ˆ', '^') # new_content = new_content.replace('˜', '~') # new_content = new_content.replace('–', '–') # new_content = new_content.replace('—', '—') # new_content = new_content.replace('‘', '‘') # new_content = new_content.replace('’', '’') # new_content = new_content.replace('‚', ',') # new_content = new_content.replace('“', '“') # new_content = new_content.replace('”', '”') # new_content = new_content.replace('„', '"') # new_content = new_content.replace('‰', '‰') # new_content = new_content.replace('€', '€') # Strip HTML/XML markup seg = strip_markup(seg).replace('\\t','\n') # Some very crude pre-tokenisation seg = seg.replace(':',' :').replace('\t','\n').replace("\\", '\n').replace('|','\n') # It’s not quite clear what this is supposed to do seg = seg.replace("&","").replace("[\\w]+_[\\w]+","\n").replace("[\\w]+_","\n").replace("_[\\w]+","\n") # Service.logger.debug("Segment: " + seg) for seg in sent_tokenizer.tokenize(seg): # Unmask { and } seg = seg.replace(u"﹛",u"{").replace(u"﹜",u"}") new_content.add(seg) if __debug_on__: # Service.logger.debug("Finished sentence segmentation.") Service.logger.debug("Finished character-level pre-processing.") # word tokenize the sentences new_content_tokenized = [word_tok.tokenize(line) for line in new_content] if __debug_on__: Service.logger.debug("Finished main tokenisation.") # Remove empty lines # Remove one-word lines which are all caps. Typically: commands. for l in new_content_tokenized: if len(l)==0 or (len(l)==1 and str(l).isupper()): new_content_tokenized.remove(l) else: # Remove placeholders (note: any modification can be made to the text here, as the final output will be verified in the original content) # (this doesn’t really make any sense to me —— V.) for t in l: if '%' in t: l.remove(t) # POS tag sentences tagged_sent = pos_tagger.tag_sents(new_content_tokenized) if __debug_on__: Service.logger.debug("Finished main POS tagging.") # [Issue not repro since default tagger is added] I leave it in, in any case. None tags are extremely rare, deleting these segments results in minimal loss. tagged_sent = [r for r in tagged_sent if ("', None), ('" not in str(r)) and ("', None" not in str(r)) and ("', ''" not in str(r))] # Define chunkers (left in 'Unk'/'UNK' as POS-tags for unknown words in the chunker definition, but the tagger uses 'NN'.) def GetSurfaceChunksByStem(sentences): global bad_stemmer_1 grammar = (r'''CHUNK: {(<Unk|UNK|NN.*|VBN>*)(<JJ.*|VBN>*)(<Unk|UNK|NN.*|VBN>)(<Unk|UNK|NN.*>+)}''') cp = nltk.RegexpParser(grammar) chunks = set() for sent in sentences: try: tree = cp.parse(sent) for subtree in tree.subtrees(): if subtree.label() == 'CHUNK': chunks.add(' '.join([l[0] for l in subtree.leaves()])) except: bad_stemmer_1 += 1 Service.logger.debug("Issues with chunker 1!".encode('utf-8')) Service.logger.debug(Service.traceback.format_exc()) # Service.logger.debug("1+") # Service.logger.debug("Bad stemmer 1: "+str(sent)) return chunks # This chunker extracts units like "Elements limiting slenderness" def GetSurfaceChunksByStem3(sentences): global bad_stemmer_3 grammar = (r'''CHUNK: {<Unk|UNK|NN.*|VBN> <VBG> <Unk|UNK|NN.*>}''') cp = nltk.RegexpParser(grammar) chunks = set() for sent in sentences: try: tree = cp.parse(sent) for subtree in tree.subtrees(): if subtree.label() == 'CHUNK': chunks.add(' '.join([l[0] for l in subtree.leaves()])) except: bad_stemmer_3 += 1 Service.logger.debug("Issues with chunker 3!".encode('utf-8')) Service.logger.debug(Service.traceback.format_exc()) # Service.logger.debug("3+") return chunks # Chunker for single word noun-like units def GetNouns(sentences): global bad_stemmer_4 grammar = (r'''CHUNK: {<Unk|UNK|NN.*|VBN|JJ.*>}''') cp = nltk.RegexpParser(grammar) chunks = set() for sent in sentences: try: tree = cp.parse(sent) for subtree in tree.subtrees(): if subtree.label() == 'CHUNK': chunks.add(' '.join([l[0] for l in subtree.leaves()])) except: bad_stemmer_4 += 1 Service.logger.debug("Issues with chunker 4!".encode('utf-8')) Service.logger.debug(Service.traceback.format_exc()) # Service.logger.debug("4+") return chunks # Get compound chunks extracted # AND Remove duplicate chunks new_chunks = GetSurfaceChunksByStem(tagged_sent).union(GetSurfaceChunksByStem3(tagged_sent)) if __debug_on__: Service.logger.debug("Finished main chunking.") Service.logger.debug((u"Skipped bad parses as follows: 1=" + str(bad_stemmer_1) + u" 3=" + str(bad_stemmer_3) + u" 4=" + str(bad_stemmer_4)).encode('utf-8')) # Correct chunks (Some corrections aren't repro, because they were added for a different tokenizer. # They don't hurt to have - I leave them in.) # Maybe these characters should be removed from the beginning...? not_needed = ['.', '^', "'", "\\", "/", "!", '_', '%', "=", '*', '>', '<', '\\', ":", "|"] new_compounds = set() for w in new_chunks: # [Issue not repro.] Remove '@' from multi-word units. w = w.replace('@', '') # [Issue not repro.] Remove '*' from the multi-word units. w = w.replace('*', '') # [Issue not repro.] Remove '.' from the end of multi-word units. # [Issue not repro.] Remove ',' from the end of multi-word units. w.rstrip(".,") # Correct issue deriving from tokenization w = w.replace(" 's", "'s") # Get rid of words containing '+' in chunks (for sw strings). # Eg: 'Ctrl+A key combination' will become 'key combination' if '+' in w: tok = word_tok.tokenize(w) for i in tok: if "+" in i: w = w.replace(i, '') # [Issue not repro.] Remove '=' from multi-word units. w = w.replace('=', '') # [Issue not repro.] Remove double spaces from multi-word units. w = w.replace(' ', ' ') # [Issue not repro.] Remove space from the end of multi-word units. # [Issue not repro.] Remove space from the beginning of multi-word units. w.strip() # remove one letter words from the chunk units (eg. remains of placeholders) # switched to removing the whole chunmk if a one-letter word was found noWordFound = False for word in word_tok.tokenize(w): if word in nowords or len(word) == 1: noWordFound = True break if not noWordFound: for mark in not_needed: if mark in w: noWordFound = True break if not noWordFound: new_compounds.add(w) if __debug_on__: Service.logger.debug("Finished first chunk cleanup.") # extract noun(like) units nouns = [w for w in GetNouns(tagged_sent) if w.isdigit() == False] # clean results up from (untranslatable) characters, nowords content and check if they are in the original content as-is new_nouns = set() for n in nouns: not_needed_found = False for mark in not_needed: if mark in n: not_needed_found = True break if not not_needed_found: new_nouns.add(n) new_nouns = set([w.lower() for w in new_nouns if (w.lower() not in nowords) and (w in new_content_orig_tok)]) if __debug_on__: Service.logger.debug("Finished noun selection.") # Compounds: new_compounds # Single words: new_nouns # Create one group of all chunks # check back if extracted term candidates are in the original text as well new_words_and_compounds = [w for w in new_compounds.union(new_nouns) if w in new_content_orig] if __debug_on__: Service.logger.debug("Starting substring cleanup for " + str(len(new_words_and_compounds)) + " chunks") # remove multi-word chunks that are compouns of smaller multi-word chunks. For example, 'calculation configuration' and # 'dialog box' remains, but 'calculation configuration dialog box' will be removed tempSet = set() new_chunks_set = set([_.lower() for _ in new_words_and_compounds]) new_chunks_temp = sorted(new_chunks_set, key=cmp_to_key(locale.strcoll)) counter = 0 for i in range(0, len(new_chunks_temp)): for j in range(i, len(new_chunks_temp)): if __debug_on__: counter += 1 if not counter % 10000: Service.logger.debug(".") if not counter % 500000: Service.logger.debug(str(counter)) nc = new_chunks_temp[i] + ' ' + new_chunks_temp[j] if nc in new_chunks_set: # Service.logger.debug("found superstring " + nc) tempSet.add(nc) nc = new_chunks_temp[j] + ' ' + new_chunks_temp[i] if nc in new_chunks_set: # Service.logger.debug("found superstring " + nc) tempSet.add(nc) # Word tokenize filtered multi-word units. new_words_and_compounds = [w for w in new_chunks_temp if w not in tempSet] if __debug_on__: Service.logger.debug("Finished chunk substring cleanup.") # Query NeXLT for existing translation if __debug_on__: Service.logger.debug("Running NeXLT queries for " + str(len(new_words_and_compounds)) + " chunks...") def QueryNeXLT(term, language, prod_name): r = None if Service.isStaging: r = requests.get("http://aws.stg.solr:8983/search/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:" + prod_name + "%20AND%20" + language + ":['' TO *]") else: r = requests.get("http://aws.prd.solr:8983/search/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:" + prod_name + "%20AND%20" + language + ":['' TO *]") r.encoding = "utf-8" try: response = r.json()['response']['numFound'] except: response = 0 return response def QueryNeXLTAllProds(term, language): r = None if Service.isStaging: r = requests.get("http://aws.stg.solr:8983/search/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:" + '*' + "%20AND%20" + language + ":['' TO *]") else: r = requests.get("http://aws.prd.solr:8983/search/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:" + '*' + "%20AND%20" + language + ":['' TO *]") r.encoding = "utf-8" try: response = r.json()['response']['numFound'] except: response = 0 return response new_words_and_compounds_in_product = [] for t in new_words_and_compounds: newTerm = True for prod_name in prods: newTerm = newTerm and QueryNeXLT(t.lower(), lang, prod_name) == 0 if newTerm: new_words_and_compounds_in_product.append(t) # product independent query to NeXLT # append context and product/corpus information + number of occurrences terms = [] for term in new_words_and_compounds_in_product: # Unmask { and } term = term.replace(u"﹛",u"{").replace(u"﹜",u"}") contexts = [con for con in new_content if term.lower() in con.lower()] # Skip any terms that cannot be found in the original source. We cannot provide terms with no context to translators. if len(contexts) == 0: continue Service.logger.error(u"Could not find original context for term %s!" % term) if QueryNeXLTAllProds(term.lower(), lang) == 0: terms.append([term, "Corpus", contexts, len(contexts), len(term)]) else: terms.append([term, "Product", contexts, len(contexts), len(term)]) if __debug_on__: Service.logger.debug("Finished NeXLT calls with %s new terms remaining." % len(terms)) if returnJSON: # Sort final term list and create json format terms = sorted(terms, key=itemgetter(1,0), reverse=True) terms_for_json = {} for listitem in terms: if prods[0] == "NEW_PRODUCT": k = {listitem[0]: {'newto': "New product, search in corpus only", 'context': listitem[2], 'numContextSents': listitem[3]}} else: k = {listitem[0]: {'newto': listitem[1], 'context': listitem[2], 'numContextSents': listitem[3]}} terms_for_json.update(k) if __debug_on__: Service.logger.debug("Finished final processing.") return terms_for_json else: if __debug_on__: Service.logger.debug("Finished final processing.") Service.logger.debug("Extracted terms:") # import pprint # for term in terms: # Service.logger.debug("\tterm: %s" % pprint.saferepr(term)) return terms
def pre_validate(self, form): if self.data and self.data != '': self.data = htmllaundry.strip_markup(self.data) else: self.data = None