def regex_to_big_query(re): # Take care of encoding re = re.replace("\\", "\\\\").replace("'", "\\'") # No need for grouping re = re.replace("(?:", "(") str = "(" + "LENGTH(REGEXP_REPLACE(lower(message)," + "'%s', '@'))" % re + "-" \ + "LENGTH(REGEXP_REPLACE(lower(message)," + "'%s', ''))" % re + ")" return str
def parse_xml(): """Parse xml and print only content enclosed in <post> tags""" post = False for line in stdin: line = line.strip() if is_match(r'</post|<quote', line): post = False elif post and line: print replace(r'<img(.*?)/>|<a(.*?)</a>', '', line) elif is_match(r'<post|</quote', line): post = True
def _update_imported_attachement_titles(self): """ Builds a pretty title from filename for attachments and later it saves them - Removes extension - Removes extra spaces """ for record in self.imported_attachment_ids: name = record.name name = replace(r'^(.+)(\.[^.]*)$', r'\1', name, flags=UNICODE) name = replace(r'[ \-_]+', r' ', name, flags=UNICODE) record.write({'name' : name.title()})
def handleURL(line): link, url, text = getURL(line) if link is None: return re.replace(line, "[url]", "") v["%u"] = url v["%v"] = text text = subTags("URLs", "url") line = re.replace(line, link, text) url = subTags("URLs", "url-attr") line = re.replace(line, "[url]", url) return line
def eigthDayB(): f = open("day8.txt", 'r') totalStringCharacters = 0 totalCharactersEncoded = 0 for line in f: line = line.rstrip() totalStringCharacters += len(line) print totalStringCharacters re.replace(line, "\\\\"", ") print re.findall('\\\\"', line) print re.findall('\\\\x', line) totalCharactersEncoded += len(line) + 4 + len(re.findall('\\\\"', line)) * 2 + len(re.findall('\\\\.', line)) print totalCharactersEncoded print "\n" print totalCharactersEncoded - totalStringCharacters
def call_google(params, payload): gcr = requests.post('https://vision.googleapis.com/v1/images:annotate', params=params, json=payload) try: gc_response = json.loads(gcr.content)['responses'][0] if gc_response['textAnnotations'][0]['locale'] == 'en': text = gc_response['textAnnotations'][0]['description'] text = replace(r"(\r\n|\n|\r)", " ", text) if text == "": return text words = text.split(' ') valid = 0 invalid = 0 for word in words: if word in english: valid += 1 else: invalid += 1 if invalid >= valid: text = "We are not confident in the text transcription of this image. " + "This image contains the text: " + text + ". " else: text = "This image contains the text: " + text + ". " else: text = "This image contains text in a language our extension cannot understand. " except KeyError as e: text = "" return text
def save_image_result(driver): # type ={'type':'login'} image_loc = ('css selector', 'div#captcha>img#codeimg') # url = 'https://passport.douguo.com' get_src = driver.find_element(*image_loc) size = get_src.size print(size) #微信截图可以定位 x y geti = ImageGrab.grab((1250, 600, 1350, 631)) base_dir = os.path.dirname(os.path.dirname(__file__)) base_dir = str(base_dir) base_dir = base_dir.replace('\\', '/') base = base_dir.split('/test_case')[0] path = base + '/report/image/' + 'image.png' re = pytesseract.image_to_string(geti) print(re) # geti.save(path) # f = Image.open(path) # getimage = f.crop(ra) geti.save(path) # verify = Image.open(path) # result = pytesseract.image_to_string(verify) result = re.replace(' ', '').replace('\n', '').replace('\u3000', '') print(result) return result
def slice2(s): s = re.replace(s.lower(), '[^a-z]') res = [] for i in range(len(s) - 1): tmp = s[i:i + 2] if ' ' not in tmp: res.append(tmp) return res
def power(): InputLatex="x ^ 2" string = "" str = "" dictstr = "" Outcome = " " InputLatex.split(' ') print(InputLatex) match = re.match('sum_{.*?=.*?}\^{.*?}|prod_{.*?=.*?}\^{.*?}|int_{.*?}\^{.*?}', InputLatex) powerfuncmatch = "^" #powerfuncmatch = re.match('\\\\^', InputLatex) #basefuncmatch = re.match('\\\\_', InputLatex) # print(match) if(match): print("found") if (powerfuncmatch): #re.compile(powerfuncmatch) for equ in InputLatex: queue.append(equ) print(queue) if re.search(powerfuncmatch,InputLatex): InputLatex = re.replace(powerfuncmatch, 'to the power of', InputLatex) print("Now here") print(InputLatex)
def make_formater(form): '''replaces the words in the form to the appropriate logging syntax and return the formmatter to be added''' for word in replace(r'[^\w]', ' ', form).split(): if word in FORMATS.keys(): form = form.replace(word, FORMATS[word]) return logging.Formatter(form)
def clean_list(self, pagename): a = input("1-follower/2-following: ") def openfile(pagename): if a == "1": open_file = open( "/root/Desktop/py/followerslist/" + pagename + "followers.txt", "r+") read = open_file.read() return read open_file.close() if a == "2": open_file = open( "/root/Desktop/py/following list/" + pagename + "following.txt", "r+") read = open_file.read() return read open_file.close() read = openfile(pagename) re = read.replace("'https://www.instagram.com/", "") me = re.replace("/'", "") de = me.replace(" ", "\n") if a == "1": he = open( "/root/Desktop/py/clean/" + pagename + "followersclean.txt", "w+") he.write(de) he.close() elif a == "2": he = open( "/root/Desktop/py/clean/" + pagename + "followingclean.txt", "w+") he.write(de) he.close()
def set_file_postfix(self, file, extension): ''' Set the filename postfix ''' postfix = self.options.name[0] return postfix and replace('.%s$' % extension, '%s.%s' % (postfix, extension), file)
def test_regex_match(): datafile = replace('\.py$', '.json', __file__) with open(datafile, 'r') as f: data = json_load(f) for sample in data: assert RE_COLOR.match(sample) is not None
def import_asset( self, id ): out = [] queue = [ id ] while !empty( queue ): id = queue[0] if isset( self._assets[id] ) and self._assets[id][4] and !self._assets[id][5]: # enqueued but not loaded yet asset_def = self._assets[id] type = asset_def[0] id = asset_def[1] asset = asset_def[2] deps = asset_def[3] if !empty( deps ): needs_deps = False numdeps = len(deps) for i=numdeps-1; i>=0; i--: dep = deps[i] if isset( self._assets[dep] ) and !self._assets[dep][5]: self._assets[dep][4] = True # enqueued needs_deps = True array_unshift( queue, dep ) if needs_deps: continue else: array_shift( queue ) asset_def[5] = True # loaded # hook here ret = {} self.trigger("import-asset", [ # importer, id, type, asset self, id, type, asset, ret ]).trigger("import-asset-"+id, [ # importer, id, type, asset self, id, type, asset, ret ]) if 'return' in ret: out.append( ret['return'] ) else: is_style = 'styles' == type is_script = 'scripts' == type is_tpl = 'templates' == type is_inlined = is_array(asset) asset_id = re.replace( r'[\-.\/\\:]+', '_', id) if is_style: out.append( ("<style id=\"importer-inline-style-{$asset_id}\" type=\"text/css\" media=\"all\">{$asset[0]}</style>") if is_inlined else ("<link id=\"importer-style-{$asset_id}\" type=\"text/css\" rel=\"stylesheet\" href=\""+self.path_url(asset)+"\" media=\"all\" />") ) elif is_script: out.append( ("<script id=\"importer-inline-script-{$asset_id}\" type=\"text/javascript\">/*<![CDATA[*/ {$asset[0]} /*]]>*/</script>") if is_inlined else ("<script id=\"importer-script-{$asset_id}\" type=\"text/javascript\" src=\""+self.path_url(asset)+"\"></script>") ) elif is_tpl: out.append( ("<script id=\"importer-inline-tpl-{$asset_id}\" type=\"text/x-tpl\">{$asset[0]}</script>") if is_inlined elsec("<script id=\"importer-inline-tpl-{$asset_id}\" type=\"text/x-tpl\">"+self.get(asset)+"</script>") ) else: out.append( asset[0] if is_inlined else self.get(asset) ) else: array_shift( queue )
def _get_cleared_text(self): """ Perform some operations to obtain a cleared text - Replace tabs with spaces en removes extra spaces - Replace extra simbols after lists elements """ content = (self.content or '').strip() flags = MULTILINE | UNICODE # STEP 1: Remove tabs and extra spaces content = replace(r'[ \t]+', r' ', content, flags=flags) # STEP 2: Remove spaces from both line bounds content = replace(r'[ \t]+$', r'', content, flags=flags) content = replace(r'^[ \t]+', r'', content, flags=flags) # STEP 3: Replace CRLF by LF and remove duplicates content = replace(r'[\r\n]', r'\n', content, flags=flags) content = replace(r'[\n]{2,}', r'\n\n', content, flags=flags) # STEP 2: Update questions and answers numbering formats pattern = r'^([0-9]{1,10})[.\-)]+[ \t]+' content = replace(pattern, r'\1. ', content, flags=flags) pattern = r'^([a-zñA-ZÑ])[.\-)]+[ \t]+' content = replace(pattern, r'\1) ', content, flags=flags) return content
def escape_re(re): re = re.replace(".", "\\.") re = re.replace("(", "\\)") re = re.replace(")", "\\)") re = re.replace("|", "\\|") re = re.replace("^", "\\^") re = re.replace("*", "\\*") re = re.replace("+", "\\+") re = re.replace("?", "\\?") return re
def repl(re) : """ Cambia expresiones regulares de prosite a las del modulo re de python""" mal = ['.', '-', '{', '}', '(', ')', '<', '>', 'x', '>]'] # La de prosite bien = ['', '', '[^', ']', '{', '}', '^', '$', '[GAVLIMPFYWSCTNQDEKRH]', ']?$'] # La buena for i in range(len(mal)) : re = re.replace(mal[i], bien[i]) return re
def getTemplates(args): if(not args.tokenFeatures): return [TokenString(allowOOV = True)] else: templates = [] for name in args.featureTemplates.split(","): if(name == "tokenString"): templates.append(TokenString(allowOOV = True)) elif(name == "isCap"): templates.append(Capitalized(allowOOV = False)) elif(name == "isNumeric"): templates.append(IsNumeric(allowOOV = False)) elif(re.match(r"Suffix-\d+",name)): num = re.replace(r"Suffix-","",name) templates.append(Suffix(int(num),allowOOV = True)) elif(re.match(r"Prefix-\d+",name)): num = re.replace(r"Prefix-","",name) templates.append(Prefix(int(num),allowOOV = True)) return templates
def test_regex(): datafile = replace('\.py$', '.json', __file__) with open(datafile, 'r') as f: data = json_load(f) for expectation in data: match = RE_CONFLICTED_PACKAGES.match(expectation['output']) assert match is not None assert match.group('package1') == expectation['result'][0] assert match.group('package2') == expectation['result'][1]
def getTemplates(args): if (not args.tokenFeatures): return [TokenString(allowOOV=True)] else: templates = [] for name in args.featureTemplates.split(","): if (name == "tokenString"): templates.append(TokenString(allowOOV=True)) elif (name == "isCap"): templates.append(Capitalized(allowOOV=False)) elif (name == "isNumeric"): templates.append(IsNumeric(allowOOV=False)) elif (re.match(r"Suffix-\d+", name)): num = re.replace(r"Suffix-", "", name) templates.append(Suffix(int(num), allowOOV=True)) elif (re.match(r"Prefix-\d+", name)): num = re.replace(r"Prefix-", "", name) templates.append(Prefix(int(num), allowOOV=True)) return templates
def add_new_accounts(): global CG_USERS CG_USERS.clear() with open(files["users"], "r") as f: CG_USERS = f.readlines() filter = lambda s: replace("^\ufeff", "", s) user = filter(CG_USERS[0]).split(":") return user[:2]
def isMasterFromConfigFile(self, chat_id): if not hasattr(self, "master") or not self.master: return False if unicode(self.master).isnumeric(): return unicode(chat_id) == unicode(self.master) else: with self.bot.database as conn: cur = conn.cursor() cur.execute("select username from telegram_uids where uid = ?", [chat_id]) res = cur.fetchone() return res != None and unicode(res[0]) == unicode(re.replace(r'^@', '', self.master))
def tokenize(text, stop_list): a = text.lower().translate(string.maketrans("",""), string.punctuation + string.digits) b = re.replace(r'(\s+)', ' ', a) c = b.split(' ') ### This might be too simple (simply deletes punctuation, then elims ### double spaces) ### Maybe eliminate some words here? ### It seems like code blocks get grabbed sometimes, so elim those ### To keep or not to keep numbers? return c
def tokenize(text, stop_list): a = text.lower().translate(string.maketrans("", ""), string.punctuation + string.digits) b = re.replace(r'(\s+)', ' ', a) c = b.split(' ') ### This might be too simple (simply deletes punctuation, then elims ### double spaces) ### Maybe eliminate some words here? ### It seems like code blocks get grabbed sometimes, so elim those ### To keep or not to keep numbers? return c
def test_assert_setting_reddit_api_scope_format(self): """ For the "scope" value, Reddit's API uses a slight deviation from the oAuth 2.0 specifications, which states scopes should be space-separated. Reddit uses a comma separated value. Here, verify that the setting actually uses a comma separated list and NOT a standard oAuth space separated list. """ self.assertEqual( settings.OAUTH_REDDIT_SCOPE, re.replace(r"[, ]+", ",", settings.OAUTH_REDDIT_SCOPE), )
def _get_table_condition(self, table, config): conds = [] if table.extcondition: conds.append( re.replace(r"(?i)^\s*where\s+", table.extcondition, "")) if config.filter: conds.append(config.filter) if conds: return " where " + " and ".join("(%s)" % c for c in conds) else: return ""
def test_assert_setting_reddit_api_scope_format(self): """ For the "scope" value, Reddit's API uses a slight deviation from the oAuth 2.0 specifications, which states scopes should be space-separated. Reddit uses a comma separated value. Here, verify that the setting actually uses a comma separated list and NOT a standard oAuth space separated list. """ self.assertEqual( settings.OAUTH_REDDIT_SCOPE, re.replace(r'[, ]+', ',', settings.OAUTH_REDDIT_SCOPE) )
def get_username(): filter = lambda s: replace("^\ufeff", "", s) try: assert len(CG_USERS), "Not accounts to CG!" assert ":" in CG_USERS[0], "Not accounts to CG!" user = filter(CG_USERS[0]) user = user.split(":") return user[:2] except IndexError: raise SystemExit("Not accounts to CG!")
def isMasterFromConfigFile(self, chat_id): if not hasattr(self, "master") or not self.master: return False if unicode(self.master).isnumeric(): return unicode(chat_id) == unicode(self.master) else: with self.bot.database as conn: cur = conn.cursor() cur.execute("select username from telegram_uids where uid = ?", [chat_id]) res = cur.fetchone() return res != None and unicode(res[0]) == unicode( re.replace(r'^@', '', self.master))
def shoesize(size): if 'EU' in size: re = size.replace('EU', '') elif 'UK' in size: re = size.replace('UK', '') elif 'US' in size: re = size.replace('US', '') else: re = size if '.' in re: return re.replace('.', '') else: return re
def convert(self, input, target = False): convert_from = convert_to = multiplier_from = multiplier_to = False unittext = string.strip(re.replace("[^a-zA-Z ]", "", input)) amount = string.strip(re.replace("[^0-9.,]", "", input)).replace(",", ".") if amount == "": return False #first determine unit of input value unit = self.get_unit(unittext) if not unit: return False convert_from = unit[1] if not target: if convert_from.has_key("derivations"): convert_to = convert_from.derivations[0] else: convert_to = unit[2] else: convert_to = self.get_unit(target) if convert_to.has_key("derivations"): if inspect.isfunction(convert_from.ratio_inverse): conversion = convert_from.ratio_inverse(amount) else: conversion = amount / convert_from.ratio else: if inspect.isfunction(convert_to.ratio): conversion = convert_to.ratio(amount) else: conversion = amount * convert_to.ratio if convert_to.has_key("factor"): prefix = convert_to.factor amount = amount / units.multipliers[convert_to.factor]["factor"] else: prefix = "" return (amount, convert_from.symbol, conversion, convert_to.symbol)
def action(self, state, turn, playerNum): print "Current State" print self.state userInput = raw_input('It\'s turn {}. Your Move, Player {}: '.format( turn, playerNum)) # Cleaning input to standardized form userInput.replace(userInput, r"$[0-9]* $[0-9]*", r"\1,\2") # fix for higher dimension userInput = re.replace(userInput, r"\[\] ", "") position = userInput.split(',') for i in range(len(position)): position[i] = int(position[i]) return position
def segment(all_the_text): re = "" relist = "" words = segmenter.seg(all_the_text) count = 0 for w in words: if len(w) > 1 and w >= u'/u4e00' and w <= u'\u9fa5': re = re + " " + w count = count + 1 if count % 100 == 0: re = re.replace("\n", " ") relist = relist + "\n" + re re = "" count = count + 1 re = re.replace("\n", " ").replace("\r", " ") if len(relist) > 1 and len(re) > 40: relist = relist + "\n" + re elif len(re) > 40: relist = re relist = relist + "\n" relist = relist.replace("\r\n", "\n").replace("\n\n", "\n") return relist
def clean_list(self, filename): def openfile(filename): open_file = open( "/root/Desktop/py/hemin/" + filename + "following.txt", "r+") read = open_file.read() return read open_file.close() read = openfile(filename) re = read.replace("'https://www.instagram.com/", "") me = re.replace("/'", "") de = me.replace(" ", "\n") he = open("/root/Desktop/py/clean/" + filename + "clean.txt", "w+") he.write(de) he.close()
def todo_complete(caldav_conn, args): if args.nocaldav: raise ValueError("No caldav connection, aborting") tasks = todo_select(caldav_conn, args) for task in tasks: if hasattr(task.instance.vtodo, 'rrule'): rrule = rrulestr(task.instance.vtodo.rrule.value) try: next = rrule.after(datetime.now()) except TypeError: ## pesky problem with comparition of timestamps with and without tzinfo next = rrule.after(datetime.now(tz=tzlocal.get_localzone())) if next: ## new_task is to be completed and we keep the original task open completed_task = task.copy() remaining_task = task ## the remaining task should have recurrence id set to next start time, and range THISANDFUTURE if hasattr(remaining_task.instance.vtodo, 'recurrence_id'): del remaining_task.instance.vtodo.recurrence_id remaining_task.instance.vtodo.add('recurrence-id') remaining_task.instance.vtodo.recurrence_id.value = next ## TODO: should be same type as dtstart (date or datetime) remaining_task.instance.vtodo.dtstart.value = next ## TODO: should be same type as dtstart (date or datetime) remaining_task.instance.vtodo.recurrence_id.params['RANGE'] = [ 'THISANDFUTURE' ] remaining_task.instance.vtodo.rrule remaining_task.save() ## the completed task should have recurrence id set to current time ## count in rrule should decrease if hasattr(completed_task.instance.vtodo, 'recurrence_id'): del completed_task.instance.vtodo.recurrence_id completed_task.instance.vtodo.add('recurrence-id') completed_task.instance.vtodo.recurrence_id.value = datetime.now( ) completed_task.instance.vtodo.dtstart.value = datetime.now() count_search = re.search( 'COUNT=(\d+)', completed_task.instance.vtodo.rrule.value) if count_search: completed_task.instance.vtodo.rrule.value = re.replace( 'COUNT=(\d+)', 'COUNT=%d' % int(count_search.group(1)) - 1) completed_task.complete() continue task.complete()
def longest_word(string): """Finds the longest word in a given string""" if len(string) == 1: return string # split string and remove any non-alphanum chars words = [replace(r'[\W+]', '', word) for word in string.split(' ')] if len(words) == 1: return string longest = '' for word in words: if len(word) > len(longest): longest = word return longest
def removeCommonWords(self): if self.channel == "Instagram": self.keywordList = ["인스타그램", "인스타", "팔로우", "맞팔", "인친", "셀스타그램", "그램", "스타"] elif self.channel == "Naver Blog": self.keywordList = ["포스팅", "블로그", "댓글", "이웃추가"] elif self.channel == "Twitter": self.keywordList = ["트윗", "RT", "트위터"] if self.channel == "Naver News": self.keywordList = ["없음", "헤럴드", "역필", "투데이", "머니", "코리아", "기자", "오마이", "구독", "연합", "채널", "네이버", "뉴시스", "금지", "저작", "무단", "뉴스", "재배포"] else: self.keywordList = [] # self.keywordList.append(self.keyword) for keyword in self.keywordList: self.text = re.replace(keyword, " ", self.text)
def defragment(tokenized): """Converts token list to string Arguments: tokenized {list} -- Tokenized string Return: str -- Rebuilt string """ title = "" for token in tokenized: if token in "([{": title += token elif token in "}])": title = title[:-1] + token + " " else: title += token + " " return replace(r"(?<=\d)( \/ )(?=\d)", "/", title[:-1])
def todo_complete(caldav_conn, args): if args.nocaldav: raise ValueError("No caldav connection, aborting") tasks = todo_select(caldav_conn, args) for task in tasks: if hasattr(task.instance.vtodo, 'rrule'): rrule = rrulestr(task.instance.vtodo.rrule.value) try: next = rrule.after(datetime.now()) except TypeError: ## pesky problem with comparition of timestamps with and without tzinfo next = rrule.after(datetime.now(tz=tzlocal.get_localzone())) if next: ## new_task is to be completed and we keep the original task open completed_task = task.copy() remaining_task = task ## the remaining task should have recurrence id set to next start time, and range THISANDFUTURE if hasattr(remaining_task.instance.vtodo, 'recurrence_id'): del remaining_task.instance.vtodo.recurrence_id remaining_task.instance.vtodo.add('recurrence-id') remaining_task.instance.vtodo.recurrence_id.value = next ## TODO: should be same type as dtstart (date or datetime) remaining_task.instance.vtodo.dtstart.value = next ## TODO: should be same type as dtstart (date or datetime) remaining_task.instance.vtodo.recurrence_id.params['RANGE'] = [ 'THISANDFUTURE' ] remaining_task.instance.vtodo.rrule remaining_task.save() ## the completed task should have recurrence id set to current time ## count in rrule should decrease if hasattr(completed_task.instance.vtodo, 'recurrence_id'): del completed_task.instance.vtodo.recurrence_id completed_task.instance.vtodo.add('recurrence-id') completed_task.instance.vtodo.recurrence_id.value = datetime.now() completed_task.instance.vtodo.dtstart.value = datetime.now() count_search = re.search('COUNT=(\d+)', completed_task.instance.vtodo.rrule.value) if count_search: completed_task.instance.vtodo.rrule.value = re.replace('COUNT=(\d+)', 'COUNT=%d' % int(count_search.group(1))-1) completed_task.complete() continue task.complete()
def getMain(self, sentence): re = '' words, rely_id, relation = self.getLTPAnalysis(sentence) #hed = self.getHED(array) if 0 in rely_id: hed = rely_id.index(0) sbv = self.getWord(words, rely_id, relation, hed, 'SBV') # 主语 vob = self.getWord(words, rely_id, relation, hed, 'VOB') # 宾语 fob = self.getWord(words, rely_id, relation, hed, 'FOB') # 后置宾语 adv = self.getWord(words, rely_id, relation, hed, 'ADV') # 定中 pob = self.getWord(words, rely_id, relation, hed, 'POB') # 介宾如果没有主语可做主语 zhuWord = self.getFirstNotNone([sbv, pob]) # 最终主语 weiWord = words[hed] # 最终谓语 binWord = self.getFirstNotNone([vob, fob, pob]) # 最终宾语 re = '{} {} {}'.format(zhuWord, weiWord, binWord) return re.replace('None', ' ')
def _insert_value(line, value, type): """.. Insert value into line. Parameters ---------- line : str Line of document to insert value. value : str Value to insert. type : str Formatting for value. Returns ------- line : str Line of document with inserted value. """ if (type == 'no change'): line = re.replace('\\\\?#\\\\?#\\\\?#', value, line) elif (type == 'round'): try: value = float(value) except: raise_from(CritError(messages.crit_error_not_float % value), None) digits = re.findall('\\\\?#([0-9]+)\\\\?#', line)[0] rounded_value = format(value, '.%sf' % digits) line = re.sub('(.*?)\\\\?#[0-9]+\\\\?#', r'\g<1>' + rounded_value, line) elif (type == 'comma + round'): try: value = float(value) except: raise_from(CritError(messages.crit_error_not_float % value), None) digits = re.findall('\\\\?#([0-9]+),\\\\?#', line)[0] rounded_value = format(value, ',.%sf' % digits) line = re.sub('(.*?)\\\\?#[0-9]+,\\\\?#', r'\g<1>' + rounded_value, line) return(line)
def query_naivebayes(model={}, text='', min_word_size=4, k=1): from decimal import Decimal, Context from math import log from re import sub as replace # cria contexto para o built-in Decimal context = Context() # processa texto para de ter apenas palavras significativas if type(text).__name__ == 'str': print('Processando texto, stemming, removendo stopwords ....\n\n') words = text_processing(replace(r'([^a-zA-Z ]+)', r'', text), min_word_size=min_word_size) else: words = text #return model # cria vetor de probabilidades prob_topic = list() for topic in model.keys(): #print("Calculando topico: %s" % topic) prob = context.ln(model[topic]['P_topic']) words_of_topic = model[topic]['P_wk'].keys() #print("ANTES %s" % words) for wk in words: #print("PROBLEMA %s %f %f" % (wk, model[topic]['P_wk'][wk], log(model[topic]['P_wk'][wk]))) if not wk in words_of_topic: prob += context.ln(model[topic]['P_wk']['espuria']) else: prob += context.ln(model[topic]['P_wk'][wk]) prob_topic.append({topic: prob}) sorted_probs = sorted(prob_topic, key=lambda k: tuple(k.values())[0], reverse=True) return sorted_probs[0:k]
def query_naivebayes(model={}, text='', min_word_size=4, k=1): from decimal import Decimal, Context from math import log from re import sub as replace # cria contexto para o built-in Decimal context = Context() # processa texto para de ter apenas palavras significativas if type(text).__name__ == 'str': print('Processando texto, stemming, removendo stopwords ....\n\n') words = text_processing( replace(r'([^a-zA-Z ]+)', r'', text), min_word_size=min_word_size) else: words = text #return model # cria vetor de probabilidades prob_topic = list() for topic in model.keys(): #print("Calculando topico: %s" % topic) prob = context.ln(model[topic]['P_topic']) words_of_topic = model[topic]['P_wk'].keys() #print("ANTES %s" % words) for wk in words: #print("PROBLEMA %s %f %f" % (wk, model[topic]['P_wk'][wk], log(model[topic]['P_wk'][wk]))) if not wk in words_of_topic: prob += context.ln( model[topic]['P_wk']['espuria'] ) else: prob += context.ln( model[topic]['P_wk'][wk] ) prob_topic.append( {topic: prob} ) sorted_probs = sorted(prob_topic, key=lambda k: tuple(k.values())[0], reverse=True) return sorted_probs[0:k]
def validate_query(query): query = re.sub(r'" *~', '"~', query) query = re.sub(r'~ *', '~', query) parts = query.split('"') if parts[0] == '': parts = parts[1:] in_phrase = False for part in parts: in_phrase = not in_phrase if '*' in part and in_phrase: raise ValueError( '* is not allowed in a phrase ({})'.format(part)) elif part[0] == '~' and re.replace(r'^~[0-9]+', '', part) == part: raise ValueError( '~ must be followed by a number after a phrase ({})'.format(part)) return query
def _get_cleared_text(self): """ Perform some operations to obtain a cleared text - Replace tabs with spaces en removes extra spaces - Replace extra simbols after lists elements """ content = (self.content or '').strip() flags = MULTILINE|UNICODE # STEP 1: Remove tabs and extra spaces content = replace(r'[ \t]+', r' ', content, flags=flags) # STEP 2: Remove spaces from both line bounds content = replace(r'[ \t]+$', r'', content, flags=flags) content = replace(r'^[ \t]+', r'', content, flags=flags) # STEP 3: Replace CRLF by LF and remove duplicates content = replace(r'[\r\n]', r'\n', content, flags=flags) content = replace(r'[\n]{2,}', r'\n\n', content, flags=flags) # STEP 2: Update questions and answers numbering formats content = replace(r'^([0-9]{1,10})[.\-)]+[ \t]+', r'\1. ', content, flags=flags) content = replace(r'^([a-zñA-ZÑ])[.\-)]+[ \t]+', r'\1) ', content, flags=flags) return content
dest = os.getcwd() if len(parse.arguments) > 1: dest = os.path.join(dest, parse.arguments[2]) if os.path.isdir(dest): dest = os.path.join(dest, parse.arguments[1].replace(tpl_suffix, '.php')) if dest[-4:] != '.php': dest += '.php' # check if destination doesn't exist or --force is on if os.path.isfile(dest) and not parse.options['force']: safe.quit('Destination file already exists. Use -f/--force to overwrite.', 1) print 'Copying {0} to {1}.' # copy file safe.catch(shutil.copy, (src, dest), 'Can\'t copy template ({0}) to {1}.') print 'Ok.' # replace title with that from command line if parse.options['title'] != default_title: from contextlib import nested import re with nested(safe.fopen(dest), safe.fopen(dest + '.tmp', 'w') as (s, d): for i in s: d.write(re.replace(r'($APPLICATION\\-\\>SetTitle\\(\\s*")[^)]("\\))', '$1{0}$2'.format(parse.options['title'])))
def goto(self, axis_letter, offset=False): coordinate = float(getattr( self, 'lcdWorkNumber' + axis_letter.upper()).value()) format = "{0:." + str(conf.get('ui.lcd_precision')) + "f}" formatted_coord = format.format(coordinate) coordinate, ok = QtGui.QInputDialog.getText( self, 'Goto', u''' Enter a new coordinate for the %s axis. \u2022 -20 or +20 absolute \u2022 --20 or ++20 relative \u2022 50%% percent of current position \u2022 ((here + 20) / 3.2) arithmetic ''' % axis_letter.upper(), QtGui.QLineEdit.Normal, formatted_coord) coordinate = str(coordinate).strip() if ok: # relative match = re.search(r'^(([\-\+])\2)[\d\.]+$', coordinate) if match is not None: self.logger.debug(coordinate) coordinate = re.sub(r'^([\-\+])\1', r'\1', coordinate) if coordinate == 0: coordinate = abs(coordinate) self.logger.debug(coordinate) self.controller.move_axis(axis_letter, coordinate) return None # absolute match = re.search(r'^([\-\+])?[\d\.]+$', coordinate) if match is not None: self.controller.move_axis(axis_letter, coordinate, True) return None # percentage match = re.search(r'^([\d\.]+)\%$', coordinate) if match is not None: group = match.groups() if offset: current_coordinate = getattr( self, 'lcdWorkNumber' + axis_letter.upper()).value() else: current_coordinate = getattr( self, 'lcdMachNumber' + axis_letter.upper()).value() current_coordinate = ( current_coordinate * (float(group[0]) / 100)) if current_coordinate == 0: current_coordinate = abs(current_coordinate) self.controller.move_axis( axis_letter, current_coordinate, True) return None # expresion match = re.search(r'^\(.+\)$', coordinate) if match is not None: if offset: current_coordinate = getattr( self, 'lcdWorkNumber' + axis_letter.upper()).value() else: current_coordinate = getattr( self, 'lcdMachNumber' + axis_letter.upper()).value() coordinate = re.replace(r'\bhere\b', current_coordinate) ns = {'__builtins__': None} new_coordinate = eval(coordinate, ns) if new_coordinate == 0: new_coordinate = abs(new_coordinate) self.controller.move_axis(axis_letter, new_coordinate, True) return None
"wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you had / you would", "you'd've": "you would have", "you'll": "you shall / you will", "you'll've": "you shall have / you will have", "you're": "you are", "you've": "you have" } # cria lista com contracoes sem os apostrofos from re import sub as replace contractions_without_punc = replace(r'([^a-z ]+)', r'', ' '.join(contractions.keys())).split(' ') ## # Funcao que cria o "modelo" naivebayes, uma estrutura # com as probabilidades calculadas ## # texts => Lista com o nome dos arquivos a serem # aprendidos, ou a estrutura de texto ja # processada (estrutura retornada pela # funcao prepare_reuters_data) # ## # Retorna uma dicionario cujas chaves sao os topicos # encontrados, para cada chave ha outro dicionario associado # contendo duas chaves, a probabilidade do topico em questao,
for i in range(len(items)): list_item = driver.find_element_by_xpath("//*[@id='rso']/li[@class='psli']["+str(i+1)+"]") item = driver.find_element_by_xpath("//*[@id='rso']/li[@class='psli']["+str(i+1)+"]/div[1]/div[2]/h3/a") item.click() driver.implicitly_wait(10) driver.find_element_by_xpath("//*[@id='rso']/li[@class='psli']["+str(i+1)+"]/following-sibling::li/div/div/div/div[2]/div[1]/a").click()#get product reviews try: # try clicking "get all product reviews" driver.find_element_by_xpath("//*[@id='reviews-by-others']/div[6]/a").click() except: pass #//*[@id="reviews-by-others"]/div[3]/div[2]/div[2]/div[3]/span _product_name = driver.find_element_by_xpath("//*[@id='product-name']").text _product_name = re.replace(" ","_"); print "_product_name " + _product_name product_data = {"product_name":_product_name, "reviews":[] } print "product_data " + json.dumps(product_data) review_cnt = 0 while True: try: fn = product_data["product_name"] + ".txt" fdir = dataFolder + "/" + fn reviews_cnt = len(driver.find_elements_by_xpath("//*[@id='reviews-by-others']/div[3]/div")) #//*[@id="reviews-by-others"]/div[3]/div[1]/div[2]/div[3]/span print 'reviews_cnt is ' + str(reviews_cnt) for j in range(reviews_cnt): #//*[@id="reviews-by-others"]/div[3]/div[6]/div[2]/div[3]/span
def removeStopwords(text): return re.replace(stopwordPattern,"". text)
def prepare_reuters_data(file_name_list=list(), min_word_size=4, process_text=True, text_processing_function=nltk_tokenizer): from bs4 import BeautifulSoup from re import sub as replace from datetime import datetime as date # tags usados na estruturacao da noticia record_tag = 'reuters' topics_tag = 'topics' topic_tag = 'd' text_tag = 'text' result = dict({'texts': list(), 'topics': dict(), 'text_to_topics':[]}) index = 0 for file_name in file_name_list: print(date.now().strftime('[%Y-%m-%d %H:%M:%S]') + " - Processando arquivo: %s" % file_name) file = open(file_name, 'r', encoding='utf-8', errors='ignore') xml = BeautifulSoup( file.read() ). \ findAll(record_tag) print("\tNumero de registros a ser processado: %d" % len(xml)) for record in xml: topics = record.find(topics_tag). \ findAll(topic_tag) # descarta noticias sem topico if len(topics) == 0: continue # adiciona o indice do texto atual na lista dos # topicos presentes nessa noticia current_topics = [] for topic in topics: if not topic.text in result['topics'].keys(): result['topics'][topic.text] = list() result['topics'][topic.text].append(index) current_topics.append(topic.text) # dado um texto, i-esimo elemento da lista # result['texts'], cria uma lista com os topicos # relacionados a esse texto result['text_to_topics'].append(current_topics) text = record.find(text_tag).text if process_text: # remove caracteres que nao letras e # espaco, tambem muda texto para caixa baixo # substitue qualquer espaco por apenas um text = replace(r'([\s]+)', r' ', text.lower()) text = replace(r'([^a-zA-Z ]+)', r'', text) # aplica funcao processing_text_function text = text_processing_function(text=text, min_size=min_word_size) result['texts'].append(text) #print(index) index += 1 return result
import re print "Hello, World!" s = "s#$%^&plunk" print re.replace('\W', '', s)
def format_image(self, image): name = image["name"] # If inlining is turned on then we need to embed the image # into the generated output HTML file. if(self.m_inline == True): handle = open(name, "rb") data = base64.encodestring(handle.read()) data = re.replace("\n", "", data) name = "data:image/jpeg;base64," + data handle.close() style = "" caption = "" href_start = "" href_end = "" if(image.has_key("width")): style += "width:%s;" % image["width"] if(image.has_key("height")): style += "height:%s;" % image["height"] if(image.has_key("caption")): caption = image["caption"] if(image.has_key("href")): href_start = "<a style='text-decoration:none;' href='%s'>" % image["href"] href_end = "</a>" if(image.has_key("align") and (image["align"] == "center" or image["align"] == "right")): if(image["align"] == "center"): return """ %s <center> <table style='text-align:center;'> <tr><td><img src='%s' style=\"%s\"/></td></tr> <tr><td><b>%s</b></td></tr> </table> </center> %s """ % (href_start, name, style, caption, href_end) elif(image["align"] == "right"): return """ %s <table style='text-align:center;float:right;'> <tr><td><img src='%s' style=\"%s\"/></td></tr> <tr><td><b>%s</b></td></tr> </table> %s """ % (href_start, name, style, caption, href_end) else: return """ %s <span style='display:inline;'> <table style='display:inline;text-align:center;'> <tr><td><img src='%s' style=\"%s\"/></td></tr> <tr><td><b>%s</b></td></tr> </table> </span> %s """ % (href_start, name, style, caption, href_end)
def split_nasdaq(symbol): sym = re.replace(_symbol_delimiter_regex, '', symbol) return sym[:4], sym[4:]
def no_groups(re): return re.replace('(', '(?:').replace('(?:?', '(?')
def whitelist(value): return replace("[^\da-zA-Z\-_]", "", value)