def replace_links_with_text(html): """any absolute links will be replaced with the url in plain text, same with any img tags """ soup = BeautifulSoup(html, 'html5lib') abs_url_re = r'^http(s)?://' images = soup.find_all('img') for image in images: url = image.get('src', '') text = image.get('alt', '') if url == '' or re.match(abs_url_re, url): image.replaceWith(format_url_replacement(url, text)) links = soup.find_all('a') for link in links: url = link.get('href', '') text = ''.join(link.text) or '' if text == '': # this is due to an issue with url inlining in comments link.replaceWith('') elif url == '' or re.match(abs_url_re, url): link.replaceWith(format_url_replacement(url, text)) return force_text(soup.find('body').renderContents(), 'utf-8')
def process_line(line, extra_tags): line = re.sub(' *#.*$', '', line) # remove comments line = re.sub('-$', '', line) if not ' ' in line or re.match('.*[а-яіїєґ]/.*', line): out_line = line elif re.match('^[^ ]+ [^ ]+ [^:]?[a-z].*$', line): out_line = line elif re.match('^[^ ]+ [:^<a-z0-9_].*$', line): out_line = re.sub('^([^ ]+) ([^<a-z].*)$', '\\1 \\1 \\2', line) else: print('hit-', line, file=sys.stderr) base = re.findall('^[^ ]+', line)[0] out_line = re.sub('([^ ]+) ?', '\\1 ' + base + ' unknown' + extra_tags + '\n', line) return out_line[:-1] # if extra_tags != '' and not re.match('.* [a-z].*$', out_line): if extra_tags != '' and (not ' ' in out_line or ' ^' in out_line): extra_tags = ' ' + extra_tags if '|' in out_line: out_line = out_line.replace('|', extra_tags + '|') # if not "/" in out_line and not re.match("^[^ ]+ [^ ]+ [^ ]+$", out_line + extra_tags): # print("bad line:", out_line + extra_tags, file=sys.stderr) # if len(out_line)> 100: # print(out_line, file=sys.stderr) # sys.exit(1) return out_line + extra_tags
def importAuto(cls, string, path=None, activeFit=None, callback=None, encoding=None): # Get first line and strip space symbols of it to avoid possible detection errors firstLine = re.split("[\n\r]+", string.strip(), maxsplit=1)[0] firstLine = firstLine.strip() # If XML-style start of tag encountered, detect as XML if re.match("<", firstLine): if encoding: return "XML", cls.importXml(string, callback, encoding) else: return "XML", cls.importXml(string, callback) # If JSON-style start, parse as CREST/JSON if firstLine[0] == '{': return "JSON", (cls.importCrest(string),) # If we've got source file name which is used to describe ship name # and first line contains something like [setup name], detect as eft config file if re.match("\[.*\]", firstLine) and path is not None: filename = os.path.split(path)[1] shipName = filename.rsplit('.')[0] return "EFT Config", cls.importEftCfg(shipName, string, callback) # If no file is specified and there's comma between brackets, # consider that we have [ship, setup name] and detect like eft export format if re.match("\[.*,.*\]", firstLine): return "EFT", (cls.importEft(string),) # Use DNA format for all other cases return "DNA", (cls.importDna(string),)
def __init__(self, host, debugfunc=None): if isinstance(host, types.TupleType): host, self.weight = host else: self.weight = 1 # parse the connection string m = re.match(r'^(?P<proto>unix):(?P<path>.*)$', host) if not m: m = re.match(r'^(?P<proto>inet):' r'(?P<host>[^:]+)(:(?P<port>[0-9]+))?$', host) if not m: m = re.match(r'^(?P<host>[^:]+):(?P<port>[0-9]+)$', host) if not m: raise ValueError('Unable to parse connection string: "%s"' % host) hostData = m.groupdict() if hostData.get('proto') == 'unix': self.family = socket.AF_UNIX self.address = hostData['path'] else: self.family = socket.AF_INET self.ip = hostData['host'] self.port = int(hostData.get('port', 11211)) self.address = ( self.ip, self.port ) if not debugfunc: debugfunc = lambda x: x self.debuglog = debugfunc self.deaduntil = 0 self.socket = None self.buffer = ''
def _apache_index(self, url): r = requests.get(url) if r.status_code != 200: raise ValueError(url+" status:"+str(r.status_code)) r.dirs = [] r.files = [] for l in r.content.split("\n"): # '<img src="/icons/folder.png" alt="[DIR]" /> <a href="7.0/">7.0/</a> 03-Dec-2014 19:57 - ' # ''<img src="/icons/tgz.png" alt="[ ]" /> <a href="owncloud_7.0.4-2.diff.gz">owncloud_7.0.4-2.diff.gz</a> 09-Dec-2014 16:53 9.7K <a href="owncloud_7.0.4-2.diff.gz.mirrorlist">Details</a>' # m = re.search("<a\s+href=[\"']?([^>]+?)[\"']?>([^<]+?)[\"']?</a>\s*([^<]*)", l, re.I) if m: # ('owncloud_7.0.4-2.diff.gz', 'owncloud_7.0.4-2.diff.gz', '09-Dec-2014 16:53 9.7K ') m1,m2,m3 = m.groups() if re.match("(/|\?|\w+://)", m1): # skip absolute urls, query strings and foreign urls continue if re.match("\.?\./?$", m1): # skip . and .. continue m3 = re.sub("[\s-]+$", "", m3) if re.search("/$", m1): r.dirs.append([m1, m3]) else: r.files.append([m1, m3]) return r
def __init__(self, filename): self.name = "YNAB" self.transactions = [] with open(filename) as register: dr = csv.DictReader(register) for row in dr: trans = self._process_row(row) while True: # Merge split transactions into a single transaction regex = r'\(Split ([0-9]+)/([0-9]+)\)' match = re.match(regex, row["Memo"]) if not match: break for split_row in dr: match = re.match(regex, split_row["Memo"]) t = self._process_row(split_row) trans.amount += t.amount current_split = match.group(1) max_splits = match.group(2) if current_split == max_splits: break break trans.amount = round(trans.amount, 2) # This fixes errors from adding numbers that can't be represented in binary and expecting them to equal one that can that came from Mint. self.transactions.append(trans) self.transactions.sort()
def checkInCNAME(node_text, nodes): try: InCNAME = re.search("IN CNAME (.*)", node_text) alias = InCNAME.group(0).split("IN CNAME ")[1] #IP address found if re.match("(\d{1,3}\.)", alias): return alias # cname is a subdomain elif re.match(".*[a-x]\.", alias): return ("subdomain found (" + alias + ")") #cname is another cname else: try: alias_name = dns.name.Name([alias]) alias_IP = nodes[alias_name].to_text(alias_name) checkCname = checkInA(alias_IP) if checkCname is None: return checkInCNAME(alias_IP, nodes) else: return checkCname except: return (Fore.RED + "unknown host (" + alias + ")" + Fore.RESET) # node has no IN CNAME except: return None
def area_code_lookup(request, area_id, format): from mapit.models import Area, CodeType area_code = None if re.match('\d\d([A-Z]{2}|[A-Z]{4}|[A-Z]{2}\d\d\d|[A-Z]|[A-Z]\d\d)$', area_id): area_code = CodeType.objects.get(code='ons') elif re.match('[EW]0[12]\d{6}$', area_id): # LSOA/MSOA have ONS code type area_code = CodeType.objects.get(code='ons') elif re.match('[ENSW]\d{8}$', area_id): area_code = CodeType.objects.get(code='gss') if not area_code: return None args = { 'format': format, 'codes__type': area_code, 'codes__code': area_id } if re.match('[EW]01', area_id): args['type__code'] = 'OLF' elif re.match('[EW]02', area_id): args['type__code'] = 'OMF' area = get_object_or_404(Area, **args) path = '/area/%d%s' % (area.id, '.%s' % format if format else '') # If there was a query string, make sure it's passed on in the # redirect: if request.META['QUERY_STRING']: path += "?" + request.META['QUERY_STRING'] return HttpResponseRedirect(path)
def parse_report(path): """ Return the volume informations contained in the SIENAX report. This is a dictionary with keys "grey", "white", and "brain". The informations for the different tissues is a dictionary with the normalized and raw values, in cubic millimeters. adapted from: http://code.google.com/p/medipy/source/browse/plugins/fsl/sienax.py see licence: http://code.google.com/p/medipy/source/browse/LICENSE """ report = {} fd = open(path) for line in fd.readlines() : for tissue in ["GREY", "WHITE", "BRAIN"] : pattern = tissue + r"\s+([\d+\.]+)\s+([\d+\.]+)" measure = re.match(pattern, line) if measure : normalized = float(measure.group(1)) raw = float(measure.group(2)) report[tissue.lower()] = {"normalized" : normalized, "raw" : raw} continue vscale = re.match("VSCALING ([\d\.]+)", line) if vscale : report["vscale"] = float(vscale.group(1)) return report
def parse_requirements(requirements_file='requirements.txt'): requirements = [] with open(requirements_file, 'r') as f: for line in f: # For the requirements list, we need to inject only the portion # after egg= so that distutils knows the package it's looking for # such as: # -e git://github.com/openstack/nova/master#egg=nova if re.match(r'\s*-e\s+', line): requirements.append(re.sub(r'\s*-e\s+.*#egg=(.*)$', r'\1', line)) # such as: # http://github.com/openstack/nova/zipball/master#egg=nova elif re.match(r'\s*https?:', line): requirements.append(re.sub(r'\s*https?:.*#egg=(.*)$', r'\1', line)) # -f lines are for index locations, and don't get used here elif re.match(r'\s*-f\s+', line): pass # -r lines are for including other files, and don't get used here elif re.match(r'\s*-r\s+', line): pass # argparse is part of the standard library starting with 2.7 # adding it to the requirements list screws distro installs elif line == 'argparse' and sys.version_info >= (2, 7): pass else: requirements.append(line.strip()) return requirements
def parse_template(template_name): """Given a template name, attempt to extract its group name and upload date Returns: * None if no groups matched * group_name, datestamp of the first matching group. group name will be a string, datestamp with be a :py:class:`datetime.date <python:datetime.date>`, or None if a date can't be derived from the template name """ for group_name, regex in stream_matchers: matches = re.match(regex, template_name) if matches: groups = matches.groupdict() # hilarity may ensue if this code is run right before the new year today = date.today() year = int(groups.get('year', today.year)) month, day = int(groups['month']), int(groups['day']) # validate the template date by turning into a date obj template_date = futurecheck(date(year, month, day)) return TemplateInfo(group_name, template_date, True) for group_name, regex in generic_matchers: matches = re.match(regex, template_name) if matches: return TemplateInfo(group_name, None, False) # If no match, unknown return TemplateInfo('unknown', None, False)
def check_api_version_decorator(logical_line, previous_logical, blank_before, filename): msg = ("N332: the api_version decorator must be the first decorator" " on a method.") if blank_before == 0 and re.match(api_version_re, logical_line) \ and re.match(decorator_re, previous_logical): yield(0, msg)
def parse(fh): stats = [] for line in fh: m = re.match(r'TRANSLATION\s+(?P<content>.*)\n', line) if not m: continue line = m.group('content') m = re.match(r'(?P<group>[[email protected]]+):', line) if not m: sys.stderr.write('Malformed TRANSLATION line: %s\n' % line) continue stat = {'group': m.group('group')} if stat['group'] == 'total': continue else: sum = 0 for x in stat_types: m = re.search(r'\b(?P<count>\d+) %s (message|translation)' % x, line) if m: stat[x] = int(m.group('count')) sum += stat[x] stat['total'] = sum stats.append(stat) return stats
def readFile(fileV4, fileV6, trie): # open ipv4 file input = open(fileV4, "r") pattern = '(\d+)\,(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/(\d{1,2}).*' for line in input: result = re.match(pattern, line) if result: address = result.group(2) length = result.group(3) asn = result.group(1) update = True withdrawal = False count = 0 insertTrie(trie, address, length, asn, update, withdrawal, count) # open ipv6 file input = open(fileV6, "r") pattern = '(\d+)\,(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))/(\d{1,3}),.*' for line in input: result = re.match(pattern, line) if result: address = result.group(2) length = result.group(32) asn = result.group(1) update = True withdrawal = False count = 0 insertTrie(trie, address, length, asn, update, withdrawal, count) return trie
def history(self, page): GIT_COMMIT_FIELDS = ["commit", "author", "date", "date_relative", "message"] GIT_LOG_FORMAT = "%x1f".join(["%h", "%an", "%ad", "%ar", "%s"]) + "%x1e" output = git.log("--format=%s" % GIT_LOG_FORMAT, "--follow", "-z", "--shortstat", page.abspath) output = output.split("\n") history = [] for line in output: if "\x1f" in line: log = line.strip("\x1e\x00").split("\x1f") history.append(dict(zip(GIT_COMMIT_FIELDS, log))) else: insertion = re.match(r".* (\d+) insertion", line) deletion = re.match(r".* (\d+) deletion", line) history[-1]["insertion"] = int(insertion.group(1)) if insertion else 0 history[-1]["deletion"] = int(deletion.group(1)) if deletion else 0 max_changes = float(max([(v["insertion"] + v["deletion"]) for v in history])) or 1.0 for v in history: v.update( { "insertion_relative": str((v["insertion"] / max_changes) * 100), "deletion_relative": str((v["deletion"] / max_changes) * 100), } ) return history
def filter_services(svcs): filtered = [] # filter includes if _args['--has']: for sv in svcs: for inc in _args['--has']: if inc in sv["tags"] and sv not in filtered: filtered.append(sv) if _args['--match']: for sv in svcs: for regex in _args['--match']: for tag in sv["tags"]: if re.match(regex, tag) and sv not in filtered: filtered.append(sv) if not filtered and not _args['--has'] and not _args['--match']: filtered = svcs if _args['--has-not']: for sv in list(filtered): # operate on a copy, otherwise .remove would change the list under our feet for exc in _args['--has-not']: if exc in sv["tags"]: filtered.remove(sv) if _args['--no-match']: for sv in list(filtered): for regex in _args['--no-match']: for tag in sv["tags"]: if re.match(regex, tag) and sv in list(filtered): filtered.remove(sv) return filtered
def process_isolation_file(self, sql_file, output_file): """ Processes the given sql file and writes the output to output file """ try: command = "" for line in sql_file: tinctest.logger.info("re.match: %s" % re.match(r"^\d+[q\\<]:$", line)) print >> output_file, line.strip(), (command_part, dummy, comment) = line.partition("--") if command_part == "" or command_part == "\n": print >> output_file elif command_part.endswith(";\n") or re.match(r"^\d+[q\\<]:$", line): command += command_part tinctest.logger.info("Processing command: %s" % command) self.process_command(command, output_file) command = "" else: command += command_part for process in self.processes.values(): process.stop() except: for process in self.processes.values(): process.terminate() raise finally: for process in self.processes.values(): process.terminate()
def main(): f = open("makefile2wrappers.txt","r"); lins = f.readlines(); f.close(); for l in lins: l = l.strip(); if len(l)==0: continue; print('Line: '+l); # $(C) -DDINT -c ../Source/umf_analyze.c -o umf_i_analyze.o defs=re.match(".*\)(.*)-c",l).group(1).strip(); # If there's no "-o" flag, just compile the file as is: if re.search('.*-o.*',l)!=None: src=re.match(".*-c(.*)-o",l).group(1).strip(); out=re.match(".*-o(.*)",l).group(1).strip(); f='SourceWrappers/'+out+".c"; print(' => Creating '+f+'\n'); o = open(f,"w"); DEFs = defs.strip().split("-D"); DEFs = [x for x in DEFs if x]; # Remove empty for d in DEFs: o.write('#define '+d+'\n'); o.write('#include <'+src+'>'+'\n'); o.close(); else: src=re.match(".*-c(.*)",l).group(1).strip(); f = "SourceWrappers/"+os.path.basename(src); print(' => Creating '+f+'\n'); o = open(f,"w"); o.write('#include <'+src+'>'+'\n'); o.close(); return 0
def main(): f = open('4_dataset.txt', 'r') x = f.readlines() for line in x: if re.match('a={(.*)}', line): a = re.match('a={(.*)}', line).group(1).split(',') elif re.match('b={(.*)}', line): b = re.match('b={(.*)}', line).group(1).split(',') f00 = f01 = f10 = f11 = 0 print 'a =', [ int(i) for i in a ] print 'b =', [ int(i) for i in b ] for i in zip(a, b): if i == ('0', '0'): f00 += 1 if i == ('0', '1'): f01 += 1 if i == ('1', '0'): f10 += 1 if i == ('1', '1'): f11 += 1 print 'Similarity Coeff =', float(f00 + f11)/(f00 + f01 + f10 + f11) print 'Jaccard Coeff =', f11/float(f01 + f10 + f11)
def test_various_ops(self): # This takes about n/3 seconds to run (about n/3 clumps of tasks, # times about 1 second per clump). NUMTASKS = 10 # no more than 3 of the 10 can run at once sema = threading.BoundedSemaphore(value=3) mutex = threading.RLock() numrunning = Counter() threads = [] for i in range(NUMTASKS): t = TestThread("<thread %d>" % i, self, sema, mutex, numrunning) threads.append(t) self.assertEqual(t.ident, None) self.assertTrue(re.match("<TestThread\(.*, initial\)>", repr(t))) t.start() if verbose: print("waiting for all tasks to complete") for t in threads: t.join(NUMTASKS) self.assertTrue(not t.is_alive()) self.assertNotEqual(t.ident, 0) self.assertFalse(t.ident is None) self.assertTrue(re.match("<TestThread\(.*, stopped -?\d+\)>", repr(t))) if verbose: print("all tasks done") self.assertEqual(numrunning.get(), 0)
def processAux(self, dFrag): self.depth=self.depth+1 if not self.files.has_key(self.depth): self.files[self.depth]=[] thisDir=self.compoundDir(self.topDir, dFrag) os.chdir(thisDir) self.theDict[thisDir]={'xml': [], 'bin': [], 'dir': []} # print "Processing",thisDir," Depth",self.depth thisDirContents=os.listdir(thisDir) for fname in thisDirContents: if stat.S_ISDIR(os.stat(fname)[stat.ST_MODE]): if not re.match("^(CVS|images|search|photos|htdig|\.)", fname) and self.depth<4: self.processAux(self.compoundDir(dFrag,fname)) self.handleDir(thisDir, fname) os.chdir(thisDir) else: # print "File",fname if re.match(".*\.xml$", fname): self.handleXML(thisDir, dFrag, fname) elif re.match(".*\.(jpe?g|JPG|gif|png|html)$", fname): self.handleBinary(thisDir, fname) self.writeIndex(dFrag) self.depth=self.depth-1
def tourAllFiles(dirpath): global a global alen global domain global person # names = list of files in current path names = os.listdir(dirpath) # find 'si' and 'sx' prefix and 'phn' suffix # filter out 'sa' prefix pat1 = '.*si.*\.phn' pat2 = '.*sx.*\.phn' drpat = 'dr\d' for name in names: if re.match(pat1,name) != None or re.match(pat2,name) != None: phn2label(name) curpath = dirpath+'/'+name if os.path.isdir(curpath): # only use to drx/person/xxx.phn if re.match(drpat,name): domain = name else: person = name # iterate os.chdir(curpath) tourAllFiles(curpath) os.chdir(dirpath)
def register(request) : ''' Handle a Post request with the following information: login, password, email ''' print 'receiving a request' #parameter retrieval try : login = request.GET['registerLogin'] password = request.GET['registerPassword'] email = request.GET['registerEmail'] except MultiValueDictKeyError : response=HttpResponse('400 - BAD URI') response.status_code=400 return response #parameter validation loginIsValid = re.match('[\w0-9]*', login) and len(login) > 3 and len(login) < 16 passwordIsValid = len(password) >= 6 #TODO check with number emailIsValid = re.match('[\w.]*@\w*\.[\w.]*', email) logger.info(login + ' ' + password + ' ' + email) if loginIsValid and passwordIsValid and emailIsValid : return processFormInformation(login, password, email, request) else : response=HttpResponse("400") response['message'] = 'invalid information' response.status_code=400 return response
def parse(self, response): sel = Selector(response) result = [] ad = DatesItem() ad['name'] = "" for p in sel.xpath("//div[@class='poziomd']//text()").extract(): if re.match("^.*,", p): if p.startswith(","): ad['desc'] = p[2:] else: ad['desc'] = p[6:] ad['name'] = ad['name'].lstrip('1234567890() ').strip() if re.match('^.\s', ad['name']): ad['name'] = ad['name'][2:] ad['url'] = response.url if re.match(".*urodzeni.*", response.url): ad['isBirth'] = True else: ad['isBirth'] = False result.append(ad) ad = DatesItem() ad['name'] = "" elif re.match("^\s*[0-9]{1,4}", p) and not ad.has_key('date'): ad['date'] = re.match("^\s*[0-9]{1,4}", p).group() else: ad['name'] = ad['name'] + p return result
def _sanitize(self, badKey, badVal): valid = True # Used for debugging if 'csv_line' not in self: self['csv_line'] = "-1" # Catch bad formatting if badKey in self: logging.debug(badKey, ''.join(self[badKey])) logging.debug("Bad Key") valid = False if 'last_pymnt_d' in self and re.match("^\s*$", self['last_pymnt_d']): if 'issue_d' in self: # If no payment received, last payment date = issue date self['last_pymnt_d'] = self['issue_d'] for k, v in self.items(): if badVal == v: logging.debug(badVal) valid = False break # Replace empties with 0s if re.match('^\s*$', str(v)): self[k] = 0 if not valid: logging.debug(self.items()) # Can't safely access specific keys, other than id, when incorrectly formatted logging.warning("Fix Loan {}".format(self['id'])) logging.warning("Line {}".format(self['csv_line'])) return valid
def _get_type_of_macro(self, macros, clss): for macro in macros: # ARGN Macros if re.match('ARG\d', macro): macros[macro]['type'] = 'ARGN' continue # USERN macros # are managed in the Config class, so no # need to look that here elif re.match('_HOST\w', macro): macros[macro]['type'] = 'CUSTOM' macros[macro]['class'] = 'HOST' continue elif re.match('_SERVICE\w', macro): macros[macro]['type'] = 'CUSTOM' macros[macro]['class'] = 'SERVICE' # value of macro: re.split('_HOST', '_HOSTMAC_ADDRESS')[1] continue elif re.match('_CONTACT\w', macro): macros[macro]['type'] = 'CUSTOM' macros[macro]['class'] = 'CONTACT' continue # On demand macro elif len(macro.split(':')) > 1: macros[macro]['type'] = 'ONDEMAND' continue # OK, classical macro... for cls in clss: if macro in cls.macros: macros[macro]['type'] = 'class' macros[macro]['class'] = cls continue
def _strip_and_unquote( keys, value ): if value[:3] == "'''": m = re.match( _MULTI_LINE_SINGLE, value ) if m: value = m.groups()[0] else: raise IllegalValueError( "string", keys, value ) elif value[:3] == '"""': m = re.match( _MULTI_LINE_DOUBLE, value ) if m: value = m.groups()[0] else: raise IllegalValueError( "string", keys, value ) elif value[0] == '"': m = re.match( _DQ_VALUE, value ) if m: value = m.groups()[0] else: raise IllegalValueError( "string", keys, value ) elif value[0] == "'": m = re.match( _SQ_VALUE, value ) if m: value = m.groups()[0] else: raise IllegalValueError( "string", keys, value ) else: # unquoted value = re.sub( '\s*#.*$', '', value ) # Note strip() removes leading and trailing whitespace, including # initial newlines on a multiline string: return value.strip()
def __load_book_menu (self, lines) : r1 = re.compile(u'^\s*目\s*录\s*$') r2 = re.compile(u'^\s*([^·…]+)\s*[·.…]{2,}\s*([l\d]+)\s*$') menus = {} start = False not_match = 0 for line in lines : words = line.decode(self.default_coding) words.strip('\n') if re.match(r1, words) : start = True continue elif start : m = re.match(r2, words) if m : title = m.group(1) page = m.group(2) page = page.replace('l', '1') page = int(page.encode(self.default_coding)) menus[page] = self.__get_simple_string(title) not_match = 0 else : not_match += 1 if not_match > 10 : break return menus
def create_filetree(path=None, depth=0, max_depth=0): tree = None if max_depth == 0 or depth < max_depth: if path is None: path = os.getcwd() tree = dict(name=os.path.basename(path), children=[]) try: lst = os.listdir(path) except OSError: pass # ignore errors else: for name in lst: fn = os.path.join(path, name) if (os.path.isdir(fn) and re.match('^.*(Compiled)$', fn) is None): child = create_filetree(fn, depth + 1, max_depth) if child is not None: tree['children'].append(child) elif re.match('^.*\.(m|def|txt|csv)$', fn) is not None: tree['children'].append(dict(name=fn.replace( os.getcwd() + os.path.sep, ""))) return tree
def process_line_exceptions(line, extra_tags): global except_base_tag if not ' ' in line or re.match('.*[а-яіїєґ]/.*', line): return line if re.match('^[^ ]+ [^ ]+ [^:]?[a-z].*$', line): return line if line.startswith('# !'): except_base_tag = re.findall('![a-z:-]+', line)[0][1:] + ':' return '' base = re.findall('^[^ ]+', line)[0] except_base_tag2 = except_base_tag if base.endswith('ся'): except_base_tag2 = except_base_tag.replace('verb:', 'verb:rev:') out_line = re.sub('([^ ]+) ?', '\\1 ' + base + ' ' + except_base_tag2 + 'unknown' + extra_tags + '\n', line) if except_base_tag in ('verb:imperf:', 'verb:perf:'): base_add = 'inf:' # if base.endswith('ся'): # base_add = 'rev:' + base_add out_line = re.sub("(verb:(?:rev:)?)((im)?perf:)", "\\1inf:\\2", out_line, 1) out_lines = out_line.split('\n') out_lines[0] = out_lines[0].replace(':unknown', '') out_line = '\n'.join(out_lines) return out_line[:-1]