class Check: """Check consists of the variable name, operator, and a value""" operators = { '~=': lambda b: regexp_compile(b).match, '==': lambda b: lambda a: a == b, '!=': lambda b: lambda a: a != b, '<=': lambda b: lambda a: a <= b, '>=': lambda b: lambda a: a >= b, '<': lambda b: lambda a: a < b, '>': lambda b: lambda a: a > b, } def __init__(self, var, symbol, value, divider=None): self.var = var self.symbol = symbol self.value = value self.executor = self.operators[symbol](value) if divider: if divider != 'min': raise NotImplemented('Only "/min" is supported') self.divider = timedelta(minutes=1) else: self.divider = None def __str__(self): key = self.var if self.divider: key += ' / {}'.format(self.divider) return '{} {} {}'.format(key, self.symbol, self.value) def __call__(self, process): if self.divider: value = process.get_scaled_value(self.var, self.divider) if not value: return False else: value = process[self.var] return self.executor(value) @classmethod def parse(cls, pair): for symbol in sorted(cls.operators.keys(), key=len, reverse=True): if symbol in pair: index = pair.index(symbol) right_split = pair[:index].split('/', 1) var = right_split[0].strip() if len(right_split) > 1: divider = right_split[1].strip() else: divider = None value = cast(pair[(index + len(symbol)):].strip()) return cls(var, symbol, value, divider) raise ValueError('Cannot parse {}'.format(pair))
import shlex from HTMLParser import HTMLParser from tempfile import NamedTemporaryFile from os import unlink from subprocess import Popen, PIPE from mimetypes import guess_type from re import compile as regexp_compile, DOTALL, escape import cld import magic from pypln.backend.celery_task import PyPLNTask regexp_tags = regexp_compile(r'(<[ \t]*([a-zA-Z0-9!"./_-]*)[^>]*>)', flags=DOTALL) regexp_comment = regexp_compile(r'<!--.*?-->', flags=DOTALL) regexp_spaces_start = regexp_compile('([\n]+)[ \t]*', flags=DOTALL) regexp_spaces_end = regexp_compile('[ \t]*\n', flags=DOTALL) regexp_newlines = regexp_compile('[\n]{3,}', flags=DOTALL) regexp_spaces = regexp_compile('[ \t]{2,}', flags=DOTALL) regexp_punctuation = regexp_compile('[ \t]*([' + escape('!,.:;?') + '])', flags=DOTALL) breakline_tags = ['table', '/table', 'tr', 'div', '/div', 'h1', '/h1', 'h2', '/h2', 'h3', '/h3', 'h4', '/h4', 'h5', '/h5', 'h6', '/h6', 'br', 'br/'] double_breakline = ['table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] def clean(text): text = regexp_spaces_start.sub(r'\1', text)
# coding: utf-8 from __future__ import unicode_literals import argparse import datetime from collections import OrderedDict from decimal import Decimal from re import compile as regexp_compile import rows from lxml.etree import HTML REGEXP_PAGE = regexp_compile(r'^[0-9]+ de [0-9]+$') MONTHS = 'JAN FEV MAR ABR MAI JUN JUL AGO SET OUT NOV DEZ' FIELDS = OrderedDict([('category', rows.fields.TextField), ('description', rows.fields.TextField), ('value', rows.fields.DecimalField), ('date', rows.fields.DateField)]) def partition(data, number): for index in range(0, len(data), number): yield data[index:index + number] def convert_text(text): return text.replace('\xa0', ' ')
#!/usr/bin/env python # coding: utf-8 import argparse import sys import time from datetime import timedelta from re import compile as regexp_compile from tempfile import TemporaryFile import pymongo from pypln.api import PyPLN regexp_mongodb = regexp_compile(r'([^:]+):([^/]+)/([^/]+)/(.+)') def partition(iterator, n): iterator = iter(iterator) finished = False while not finished: values = [] for i in range(n): try: values.append(iterator.next()) except StopIteration: finished = True if values: yield values
class Check: pattern = regexp_compile('\s*'.join([ # Allow spaces between everything '\A', '(?:', # Count clause '(?P<count_number>[0-9]+)', '(?P<count_unit>%?)', ')?', '(?:in', # Transaction after separator '(?P<txn>transaction)', '(?:for', # Time clause after separator '(?P<txn_time_number>[0-9]+)', '(?P<txn_time_unit>{time_units})?' ')?', '(?:at', # State after separator '(?P<txn_state>[a-z ]+?)', ')?', ')?', '(?:on', # Command after separator '(?P<command>[a-z ]+?)', ')?', '(?:for', # Time clause after separator '(?P<command_time_number>[0-9]+)', '(?P<command_time_unit>{time_units})?' ')?', '(?:at', # State after separator '(?P<command_state>[a-z ]+?)', ')?', '\Z', ]).format( time_units='|'.join(k for k, v in Interval.units) )) def __init__(self, arg): matches = self.pattern.match(arg) if not matches: raise ArgumentTypeError('"{}" cannot be parsed'.format(arg)) self.count_number = int(matches.group('count_number') or 1) self.count_unit = matches.group('count_unit') self.txn = matches.group('txn') self.txn_time = Interval( int(matches.group('txn_time_number') or 0), matches.group('txn_time_unit') or Interval.units[0][0], ) self.txn_state = matches.group('txn_state') self.command = matches.group('command') self.command_state = matches.group('command_state') self.command_time = Interval( int(matches.group('command_time_number') or 0), matches.group('command_time_unit') or Interval.units[0][0], ) def __repr__(self): return "'{}'".format(self.__str__()) def __str__(self): return str(self.count_number) + self.count_unit + self.get_spec_str() def get_spec_str(self): spec = '' if self.txn: spec += ' in {}'.format(self.txn) if self.txn_time: spec += ' for {}'.format(self.txn_time) if self.txn_state: spec += ' at {}'.format(self.txn_state) if self.command: spec += ' on {}'.format(self.command) if self.command_time: spec += ' for {}'.format(self.command_time) if self.command_state: spec += ' at {}'.format(self.command_state) return spec def relative(self): return bool(self.count_unit) def get_problem(self, db): count = 0 for process in db.get_processes(): if process['time'] < int(self.command_time): if not self.txn_time: break continue if self.fail_command(process): continue if self.txn and self.fail_txn(process, db): continue count += 1 if count >= self.get_count_limit(db): return self.format_problem(count) return None def fail_command(self, process): # Command time is checked by the caller. if self.command and process['command'].lower() != self.command: return True if self.command_state: if not process['state'].lower().startswith(self.command_state): return True return False def fail_txn(self, process, db): txn_info = db.get_txn(process['id']) if not txn_info: return True if txn_info['seconds'] < int(self.txn_time): return True if self.txn_state: if not txn_info['state'].lower().startswith(self.txn_state): return True return False def get_count_limit(self, db): if not self.relative(): return self.count_number return self.count_number * db.get_max_connections() / 100.0 def format_problem(self, count): problem = '{} processes{}'.format(count, self.get_spec_str()) if self.count_number > 1 or self.count_unit: problem += ' exceeds ' + str(self.count_number) + self.count_unit return problem
import os from collections import namedtuple, OrderedDict from io import BytesIO from pathlib import Path from re import compile as regexp_compile import requests import requests_cache import rows import rows.utils URL_YEARS = 'http://www.sports-reference.com/olympics/summer/' URL_COUNTRIES = 'http://www.sports-reference.com/olympics/countries/' URL_DATA = 'http://www.sports-reference.com/olympics/countries/{country_code}/summer/{year}/' REGEXP_COUNTRY = regexp_compile(r'/olympics/countries/([A-Z]{3})/">([^<]+)<') FIELDS = OrderedDict([ ('rk', rows.fields.IntegerField), ('athlete', rows.fields.TextField), ('gender', rows.fields.TextField), ('age', rows.fields.IntegerField), ('sport', rows.fields.TextField), ('gold', rows.fields.IntegerField), ('silver', rows.fields.IntegerField), ('bronze', rows.fields.IntegerField), ('total', rows.fields.IntegerField), ]) FULL_FIELDS = OrderedDict([ ('year', rows.fields.IntegerField), ('country_code', rows.fields.TextField), ('country_name', rows.fields.TextField),
# coding: utf-8 import HTMLParser import zipfile from re import compile as regexp_compile, DOTALL from unicodedata import normalize html_parser = HTMLParser.HTMLParser() regexp_tags = regexp_compile(r'<[ \t]*[a-zA-Z0-9!"./_-]*[^>]*>', flags=DOTALL) regexp_comment = regexp_compile(r'<!--.*?-->', flags=DOTALL) regexp_ods_table = regexp_compile(r'(<table:table [^>]*>)(.*?)' r'(</table:table>)', flags=DOTALL) regexp_ods_table_row = regexp_compile(r'(<table:table-row[^>]*>)(.*?)' r'(</table:table-row>)', flags=DOTALL) regexp_ods_table_cell = regexp_compile(r'(<table:table-cell[^>]*>)(.*?)' r'(</table:table-cell>)', flags=DOTALL) # TODO: encoding? # TODO: replace &...; # TODO: name/id of tables # TODO: re.MULTILINE # TODO: identify types # TODO: clear empty rows? # TODO: clear non-table rows? def tables_ods(filename, headers=False, strip_xml=True):
# THE SOFTWARE. from argparse import ArgumentParser, RawTextHelpFormatter from collections import defaultdict from datetime import datetime, timedelta from operator import itemgetter from os.path import isfile from re import compile as regexp_compile from subprocess import Popen, PIPE from sys import exit # The option arguments which accept a check CHECK_ARGS = ['match', 'parent', 'exclude', 'warning', 'critical'] TIMEDELTA_PATTERN = regexp_compile('\A(((?P<days>[0-9]+)(\-| *days?,? *))?' '((?P<hours>[0-9]+):))?' '(?P<minutes>[0-9]+):' '(?P<seconds>[0-9]+(\.[0-9]+)?)\Z') def main(): """The main program This function puts together everything. It parses the arguments, runs the tests, prints the results and exits with a Nagios compatible exit code. """ args = parse_args() columns = ['pid', 'command'] for arg_name in CHECK_ARGS: for check in getattr(args, arg_name): if check.var not in columns:
from argparse import ArgumentParser, RawTextHelpFormatter from collections import defaultdict from datetime import datetime, timedelta from operator import itemgetter from os.path import isfile from re import compile as regexp_compile from subprocess import Popen, PIPE from sys import exit # The option arguments which accept a check CHECK_ARGS = ['match', 'parent', 'exclude', 'warning', 'critical'] TIMEDELTA_PATTERN = regexp_compile( '\A(((?P<days>[0-9]+)(\-| *days?,? *))?' '((?P<hours>[0-9]+):))?' '(?P<minutes>[0-9]+):' '(?P<seconds>[0-9]+(\.[0-9]+)?)\Z' ) def main(): """The main program This function puts together everything. It parses the arguments, runs the tests, prints the results and exits with a Nagios compatible exit code. """ args = parse_args() columns = ['pid', 'command'] for arg_name in CHECK_ARGS: for check in getattr(args, arg_name):
from __future__ import unicode_literals import argparse import datetime from collections import OrderedDict from decimal import Decimal from re import compile as regexp_compile import rows from lxml.etree import HTML REGEXP_PAGE = regexp_compile(r'^[0-9]+ de [0-9]+$') MONTHS = 'JAN FEV MAR ABR MAI JUN JUL AGO SET OUT NOV DEZ' FIELDS = OrderedDict([('category', rows.fields.TextField), ('description', rows.fields.TextField), ('value', rows.fields.DecimalField), ('date', rows.fields.DateField)]) def partition(data, number): for index in range(0, len(data), number): yield data[index:index + number] def convert_text(text): return text.replace('\xa0', ' ')
BSON_DATE = ord(bson.BSONDAT) # WTF, pymongo? MONTHS = {'jan': 1, 'fev': 2, 'mar': 3, 'abr': 4, 'mai': 5, 'jun': 6, 'jul': 7, 'ago': 8, 'set': 9, 'out': 10, 'nov': 11, 'dez': 12, 'feb': 2, 'apr': 4, 'may': 5, 'aug': 8, 'sep': 9, 'oct': 10, 'dec': 12} FULL_MONTHS = {'janeiro': 1, 'fevereiro': 2, u'março': 3, 'abril': 4, 'maio': 5, 'junho': 6, 'julho': 7, 'agosto': 8, 'setembro': 9, 'outubro': 10, 'novembro': 11, 'dezembro': 12, 'january': 1, 'februrary': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12,} regexp_almost_iso_date = \ regexp_compile(r'([0-9]{4}-[0-9]{2}-[0-9]{2})t([0-9]{2}:[0-9]{2}:[0-9]{2})([+-]+[0-9:]*)') def get_offset_datetime(offset): if offset.lower() == 'gmt': offset = '+0000' offset_signal = int(offset[0] + '1') offset_hours = int(offset[1:3]) offset_minutes = int(offset[3:5]) total_offset_seconds = offset_signal * (offset_hours * 3600 + offset_minutes * 60) offset_in_days = total_offset_seconds / (3600.0 * 24) return datetime.timedelta(offset_in_days) def parse_pt_date(date_string):
import sys import time from os import path, walk from re import compile as regexp_compile, DOTALL import pymongo doc_format = ''' <doc id="([^"]+)" url="([^"]+)" title="[^"]+"> ([^\n]+) (.*) </doc>'''.strip() doc_regexp = regexp_compile(doc_format, flags=DOTALL) doc_fields = ('id', 'url', 'title', 'text') regexp_mongodb = regexp_compile(r'([^:]+):([^/]+)/([^/]+)/(.+)') def parse_doc(text): '''Parse a WP page in "<doc>...</doc>" format and return a dict''' result = doc_regexp.findall(text) page = dict(zip(doc_fields, result[0])) page['text'] = page['text'].replace('</ref>', '')\ .replace('</math>', '').strip() return page def parse_docs(raw_text): '''Given a string with "<doc>...</doc>"s, return a list of dicts'''
""" from json import loads, dumps from os.path import join, isfile, isdir, join from tempfile import gettempdir from os import listdir, makedirs, rename from re import compile as regexp_compile from time import time from functools import wraps from flask import Flask, request, abort, Response, jsonify APP = Flask(__name__) APP.config.update(dict(ROOT=join(gettempdir(), "wikidata"))) APP.config.from_envvar('WIKI_SETTINGS', silent=True) DOCUMENT_TITLE_REGEXP = regexp_compile("[A-Za-z0-9]{1,50}$") TIMESTAMP_REGEXP = regexp_compile(r"\d+(\.\d+)?$") # library functions def get_version_directories(title): """Return a list of version strings for a page: Args: title (str): page title, assumed to be verified Returns: List[str]: list of timestamps in string form, sorted in floating point numeric order """ page_directory = join(APP.config['ROOT'], title)
#!/usr/bin/env python # coding: utf-8 import glob import os import sys from collections import defaultdict from re import compile as regexp_compile regexp_finished = regexp_compile(r'Job finished: id=([a-f0-9]+), ' 'worker=([a-zA-Z0-9]+)') regexp_job_duration = regexp_compile(r'\[API\] Request to router: ({[^\n]+})') def parse_log(filename): with open(filename) as fobj: contents = fobj.read() ids_and_worker_names = regexp_finished.findall(contents) job_ids = defaultdict(list) map(lambda x: job_ids[x[1]].append(x[0]), ids_and_worker_names) job_durations = {} for raw_message in regexp_job_duration.findall(contents): if 'job finished' in raw_message: instruction = 'data = {}'.format(raw_message) namespace = {} exec instruction in namespace data = namespace['data']
import shlex from HTMLParser import HTMLParser from tempfile import NamedTemporaryFile from os import unlink from subprocess import Popen, PIPE from mimetypes import guess_type from re import compile as regexp_compile, DOTALL, escape import cld import magic from pypln.backend.celery_task import PyPLNTask regexp_tags = regexp_compile(r'(<[ \t]*([a-zA-Z0-9!"./_-]*)[^>]*>)', flags=DOTALL) regexp_comment = regexp_compile(r'<!--.*?-->', flags=DOTALL) regexp_spaces_start = regexp_compile('([\n]+)[ \t]*', flags=DOTALL) regexp_spaces_end = regexp_compile('[ \t]*\n', flags=DOTALL) regexp_newlines = regexp_compile('[\n]{3,}', flags=DOTALL) regexp_spaces = regexp_compile('[ \t]{2,}', flags=DOTALL) regexp_punctuation = regexp_compile('[ \t]*([' + escape('!,.:;?') + '])', flags=DOTALL) breakline_tags = [ 'table', '/table', 'tr', 'div', '/div', 'h1', '/h1', 'h2', '/h2', 'h3', '/h3', 'h4', '/h4', 'h5', '/h5', 'h6', '/h6', 'br', 'br/' ] double_breakline = ['table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] def clean(text):