class ParseError(Exception): """An exception which is used to signal a parse failure. Attributes: filename - name of the file lineno - line number in the file msg - error message """ def __init__(self, filename, lineno, msg): assert type(lineno) == types.IntType self.filename = filename self.lineno = lineno self.msg = msg def __str__(self): return self.msg def __repr__(self): return "ParseError(%s, %d, %s)" % ( ` self.filename `, self.lineno, ` self.msg `) def print_out(self, file): """Writes a machine-parsable error message to file.""" file.write("%s:%d: %s\n" % (self.filename, self.lineno, self.msg)) file.flush() printOut = function_deprecated_by(print_out)
def __cmp__(self, other): return cmp(self._order, other._order) def __hash__(self): return hash(self._order) class Release(PseudoEnum): pass def list_releases(): releases = {} rels = ("potato", "woody", "sarge", "etch", "lenny", "sid") for r in range(len(rels)): releases[rels[r]] = Release(rels[r], r) Release.releases = releases return releases listReleases = function_deprecated_by(list_releases) def intern_release(name, releases=list_releases()): if releases.has_key(name): return releases[name] else: return None internRelease = function_deprecated_by(intern_release) del listReleases del list_releases def read_lines_sha1(lines): m = hashlib.sha1() for l in lines:
class DB: """ In-memory database mapping packages to tags and tags to packages. """ def __init__(self): self.db = {} self.rdb = {} def read(self, input, tag_filter=None): """ Read the database from a file. Example:: # Read the system Debtags database db.read(open("/var/lib/debtags/package-tags", "r")) """ self.db, self.rdb = read_tag_database_both_ways(input, tag_filter) def qwrite(self, file): "Quickly write the data to a pickled file" cPickle.dump(self.db, file) cPickle.dump(self.rdb, file) def qread(self, file): "Quickly read the data from a pickled file" self.db = cPickle.load(file) self.rdb = cPickle.load(file) def insert(self, pkg, tags): self.db[pkg] = tags.copy() for tag in tags: if self.rdb.has_key(tag): self.rdb[tag].add(pkg) else: self.rdb[tag] = set((pkg)) def dump(self): output(self.db) def dump_reverse(self): output(self.rdb) dumpReverse = function_deprecated_by(dump_reverse) def reverse(self): "Return the reverse collection, sharing tagsets with this one" res = DB() res.db = self.rdb res.rdb = self.db return res def facet_collection(self): """ Return a copy of this collection, but replaces the tag names with only their facets. """ fcoll = DB() tofacet = re.compile(r"^([^:]+).+") for pkg, tags in self.iter_packagesTags(): ftags = set([tofacet.sub(r"\1", t) for t in tags]) fcoll.insert(pkg, ftags) return fcoll facetCollection = function_deprecated_by(facet_collection) def copy(self): """ Return a copy of this collection, with the tagsets copied as well. """ res = DB() res.db = self.db.copy() res.rdb = self.rdb.copy() return res def reverse_copy(self): """ Return the reverse collection, with a copy of the tagsets of this one. """ res = DB() res.db = self.rdb.copy() res.rdb = self.db.copy() return res reverseCopy = function_deprecated_by(reverse_copy) def choose_packages(self, package_iter): """ Return a collection with only the packages in package_iter, sharing tagsets with this one """ res = DB() db = {} for pkg in package_iter: if self.db.has_key(pkg): db[pkg] = self.db[pkg] res.db = db res.rdb = reverse(db) return res choosePackages = function_deprecated_by(choose_packages) def choose_packages_copy(self, package_iter): """ Return a collection with only the packages in package_iter, with a copy of the tagsets of this one """ res = DB() db = {} for pkg in package_iter: db[pkg] = self.db[pkg] res.db = db res.rdb = reverse(db) return res choosePackagesCopy = function_deprecated_by(choose_packages_copy) def filter_packages(self, package_filter): """ Return a collection with only those packages that match a filter, sharing tagsets with this one. The filter will match on the package. """ res = DB() db = {} for pkg in filter(package_filter, self.db.iterkeys()): db[pkg] = self.db[pkg] res.db = db res.rdb = reverse(db) return res filterPackages = function_deprecated_by(filter_packages) def filter_packages_copy(self, filter): """ Return a collection with only those packages that match a filter, with a copy of the tagsets of this one. The filter will match on the package. """ res = DB() db = {} for pkg in filter(filter, self.db.iterkeys()): db[pkg] = self.db[pkg].copy() res.db = db res.rdb = reverse(db) return res filterPackagesCopy = function_deprecated_by(filter_packages_copy) def filter_packages_tags(self, package_tag_filter): """ Return a collection with only those packages that match a filter, sharing tagsets with this one. The filter will match on (package, tags). """ res = DB() db = {} for pkg, tags in filter(package_tag_filter, self.db.iteritems()): db[pkg] = self.db[pkg] res.db = db res.rdb = reverse(db) return res filterPackagesTags = function_deprecated_by(filter_packages_tags) def filter_packages_tags_copy(self, package_tag_filter): """ Return a collection with only those packages that match a filter, with a copy of the tagsets of this one. The filter will match on (package, tags). """ res = DB() db = {} for pkg, tags in filter(package_tag_filter, self.db.iteritems()): db[pkg] = self.db[pkg].copy() res.db = db res.rdb = reverse(db) return res filterPackagesTagsCopy = function_deprecated_by(filter_packages_tags_copy) def filter_tags(self, tag_filter): """ Return a collection with only those tags that match a filter, sharing package sets with this one. The filter will match on the tag. """ res = DB() rdb = {} for tag in filter(tag_filter, self.rdb.iterkeys()): rdb[tag] = self.rdb[tag] res.rdb = rdb res.db = reverse(rdb) return res filterTags = function_deprecated_by(filter_tags) def filter_tags_copy(self, tag_filter): """ Return a collection with only those tags that match a filter, with a copy of the package sets of this one. The filter will match on the tag. """ res = DB() rdb = {} for tag in filter(tag_filter, self.rdb.iterkeys()): rdb[tag] = self.rdb[tag].copy() res.rdb = rdb res.db = reverse(rdb) return res filterTagsCopy = function_deprecated_by(filter_tags_copy) def has_package(self, pkg): """Check if the collection contains the given package""" return self.db.has_key(pkg) hasPackage = function_deprecated_by(has_package) def has_tag(self, tag): """Check if the collection contains packages tagged with tag""" return self.rdb.has_key(tag) hasTag = function_deprecated_by(has_tag) def tags_of_package(self, pkg): """Return the tag set of a package""" return self.db.has_key(pkg) and self.db[pkg] or set() tagsOfPackage = function_deprecated_by(tags_of_package) def packages_of_tag(self, tag): """Return the package set of a tag""" return self.rdb.has_key(tag) and self.rdb[tag] or set() packagesOfTag = function_deprecated_by(packages_of_tag) def tags_of_packages(self, pkgs): """Return the set of tags that have all the packages in pkgs""" res = None for p in pkgs: if res == None: res = set(self.tags_of_package(p)) else: res &= self.tags_of_package(p) return res tagsOfPackages = function_deprecated_by(tags_of_packages) def packages_of_tags(self, tags): """Return the set of packages that have all the tags in tags""" res = None for t in tags: if res == None: res = set(self.packages_of_tag(t)) else: res &= self.packages_of_tag(t) return res packagesOfTags = function_deprecated_by(packages_of_tags) def card(self, tag): """ Return the cardinality of a tag """ return self.rdb.has_key(tag) and len(self.rdb[tag]) or 0 def discriminance(self, tag): """ Return the discriminance index if the tag. Th discriminance index of the tag is defined as the minimum number of packages that would be eliminated by selecting only those tagged with this tag or only those not tagged with this tag. """ n = self.card(tag) tot = self.package_count() return min(n, tot - n) def iter_packages(self): """Iterate over the packages""" return self.db.iterkeys() iterPackages = function_deprecated_by(iter_packages) def iter_tags(self): """Iterate over the tags""" return self.rdb.iterkeys() iterTags = function_deprecated_by(iter_tags) def iter_packages_tags(self): """Iterate over 2-tuples of (pkg, tags)""" return self.db.iteritems() iterPackagesTags = function_deprecated_by(iter_packages_tags) def iter_tags_packages(self): """Iterate over 2-tuples of (tag, pkgs)""" return self.rdb.iteritems() iterTagsPackages = function_deprecated_by(iter_tags_packages) def package_count(self): """Return the number of packages""" return len(self.db) packageCount = function_deprecated_by(package_count) def tag_count(self): """Return the number of tags""" return len(self.rdb) tagCount = function_deprecated_by(tag_count) def ideal_tagset(self, tags): """ Return an ideal selection of the top tags in a list of tags. Return the tagset made of the highest number of tags taken in consecutive sequence from the beginning of the given vector, that would intersecate with the tagset of a comfortable amount of packages. Comfortable is defined in terms of how far it is from 7. """ # TODO: the scoring function is quite ok, but may need more # tuning. I also center it on 15 instead of 7 since we're # setting a starting point for the search, not a target point def score_fun(x): return float((x - 15) * (x - 15)) / x hits = [] tagset = set() min_score = 3 for i in range(len(tags)): pkgs = self.packages_of_tags(tags[:i + 1]) card = len(pkgs) if card == 0: break score = score_fun(card) if score < min_score: min_score = score tagset = set(tags[:i + 1]) # Return always at least the first tag if len(tagset) == 0: return set(tags[:1]) else: return tagset idealTagset = function_deprecated_by(ideal_tagset) def correlations(self): """ Generate the list of correlation as a tuple (hastag, hasalsotag, score). Every touple will indicate that the tag 'hastag' tends to also have 'hasalsotag' with a score of 'score'. """ for pivot in self.iter_tags(): with_ = self.filter_packages_tags(lambda pt: pivot in pt[1]) without = self.filter_packages_tags(lambda pt: pivot not in pt[1]) for tag in with_.iter_tags(): if tag == pivot: continue has = float(with_.card(tag)) / float(with_.package_count()) hasnt = float(without.card(tag)) / float( without.package_count()) yield pivot, tag, has - hasnt
def parse_tags(input): lre = re.compile(r"^(.+?)(?::?\s*|:\s+(.+?)\s*)$") for line in input: # Is there a way to remove the last character of a line that does not # make a copy of the entire line? m = lre.match(line) pkgs = set(m.group(1).split(', ')) if m.group(2): tags = set(m.group(2).split(', ')) else: tags = set() yield pkgs, tags parseTags = function_deprecated_by(parse_tags) def read_tag_database(input): "Read the tag database, returning a pkg->tags dictionary" db = {} for pkgs, tags in parse_tags(input): # Create the tag set using the native set for p in pkgs: db[p] = tags.copy() return db readTagDatabase = function_deprecated_by(read_tag_database)
class Deb822(Deb822Dict): def __init__(self, sequence=None, fields=None, _parsed=None, encoding="utf-8"): """Create a new Deb822 instance. :param sequence: a string, or any any object that returns a line of input each time, normally a file(). Alternately, sequence can be a dict that contains the initial key-value pairs. :param fields: if given, it is interpreted as a list of fields that should be parsed (the rest will be discarded). :param _parsed: internal parameter. :param encoding: When parsing strings, interpret them in this encoding. (All values are given back as unicode objects, so an encoding is necessary in order to properly interpret the strings.) """ if hasattr(sequence, 'items'): _dict = sequence sequence = None else: _dict = None Deb822Dict.__init__(self, _dict=_dict, _parsed=_parsed, _fields=fields, encoding=encoding) if sequence is not None: try: self._internal_parser(sequence, fields) except EOFError: pass self.gpg_info = None def iter_paragraphs(cls, sequence, fields=None, use_apt_pkg=True, shared_storage=False, encoding="utf-8"): """Generator that yields a Deb822 object for each paragraph in sequence. :param sequence: same as in __init__. :param fields: likewise. :param use_apt_pkg: if sequence is a file(), apt_pkg will be used if available to parse the file, since it's much much faster. Set this parameter to False to disable using apt_pkg. :param shared_storage: not used, here for historical reasons. Deb822 objects never use shared storage anymore. :param encoding: Interpret the paragraphs in this encoding. (All values are given back as unicode objects, so an encoding is necessary in order to properly interpret the strings.) """ if _have_apt_pkg and use_apt_pkg and isinstance(sequence, file): parser = apt_pkg.TagFile(sequence) for section in parser: paragraph = cls(fields=fields, _parsed=TagSectionWrapper(section), encoding=encoding) if paragraph: yield paragraph else: iterable = iter(sequence) x = cls(iterable, fields, encoding=encoding) while len(x) != 0: yield x x = cls(iterable, fields, encoding=encoding) iter_paragraphs = classmethod(iter_paragraphs) ### @staticmethod def _skip_useless_lines(sequence): """Yields only lines that do not begin with '#'. Also skips any blank lines at the beginning of the input. """ at_beginning = True for line in sequence: if line.startswith('#'): continue if at_beginning: if not line.rstrip('\r\n'): continue at_beginning = False yield line def _internal_parser(self, sequence, fields=None): # The key is non-whitespace, non-colon characters before any colon. key_part = r"^(?P<key>[^: \t\n\r\f\v]+)\s*:\s*" single = re.compile(key_part + r"(?P<data>\S.*?)\s*$") multi = re.compile(key_part + r"$") multidata = re.compile(r"^\s(?P<data>.+?)\s*$") wanted_field = lambda f: fields is None or f in fields if isinstance(sequence, basestring): sequence = sequence.splitlines() curkey = None content = "" for line in self.gpg_stripped_paragraph( self._skip_useless_lines(sequence)): m = single.match(line) if m: if curkey: self[curkey] = content if not wanted_field(m.group('key')): curkey = None continue curkey = m.group('key') content = m.group('data') continue m = multi.match(line) if m: if curkey: self[curkey] = content if not wanted_field(m.group('key')): curkey = None continue curkey = m.group('key') content = "" continue m = multidata.match(line) if m: content += '\n' + line # XXX not m.group('data')? continue if curkey: self[curkey] = content def __str__(self): return self.dump() def __unicode__(self): return self.dump() # __repr__ is handled by Deb822Dict def get_as_string(self, key): """Return the self[key] as a string (or unicode) The default implementation just returns unicode(self[key]); however, this can be overridden in subclasses (e.g. _multivalued) that can take special values. """ return unicode(self[key]) def dump(self, fd=None, encoding=None): """Dump the the contents in the original format If fd is None, return a unicode object. If fd is not None, attempt to encode the output to the encoding the object was initialized with, or the value of the encoding argument if it is not None. This will raise UnicodeEncodeError if the encoding can't support all the characters in the Deb822Dict values. """ if fd is None: fd = StringIO.StringIO() return_string = True else: return_string = False if encoding is None: # Use the encoding we've been using to decode strings with if none # was explicitly specified encoding = self.encoding for key in self.iterkeys(): value = self.get_as_string(key) if not value or value[0] == '\n': # Avoid trailing whitespace after "Field:" if it's on its own # line or the value is empty. We don't have to worry about the # case where value == '\n', since we ensure that is not the # case in __setitem__. entry = '%s:%s\n' % (key, value) else: entry = '%s: %s\n' % (key, value) if not return_string: fd.write(entry.encode(encoding)) else: fd.write(entry) if return_string: return fd.getvalue() ### def is_single_line(self, s): if s.count("\n"): return False else: return True isSingleLine = function_deprecated_by(is_single_line) def is_multi_line(self, s): return not self.is_single_line(s) isMultiLine = function_deprecated_by(is_multi_line) def _merge_fields(self, s1, s2): if not s2: return s1 if not s1: return s2 if self.is_single_line(s1) and self.is_single_line(s2): ## some fields are delimited by a single space, others ## a comma followed by a space. this heuristic assumes ## that there are multiple items in one of the string fields ## so that we can pick up on the delimiter being used delim = ' ' if (s1 + s2).count(', '): delim = ', ' L = (s1 + delim + s2).split(delim) L.sort() prev = merged = L[0] for item in L[1:]: ## skip duplicate entries if item == prev: continue merged = merged + delim + item prev = item return merged if self.is_multi_line(s1) and self.is_multi_line(s2): for item in s2.splitlines(True): if item not in s1.splitlines(True): s1 = s1 + "\n" + item return s1 raise ValueError _mergeFields = function_deprecated_by(_merge_fields) def merge_fields(self, key, d1, d2=None): ## this method can work in two ways - abstract that away if d2 == None: x1 = self x2 = d1 else: x1 = d1 x2 = d2 ## we only have to do work if both objects contain our key ## otherwise, we just take the one that does, or raise an ## exception if neither does if key in x1 and key in x2: merged = self._mergeFields(x1[key], x2[key]) elif key in x1: merged = x1[key] elif key in x2: merged = x2[key] else: raise KeyError ## back to the two different ways - if this method was called ## upon an object, update that object in place. ## return nothing in this case, to make the author notice a ## problem if she assumes the object itself will not be modified if d2 == None: self[key] = merged return None return merged mergeFields = function_deprecated_by(merge_fields) def split_gpg_and_payload(sequence): """Return a (gpg_pre, payload, gpg_post) tuple Each element of the returned tuple is a list of lines (with trailing whitespace stripped). """ gpg_pre_lines = [] lines = [] gpg_post_lines = [] state = 'SAFE' gpgre = re.compile( r'^-----(?P<action>BEGIN|END) PGP (?P<what>[^-]+)-----$') blank_line = re.compile('^$') first_line = True for line in sequence: line = line.strip('\r\n') # skip initial blank lines, if any if first_line: if blank_line.match(line): continue else: first_line = False m = gpgre.match(line) if not m: if state == 'SAFE': if not blank_line.match(line): lines.append(line) else: if not gpg_pre_lines: # There's no gpg signature, so we should stop at # this blank line break elif state == 'SIGNED MESSAGE': if blank_line.match(line): state = 'SAFE' else: gpg_pre_lines.append(line) elif state == 'SIGNATURE': gpg_post_lines.append(line) else: if m.group('action') == 'BEGIN': state = m.group('what') elif m.group('action') == 'END': gpg_post_lines.append(line) break if not blank_line.match(line): if not lines: gpg_pre_lines.append(line) else: gpg_post_lines.append(line) if len(lines): return (gpg_pre_lines, lines, gpg_post_lines) else: raise EOFError('only blank lines found in input') split_gpg_and_payload = staticmethod(split_gpg_and_payload) def gpg_stripped_paragraph(cls, sequence): return cls.split_gpg_and_payload(sequence)[1] gpg_stripped_paragraph = classmethod(gpg_stripped_paragraph) def get_gpg_info(self, keyrings=None): """Return a GpgInfo object with GPG signature information This method will raise ValueError if the signature is not available (e.g. the original text cannot be found). :param keyrings: list of keyrings to use (see GpgInfo.from_sequence) """ # raw_text is saved (as a string) only for Changes and Dsc (see # _gpg_multivalued.__init__) which is small compared to Packages or # Sources which contain no signature if not hasattr(self, 'raw_text'): raise ValueError, "original text cannot be found" if self.gpg_info is None: self.gpg_info = GpgInfo.from_sequence(self.raw_text, keyrings=keyrings) return self.gpg_info def validate_input(self, key, value): """Raise ValueError if value is not a valid value for key Subclasses that do interesting things for different keys may wish to override this method. """ # The value cannot end in a newline (if it did, dumping the object # would result in multiple stanzas) if value.endswith('\n'): raise ValueError("value must not end in '\\n'") # Make sure there are no blank lines (actually, the first one is # allowed to be blank, but no others), and each subsequent line starts # with whitespace for line in value.splitlines()[1:]: if not line: raise ValueError("value must not have blank lines") if not line[0].isspace(): raise ValueError("each line must start with whitespace") def __setitem__(self, key, value): self.validate_input(key, value) Deb822Dict.__setitem__(self, key, value)
from deprecation import function_deprecated_by def parse_tags(input): lre = re.compile(r"^(.+?)(?::?\s*|:\s+(.+?)\s*)$") for line in input: # Is there a way to remove the last character of a line that does not # make a copy of the entire line? m = lre.match(line) pkgs = set(m.group(1).split(', ')) if m.group(2): tags = set(m.group(2).split(', ')) else: tags = set() yield pkgs, tags parseTags = function_deprecated_by(parse_tags) def read_tag_database(input): "Read the tag database, returning a pkg->tags dictionary" db = {} for pkgs, tags in parse_tags(input): # Create the tag set using the native set for p in pkgs: db[p] = tags.copy() return db; readTagDatabase = function_deprecated_by(read_tag_database) def read_tag_database_reversed(input): "Read the tag database, returning a tag->pkgs dictionary" db = {}
class PackageFile: """A Debian package file. Objects of this class can be used to read Debian's Source and Packages files.""" re_field = re.compile(r'^([A-Za-z][A-Za-z0-9-]+):(?:\s*(.*?))?\s*$') re_continuation = re.compile(r'^\s+(?:\.|(\S.*?)\s*)$') def __init__(self, name, file_obj=None): """Creates a new package file object. name - the name of the file the data comes from file_obj - an alternate data source; the default is to open the file with the indicated name. """ if file_obj is None: file_obj = file(name) self.name = name self.file = file_obj self.lineno = 0 def __iter__(self): line = self.file.readline() self.lineno += 1 pkg = [] while line: if line.strip(' \t') == '\n': if len(pkg) == 0: self.raise_syntax_error('expected package record') yield pkg pkg = [] line = self.file.readline() self.lineno += 1 continue match = self.re_field.match(line) if not match: self.raise_syntax_error("expected package field") (name, contents) = match.groups() contents = contents or '' while True: line = self.file.readline() self.lineno += 1 match = self.re_continuation.match(line) if match: (ncontents, ) = match.groups() if ncontents is None: ncontents = "" contents = "%s\n%s" % (contents, ncontents) else: break pkg.append((name, contents)) if pkg: yield pkg def raise_syntax_error(self, msg, lineno=None): if lineno is None: lineno = self.lineno raise ParseError(self.name, lineno, msg) raiseSyntaxError = function_deprecated_by(raise_syntax_error)
class Release(PseudoEnum): pass def list_releases(): releases = {} rels = ("potato", "woody", "sarge", "etch", "lenny", "sid") for r in range(len(rels)): releases[rels[r]] = Release(rels[r], r) Release.releases = releases return releases listReleases = function_deprecated_by(list_releases) def intern_release(name, releases=list_releases()): if releases.has_key(name): return releases[name] else: return None internRelease = function_deprecated_by(intern_release) del listReleases del list_releases