def __init__(self, linktrees): self.linkstree = RecursiveDict() for t, c in [(Links.Type.ANCHOR, AbstractAnchor), (Links.Type.FORM, AbstractForm), (Links.Type.REDIRECT, AbstractRedirect)]: if any(t in lt for lt in linktrees): self.buildtree(self.linkstree, t, [lt[t] for lt in linktrees], c)
def main(): logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) logger = logging.getLogger() words = resources.load_dict_files() logger.info(f'Loaded %d words', len(words)) if len(words) == 0: raise Exception('No words found in dictionaries') matrix = INPUT_MATRIX logger.info('Building word index...') word_index = RecursiveDict.build_word_index(words) logger.info('Searching...') paths = word_index.search_words(matrix) logger.info(f'Found %d words', len(paths)) sys.stdout.flush() print() for word in sorted(paths, key=lambda word: (-len(word), word)): word_paths = paths.get(word) print(f'==== {word} ({len(word_paths)}) ====') for path in word_paths: print_word_matrix(matrix, path) print() logger.info('Found words: %d', len(paths))
def __init__(self, anchors=[], forms=[], redirects=[]): self.logger = logging.getLogger(self.__class__.__name__) # leaves in linkstree are counter of how many times that url occurred # therefore use that counter when compuing number of urls with "nleaves" linkstree = RecursiveDict(lambda x: len(x)) for ltype, links in [(Links.Type.ANCHOR, anchors), (Links.Type.FORM, forms), (Links.Type.REDIRECT, redirects)]: for l in links: urlv = [ltype] urlv += [l.dompath] if l.dompath else [] urlv += list(l.linkvector) linkstree.applypath(urlv, lambda x: self.addlink(x, l)) if not linkstree: # all pages with no links will end up in the same special bin linkstree.setapplypathvalue(("<EMPTY>", ), [None], lambda x: x+[None]) self.linkstree = linkstree
def __init__(self, anchors=[], forms=[], redirects=[]): self.logger = logging.getLogger(self.__class__.__name__) # leaves in linkstree are counter of how many times that url occurred # therefore use that counter when compuing number of urls with "nleaves" linkstree = RecursiveDict(lambda x: len(x)) for ltype, links in [(Links.Type.ANCHOR, anchors), (Links.Type.FORM, forms), (Links.Type.REDIRECT, redirects)]: for l in links: urlv = [ltype] urlv += [l.dompath] if l.dompath else [] urlv += list(l.linkvector) linkstree.applypath(urlv, lambda x: self.addlink(x, l)) if not linkstree: # all pages with no links will end up in the same special bin linkstree.setapplypathvalue(("<EMPTY>", ), [None], lambda x: x + [None]) self.linkstree = linkstree
class AbstractLinks(object): def __init__(self, linktrees): self.linkstree = RecursiveDict() for t, c in [(Links.Type.ANCHOR, AbstractAnchor), (Links.Type.FORM, AbstractForm), (Links.Type.REDIRECT, AbstractRedirect)]: if any(t in lt for lt in linktrees): self.buildtree(self.linkstree, t, [lt[t] for lt in linktrees], c) def buildtree(self, level, key, ltval, c): assert all(isinstance(i, list) for i in ltval) or \ all(not isinstance(i, list) for i in ltval) if isinstance(ltval[0], list): assert False # we have reached the leaves without encountering a cluster # create an abstract object with all the objects in all the leaves # ltval is a list of leaves, ie a list of lists containing abstractlinks level[key] = c(i for j in ltval for i in j) if not ltval[0]: # we have reached the leaves without encountering a cluster # create an abstract object with all the objects in all the leaves # ltval is a list of leaves, ie a list of lists containing abstractlinks assert all(j.value for j in ltval) level[key].value = c(i for j in ltval for i in j.value) else: # we have descendants assert ltval[0].value is None keys = sorted(ltval[0].keys()) if all(sorted(i.keys()) == keys for i in ltval): # the linkstree for all the pages in the current subtree match, # lets go deeper in the tree for k in keys: self.buildtree(level[key], k, [v[k] for v in ltval], c) else: # different links have been clustered together # stop here and make a node containing all descending # abstractlinks # leaves are lists, so iterate teie to get links level[key].value = c(lll for l in ltval for ll in l.iterleaves() for lll in ll) def tryMergeLinkstree(self, pagelinkstree): # check if the linkstree pagelinkstree matches the current linkstree for # the current AbstractPage. If not, raise an exception and go back to # reclustering for t, c in [(Links.Type.ANCHOR, AbstractAnchor), (Links.Type.FORM, AbstractForm), (Links.Type.REDIRECT, AbstractRedirect)]: if t in pagelinkstree or t in self.linkstree: self.tryMergeLinkstreeRec(pagelinkstree[t], self.linkstree[t]) def tryMergeLinkstreeRec(self, pagelinkstree, baselinkstree): if isinstance(baselinkstree, RecursiveDict) and \ isinstance(pagelinkstree, RecursiveDict): # make sure the trees have the same keys pagekeys = set(pagelinkstree.keys()) basekeys = set(baselinkstree.keys()) if pagekeys != basekeys: # there is difference, abort and go back reclustering raise MergeLinksTreeException() for k in pagekeys: # descend into tree self.tryMergeLinkstreeRec(pagelinkstree[k], baselinkstree[k]) elif isinstance(baselinkstree, AbstractLink) and \ isinstance(pagelinkstree, list): pass else: pdb.set_trace() raise MergeLinksTreeException() def __getitem__(self, linkidx): idx = [linkidx.type] + list(linkidx.path) i = self.linkstree for p in idx: if p in i: i = i[p] else: break assert i.value and not i return i.value def __iter__(self): return self.linkstree.iterleaves() def itervalues(self): return iter(self) def iteritems(self): for p, l in self.linkstree.iteridxleaves(): if isinstance(l, AbstractForm): # return a form multiple times, iterating over all form parameters we have used so far params = frozenset(b for a in l.targets.itervalues() for b in a.target.iterkeys()) if params: for pr in params: yield (Link.LinkIdx(p[0], p[1:], pr), l) else: yield (Link.LinkIdx(p[0], p[1:], None), l) else: yield (Link.LinkIdx(p[0], p[1:], None), l) def getUnvisited(self, state): # unvisited if we never did the request for that state # third element of the tuple are the form parameters return [(i, l) for i, l in self.iteritems() if not l.skip \ and (state not in l.targets or not state in l.targets[state].target.targets)] def equals(self, l): return self.linkstree.equals(l.linkstree)
def __init__(self, featuresextractor): self.featuresextractor = featuresextractor # leaves should return the number of elements in the list for nleaves RecursiveDict.__init__(self, lambda x: 1)