def __init__(self, *patterns): self.patterns = patterns self.exprs = [rcompile(pat, re.IGNORECASE) for pat in self.patterns] self.pattern = ("(?P<month>" + "|".join("(%s)" % pat for pat in self.patterns) + ")") self.expr = rcompile(self.pattern, re.IGNORECASE)
def __init__(self, next, last, daynames): self.next_pattern = next self.last_pattern = last self._dayname_exprs = tuple(rcompile(pat, re.IGNORECASE) for pat in daynames) dn_pattern = "|".join(daynames) self.pattern = ("(?P<dir>%s|%s) +(?P<day>%s)(?=(\\W|$))" % (next, last, dn_pattern)) self.expr = rcompile(self.pattern, re.IGNORECASE)
def __init__(self, next, last, daynames): self.next_pattern = next self.last_pattern = last self._dayname_exprs = tuple( rcompile(pat, re.IGNORECASE) for pat in daynames) dn_pattern = "|".join(daynames) self.pattern = ("(?P<dir>%s|%s) +(?P<day>%s)(?=(\\W|$))" % (next, last, dn_pattern)) self.expr = rcompile(self.pattern, re.IGNORECASE)
def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", onceper=True, requireall=False, allof=None, anyof=None, name=None): """ :param elements: the sub-elements to parse. :param sep: a separator regular expression to match between elements, or None to not have separators. :param onceper: only allow each element to match once. :param requireall: if True, the sub-elements can match in any order, but they must all match. :param allof: a list of indexes into the list of elements. When this argument is not None, this element matches only if all the indicated sub-elements match. :param allof: a list of indexes into the list of elements. When this argument is not None, this element matches only if any of the indicated sub-elements match. :param name: a name for this element (for debugging purposes only). """ super(Bag, self).__init__(elements, name) self.sep_expr = rcompile(sep, re.IGNORECASE) self.onceper = onceper self.requireall = requireall self.allof = allof self.anyof = anyof
class RangePlugin(Plugin): """Adds the ability to specify term ranges. """ expr = rcompile(r""" (?P<open>\{|\[) # Open paren (?P<start> ('[^']*?'\s+) # single-quoted | # or ([^\]}]+?(?=[Tt][Oo])) # everything until "to" )? [Tt][Oo] # "to" (?P<end> (\s+'[^']*?') # single-quoted | # or ([^\]}]+?) # everything until "]" or "}" )? (?P<close>}|]) # Close paren """, verbose=True) class RangeTagger(RegexTagger): def __init__(self, expr, excl_start, excl_end): self.expr = expr self.excl_start = excl_start self.excl_end = excl_end def create(self, parser, match): start = match.group("start") end = match.group("end") if start: # Strip the space before the "to" start = start.rstrip() # Strip single quotes if start.startswith("'") and start.endswith("'"): start = start[1:-1] if end: # Strip the space before the "to" end = end.lstrip() # Strip single quotes if end.startswith("'") and end.endswith("'"): end = end[1:-1] # What kind of open and close brackets were used? startexcl = match.group("open") == self.excl_start endexcl = match.group("close") == self.excl_end rn = syntax.RangeNode(start, end, startexcl, endexcl) return rn def __init__(self, expr=None, excl_start="{", excl_end="}"): self.expr = expr or self.expr self.excl_start = excl_start self.excl_end = excl_end def taggers(self, parser): tagger = self.RangeTagger(self.expr, self.excl_start, self.excl_end) return [(tagger, 1)]
def __init__(self, pattern, replacement): """ :param pattern: a pattern string or compiled regular expression object describing the text to replace. :param replacement: the substitution text. """ self.pattern = rcompile(pattern) self.replacement = replacement
def __init__(self, expression=default_pattern, gaps=False): """ :param expression: 一个正则表达式对象或字符串,默认为 rcompile(r"\w+(\.?\w+)*")。 表达式的每一个匹配都等于一个 token 令牌。 第0组匹配(整个匹配文本)用作 token 令牌的文本。 如果你需要更复杂的正则表达式匹配处理,只需要编写自己的 tokenizer 令牌解析器即可。 :param gaps: 如果为 True, tokenizer 令牌解析器会在正则表达式上进行分割,而非匹配。 """ self.expression = rcompile(expression) self.gaps = gaps
def __init__(self, expression=default_pattern, gaps=False): """ :param expression: A regular expression object or string. Each match of the expression equals a token. Group 0 (the entire matched text) is used as the text of the token. If you require more complicated handling of the expression match, simply write your own tokenizer. :param gaps: If True, the tokenizer *splits* on the expression, rather than matching on the expression. """ self.expression = rcompile(expression) self.gaps = gaps
def __init__(self, years, months, weeks, days, hours, minutes, seconds): rel_years = "((?P<years>[0-9]+) *(%s))?" % years rel_months = "((?P<months>[0-9]+) *(%s))?" % months rel_weeks = "((?P<weeks>[0-9]+) *(%s))?" % weeks rel_days = "((?P<days>[0-9]+) *(%s))?" % days rel_hours = "((?P<hours>[0-9]+) *(%s))?" % hours rel_mins = "((?P<mins>[0-9]+) *(%s))?" % minutes rel_secs = "((?P<secs>[0-9]+) *(%s))?" % seconds self.pattern = ("(?P<dir>[+-]) *%s *%s *%s *%s *%s *%s *%s(?=(\\W|$))" % (rel_years, rel_months, rel_weeks, rel_days, rel_hours, rel_mins, rel_secs)) self.expr = rcompile(self.pattern, re.IGNORECASE)
def create(self, in_memory=False): tokenizer_pattern = rcompile(r"(\w|·)+(\.?(\w|·)+)*") # Includes l·l analyzer = StandardAnalyzer(minsize=1, stoplist=None, expression=tokenizer_pattern) schema = Schema(verb_form=TEXT(stored=True, sortable=True, analyzer=analyzer), index_letter=TEXT(stored=True, analyzer=analyzer), file_path=TEXT(stored=True, sortable=True)) if os.path.exists(self.dir_name): shutil.rmtree(self.dir_name) os.makedirs(self.dir_name) ix = create_in(self.dir_name, schema) self.writer = ix.writer() return ix
def __init__(self, elements, sep="(\\s+|\\s*,\\s*)", name=None, progressive=False): """ :param elements: the sequence of sub-elements to parse. :param sep: a separator regular expression to match between elements, or None to not have separators. :param name: a name for this element (for debugging purposes only). :param progressive: if True, elements after the first do not need to match. That is, for elements (a, b, c) and progressive=True, the sequence matches like ``a[b[c]]``. """ super(Sequence, self).__init__(elements, name) self.sep_pattern = sep if sep: self.sep_expr = rcompile(sep, re.IGNORECASE) else: self.sep_expr = None self.progressive = progressive
def __init__(self, expression="[^/]+"): self.expr = rcompile(expression)
# an optional list to override this one. STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', 'to', 'us', 'we', 'when', 'will', 'with', 'yet', 'you', 'your')) # Simple pattern for filtering URLs, may be useful url_pattern = rcompile(""" ( [A-Za-z+]+:// # URL protocol \\S+? # URL body (?=\\s|[.]\\s|$|[.]$) # Stop at space/end, or a dot followed by space/end ) | ( # or... \w+([:.]?\w+)* # word characters, with opt. internal colons/dots ) """, verbose=True) # Filters class Filter(Composable): """Base class for Filter objects. A Filter subclass must implement a filter() method that takes a single argument, which is an iterator of Token objects, and yield a series of Token objects in return. Filters that do morphological transformation of tokens (e.g. stemming) should set their ``is_morph`` attribute to True.
self.seen.add(candidate[self.field]) return (candidate["user"], candidate[self.field]) def deduper(it, dedupe=True, field="content"): seen = set([]) for item in it: if dedupe and item[field] in seen: continue if dedupe: seen.add(item[field]) yield (item["user"], item[field]) return None tok_pat = rcompile(r"[+£€]?\w+(\.?\w+)*") STOP_WORDS = frozenset( ('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can', 'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may', 'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this', 'to', 'when', 'will', 'with', 'yet')) def Analyzer(expression=tok_pat, stoplist=None, minsize=1, maxsize=None, gaps=False): if stoplist is None: stoplist = STOP_WORDS return whoosh.analysis.StandardAnalyzer(expression=expression, stoplist=stoplist,
def __init__(self, expr): self.expr = rcompile(expr)
def __init__(self): self.tokenizer_pattern = rcompile(r"(\w|·)+(\.?(\w|·)+)*") # Includes l·l self.analyzer = StandardAnalyzer(minsize=1, stoplist=None, expression=self.tokenizer_pattern)
def __init__(self): self.pattern = ("(?P<hour>[1-9]|10|11|12)(:(?P<mins>[0-5][0-9])" "(:(?P<secs>[0-5][0-9])(\\.(?P<usecs>[0-9]{1,5}))?)?)?" "\\s*(?P<ampm>am|pm)(?=(\\W|$))") self.expr = rcompile(self.pattern, re.IGNORECASE)
class SimpleVersion(BaseVersion): """An object that parses version numbers such as:: 12.2.5b The filter supports a limited subset of PEP 386 versions including:: 1 1.2 1.2c 1.2c3 1.2.3 1.2.3a 1.2.3b4 10.7.5rc1 999.999.999c999 """ _version_exp = rcompile(r""" ^ (?P<major>\d{1,4}) ( [.](?P<minor>\d{1,4}) ( [.](?P<release>\d{1,4}) )? ( (?P<ex>[abc]|rc) (?P<exnum>\d{1,4})? )? )? $ """, verbose=True) # (groupid, method, skippable, default) _parts = [ ("major", int), ("minor", int), ("release", int), ("ex", str), ("exnum", int), ] _ex_bits = {"a": 0, "b": 1, "c": 2, "rc": 10, "z": 15} _bits_ex = dict((v, k) for k, v in _ex_bits.items()) __slots__ = ("major", "minor", "release", "ex", "exnum") def __init__(self, major=1, minor=0, release=0, ex="z", exnum=0): self.major = major self.minor = minor self.release = release self.ex = ex self.exnum = exnum def to_int(self): assert self.major < 1024 n = self.major << 34 assert self.minor < 1024 n |= self.minor << 24 assert self.release < 1024 n |= self.release << 14 exbits = self._ex_bits.get(self.ex, 15) n |= exbits << 10 assert self.exnum < 1024 n |= self.exnum return n @classmethod def from_int(cls, n): major = (n & (1023 << 34)) >> 34 minor = (n & (1023 << 24)) >> 24 release = (n & (1023 << 14)) >> 14 exbits = (n & (7 << 10)) >> 10 ex = cls._bits_ex.get(exbits, "z") exnum = n & 1023 return cls(major, minor, release, ex, exnum)
class FunctionPlugin(TaggingPlugin): """Adds an abitrary "function call" syntax to the query parser to allow advanced and extensible query functionality. This is unfinished and experimental. """ expr = rcompile(""" [#](?P<name>[A-Za-z_][A-Za-z0-9._]*) # function name ( # optional args \\[ # inside square brackets (?P<args>.*?) \\] )? """, verbose=True) class FunctionNode(syntax.SyntaxNode): has_fieldname = False has_boost = True merging = False def __init__(self, name, fn, args, kwargs): self.name = name self.fn = fn self.args = args self.kwargs = kwargs self.nodes = [] self.boost = None def __repr__(self): return "#%s<%r>(%r)" % (self.name, self.args, self.nodes) def query(self, parser): qs = [n.query(parser) for n in self.nodes] kwargs = self.kwargs if "boost" not in kwargs and self.boost is not None: kwargs["boost"] = self.boost # TODO: If this call raises an exception, return an error query return self.fn(qs, *self.args, **self.kwargs) def __init__(self, fns): """ :param fns: a dictionary mapping names to functions that return a query. """ self.fns = fns def create(self, parser, match): name = match.group("name") if name in self.fns: fn = self.fns[name] argstring = match.group("args") if argstring: args, kwargs = self._parse_args(argstring) else: args = () kwargs = {} return self.FunctionNode(name, fn, args, kwargs) def _parse_args(self, argstring): args = [] kwargs = {} parts = argstring.split(",") for part in parts: if "=" in part: name, value = part.split("=", 1) # Wrap with str() because Python 2.5 can't handle unicode kws name = str(name.strip()) else: name = None value = part value = value.strip() if value.startswith("'") and value.endswith("'"): value = value[1:-1] if name: kwargs[name] = value else: args.append(value) return args, kwargs def filters(self, parser): return [(self.do_functions, 600)] def do_functions(self, parser, group): newgroup = group.empty_copy() i = 0 while i < len(group): node = group[i] if (isinstance(node, self.FunctionNode) and i < len(group) - 1 and isinstance(group[i + 1], syntax.GroupNode)): nextnode = group[i + 1] node.nodes = list(self.do_functions(parser, nextnode)) if nextnode.boost != 1: node.set_boost(nextnode.boost) i += 1 elif isinstance(node, syntax.GroupNode): node = self.do_functions(parser, node) newgroup.append(node) i += 1 return newgroup
def __init__(self, pattern, fn=None, modify=None): self.pattern = pattern self.expr = rcompile(pattern, re.IGNORECASE) self.fn = fn self.modify = modify
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. from whoosh.compat import u, text_type from whoosh.analysis.acore import Composable, Token from whoosh.util.text import rcompile default_pattern = rcompile(r"\w+(\.?\w+)*") # Tokenizers class Tokenizer(Composable): """Base class for Tokenizers. """ def __eq__(self, other): return other and self.__class__ is other.__class__ class IDTokenizer(Tokenizer): """Yields the entire input string as a single token. For use in indexed but untokenized fields, such as a document's path.
def __init__(self, plugin, expr): self.plugin = plugin self.expr = rcompile(expr, re.IGNORECASE)
def __init__(self, expr=None): self.expr = rcompile(expr or self.expr)
class PhrasePlugin(Plugin): """Adds the ability to specify phrase queries inside double quotes. """ # Didn't use TaggingPlugin because I need to add slop parsing at some # point # Expression used to find words if a schema isn't available wordexpr = rcompile(r'\S+') class PhraseNode(syntax.TextNode): def __init__(self, text, textstartchar, slop=1): syntax.TextNode.__init__(self, text) self.textstartchar = textstartchar self.slop = slop def r(self): return "%s %r~%s" % (self.__class__.__name__, self.text, self.slop) def apply(self, fn): return self.__class__(self.type, [fn(node) for node in self.nodes], slop=self.slop, boost=self.boost) def query(self, parser): text = self.text fieldname = self.fieldname or parser.fieldname # We want to process the text of the phrase into "words" (tokens), # and also record the startchar and endchar of each word sc = self.textstartchar if parser.schema and fieldname in parser.schema: field = parser.schema[fieldname] if field.analyzer: # We have a field with an analyzer, so use it to parse # the phrase into tokens tokens = field.tokenize(text, mode="query", chars=True) words = [] char_ranges = [] for t in tokens: words.append(t.text) char_ranges.append((sc + t.startchar, sc + t.endchar)) else: # We have a field but it doesn't have a format object, # for some reason (it's self-parsing?), so use process_text # to get the texts (we won't know the start/end chars) words = list(field.process_text(text, mode="query")) char_ranges = [(None, None)] * len(words) else: # We're parsing without a schema, so just use the default # regular expression to break the text into words words = [] char_ranges = [] for match in PhrasePlugin.wordexpr.finditer(text): words.append(match.group(0)) char_ranges.append((sc + match.start(), sc + match.end())) qclass = parser.phraseclass q = qclass(fieldname, words, slop=self.slop, boost=self.boost, char_ranges=char_ranges) return attach(q, self) class PhraseTagger(RegexTagger): def create(self, parser, match): text = match.group("text") textstartchar = match.start("text") slopstr = match.group("slop") slop = int(slopstr) if slopstr else 1 return PhrasePlugin.PhraseNode(text, textstartchar, slop) def __init__(self, expr='"(?P<text>.*?)"(~(?P<slop>[1-9][0-9]*))?'): self.expr = expr def taggers(self, parser): return [(self.PhraseTagger(self.expr), 0)]
from tempfile import TemporaryDirectory from whoosh import qparser from whoosh.analysis import RegexTokenizer from whoosh.util.text import rcompile tokenizer = RegexTokenizer(expression=rcompile(r"[\w/.]+")) for token in tokenizer(u"Hello there templates/app1/test.html!"): print(repr(token.text)) from whoosh.fields import Schema, TEXT, ID from whoosh.index import create_in tmp_dir = TemporaryDirectory() schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+")))) ix = create_in(tmp_dir.name, schema) writer = ix.writer() writer.add_document(title=u"First document", path=u"/a", content=u"this/is/a/test.html") writer.add_document(title=u"Second document", path=u"/b", content=u"this/is/a/hello.html hello a yup") writer.add_document(title=u"Second document", path=u"/b", content=u"this is a hello.html hello a yup") writer.commit() from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content", ix.schema)
# -*- coding: utf-8 -*- from whoosh.fields import * from whoosh.formats import Positions, Characters from whoosh.analysis import StandardAnalyzer, Tokenizer, RegexTokenizer, NgramFilter from whoosh.analysis.filters import Filter, PassFilter from whoosh.analysis.filters import LowercaseFilter from whoosh.analysis.filters import StopFilter, STOP_WORDS from whoosh.analysis.acore import Token from whoosh.util.text import rcompile from whoosh.query import And, Or, Term, FuzzyTerm INDEXDIR = 'var/indexdir' ENGLISH = rcompile(r"[a-zA-Z0-9_]+(\.?[a-zA-Z0-9_]+)*") tokenizer = RegexTokenizer(r"[a-zA-Z0-9_]+(\.?[a-zA-Z0-9_]+)*|\w+") ngram = NgramFilter(minsize=2, maxsize=2) lower = LowercaseFilter() class StreamSplitter(object): def __init__(self, tokens): self.tokens = tokens try: self.current = self.tokens.next() self.end = False except StopIteration:
"when", "will", "with", "yet", "you", "your", )) # Simple pattern for filtering URLs, may be useful url_pattern = rcompile( """ ( [A-Za-z+]+:// # URL protocol \\S+? # URL body (?=\\s|[.]\\s|$|[.]$) # Stop at space/end, or a dot followed by space/end ) | ( # or... \w+([:.]?\w+)* # word characters, with opt. internal colons/dots ) """, verbose=True, ) # Filters class Filter(Composable): """Base class for Filter objects. A Filter subclass must implement a filter() method that takes a single argument, which is an iterator of Token objects, and yield a series of Token objects in return. Filters that do morphological transformation of tokens (e.g. stemming)
from dataprocessing.utils import PoorDoc import numpy as np from whoosh.index import open_dir from whoosh.query import DateRange from config import indexdir, vectordir, dociddir from dateutil import rrule from datetime import datetime from collections import Counter from nltk.tokenize import word_tokenize, sent_tokenize from whoosh.analysis import StandardAnalyzer from whoosh.util.text import rcompile logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger=logging.getLogger("generate month-vectors") default_pattern = rcompile(r"\w+(\.?\w+)*") def RegexTokenizer(text): for match in default_pattern.finditer(text): term = match.group(0) yield term.lower().encode("utf-8") def getdocvector(date,didentifier): doc=PoorDoc(docidentifier=didentifier,date=date) tokens = RegexTokenizer(doc.getcontent()) return Counter(tokens) def get_months(batchnumber, n_batches): ''' returns a list of monthranges as part of the total monthranges ''' month_range=[]
try: from find_stuff.chmfile import SimpleChmFile except: logger.warn("failed to import chm packages") try: from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter except: logger.warn("failed to import rtf packages") stem_ana = StemmingAnalyzer() | CJKFilter() stem_ana.cachesize = -1 pattern2 = rcompile(r"[A-Za-z0-9]+(\.?[A-Za-z0-9]+)*") stem_ana2 = StemmingAnalyzer(expression=pattern2) | CJKFilter() stem_ana2.cachesize = -1 schema = Schema(title=TEXT(analyzer=stem_ana2, stored=True), content=TEXT(analyzer=stem_ana), time=STORED, path=ID(stored=True), real_path=STORED, filetype=ID) handlers = {} class TxtHandler(object): def extract_content(self, filepath):
def __init__(self): self.expression = rcompile(r'(\W|_)')
try: from find_stuff.chmfile import SimpleChmFile except: logger.warn("failed to import chm packages") try: from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter except: logger.warn("failed to import rtf packages") stem_ana = StemmingAnalyzer() | CJKFilter() stem_ana.cachesize = -1 pattern2 = rcompile(r"[A-Za-z0-9]+(\.?[A-Za-z0-9]+)*") stem_ana2 = StemmingAnalyzer(expression=pattern2) | CJKFilter() stem_ana2.cachesize = -1 schema = Schema(title=TEXT(analyzer=stem_ana2,stored=True), content=TEXT(analyzer=stem_ana), time=STORED, path=ID(stored=True), real_path=STORED, filetype=ID) handlers = {} class TxtHandler(object): def extract_content(self, filepath): with codecs.open(filepath, encoding='utf-8') as fh: return fh.read()
class FuzzyTermPlugin(TaggingPlugin): """Adds syntax to the query parser to create "fuzzy" term queries, which match any term within a certain "edit distance" (number of inserted, deleted, or transposed characters) by appending a tilde (``~``) and an optional maximum edit distance to a term. If you don't specify an explicit maximum edit distance, the default is 1. >>> qp = qparser.QueryParser("content", myschema) >>> qp.add_plugin(qparser.FuzzyTermPlugin()) >>> q = qp.parse("Stephen~2 Colbert") For example, the following query creates a :class:`whoosh.query.FuzzyTerm` query with a maximum edit distance of 1:: bob~ The following creates a fuzzy term query with a maximum edit distance of 2:: bob~2 The maximum edit distance can only be a single digit. Note that edit distances greater than 2 can take an extremely long time and are generally not useful. You can specify a prefix length using ``~n/m``. For example, to allow a maximum edit distance of 2 and require a prefix match of 3 characters:: johannson~2/3 To specify a prefix with the default edit distance:: johannson~/3 """ expr = rcompile(""" (?<=\\S) # Only match right after non-space ~ # Initial tilde (?P<maxdist>[0-9])? # Optional maxdist (/ # Optional prefix slash (?P<prefix>[1-9][0-9]*) # prefix )? # (end prefix group) """, verbose=True) class FuzzinessNode(syntax.SyntaxNode): def __init__(self, maxdist, prefixlength, original): self.maxdist = maxdist self.prefixlength = prefixlength self.original = original def __repr__(self): return "<~%d/%d>" % (self.maxdist, self.prefixlength) class FuzzyTermNode(syntax.TextNode): qclass = query.FuzzyTerm def __init__(self, wordnode, maxdist, prefixlength): self.fieldname = wordnode.fieldname self.text = wordnode.text self.boost = wordnode.boost self.startchar = wordnode.startchar self.endchar = wordnode.endchar self.maxdist = maxdist self.prefixlength = prefixlength def r(self): return "%r ~%d/%d" % (self.text, self.maxdist, self.prefixlength) def query(self, parser): # Use the superclass's query() method to create a FuzzyTerm query # (it looks at self.qclass), just because it takes care of some # extra checks and attributes q = syntax.TextNode.query(self, parser) # Set FuzzyTerm-specific attributes q.maxdist = self.maxdist q.prefixlength = self.prefixlength return q def create(self, parser, match): mdstr = match.group("maxdist") maxdist = int(mdstr) if mdstr else 1 pstr = match.group("prefix") prefixlength = int(pstr) if pstr else 0 return self.FuzzinessNode(maxdist, prefixlength, match.group(0)) def filters(self, parser): return [(self.do_fuzzyterms, 0)] def do_fuzzyterms(self, parser, group): newgroup = group.empty_copy() i = 0 while i < len(group): node = group[i] if i < len(group) - 1 and isinstance(node, syntax.WordNode): nextnode = group[i + 1] if isinstance(nextnode, self.FuzzinessNode): node = self.FuzzyTermNode(node, nextnode.maxdist, nextnode.prefixlength) i += 1 if isinstance(node, self.FuzzinessNode): node = syntax.to_word(node) if isinstance(node, syntax.GroupNode): node = self.do_fuzzyterms(parser, node) newgroup.append(node) i += 1 return newgroup
def find_unused_templates(): start = time.perf_counter() print('Finding all unused templates...') print(' Getting global templates...') global_templates_files, global_templates = find_global_templates() print(' Done.\n Getting app templates...') app_templates_files, app_templates = find_app_templates() print(' Done.') templates = global_templates + app_templates template_files = global_templates_files + app_templates_files # templates.sort() template_files.sort() print(' Getting python files...') py_files, pys = find_py_files() print(' Done.') all_files = py_files + template_files tl_count = [0 for t in templates] unused_templates = [] print(' Creating Index', end='') tmp_dir = TemporaryDirectory() schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(analyzer=RegexTokenizer(expression=rcompile(r"[\w/.]+")))) ix = create_in(tmp_dir.name, schema) writer = ix.writer() for filename in all_files: print('.', end='') # , flush=True) with open(filename, 'r') as f: # print('WHOOSH', filename, filename, f) # content = '/n'.join(f.readlines()) # if content: # print('HAS CONTENT') # print(content) u_filename = filename try: # Python2 u_filename = unicode(filename) except NameError: pass writer.add_document(title=u_filename, path=u_filename, content=six.u('/n'.join(f.readlines()))) # content=content) print('') # , flush=True) writer.commit() print(' Done.') print(' Searching through templates for references', end='') # , flush=True) with ix.searcher() as searcher: for count, template in enumerate(templates): print('.', end="") # , flush=True) query = QueryParser("content", ix.schema).parse(template) results = searcher.search(query) if len(results) < 1: unused_templates.append(template) print('') # , flush=True) print(' Done.') if not unused_templates: print('No unused templates found.') else: print('\nUnused templates:') for template in unused_templates: print(template) end = time.perf_counter() print('Finished in ' + str(end - start) + ' seconds.') return unused_templates