def parse_file(f): """Parses the WordNet wn_s.pl prolog file and returns two dictionaries: word2nums and num2words. """ word2nums = defaultdict(list) num2words = defaultdict(list) for line in f: if not line.startswith("s("): continue line = line[2:] num = int(line[:line.find(",")]) qt = line.find("'") line = line[qt+1:] qt = line.find("'") word = line[:qt].lower() if not word.isalpha(): continue word2nums[word].append(num) num2words[num].append(word) return word2nums, num2words
def parse_file(f): """Parses the WordNet wn_s.pl prolog file and returns two dictionaries: word2nums and num2words. """ word2nums = defaultdict(list) num2words = defaultdict(list) for line in f: if not line.startswith("s("): continue line = line[2:] num = int(line[:line.find(",")]) qt = line.find("'") line = line[qt + 1:] qt = line.find("'") word = line[:qt].lower() if not word.isalpha(): continue word2nums[word].append(num) num2words[num].append(word) return word2nums, num2words
def __init__(self, ruletable): """ :param ruletable: a string containing the rule data, separated by newlines. """ self.rules = defaultdict(list) self.read_rules(ruletable)
def __init__(self, ixreader, fieldname, model=Bo1Model): """ :param reader: A :class:whoosh.reading.IndexReader object. :param fieldname: The name of the field in which to search. :param model: (classify.ExpansionModel) The model to use for expanding the query terms. If you omit this parameter, the expander uses scoring.Bo1Model by default. """ self.fieldname = fieldname if type(model) is type: model = model(ixreader, fieldname) self.model = model # Cache the collection frequency of every term in this field. This # turns out to be much faster than reading each individual weight from # the term index as we add words. self.collection_freq = dict((word, freq) for word, _, freq in ixreader.iter_field(fieldname)) # Maps words to their weight in the top N documents. self.topN_weight = defaultdict(float) # Total weight of all terms in the top N documents. self.top_total = 0
def __init__(self, ixreader, fieldname, model=Bo1Model): """ :param reader: A :class:whoosh.reading.IndexReader object. :param fieldname: The name of the field in which to search. :param model: (classify.ExpansionModel) The model to use for expanding the query terms. If you omit this parameter, the expander uses scoring.Bo1Model by default. """ self.fieldname = fieldname if type(model) is type: model = model(ixreader, fieldname) self.model = model # Cache the collection frequency of every term in this field. This # turns out to be much faster than reading each individual weight from # the term index as we add words. self.collection_freq = dict( (word, freq) for word, _, freq in ixreader.iter_field(fieldname)) # Maps words to their weight in the top N documents. self.topN_weight = defaultdict(float) # Total weight of all terms in the top N documents. self.top_total = 0
def word_values(self, value, doc_boost=1.0, **kwargs): seen = defaultdict(int) for t in unstopped(self.analyzer(value, **kwargs)): seen[t.text] += 1 encode = self.encode return ((w, freq, encode((freq, doc_boost))) for w, freq in seen.iteritems())
def word_values(self, value, start_pos=0, **kwargs): seen = defaultdict(list) for t in unstopped(self.analyzer(value, positions=True, start_pos=start_pos, **kwargs)): seen[t.text].append(start_pos + t.pos) encode = self.encode return ((w, len(poslist), encode(poslist)) for w, poslist in seen.iteritems())
def __init__(self, schema): self.schema = schema self.maxdoc = 0 self._sync_lock = Lock() self.termlists = defaultdict(list) self.invertedindex = {} for fieldnum in xrange(len(schema)): self.invertedindex[fieldnum] = defaultdict(list) self.indexfreqs = defaultdict(int) self.storedfields = {} self.fieldlengths = defaultdict(int) self.fieldlength_totals = defaultdict(int) self.vectors = {} self.deleted = set() self._stored_to_pos = dict((fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))
def __init__(self, schema): self.schema = schema self.maxdoc = 0 self._sync_lock = Lock() self.termlists = defaultdict(list) self.invertedindex = {} for fieldnum in xrange(len(schema)): self.invertedindex[fieldnum] = defaultdict(list) self.indexfreqs = defaultdict(int) self.storedfields = {} self.fieldlengths = defaultdict(int) self.fieldlength_totals = defaultdict(int) self.vectors = {} self.deleted = set() self._stored_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.stored_fields()))
def word_values(self, value, **kwargs): seen = defaultdict(int) if self.boost_as_freq: for t in unstopped(self.analyzer(value, boosts=True, **kwargs)): seen[t.text] += int(t.boost) else: for t in unstopped(self.analyzer(value, **kwargs)): seen[t.text] += 1 encode = self.encode return ((w, freq, encode(freq)) for w, freq in seen.iteritems())
def word_values(self, value, start_pos=0, start_char=0, **kwargs): seen = defaultdict(list) for t in unstopped(self.analyzer(value, positions=True, chars=True, start_pos=start_pos, start_char=start_char, **kwargs)): seen[t.text].append((t.pos, start_char + t.startchar, start_char + t.endchar)) encode = self.encode return ((w, len(ls), encode(ls)) for w, ls in seen.iteritems())
def word_values(self, value, start_pos=0, **kwargs): seen = defaultdict(list) for t in unstopped( self.analyzer(value, positions=True, start_pos=start_pos, **kwargs)): seen[t.text].append(start_pos + t.pos) encode = self.encode return ((w, len(poslist), encode(poslist)) for w, poslist in seen.iteritems())
def suggestions_and_scores(self, text, weighting=None): """Returns a list of possible alternative spellings of 'text', as ('word', score, weight) triples, where 'word' is the suggested word, 'score' is the score that was assigned to the word using :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`, and 'weight' is the score the word received in the search for the original word's ngrams. You must add words to the dictionary (using add_field, add_words, and/or add_scored_words) before you can use this. This is a lower-level method, in case an expert user needs access to the raw scores, for example to implement a custom suggestion ranking algorithm. Most people will want to call :meth:`~SpellChecker.suggest` instead, which simply returns the top N valued words. :param text: The word to check. :rtype: list """ if weighting is None: weighting = TF_IDF() grams = defaultdict(list) for size in xrange(self.mingram, self.maxgram + 1): key = "gram%s" % size nga = analysis.NgramAnalyzer(size) for t in nga(text): grams[key].append(t.text) queries = [] for size in xrange(self.mingram, min(self.maxgram + 1, len(text))): key = "gram%s" % size gramlist = grams[key] queries.append( query.Term("start%s" % size, gramlist[0], boost=self.booststart)) queries.append( query.Term("end%s" % size, gramlist[-1], boost=self.boostend)) for gram in gramlist: queries.append(query.Term(key, gram)) q = query.Or(queries) ix = self.index() s = ix.searcher(weighting=weighting) try: result = s.search(q) return [(fs["word"], fs["score"], result.score(i)) for i, fs in enumerate(result) if fs["word"] != text] finally: s.close()
def __init__(self, ix, postlimit, blocklimit, name=None): """ :param ix: the Index object in which to write the new segment. :param postlimit: the maximum size for a run in the posting pool. :param blocklimit: the maximum number of postings in a posting block. :param name: the name of the segment. """ self.index = ix self.schema = ix.schema self.storage = storage = ix.storage self.name = name or ix._next_segment_name() self.max_doc = 0 self.pool = postpool.PostingPool(postlimit) # Create mappings of field numbers to the position of that field in the # lists of scorable and stored fields. For example, consider a schema # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary # would then map B -> 0, D -> 1, and E -> 2. self._scorable_to_pos = dict((fnum, i) for i, fnum in enumerate(self.schema.scorable_fields())) self._stored_to_pos = dict((fnum, i) for i, fnum in enumerate(self.schema.stored_fields())) # Create a temporary segment object just so we can access its # *_filename attributes (so if we want to change the naming convention, # we only have to do it in one place). tempseg = Segment(self.name, 0, 0, None) self.termtable = create_terms(storage, tempseg) self.docslist = create_storedfields(storage, tempseg) self.doclengths = None if self.schema.scorable_fields(): self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos)) postfile = storage.create_file(tempseg.posts_filename) self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit) self.vectortable = None if self.schema.has_vectored_fields(): # Table associating document fields with (postoffset, postcount) self.vectortable = create_vectors(storage, tempseg) vpostfile = storage.create_file(tempseg.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpostfile, stringids=True) # Keep track of the total number of tokens (across all docs) # in each field self.field_length_totals = defaultdict(int)
def __init__(self, ix, postlimit, blocklimit, name=None): """ :param ix: the Index object in which to write the new segment. :param postlimit: the maximum size for a run in the posting pool. :param blocklimit: the maximum number of postings in a posting block. :param name: the name of the segment. """ self.index = ix self.schema = ix.schema self.storage = storage = ix.storage self.name = name or ix._next_segment_name() self.max_doc = 0 self.pool = postpool.PostingPool(postlimit) # Create mappings of field numbers to the position of that field in the # lists of scorable and stored fields. For example, consider a schema # with fields (A, B, C, D, E, F). If B, D, and E are scorable, then the # list of scorable fields is (B, D, E). The _scorable_to_pos dictionary # would then map B -> 0, D -> 1, and E -> 2. self._scorable_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.scorable_fields())) self._stored_to_pos = dict( (fnum, i) for i, fnum in enumerate(self.schema.stored_fields())) # Create a temporary segment object just so we can access its # *_filename attributes (so if we want to change the naming convention, # we only have to do it in one place). tempseg = Segment(self.name, 0, 0, None) self.termtable = create_terms(storage, tempseg) self.docslist = create_storedfields(storage, tempseg) self.doclengths = None if self.schema.scorable_fields(): self.doclengths = create_doclengths(storage, tempseg, len(self._scorable_to_pos)) postfile = storage.create_file(tempseg.posts_filename) self.postwriter = FilePostingWriter(postfile, blocklimit=blocklimit) self.vectortable = None if self.schema.has_vectored_fields(): # Table associating document fields with (postoffset, postcount) self.vectortable = create_vectors(storage, tempseg) vpostfile = storage.create_file(tempseg.vectorposts_filename) self.vpostwriter = FilePostingWriter(vpostfile, stringids=True) # Keep track of the total number of tokens (across all docs) # in each field self.field_length_totals = defaultdict(int)
def suggestions_and_scores(self, text, weighting=None): """Returns a list of possible alternative spellings of 'text', as ('word', score, weight) triples, where 'word' is the suggested word, 'score' is the score that was assigned to the word using :meth:`SpellChecker.add_field` or :meth:`SpellChecker.add_scored_words`, and 'weight' is the score the word received in the search for the original word's ngrams. You must add words to the dictionary (using add_field, add_words, and/or add_scored_words) before you can use this. This is a lower-level method, in case an expert user needs access to the raw scores, for example to implement a custom suggestion ranking algorithm. Most people will want to call :meth:`~SpellChecker.suggest` instead, which simply returns the top N valued words. :param text: The word to check. :rtype: list """ if weighting is None: weighting = TF_IDF() grams = defaultdict(list) for size in xrange(self.mingram, self.maxgram + 1): key = "gram%s" % size nga = analysis.NgramAnalyzer(size) for t in nga(text): grams[key].append(t.text) queries = [] for size in xrange(self.mingram, min(self.maxgram + 1, len(text))): key = "gram%s" % size gramlist = grams[key] queries.append(query.Term("start%s" % size, gramlist[0], boost=self.booststart)) queries.append(query.Term("end%s" % size, gramlist[-1], boost=self.boostend)) for gram in gramlist: queries.append(query.Term(key, gram)) q = query.Or(queries) ix = self.index() s = ix.searcher(weighting=weighting) try: result = s.search(q) return [(fs["word"], fs["score"], result.score(i)) for i, fs in enumerate(result) if fs["word"] != text] finally: s.close()
def word_values(self, value, start_pos=0, start_char=0, **kwargs): seen = defaultdict(list) for t in unstopped( self.analyzer(value, positions=True, chars=True, start_pos=start_pos, start_char=start_char, **kwargs)): seen[t.text].append( (t.pos, start_char + t.startchar, start_char + t.endchar)) encode = self.encode return ((w, len(ls), encode(ls)) for w, ls in seen.iteritems())
def wrapper(self, *args): if not hasattr(self, prefix + "cache"): cache = {} queue = deque() refcount = defaultdict(int) setattr(self, prefix + "cache", cache) setattr(self, prefix + "queue", queue) setattr(self, prefix + "refcount", refcount) else: cache = getattr(self, prefix + "cache") queue = getattr(self, prefix + "queue") refcount = getattr(self, prefix + "refcount") qpend = queue.append qpop = queue.popleft # Get cache entry or compute if not found try: result = cache[args] except KeyError: result = cache[args] = func(self, *args) # Record that this key was recently accessed qpend(args) refcount[args] += 1 # Purge least recently accessed cache contents while len(cache) > size: k = qpop() refcount[k] -= 1 if not refcount[k]: del cache[k] del refcount[k] # Periodically compact the queue by removing duplicate keys if len(queue) > size * 4: for _ in xrange(len(queue)): k = qpop() if refcount[k] == 1: qpend(k) else: refcount[k] -= 1 #assert len(queue) == len(cache) == len(refcount) == sum(refcount.itervalues()) return result
def __init__(self, dbfile): self.dbfile = dbfile dbfile.seek(2048) self.hashes = defaultdict(list)
("erid", "eris"), ("pand", "pans"), ("end", "ens", "s"), ("ond", "ons"), ("lud", "lus"), ("rud", "rus"), ("her", "hes", "pt"), ("mit", "mis"), ("ent", "ens", "m"), ("ert", "ers"), ("et", "es", "n"), ("yt", "ys"), ("yz", "ys")) # Hash the ending rules by the last letter of the target ending _endingrules = defaultdict(list) for rule in _endings: _endingrules[rule[0][-1]].append(rule) _doubles = frozenset(("dd", "gg", "ll", "mm", "nn", "pp", "rr", "ss", "tt")) def fix_ending(word): if word[-2:] in _doubles: word = word[:-1] for endingrule in _endingrules[word[-1]]: target, newend = endingrule[:2] if word.endswith(target): if len(endingrule) > 2: exceptafter = endingrule[2]