Example #1
0
            postings = defaultdict(lambda: [])
            bdata = index_pb.BuilderData()
            docs = []

            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue  # wtf
                if text[:9].lower() == ('#redirect'): continue

                text = unwiki(text)
                itokens = list(itokenise(text))
                itokens_title = list(itokenise(title))

                tokens = normalise(utils.tokens(text, itokens))
                tokens_title = negate_tokens(
                    normalise(utils.tokens(title, itokens_title)))
                tokens_all = tokens_title + tokens
                if not tokens_all: continue

                article_tokens = Counter()

                thisdoc_postings = defaultdict(lambda: [])
                for i, w in tokens_all:
                    article_tokens[w] += 1
                    thisdoc_postings[w].append(i)
                for w, l in thisdoc_postings.iteritems():
Example #2
0
def tokens(string, ilist=None):
    if ilist is None: ilist = itokenise(string)
    return [string[f:t] for f, t in ilist]
Example #3
0
def tokens(string, ilist=None):
    if ilist is None:
        ilist = itokenise(string)
    return [string[f:t] for f, t in ilist]
Example #4
0
            postings = defaultdict(lambda: [])
            bdata = index_pb.BuilderData()
            docs = []

            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue # wtf
                if text[:9].lower() == ('#redirect'): continue


                text = unwiki(text)
                itokens = list(itokenise(text))
                itokens_title = list(itokenise(title))

                tokens = normalise(utils.tokens(text, itokens))
                tokens_title = negate_tokens(normalise(utils.tokens(title, itokens_title)))
                tokens_all = tokens_title + tokens
                if not tokens_all: continue

                article_tokens = Counter()

                thisdoc_postings = defaultdict(lambda: [])
                for i, w in tokens_all:
                    article_tokens[w] += 1
                    thisdoc_postings[w].append(i)
                for w, l in thisdoc_postings.iteritems():
                    postings[w].append((sha1, l))