Beispiel #1
0
            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue # wtf
                if text[:9].lower() == ('#redirect'): continue

                processed += 1


                text = unwiki(text)

                tokens = normalise_gently(filter(good, utils.tokens(text)))
                tokens_title = normalise_gently(filter(good, utils.tokens(title)))
                round_tokens |= set(tokens_title) | set(tokens)


            for w in round_tokens:
                record = bdata.records.add()
                record.key = w
                record.value.parts.append('')
                del record.value.parts[:]

            t2 = time()

            # Index
            iserver.feedData(bdata, deadline_ms=10)
Beispiel #2
0
            processed = 0

            for doc in docgroup:
                if not doc: break

                (title, ns, sha1, text) = doc

                if ns != '0': continue
                if not text: continue  # wtf
                if text[:9].lower() == ('#redirect'): continue

                processed += 1

                text = unwiki(text)

                tokens = normalise_gently(filter(good, utils.tokens(text)))
                tokens_title = normalise_gently(
                    filter(good, utils.tokens(title)))
                round_tokens |= set(tokens_title) | set(tokens)

            for w in round_tokens:
                record = bdata.records.add()
                record.key = w
                record.value.parts.append('')
                del record.value.parts[:]

            t2 = time()

            # Index
            iserver.feedData(bdata, deadline_ms=10)
Beispiel #3
0
    def __init__(self, query, mongo_cred, server='tcp://*****:*****@{host}/{db}'.format(user=mongo_cred['user'], password=mongo_cred['password'],
                                                                         host=mongo_cred['host'], db=mongo_cred['db'])
        self.mongo = MongoClient(MONGO_ADDRESS)
        self.db = self.mongo[mongo_cred['db']]

        index = self.index = IndexServer(server, store_path)


        self._TIME()
        query_tokens = map(self.correct_token, tokens(query))

        querysets = set([frozenset(normalise_drop(ts)) for ts in query_tokens])
        querysets = filter(lambda s: s, querysets)
        if not querysets: raise NotEnoughEntropy()
        self._TIME('proc')

        kw_docsets = defaultdict(lambda: frozenset())
        doc_poslists = defaultdict(lambda: defaultdict(lambda: []))
        self.freq = freq = defaultdict(lambda: Counter())
        docs = None

        for queryset in querysets:
            matched_docs = set()

            for kw in queryset:
                self._TIME()
                try :
                  res = index.query(kw, max_mistakes=0, timeout=3)
                except rpcz.RpcDeadlineExceeded:
                  try:
                    res = index.query(kw, max_mistakes=0, timeout=4)
                  except rpcz.RpcDeadlineExceeded:
                    res = index.query(kw, max_mistakes=0, timeout=5)

                if res.exact_total == 0:
                    try:
                        res = index.query(kw, max_mistakes=1, timeout=3)
                    except rpcz.RpcDeadlineExceeded:
                        self.extraquery_deadline = True
                self._TIME('index')

                for record in res.values:
                    key = record.key
                    if key in kw_docsets:
                        matched_docs |= kw_docsets[key]
                        continue
                    data = record.value.parts

                    docpostings = map(cPickle.loads, data)

                    key_set = set()
                    for (sha1, positions) in docpostings:
                        key_set.add(sha1)
                        matched_docs.add(sha1)
                        doc_poslists[sha1][key].append(positions)
                        freq[key][sha1] += len(positions)
                    kw_docsets[key] = frozenset(key_set)
                self._TIME('proc')
            if docs is None:
                docs = matched_docs
            else:
                docs &= matched_docs
            if not docs:
                break
            self._TIME('proc')


        doc_count = Counter()
        doc_count.update({kw: len(freq[kw]) for kw in freq})

        N = self.N = self.db.articles.count()
        idf = {kw: max(0.4, log((N - doc_count[kw] + 0.5) / (doc_count[kw] + 0.5))) for kw in freq}

        self.poslists = {sha1: merge_sorted([l for klists in doc_poslists[sha1].values() for l in klists]) for sha1 in docs}
        self._TIME('proc')

        # Here comes BM25 to save the world!
        scores = []
        avg_size = self.db.service.find_one({'_id': 'avg_len'})['val']
        doc_headers = self.db.articles.find({'_id': {'$in': list(docs)}, 'size': {'$gt': 0}}, {'size':1, 'title':1})
        query_tokens = set([t for qs in query_tokens for t in qs])
        for d in doc_headers:
            score = 0

            sha1 = d['_id']
            size = d['size']
            title = d['title']

            for kw in freq:
                m = (freq[kw][sha1] / size  * (k1 + 1)) / (freq[kw][sha1] / size + k1 * (1 - b + b * size / avg_size))
                score += idf[kw] * m

            # Prioritise title matches (our own heuristic)
            keywords_bag = Counter(query_tokens)
            title_tokens = normalise_gently(tokens(title))
            title_bag = Counter(title_tokens)
            both = keywords_bag & title_bag
            both_c = sum(both.values())
            ratio = both_c / (len(query_tokens) + len(title_tokens) - both_c)
            score += 10 * ratio

            tokens_title = normalise_drop(title_tokens)
            title_set = set(tokens_title)
            both = set(freq.keys()) & title_set
            ratio = len(both) / len(freq)
            score += 10 * ratio

            scores.append((sha1, score))

        self.scores = sorted(scores, key=lambda p: p[1], reverse=True)
        self._TIME('ranking')
        self.results = map(lambda p: p[0], self.scores)
Beispiel #4
0
    def __init__(self,
                 query,
                 mongo_cred,
                 server='tcp://*****:*****@{host}/{db}'.format(
            user=mongo_cred['user'],
            password=mongo_cred['password'],
            host=mongo_cred['host'],
            db=mongo_cred['db'])
        self.mongo = MongoClient(MONGO_ADDRESS)
        self.db = self.mongo[mongo_cred['db']]

        index = self.index = IndexServer(server, store_path)

        self._TIME()
        query_tokens = map(self.correct_token, tokens(query))

        querysets = set([frozenset(normalise_drop(ts)) for ts in query_tokens])
        querysets = filter(lambda s: s, querysets)
        if not querysets: raise NotEnoughEntropy()
        self._TIME('proc')

        kw_docsets = defaultdict(lambda: frozenset())
        doc_poslists = defaultdict(lambda: defaultdict(lambda: []))
        self.freq = freq = defaultdict(lambda: Counter())
        docs = None

        for queryset in querysets:
            matched_docs = set()

            for kw in queryset:
                self._TIME()
                try:
                    res = index.query(kw, max_mistakes=0, timeout=3)
                except rpcz.RpcDeadlineExceeded:
                    try:
                        res = index.query(kw, max_mistakes=0, timeout=4)
                    except rpcz.RpcDeadlineExceeded:
                        res = index.query(kw, max_mistakes=0, timeout=5)

                if res.exact_total == 0:
                    try:
                        res = index.query(kw, max_mistakes=1, timeout=3)
                    except rpcz.RpcDeadlineExceeded:
                        self.extraquery_deadline = True
                self._TIME('index')

                for record in res.values:
                    key = record.key
                    if key in kw_docsets:
                        matched_docs |= kw_docsets[key]
                        continue
                    data = record.value.parts

                    docpostings = map(cPickle.loads, data)

                    key_set = set()
                    for (sha1, positions) in docpostings:
                        key_set.add(sha1)
                        matched_docs.add(sha1)
                        doc_poslists[sha1][key].append(positions)
                        freq[key][sha1] += len(positions)
                    kw_docsets[key] = frozenset(key_set)
                self._TIME('proc')
            if docs is None:
                docs = matched_docs
            else:
                docs &= matched_docs
            if not docs:
                break
            self._TIME('proc')

        doc_count = Counter()
        doc_count.update({kw: len(freq[kw]) for kw in freq})

        N = self.N = self.db.articles.count()
        idf = {
            kw: max(0.4, log(
                (N - doc_count[kw] + 0.5) / (doc_count[kw] + 0.5)))
            for kw in freq
        }

        self.poslists = {
            sha1: merge_sorted(
                [l for klists in doc_poslists[sha1].values() for l in klists])
            for sha1 in docs
        }
        self._TIME('proc')

        # Here comes BM25 to save the world!
        scores = []
        avg_size = self.db.service.find_one({'_id': 'avg_len'})['val']
        doc_headers = self.db.articles.find(
            {
                '_id': {
                    '$in': list(docs)
                },
                'size': {
                    '$gt': 0
                }
            }, {
                'size': 1,
                'title': 1
            })
        query_tokens = set([t for qs in query_tokens for t in qs])
        for d in doc_headers:
            score = 0

            sha1 = d['_id']
            size = d['size']
            title = d['title']

            for kw in freq:
                m = (freq[kw][sha1] / size *
                     (k1 + 1)) / (freq[kw][sha1] / size + k1 *
                                  (1 - b + b * size / avg_size))
                score += idf[kw] * m

            # Prioritise title matches (our own heuristic)
            keywords_bag = Counter(query_tokens)
            title_tokens = normalise_gently(tokens(title))
            title_bag = Counter(title_tokens)
            both = keywords_bag & title_bag
            both_c = sum(both.values())
            ratio = both_c / (len(query_tokens) + len(title_tokens) - both_c)
            score += 10 * ratio

            tokens_title = normalise_drop(title_tokens)
            title_set = set(tokens_title)
            both = set(freq.keys()) & title_set
            ratio = len(both) / len(freq)
            score += 10 * ratio

            scores.append((sha1, score))

        self.scores = sorted(scores, key=lambda p: p[1], reverse=True)
        self._TIME('ranking')
        self.results = map(lambda p: p[0], self.scores)