コード例 #1
0
ファイル: searcher.py プロジェクト: ticcky/kwesa
    def build_index(self, db_file, outfile):
        db = sqlite3.connect(db_file)
        
        self.index = index = defaultdict(lambda:defaultdict())

        start_id = 0

        while 1:
            q = self.DB_ITER_QUERY.format(id=start_id, limit=self.ITER_STEP)
            print q
            res = db.execute(q)
            res_rows = res.fetchall()
            if len(res_rows) == 0: break

            for doc_id, doc_body in res_rows:
                start_id = doc_id = int(doc_id)
                doc_body = doc_body.lower()
                words = kwutils.tokenize(doc_body)
                w_stemmed = kwutils.stem(words, self.p)
                w_stopped = w_stemmed
                w_stopped = kwutils.filter_stopwords(w_stemmed, self.sw)

                for w in w_stopped:
                    w = w.strip()
                    if len(w) > 0:
                        index[w][doc_id] = 1
        
        with open(outfile, 'w') as f_out:
            f_out.write(json.dumps(index))
コード例 #2
0
ファイル: esa_analyze.py プロジェクト: ticcky/kwesa
    def analyze(self, sentence):
        words = kwutils.tokenize(sentence)
        w_stemmed = kwutils.stem(words, self.p)
        w_stopped = kwutils.filter_stopwords(w_stemmed, self.sw)

        vector = set()
        for word in w_stopped:
            for doc_id in self.ndx.get(word, []):
                vector.add(doc_id)
        return vector
コード例 #3
0
ファイル: esa_analyze.py プロジェクト: ticcky/kwesa
    def analyze(self, sentence):
        words = kwutils.tokenize(sentence)
        w_stemmed = kwutils.stem(words, self.p)
        w_stopped = kwutils.filter_stopwords(w_stemmed, self.sw)

        vector = set()
        for word in w_stopped:
            for doc_id in self.ndx.get(word, []):
                vector.add(doc_id)
        return vector
コード例 #4
0
    def search(self, phrase):
        phrase = phrase.lower()
        words = kwutils.tokenize(phrase)
        w_stemmed = kwutils.stem(words, self.p)
        w_stopped = w_stemmed
        w_stopped = kwutils.filter_stopwords(w_stemmed, self.sw)

        rank = defaultdict(lambda: defaultdict(lambda: 0))

        for word in w_stopped:
            if len(word) == 0: continue

            #q = self.DB_WORD_QUERY.format(word=word)
            #print >>sys.stderr, q
            #res = self.db.execute(q)
            res = self.indexer.find(word)
            for doc_id in res:
                rank[doc_id][word] = 1

        return self.rank_docs(rank, w_stopped)
コード例 #5
0
ファイル: searcher.py プロジェクト: ticcky/kwesa
    def search(self, phrase):
        phrase = phrase.lower()
        words = kwutils.tokenize(phrase)
        w_stemmed = kwutils.stem(words, self.p)
        w_stopped = w_stemmed
        w_stopped = kwutils.filter_stopwords(w_stemmed, self.sw)

        rank = defaultdict(lambda: defaultdict(lambda: 0))

        for word in w_stopped:
            if len(word) == 0: continue
            
            #q = self.DB_WORD_QUERY.format(word=word)
            #print >>sys.stderr, q
            #res = self.db.execute(q)
            res = self.indexer.find(word)
            for doc_id in res:
                rank[doc_id][word] = 1

        return self.rank_docs(rank, w_stopped)
コード例 #6
0
    def build(self, docpath, outfile):
        p = PorterStemmer()
        sw = stopwords.StopWords(self.stopword_file)

        ndx = defaultdict(list)

        for filename in os.listdir(docpath):
            if not filename.endswith(".txt"): continue

            doc_id = hash(filename.replace(".txt", ""))
            with open(os.path.join(docpath, filename)) as f:
                f_content = kwutils.normalize(f.read().lower())

            words = kwutils.tokenize(f_content)
            w_stemmed = kwutils.stem(words, p)
            w_stopped = kwutils.filter_stopwords(w_stemmed, sw)

            for word in w_stopped:
                if len(word) > 0:
                    if not doc_id in ndx[word]:
                        ndx[word].append(doc_id)

        with open(outfile, 'w') as f:
            f.write(json.dumps(ndx))