Example #1
0
    def delete(self, tid, title, field):
        '''删除索引'''

        if tid is None or tid == '' \
                or title is None or title == '' \
                or field is None or field == '':
            return False

        # 原始数据
        self.tid = tid
        self.title = title
        self.field = field

        # 删除保存原始数据的Hash
        # self.r.hdel(self.field, self.tid)
        self.pipeline.hdel(self.field, self.tid)
        # 删除能通过sort命令的get参数获取数据的原始数据
        # self.pipeline.delete('%s:%s' % (self.field, self.tid))

        # 分词
        algor = mmseg.Algorithm(self.title)

        words = []
        for tok in algor:
            # 不区分大小写
            word = tok.text.decode('utf-8').lower()
            words.append(word)
            # 删除倒排索引,以分词为key
            # self.r.srem('%s:%s' % (self.field, word), self.tid)
            self.pipeline.srem('%s:%s' % (self.field, word), self.tid)
        # 删除用于搜索结果排序的score
        # self.r.del('%s:score:%s' % (self.field, self.tid))
        self.pipeline.delete('%s:score:%s' % (self.field, self.tid))

        # 前缀索引
        if self.config.prefix_index_enable is True:
            # 不区分大小写
            word = self.title.decode('utf-8').lower()
            # 前缀索引不包括分词内容
            del words[:]
            words.append(word)
            # 删除倒排索引,以分词为key
            # self.r.srem('%s:%s' % (self.field, word), self.tid)
            self.pipeline.srem('%s:%s' % (self.field, word), self.tid)

            dic = []
            for word in words:
                for i in range(len(word)):
                    prefix = word[:i+1]
                    dic.append(prefix)
                    # print prefix.encode('utf-8')
                # 完整的词增加一项,用*号区分
                prefix = '%s%s' % (word, '*')
                dic.append(prefix)
            # self.r.zrem('compl:%s' % (self.field), *dic)
            self.pipeline.zrem('compl:%s' % (self.field), *dic)

        self.pipeline.execute()
        return True
Example #2
0
 def GenWordSegString(self,ss,FID):        
     rawText = ss        
     segRes = mmseg.Algorithm(rawText)
     self.segFID = FID
     self.segLst = list()
     for w in segRes:
         if not w.text.isalnum():
             self.segLst.append(w.text)                
     self.segInMemo = True        
     return self
Example #3
0
    def segment(text):
        '''
        text should be either utf8 or unicode
        return a list of words in unicode
        '''

        if isinstance(text, unicode):
            text = text.encode('utf8')
        alg = mmseg.Algorithm(text)
        # print '%s [%d..%d]' % (tok.text, tok.start, tok.end) for tok in alg
        return [tok.text.decode('utf8') for tok in alg]
Example #4
0
 def GenWordSeg(self,inputFile):
     with open(inputFile) as f:
         rawText = f.read()        
     segRes = mmseg.Algorithm(rawText)
     self.segFID = inputFile
     self.segLst = list()
     for w in segRes:
         if not w.text.isalnum():
             self.segLst.append(w.text)                
     self.segInMemo = True        
     return self
Example #5
0
 def post(self):
     keyword = self.get_argument('keyword')
     algor = mmseg.Algorithm(keyword.encode('utf-8'))
     ret = []
     for tok in algor:
         ret.append({'start' : tok.start})
         ret.append({'end' : tok.end})
         ret.append({'length' : tok.length})
         ret.append({'text' : tok.text.decode('utf-8')})
     self.write(tornado.escape.json_encode(ret))
     self.finish()
Example #6
0
def build_index_database(key, fic, pos):
    """build index
    
    Args:
        key: the content which gona to be segmented
        fic: fiction object which contains the key
        pos: position of the key word in fiction

    Return:
        None
    """
    try:
        words = mmseg.Algorithm(key)
    except Exception, e:
        print e
        return
Example #7
0
def word_split(sentence, shall_print=0):
    # python word split is incorrect for float, for example,"16.5 中午吃饭",
    # will be splited as "16 5 中午吃饭", instead of "16.5 中午 吃饭"
    SPECIAL_CHARACTER_FOR_FLOAT = 'a'
    sentence = sentence.replace('。', SPECIAL_CHARACTER_FOR_FLOAT)
    sentence = sentence.replace('.', SPECIAL_CHARACTER_FOR_FLOAT)
    algor = mmseg.Algorithm(sentence)
    token_list = []
    for tok in algor:
        if tok.text.replace(SPECIAL_CHARACTER_FOR_FLOAT, '0').isdigit():
            token_list.append((tok.text.replace(SPECIAL_CHARACTER_FOR_FLOAT, '.'), \
             tok.start, tok.end))
        else:
            token_list.append((tok.text, tok.start, tok.end))

    # temporarily print
    for text, start, end in token_list:
        if shall_print == 1:
            log.info("%s, %d, %d" % (text, start, end))
        else:
            if shall_print == 2:
                print "%s, %d, %d" % (text, start, end)

    return token_list
Example #8
0
# -*- coding: utf8 -*-

from pymmseg import mmseg
 
mmseg.dict_load_defaults()
text = '工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作'
algor = mmseg.Algorithm(text)
for tok in algor:
    print '%s [%d..%d]' % (tok.text.decode('utf8'), tok.start, tok.end)
Example #9
0
def test(text):
    mmseg.dict_load_defaults()
    algor = mmseg.Algorithm(text)
    for tok in algor:
        print '%s [%d..%d]' % (tok.text, tok.start, tok.end)
Example #10
0
    def index(self, path, field):
        with open(path, 'r') as fp:
            i = 0
            data = {}
            inverted_index = defaultdict(list)
            scores = {}
            prefix_index = []
            for line in fp:
                i = i + 1
                if i % 20000 == 0:
                    print i
                    # 原始数据保存到Hash
                    self.r.hmset(field, data)
                    #self.pipeline.hmset(field, data)
                    # 建立倒排索引,以分词为key
                    for w in inverted_index:
                        self.pipeline.sadd('%s:%s' % (field, w), *inverted_index[w])
                    self.pipeline.execute()
                    # 用于搜索结果排序的score
                    self.r.mset(scores)
                    #self.pipeline.mset(scores)
                    # 前缀索引
                    self.r.zadd('compl:%s' % (field), *prefix_index)
                    #self.pipeline.zadd('compl:%s' % (field), *prefix_index)
                    #self.pipeline.execute()

                    data.clear()
                    inverted_index.clear()
                    scores.clear()
                    del prefix_index[:]

                tid, uid, title, attachments = line.strip().split('\t')
                score = 0
                data[tid] = sj.dumps({'tid' : tid, 'title' : title, 'field' : field})
                # 分词
                algor = mmseg.Algorithm(title)
                words = []
                for tok in algor:
                    # 不区分大小写
                    word = tok.text.decode('utf-8').lower()
                    words.append(word)
                    inverted_index[word].append(tid)
                scores['%s:score:%s' % (field, tid)] = score

                # 前缀索引
                if self.config.prefix_index_enable is True:
                    # 不区分大小写
                    word = title.decode('utf-8').lower()
                    # 前缀索引不包括分词内容
                    del words[:]
                    words.append(word)
                    inverted_index[word].append(tid)

                    for w in words:
                        for j in range(len(w)):
                            prefix = w[:j+1]
                            prefix_index.append(prefix)
                            prefix_index.append(0.0)
                        # 完整的词增加一项,用*号区分
                        prefix = '%s%s' % (w, '*')
                        prefix_index.append(prefix)
                        prefix_index.append(0.0)

            # 原始数据保存到Hash
            self.r.hmset(field, data)
            #self.pipeline.hmset(field, data)
            # 建立倒排索引,以分词为key
            for w in inverted_index:
                self.pipeline.sadd('%s:%s' % (field, w), *inverted_index[w])
            self.pipeline.execute()
            # 用于搜索结果排序的score
            self.r.mset(scores)
            #self.pipeline.mset(scores)
            # 前缀索引
            self.r.zadd('compl:%s' % (field), *prefix_index)
Example #11
0
    def run(self):
        """thread method"""
        
        #get all the fresh information
        _headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5",  
                     "Accept": "text/plain"} 
        request = urllib2.Request(self.newest_url, headers = _headers)
        html_page = urllib2.urlopen(request).read() 
        try: 
            import gzip, StringIO
            data = html_page
            data = StringIO.StringIO(data)
            gzipper = gzip.GzipFile(fileobj=data)
            html = gzipper.read()
            html_page = html
        except:
            pass
        print '抓取%s网站更新线程启动...' % self.thread_name
        print '获取更新部分....', 
        html_page = BeautifulSoup(html_page)
        content = html_page.findAll(self.content_tag, self.content_dict)
        contents = ''.join([str(item) for item in content])
        chapter_infor = BeautifulSoup(contents)
        content = chapter_infor.findAll(self.chapter_tag, self.chapter_dict)
        print ' Done.'
        indexs = 1
        for item in content:
            print '获取第%d个基本信息' % indexs,
            indexs += 1
            contents = str(item)
            types = ''.join(re.findall(self.types_pattern, contents))
            title = ''.join(re.findall(self.title_pattern, contents))
            chapter = ''.join(re.findall(self.chapter_pattern, contents))
            author = ''.join(re.findall(self.author_pattern, contents))
            fiction_url = ''.join(re.findall(self.fiction_url_pattern, contents))
            chapter_url = ''.join(re.findall(self.chapter_url_pattern, contents))
            if not types or not title or \
                not chapter or not author or not fiction_url or not chapter_url:
                print 'Failed.'
                continue
            print chapter_url
            newest_chapter_url = chapter_url
            print '标题:%s, 作者:%s, 小说主页%s' %(title, author, fiction_url),
            print 'Done.'
            host = self.host
            if self.host[len(self.host) - 1] == '/':
                host = self.host[:len(self.host) - 1]
            if chapter_url[0] == '/':
                chapter_url = host + chapter_url
            if fiction_url[0] == '/':
                fiction_url = host + fiction_url
            try:
                web_site = FictionWebSite.objects.get(url = self.host)
            except:
                web_site = FictionWebSite(title = self.thread_name, url = self.host)
                web_site.save()



            try:
                hash_url = HashUrl.objects.get(urls = fiction_url)
                is_exit = True
                fic = Fiction.objects.get(fiction_title = title, author = author)
            except:
                is_exit = False

            if not is_exit:
                try:
                    hash_url = HashUrl(urls = fiction_url)
                    hash_url.save()
                except:
                    continue
                #if the fiction got by crawler is the newest one
                #get the book infor
                print '获取小说%s详细信息' % title,
                book_infor = get_book_infor(self.host, self.thread_name, fiction_url, True)
                print 'Done.'
                ids = re.findall(ALL_PATTERN[web_site.title]['ids_pattern'], fiction_url)
                types = '4' if not STYLE[self.thread_name].has_key(book_infor['types']) else \
                     STYLE[self.thread_name][(book_infor['types'])]
                try:
                    fic = Fiction(fiction_title = title, 
                        fiction_avatar_url = book_infor['avatar'],
                        fiction_intro = book_infor['intro'],
                        fiction_id = ids[0],
                        fiction_style = types,
                        total_word = book_infor['total_word'],
                        com_word = "",
                        source_site = web_site,
                        click_time = book_infor['click_time'],
                        rec_time = book_infor['rec_time'],
                        author = author,
                        stock_time = 0,
                        author_url = "",
                    )
                    fic.save() 
                    fic.fiction_nid = create_nid(fic.id)
                    fic.save()
                    member = MemberShip(fiction = fic, website = web_site, fiction_url = fiction_url)
                    member.save()
                    del member
                except:
                    continue
                
                #search only by fiction title
                for item in mmseg.Algorithm(title):
                    try:
                        index = Index.objects.get(key = item.text)
                    except:
                        index = Index(key = item.text)
                        index.save()
                    IndexFictionRelationship.objects.create(key = index,
                        fiction = fic,
                        position = ','.join([str(item.start), str(item.end)]),
                        bit = '2',#chapter
                    )
                #get all chapters
                if book_infor.has_key('read_url'):
                    chapter_url = book_infor['read_url']
                else:
                    chapter_url = build_url_fiction(ids[0], web_site.title)
                print '获取所有章节.' ,
                get_chapters_thread = threading.Thread(target = chapter_func[web_site.title], 
                    args = (chapter_url, fic, web_site))
                get_chapters_thread.start()
                get_chapters_thread.join()
                print 'done.'
            #if the fiction has been inserted into the database before
            else:
                #get the max index of chapters
                try:
                    chapter_index = ChapterIndex.objects.get(fiction = fic.id, web_site = web_site.title)
                except:
                    continue
                chapter_index.id += 1
                chapter_index.save()
                #get the chapter
                try:
                    chap = Chapter.objects.get(fiction = fic, index = chapter_index.id)
                except:
                    chap = Chapter(chapter_title = chapter,
                        charpter_url = newest_chapter_url,
                        fiction = fic,
                        source = web_site,
                        index = chapter_index.id,
                        through = '0',#from udpate thread
                        )
                    chap.save()
                try:
                    chapter_url = ChapterUrl.objects.get(url = chapter_url)
                except:
                    chapter_url = ChapterUrl(url = chapter_url,
                        chapter = chap,
                        fiction = fic,
                        index = chapter_index.id,
                        name = web_site.title)
                    chapter_url.save()
            #save into newest chapter
            try:
                NewestChapter.objects.create(chapter_title = chapter,
                    charpter_url = newest_chapter_url,
                    fiction = fic,
                    source = web_site,
                    index = 0,
                    )
            except:
                continue
Example #12
0
    def add(self, tid, title, field, score = 0):
        '''建立索引'''

        # 数据检查
        if tid is None or tid == '' \
                or title is None or title == '' \
                or field is None or field == '':
            return False

        # 原始数据
        self.tid = tid
        self.title = title
        self.field = field
        self.data = {'tid' : self.tid, 'title' : self.title, 'field' : self.field}
        self.score = score

        # 原始数据保存到Hash
        # self.r.hset(self.field, self.tid, sj.dumps(self.data))
        self.pipeline.hset(self.field, self.tid, sj.dumps(self.data))
        # 采用下面的方法可以直接通过sort命令的get参数获取数据
        # self.pipeline.set('%s:%s' % (self.field, self.tid), sj.dumps(self.data))

        # 分词
        algor = mmseg.Algorithm(self.title)

        words = []
        for tok in algor:
            # 不区分大小写
            word = tok.text.decode('utf-8').lower()
            words.append(word)
            # 建立倒排索引,以分词为key
            #self.r.sadd('%s:%s' % (self.field, word), self.tid)
            self.pipeline.sadd('%s:%s' % (self.field, word), self.tid)
        # 用于搜索结果排序的score
        # self.r.set('%s:score:%s' % (self.field, self.tid), self.score)
        self.pipeline.set('%s:score:%s' % (self.field, self.tid), self.score)

        # 前缀索引
        if self.config.prefix_index_enable is True:
            # 不区分大小写
            word = self.title.decode('utf-8').lower()
            # 前缀索引不包括分词内容
            del words[:]
            words.append(word)
            # 建立倒排索引,以分词为key
            # self.r.sadd('%s:%s' % (self.field, word), self.tid)
            self.pipeline.sadd('%s:%s' % (self.field, word), self.tid)

            dic = []
            for w in words:
                for i in range(len(w)):
                    prefix = w[:i+1]
                    dic.append(prefix)
                    dic.append(0.0)
                    #print prefix.encode('utf-8')
                # 完整的词增加一项,用*号区分
                prefix = '%s%s' % (w, '*')
                dic.append(prefix)
                dic.append(0.0)
            # self.r.zadd('compl:%s' % (self.field), *dic)
            self.pipeline.zadd('compl:%s' % (self.field), *dic)

        self.pipeline.execute()
        return True
Example #13
0
class Search(object):
    '''搜索类'''
    def __init__(self, *args, **kwargs):
        # 参数检查
        if args:
            if len(args) % 2 != 0:
                raise ParameterError(
                    "Config requires an equal number of values and scores")
        # 动态初始化实例变量
        for i in range(len(args) / 2):
            setattr(self, args[i * 2], args[i * 2 + 1])
        for key in kwargs:
            setattr(self, key, kwargs[key])
        # redis
        pool = redis.ConnectionPool(host=self.config.redis['host'],
                                    port=self.config.redis['port'],
                                    db=self.config.redis['db'])
        self.r = redis.Redis(connection_pool=pool)
        # self.r = redis.StrictRedis(host=self.config.redis['host'], port=self.config.redis['port'], db=self.config.redis['db'])
        self.pipeline = self.r.pipeline()
        # 加载分词
        mmseg.dict_load_defaults()

    def query(self, field, keyword, count):
        '''关键词搜索'''

        results = []
        if keyword is None or keyword == '':
            return results

        # 转成UTF-8
        try:
            keyword = keyword.encode('utf-8')
        except UnicodeDecodeError, e:
            pass

        max_length = max(50, count)
        keyword = keyword.lower()

        # 分词
        algor = mmseg.Algorithm(keyword)
        for tok in algor:
            # 不区分大小写
            word = tok.text.decode('utf-8').lower()
            results.append(word)

        # 所有关键词结果的交集的key
        temp_store_key = 'tmpinter:%s:%s' % (field, '+'.join(results))
        # 关键词结果的并集的缓存不存在
        if self.r.exists(temp_store_key) is False:
            # 关键词结果的交集缓存
            cnt = self.r.sinterstore(
                temp_store_key,
                ['%s:%s' % (field, result) for result in results])
            if cnt == 0:
                return []
            # 缓存时间
            self.r.expire(temp_store_key, 60 * 5)

        # 如果建立了相关索引,可以通过sort直接获取数据
        # return self.r.sort(temp_store_key, by='%s:score:%s' % (field, '*'), get='%s:%s' % (field, '*'), start=0, num=count, desc=True)

        # 获取id
        ids = self.r.sort(temp_store_key,
                          by='%s:score:%s' % (field, '*'),
                          start=0,
                          num=count,
                          desc=True)
        # 获取数据
        return self.r.hmget(field, ids)
Example #14
0
def search(request):
    hot_keys = get_hot_keys(40)
    r.incr('visit:search:count')
    visit_counts()
    suggestion = search_suggestion(db['product'])
    mmseg_keys = []
    collection = db['product']
    access_token = request.session.get('access_token', None)
    expires_in = request.session.get('expires_in', None)
    uid = request.session.get('uid', None)

    profile = db['user'].find({"_id": uid})
    if profile.count() == 0:
        profile = None
    else:
        profile = profile[0]
    #chinese word segmentation engine
    mmseg.Dictionary.load_dictionaries()
    db_size = collection.count()
    template = loader.get_template('search.html')
    keywords = request.GET['keys'].encode('utf-8')
    current_url = request.path + '?keys=' + keywords
    seg_keys = []
    patterns = []
    search_str = ''

    mmseg_keys_temp = mmseg.Algorithm(keywords)
    for tok in mmseg_keys_temp:
        mmseg_keys.append(tok.text)

    if len(keywords) > 30:
        algor = mmseg.Algorithm(keywords)
        for tok in algor:
            seg_keys.append(tok.text)
            patterns.append(re.compile('.*%s.*' % tok.text))
            search_str = search_str + '.*' + tok.text + '.*|'
    else:
        algor = keywords.split(' ')
        for tok in algor:
            #add to redis server statics
            seg_keys.append(tok.strip())
            patterns.append(re.compile('.*%s.*' % tok.strip()))
            search_str = search_str + '.*' + tok + '.*|'

    #restrict search
    result_list = collection.find({"ProductName": {"$all": patterns}})
    if result_list.count() == 0:

        #restrict search return none,then use the loose search method
        search_str = search_str.rstrip('|')
        pattern = re.compile(search_str)
        result_list = collection.find({"ProductName": pattern})
    if keywords.strip() == '':
        result_list = None

    if result_list and result_list.count() >= 1:
        algor = keywords.split(' ')
        for tok in algor:
            try:
                if tok.strip() != '':
                    r.zincrby('search_keywords', tok)
            except:
                print 'error redis search static'
        after_range_num = 3
        befor_range_num = 4
        try:
            page = int(request.GET.get("page", 1))
            if page < 1:
                page = 1
        except ValueError:
            page = 1
        paginator = Paginator(result_list, 10)
        try:
            search_result = paginator.page(page)
        except (EmptyPage, InvalidPage, PageNotAnInteger):
            search_result = paginator.page(paginator.num_pages)
            if page >= after_range_num:
                page_range = paginator.page_range[page - after_range_num:page +
                                                  befor_range_num]
        else:
            page_range = paginator.page_range[0:int(page) + befor_range_num]
    else:
        algor = keywords.split(' ')
        for tok in algor:
            r.zincrby('search_keywords_not_exist', tok)
        search_result = None
        page_range = None

    most_like_item = get_most_like_items()
    MostLikeList = []
    for mll in most_like_item:
        try:
            rresult = db['product'].find({"ProductID": mll})[0]
        except:
            rresult = None
        if rresult:
            recommend = {
                "pid": rresult['ProductID'],
                "cover": rresult['MorePhotos'],
                "title": rresult['ProductName'],
                "price": rresult['ProductPrice']
            }
            MostLikeList.append(recommend)
    if len(MostLikeList) == 0:
        MostLikeList = None

    params = Context({
        "MostLikeList": MostLikeList,
        "mmseg_keys": mmseg_keys,
        "hotkeys": hot_keys,
        "current_url": current_url,
        'page_range': page_range,
        'userProfile': profile,
        'result_list': search_result,
        'instant_search': suggestion,
        'search_key_words': seg_keys,
        'system_version': version,
        'database_size': db_size
    })
    return HttpResponse(template.render(params))