コード例 #1
0
def zsite2keyword(z):
    r = []
    t = defaultdict(int)
    id = z.id

    name = z.name
    if name:
        for word in seg_title_search(name):
            t[word] += 2

    url = url_by_id(id)
    if url:
        t[url] += 2

    if z.cid == CID_SITE:
        for word in seg_title_search(motto_get(id)):
            t[word] += 1

    elif z.cid == CID_USER:

        mail = mail_by_user_id(id)
        if mail:
            t[mail] += 2
            t[mail.split('@', 1)[0]] += 2


        txt = txt_get(id)

        if txt:
            man_txt_len = len(txt)

            for word in seg_txt_search(txt):
                t[word] += 1

        for seq, career in enumerate(career_list_all(id)):
            if seq:
                add = 1
            else:
                add = 2

            unit = career.unit
            title = career.title
            txt = career.txt

            if unit:
                for word in seg_title_search(unit):
                    t[word] += add

            if title:
                for word in seg_title_search(title):
                    t[word] += add
            if txt:
                for word in seg_txt_search(txt):
                    t[word] += add

    return t
コード例 #2
0
ファイル: zsite_iter.py プロジェクト: xqk/42qu_github_mirror
def zsite2keyword(z):
    r = []
    t = defaultdict(int)
    id = z.id

    name = z.name
    if name:
        for word in seg_title_search(name):
            t[word] += 2

    url = url_by_id(id)
    if url:
        t[url] += 2

    if z.cid == CID_SITE:
        for word in seg_title_search(motto_get(id)):
            t[word] += 1

    elif z.cid == CID_USER:

        mail = mail_by_user_id(id)
        if mail:
            t[mail] += 2
            t[mail.split('@', 1)[0]] += 2

        txt = txt_get(id)

        if txt:
            man_txt_len = len(txt)

            for word in seg_txt_search(txt):
                t[word] += 1

        for seq, career in enumerate(career_list_all(id)):
            if seq:
                add = 1
            else:
                add = 2

            unit = career.unit
            title = career.title
            txt = career.txt

            if unit:
                for word in seg_title_search(unit):
                    t[word] += add

            if title:
                for word in seg_title_search(title):
                    t[word] += add
            if txt:
                for word in seg_txt_search(txt):
                    t[word] += add

    return t
コード例 #3
0
def split_words(text):
    """docstring for split_words"""

    words = []
    for i in seg_txt_search(text):
        words.append(i)

    return words
コード例 #4
0
def split_words(text):
    """docstring for split_words"""
    
    words = []
    for i in seg_txt_search(text):
        words.append(i)

    return words
コード例 #5
0
ファイル: seg.py プロジェクト: huanghao/bookworm
def guess_keywords(querystring, known_prefix=None):
    stem = xapian.Stem('en')
    words = set()
    contains_chinese = 0
    for piece in querystring.split():
        prefix = None
        if known_prefix and ':' in piece:
            qprefix, other = piece.split(':', 1)
            if qprefix in known_prefix:
                prefix = known_prefix[qprefix]
                piece = other

        for word in seg_txt_search(piece):
            if ord(word[0]) <= 127: # english
                word = stem(word)
            else:
                contains_chinese = 1
            if prefix:
                words.add(prefix + word)
            else:
                words.add(word)

    return contains_chinese, list(words)
コード例 #6
0
def guess_keywords(querystring, known_prefix=None):
    stem = xapian.Stem('en')
    words = set()
    contains_chinese = 0
    for piece in querystring.split():
        prefix = None
        if known_prefix and ':' in piece:
            qprefix, other = piece.split(':', 1)
            if qprefix in known_prefix:
                prefix = known_prefix[qprefix]
                piece = other

        for word in seg_txt_search(piece):
            if ord(word[0]) <= 127:  # english
                word = stem(word)
            else:
                contains_chinese = 1
            if prefix:
                words.add(prefix + word)
            else:
                words.add(word)

    return contains_chinese, list(words)
コード例 #7
0
def split_words(text):
    """docstring for split_words"""
    return [i for i in seg_txt_search(text)]