Ejemplo n.º 1
0
def seg_txt_search(txt):
    result = []
    for word in seg_txt(txt):
        if word.isalnum():
            result.append(word.lower())
            continue
        word = word.decode("utf-8", "ignore")
        if len(word) == 1:
            if u"一" <= word <= u"龥" and word not in SMALLCHAR:
                result.append(word)
        else:
            result.append(word)
    result = [i.encode("utf-8", "ignore") if type(i) is unicode else i for i in result]
    return result
Ejemplo n.º 2
0
def seg_txt_search(txt):
    result = []
    for word in seg_txt(txt):
        if word.isalnum():
            result.append(word.lower())
            continue
        word = word.decode("utf-8", "ignore")
        if len(word) == 1:
            if u"一" <= word <= u"龥" and word not in SMALLCHAR:
                result.append(word)
        else:
            result.append(word)
    result = [
        i.encode("utf-8", "ignore") if type(i) is unicode else i
        for i in result
    ]
    return result
Ejemplo n.º 3
0
def seg_title_search(txt):
    result = []
    for word in seg_txt(txt):
        if word.isalnum(): 
            result.append(word.lower())
            continue
        word = word.decode("utf-8", "ignore")
        if len(word) == 1:
            if u"一" <= word <= u"龥": 
                result.append(word)
        else:
            if len(word) <= 2: result.append(word)
            else:
                result.extend(word_len2(word))
            if not word.encode("utf-8").isalnum():
                for char in word: 
                    if char not in result: result.append(char)
    result = [i.encode("utf-8", "ignore") if type(i) is unicode 
              else i for i in result]
    return result
Ejemplo n.º 4
0
def seg_title_search(txt):
    result = []
    for word in seg_txt(txt):
        if word.isalnum():
            result.append(word.lower())
            continue
        word = word.decode("utf-8", "ignore")
        if len(word) == 1:
            if u"一" <= word <= u"龥":
                result.append(word)
        else:
            if len(word) <= 2: result.append(word)
            else:
                result.extend(word_len2(word))
            if not word.encode("utf-8").isalnum():
                for char in word:
                    if char not in result: result.append(char)
    result = [
        i.encode("utf-8", "ignore") if type(i) is unicode else i
        for i in result
    ]
    return result