def remap(multisegs):

    useg=unicode(multisegs[0], 'utf-8')
    chars=chinese_utils.segment_into_chars_rs(useg)

    # construct bit vector from each segmentation. with 0 representing a boundary.
    bvs=[]
    for seg in multisegs:
        bv=construct_bv(seg)
        bvs.append(bv)

    # & all the bvs, getting a new bv. This new bv represents the new segmentation.
    for i in range(1, len(bvs)):
        bvs[0] = bvs[0] & bvs[i]

    # returns a re-mapped (re-segmented) sentence.
    new_words=[]
    start=0
    for i in range(len(bvs[0])):
        if bvs[0][i]==0:
            if int(i)>start:
                new_words.append(''.join(chars[int(start):int(i)]))
                start=i

    unewseg=u' '.join(new_words)
    return unewseg.encode('utf-8')
Example #2
0
def remap(multisegs):

    useg = unicode(multisegs[0], 'utf-8')
    chars = chinese_utils.segment_into_chars_rs(useg)

    # construct bit vector from each segmentation. with 0 representing a boundary.
    bvs = []
    for seg in multisegs:
        bv = construct_bv(seg)
        bvs.append(bv)

    # & all the bvs, getting a new bv. This new bv represents the new segmentation.
    for i in range(1, len(bvs)):
        bvs[0] = bvs[0] & bvs[i]

    # returns a re-mapped (re-segmented) sentence.
    new_words = []
    start = 0
    for i in range(len(bvs[0])):
        if bvs[0][i] == 0:
            if int(i) > start:
                new_words.append(''.join(chars[int(start):int(i)]))
                start = i

    unewseg = u' '.join(new_words)
    return unewseg.encode('utf-8')
Example #3
0
def seg(line,dict):
    ret = ""
    uline=unicode(line,"utf-8")
    l=chinese_utils.segment_into_chars_rs(uline)
    l.reverse()
    output=word_segment(l,dict)
    for i in range(len(output)): 
        ret += "%s " % output[i].encode("utf-8")

    return ret
def construct_bv(seg):
    useg=unicode(seg, 'utf-8')
    chars=chinese_utils.segment_into_chars_rs(useg)
    bv=BitVector.BitVector(size=len(chars)+1)
    for i in range(len(bv)): bv[i]=1
    bnds=compile_word_boundaries(seg)
    for b in bnds:
        bv[b[0]]=0
        bv[b[1]]=0

    return bv
Example #5
0
def construct_bv(seg):
    useg = unicode(seg, 'utf-8')
    chars = chinese_utils.segment_into_chars_rs(useg)
    bv = BitVector.BitVector(size=len(chars) + 1)
    for i in range(len(bv)):
        bv[i] = 1
    bnds = compile_word_boundaries(seg)
    for b in bnds:
        bv[b[0]] = 0
        bv[b[1]] = 0

    return bv
Example #6
0
def compute_char_belong_to(f):
    f_unicode = unicode(f, "utf-8")
    fwords = f_unicode.split()
    w_index = 0
    ret = []
    for w in fwords:
        chars = chinese_utils.segment_into_chars_rs(w)
        for c in chars:
            ret.append(w_index)
        w_index = w_index + 1

    return ret
def compute_char_belong_to(f):
    f_unicode=unicode(f, "utf-8")
    fwords=f_unicode.split()
    w_index=0
    ret=[]
    for w in fwords:
        chars=chinese_utils.segment_into_chars_rs(w)
        for c in chars:
            ret.append(w_index)
        w_index=w_index+1

    return ret
Example #8
0
def compile_word_boundaries(f):
    f_unicode = unicode(f, "utf-8")
    fwords = f_unicode.split()
    ret = []
    char_index = 0
    for w in fwords:
        chars = chinese_utils.segment_into_chars_rs(w)
        start = char_index
        for c in chars:
            char_index = char_index + 1
        ret.append([start, char_index])

    return ret
def compile_word_boundaries(f):
    f_unicode=unicode(f,"utf-8")
    fwords=f_unicode.split()
    ret=[]
    char_index=0
    for w in fwords:
        chars=chinese_utils.segment_into_chars_rs(w)
        start=char_index
        for c in chars:
            char_index=char_index+1
        ret.append([start, char_index])

    return ret