Example #1
0
def _multicut(text: str, custom_dict: Trie = None):
    """
    ส่งคืน LatticeString คืนมาเป็นก้อนๆ
    """
    if not custom_dict:
        custom_dict = DEFAULT_DICT_TRIE

    len_text = len(text)
    words_at = defaultdict(list)  # main data structure

    def serialize(p, p2):  # helper function
        for w in words_at[p]:
            p_ = p + len(w)
            if p_ == p2:
                yield w
            elif p_ < p2:
                for path in serialize(p_, p2):
                    yield w + "/" + path

    q = {0}
    last_p = 0  # last position for yield
    while min(q) < len_text:
        p = min(q)
        q -= {p}  # q.pop, but for set

        for w in custom_dict.prefixes(text[p:]):
            words_at[p].append(w)
            q.add(p + len(w))

        if len(q) == 1:
            q0 = min(q)
            yield LatticeString(text[last_p:q0], serialize(last_p, q0))
            last_p = q0

        # กรณี len(q) == 0  คือ ไม่มีใน dict
        if len(q) == 0:
            m = _PAT_ENG.match(text[p:])
            if m:  # อังกฤษ, เลข, ว่าง
                i = p + m.span()[1]
            else:  # skip น้อยที่สุด ที่เป็นไปได้
                for i in range(p, len_text):
                    ww = custom_dict.prefixes(text[i:])
                    m = _PAT_ENG.match(text[i:])
                    if ww or m:
                        break
                else:
                    i = len_text
            w = text[p:i]
            words_at[p].append(w)
            yield LatticeString(w, in_dict=False)
            last_p = i
            q.add(i)
Example #2
0
def _multicut(text: str, custom_dict: Trie = None):
    """
    ส่งคืน LatticeString คืนมาเป็นก้อนๆ
    """
    if not custom_dict:
        custom_dict = DEFAULT_DICT_TRIE

    len_text = len(text)
    words_at = defaultdict(list)  # main data structure

    def serialize(p, p2):  # helper function
        for w in words_at[p]:
            p_ = p + len(w)
            if p_ == p2:
                yield w
            elif p_ < p2:
                for path in serialize(p_, p2):
                    yield w + "/" + path

    q = {0}
    last_p = 0  # last position for yield
    while min(q) < len_text:
        p = min(q)
        q -= {p}  # q.pop, but for set

        for w in custom_dict.prefixes(text[p:]):
            words_at[p].append(w)
            q.add(p + len(w))

        if len(q) == 1:
            q0 = min(q)
            yield LatticeString(text[last_p:q0], serialize(last_p, q0))
            last_p = q0

        # กรณี len(q) == 0  คือ ไม่มีใน dict
        if len(q) == 0:
            m = _PAT_ENG.match(text[p:])
            if m:  # อังกฤษ, เลข, ว่าง
                i = p + m.span()[1]
            else:  # skip น้อยที่สุด ที่เป็นไปได้
                for i in range(p, len_text):
                    ww = custom_dict.prefixes(text[i:])
                    m = _PAT_ENG.match(text[i:])
                    if ww or m:
                        break
                else:
                    i = len_text
            w = text[p:i]
            words_at[p].append(w)
            yield LatticeString(w, in_dict=False)
            last_p = i
            q.add(i)
Example #3
0
def onecut(text, data=['']):
    if (data != ['']):
        trie = Trie(data)
    else:
        trie = THAI_WORDS
    graph = defaultdict(list)  # main data structure
    allow_pos = tcc_pos(text)  # ตำแหน่งที่ตัด ต้องตรงกับ tcc

    q = [0]  # min-heap queue
    last_p = 0  # last position for yield
    while q[0] < len(text):
        p = heappop(q)

        for w in trie.prefixes(text[p:]):
            p_ = p + len(w)
            if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
                graph[p].append(p_)
                if p_ not in q:
                    heappush(q, p_)

        # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้
        if len(q) == 1:
            pp = next(bfs_paths_graph(graph, last_p, q[0]))
            # เริ่มต้น last_p = pp[0] เอง
            for p in pp[1:]:
                yield text[last_p:p]
                last_p = p
            # สุดท้าย last_p == q[0] เอง

        # กรณี length 0  คือ ไม่มีใน dict
        if len(q) == 0:
            m = pat_eng.match(text[p:])
            if m:  # อังกฤษ, เลข, ว่าง
                i = p + m.end()
            else:  # skip น้อยที่สุด ที่เป็นไปได้
                for i in range(p + 1, len(text)):
                    if i in allow_pos:  # ใช้ tcc ด้วย
                        ww = [
                            w for w in trie.prefixes(text[i:])
                            if (i + len(w) in allow_pos)
                        ]
                        m = pat_eng.match(text[i:])
                        if ww or m:
                            break
                else:
                    i = len(text)
            w = text[p:i]
            graph[p].append(i)
            yield w
            last_p = i
            heappush(q, i)
Example #4
0
def _onecut(text: str, custom_dict: Trie):
    graph = defaultdict(list)  # main data structure
    allow_pos = tcc_pos(text)  # separating position should aligned with TCC

    q = [0]  # min-heap queue
    last_p = 0  # last position for yield
    while q[0] < len(text):
        p = heappop(q)

        for w in custom_dict.prefixes(text[p:]):
            p_ = p + len(w)
            if p_ in allow_pos:  # เลือกที่สอดคล้อง tcc
                graph[p].append(p_)
                if p_ not in q:
                    heappush(q, p_)

        # กรณี length 1 คือ ไม่กำกวมแล้ว ส่งผลลัพธ์ก่อนนี้คืนได้
        if len(q) == 1:
            pp = next(_bfs_paths_graph(graph, last_p, q[0]))
            # เริ่มต้น last_p = pp[0] เอง
            for p in pp[1:]:
                yield text[last_p:p]
                last_p = p
            # สุดท้าย last_p == q[0] เอง

        # กรณี length 0 คือ ไม่มีใน dict
        if len(q) == 0:
            m = _PAT_ENG.match(text[p:])
            if m:  # อังกฤษ, เลข, ว่าง
                i = p + m.end()
            else:  # skip น้อยที่สุด ที่เป็นไปได้
                for i in range(p + 1, len(text)):
                    if i in allow_pos:  # ใช้ tcc ด้วย
                        ww = [
                            w
                            for w in custom_dict.prefixes(text[i:])
                            if (i + len(w) in allow_pos)
                        ]
                        ww = [w for w in ww if not _PAT_TWOCHARS.match(w)]
                        m = _PAT_ENG.match(text[i:])
                        if ww or m:
                            break
                else:
                    i = len(text)
            w = text[p:i]
            graph[p].append(i)
            yield w
            last_p = i
            heappush(q, i)
Example #5
0
def tcut(text):
    #global last_p, i, q, ww   # for debug
    trie = Trie(get_data())
    words_at = defaultdict(list)  # main data structure

    def serialize(p, p2):  # helper function
        for w in words_at[p]:
            p_ = p + len(w)
            if p_ == p2:
                yield w
            elif p_ < p2:
                for path in serialize(p_, p2):
                    yield w + '/' + path

    q = {0}
    last_p = 0  # last position for yield
    while min(q) < len(text):
        p = min(q)
        q -= {p}  # q.pop, but for set

        for w in trie.prefixes(text[p:]):
            words_at[p].append(w)
            q.add(p + len(w))

        if len(q) == 1:
            q0 = min(q)
            yield LatticeString(text[last_p:q0], serialize(last_p, q0))
            last_p = q0

        # กรณี len(q) == 0  คือ ไม่มีใน dict
        if len(q) == 0:
            # skip น้อยที่สุด ที่เป็นไปได้
            for i in range(p, len(text)):
                ww = trie.prefixes(text[i:])
                if ww:
                    break
            else:
                i = len(text)
            w = text[p:i]
            w = w.replace(' ', '')  # ลบค่าที่ว่าง
            words_at[p].append(w)
            yield LatticeString(w, in_dict=False)
            last_p = i
            q.add(i)
Example #6
0
def test():
    # 1. build a trie
    d = dict(zero=0, one=1, two=2, three=3, four=4, five=5, six=6, seven=7,
             eight=8, nine=9, ten=10, eleven=11, twelve=12, thirteen=13,
             fourteen=10, fifteen=15, sixteen=16, seventeen=17, eighteen=18,
             nineteen=19, twenty=20, thirty=30, fourty=40, fifty=50,
             sixty=60, seventy=70, eighty=80, ninety=90,
             hundred=100)
    t = Trie(list(d.keys()))

    # 2. scan 2000 "sentences" with it
    for _ in range(1000):
    # scanning for the longest matches only in sentence 1
        i = S1[0]
        #print(TEXT[i:S1[1]])
        while i < S1[1]:
            pfx = list(t.prefixes(TEXT[i:S1[1]]))
            if pfx:
                k = pfx[-1]
                #print(d[k])
                i += len(k)
            else:
                i += 1

        # scanning for all matches in sentence 2
        i = S2[0]
        #print(TEXT[i:S2[1]])
        s = 0
        while i < S2[1]:
            for k in t.prefixes(TEXT[i:S2[1]]):
                #print(k)
                s += d[k]
            i += 1
        if s != 142:
            raise RuntimeError(str(s))

    # 3. make a real list of all keys in the trie
    if 'nine' not in list(t.iterkeys()):
        raise RuntimeError(str(list(t.iterkeys())))
Example #7
0
class GSuiteTrackCache(object):
    CACHE_DIRECTORY = os.path.join(DATA_FILES_PATH, 'GSuiteTrackCache')
    URI_PREFIXES_FN = CACHE_DIRECTORY + os.sep + 'UriPrefixesToCache.txt'
    CACHE_SHELVE_FN = os.path.join(CACHE_DIRECTORY, \
                      (URL_PREFIX[1:] if URL_PREFIX.startswith(os.path.sep) else URL_PREFIX), \
                      'GSuiteTrackCache.shelve')

    PROTOCOL = 0

    def __init__(self):
        if not os.path.exists(self.URI_PREFIXES_FN):
            ensurePathExists(self.URI_PREFIXES_FN)
            open(self.URI_PREFIXES_FN, 'w')

        if not os.path.exists(self.CACHE_SHELVE_FN):
            ensurePathExists(self.CACHE_SHELVE_FN)

        cache = self._openShelve('c')
        cache.close()

        prefixList = [line.strip() for line in open(self.URI_PREFIXES_FN, 'r')]
        self._uriPrefixes = Trie(prefixList)

    def _openShelve(self, mode):
        return safeshelve.open(self.CACHE_SHELVE_FN,
                               mode,
                               protocol=self.PROTOCOL)

    def isCached(self, gSuiteTrack):
        cache = self._openShelve('r')
        isCached = gSuiteTrack.uri in cache
        cache.close()
        return isCached

    def getCachedGalaxyUri(self, gSuiteTrack):
        cache = self._openShelve('r')
        uri = cache[gSuiteTrack.uri]
        cache.close()
        return uri

    def shouldBeCached(self, gSuiteTrack):
        uri = unicode(gSuiteTrack.uri)
        return len(self._uriPrefixes.prefixes(uri)) > 0

    def cache(self, gSuiteTrack, galaxyUri):
        cache = self._openShelve('c')
        cache[gSuiteTrack.uri] = galaxyUri
        cache.close()
Example #8
0
def test():
    # 1. build a trie
    d = dict(zero=0,
             one=1,
             two=2,
             three=3,
             four=4,
             five=5,
             six=6,
             seven=7,
             eight=8,
             nine=9,
             ten=10,
             eleven=11,
             twelve=12,
             thirteen=13,
             fourteen=10,
             fifteen=15,
             sixteen=16,
             seventeen=17,
             eighteen=18,
             nineteen=19,
             twenty=20,
             thirty=30,
             fourty=40,
             fifty=50,
             sixty=60,
             seventy=70,
             eighty=80,
             ninety=90,
             hundred=100)
    t = Trie(list(d.keys()))

    # 2. scan 2000 "sentences" with it
    for _ in range(1000):
        # scanning for the longest matches only in sentence 1
        i = S1[0]
        #print(TEXT[i:S1[1]])
        while i < S1[1]:
            pfx = list(t.prefixes(TEXT[i:S1[1]]))
            if pfx:
                k = pfx[-1]
                #print(d[k])
                i += len(k)
            else:
                i += 1

        # scanning for all matches in sentence 2
        i = S2[0]
        #print(TEXT[i:S2[1]])
        s = 0
        while i < S2[1]:
            for k in t.prefixes(TEXT[i:S2[1]]):
                #print(k)
                s += d[k]
            i += 1
        if s != 142:
            raise RuntimeError(str(s))

    # 3. make a real list of all keys in the trie
    if 'nine' not in list(t.iterkeys()):
        raise RuntimeError(str(list(t.iterkeys())))