def segmenteCorpus(corpus, storage):
    result = []
    seg = Segmenter(storage)
    for page_id, pad in corpus:
        for line in pad.split("\n"):
            result.extend(["".join(w) for w in seg.segment(list(line))])
    return result
def segmenteCorpus(corpus, storage):
    result = []
    seg = Segmenter(storage)
    for page_id, pad in corpus:
        for line in pad.split("\n"):
            result.extend(["".join(w) for w in seg.segment(list(line))])
    return result
Esempio n. 3
0
def test_segmentation_2grams(storage):
    storage.add_sentence('je vous parle de hot dog'.split())
    storage.add_sentence('j ador les hot dog'.split())
    storage.add_sentence('hot dog ou pas'.split())
    storage.add_sentence('hot dog ou sandwich'.split())

    segmenter = Segmenter(storage)
    assert segmenter.segment('je deteste les hot dog'.split()) == [['je'], ['deteste'], ['les'], ['hot'], ['dog']]
Esempio n. 4
0
def test_segmentation_basic(storage):
    storage.add_sentence('je vous parle de hot dog'.split())
    storage.add_sentence('j ador les hot dog'.split())
    storage.add_sentence('hot dog ou pas'.split())
    storage.add_sentence('hot dog ou sandwich'.split())

    segmenter = Segmenter(storage)
    assert segmenter.segment('je deteste les hot dog'.split()) == [['je'], ['deteste'], ['les'], ['hot', 'dog']]
    assert segmenter.segment('un chat noir et blanc'.split()) == [['un'], ['chat'], ['noir'], ['et'], ['blanc']]
Esempio n. 5
0
def test_segmentation_2grams(storage):
    storage.add_sentence("je vous parle de hot dog".split())
    storage.add_sentence("j ador les hot dog".split())
    storage.add_sentence("hot dog ou pas".split())
    storage.add_sentence("hot dog ou sandwich".split())

    segmenter = Segmenter(storage)
    assert segmenter.segment("je deteste les hot dog".split()) == [
        ["je"],
        ["deteste"],
        ["les"],
        ["hot"],
        ["dog"],
    ]
Esempio n. 6
0
def benchmark(storage_class, create=True):
    m = storage_class(4)
    s = Segmenter(m, 3)
    if create:
        m.clear()

    corpus = reuters.raw()

    tokens = list(filter(lambda t: t.category == '', tokeniser_fr(corpus)))[:10000]
    
    if create:
        m.add_sentence(tokens)

    for i in range(1,5000,30):
        print(s.segment(tokens[i:i+30]))
Esempio n. 7
0
def benchmark(storage_class, create=True):
    m = storage_class(4)
    s = Segmenter(m, 3)
    if create:
        m.clear()

    corpus = reuters.raw()

    tokens = list(filter(lambda t: t.category == "",
                         tokeniser_fr(corpus)))[:10000]

    if create:
        m.add_sentence(tokens)

    for i in range(1, 5000, 30):
        print(s.segment(tokens[i:i + 30]))
Esempio n. 8
0
def segment_with_preprocessing_pool(seg: Segmenter,
                                    sentences: List[str],
                                    bies: bool = True) -> List[str]:
    pool = Pool()
    tokenized_sentences = pool.map(tokenize_by_unicode_category, sentences)
    for sent in tokenized_sentences:
        tokens = []
        for group in sent:
            try:
                if isCJK(group[0]):
                    words = ["".join(w) for w in seg.segment(list(group))]
                    for w in words:
                        if bies:
                            tokens.extend(add_bies(w))
                        else:
                            tokens.append(w)
                else:
                    if bies:
                        tokens.extend(add_bies(group))
                    else:
                        tokens.append(group)
            except:
                if bies:
                    tokens.extend(add_bies(group))
                else:
                    tokens.append(group)
        yield " ".join(tokens)
    pool.terminate()
    pool.close()
Esempio n. 9
0
def segment_file(storage, input_file: Path, output_file: Path):
    segmenter = Segmenter(storage)
    with open(input_file) as f:
        with open(output_file, "w") as out:
            for line in f:
                line = line[:-1]
                if line.strip() != "":
                    out.write(
                        chinese.segment_with_preprocessing(segmenter, line) +
                        "\n")
Esempio n. 10
0
def segment_batch(storage: Storage, batch: List[str]) -> List[str]:
    segmenter = Segmenter(storage)
    result = []
    # pool = Pool()
    # for line in pool.map(lambda x: chinese.segment_with_preprocessing(segmenter,x), batch):
    for line in batch:
        line = line[:-1]
        if line.strip() != "":
            result.append(chinese.segment_with_preprocessing(segmenter, line))
    return result
Esempio n. 11
0
def addSegToBioUnsup(input: str,
                     model: str,
                     output: str,
                     char_ind: int = 0,
                     tag_ind: int = 1):
    from eleve import Segmenter
    from eleve.memory import CSVStorage

    lm = CSVStorage(model)

    s = Segmenter(lm)

    all = []
    tags = []

    sub = []
    tag = []
    with open(input, "r", encoding="utf-8") as fp:
        for line in fp:
            if line == "\n" or line == "":
                all.append(sub)
                tags.append(tag)
                sub = []
                tag = []
            else:
                line_split = line.replace("\n", "").split()
                c = line_split[char_ind]
                t = line_split[tag_ind]
                sub.append(c)
                tag.append(t)

    out = open(output, "w", encoding='utf-8')
    for i in range(len(all)):
        if len(all[i]) > 0:
            sentence = " ".join(all[i])
            segmentation = s.segmentSentenceTIWBIES(sentence).split(" ")

            for j in range(len(segmentation)):
                line_out = segmentation[j] + " " + tags[i][j] + "\n"
                out.write(line_out)
            out.write('\n')
    out.close()
Esempio n. 12
0
def test_segmentation_basic(storage):
    storage.add_sentence("je vous parle de hot dog".split())
    storage.add_sentence("j ador les hot dog".split())
    storage.add_sentence("hot dog ou pas".split())
    storage.add_sentence("hot dog ou sandwich".split())

    segmenter = Segmenter(storage)
    assert segmenter.segment("je deteste les hot dog".split()) == [
        ["je"],
        ["deteste"],
        ["les"],
        ["hot", "dog"],
    ]
    assert segmenter.segment("un chat noir et blanc".split()) == [
        ["un"],
        ["chat"],
        ["noir"],
        ["et"],
        ["blanc"],
    ]
Esempio n. 13
0
def segment_with_preprocessing(seg: Segmenter,
                               sent: str,
                               bies: bool = True) -> str:
    tokens = []
    for group in tokenize_by_unicode_category(sent):
        try:
            if isCJK(group[0]):
                words = ["".join(w) for w in seg.segment(list(group))]
                for w in words:
                    if bies:
                        tokens.extend(add_bies(w))
                    else:
                        tokens.append(w)
            else:
                if bies:
                    tokens.extend(add_bies(group))
                else:
                    tokens.append(group)
        except:
            if bies:
                tokens.extend(add_bies(group))
            else:
                tokens.append(group)
    return " ".join(tokens)
Esempio n. 14
0
from eleve import CMemoryStorage as Storage
from eleve import Segmenter

import sys

corpus_file = sys.argv[1]

with open(corpus_file) as f:
    storage = Storage(10)
    corpus = []
    for line in f:
        tokens = list(line.strip().replace(" ", ""))
        storage.add_sentence(tokens)
        corpus.append(tokens)
    seg = Segmenter(storage)
    for tokens in corpus:
        result = seg.segment_nbest(tokens, 5)
        for ibest in result:
            print("  ".join(["".join(w) for w in ibest]))
print( '\t %s one-grams (non-unique) found in %s sentences'%\
    ( '{:,}'.format(nMots), '{:,}'.format(nPosts))  )

print( '> ELeVE: Segment & count')
dicoFr = lmdjxtools.getDicoFr()
blacklist_afterEleve = lmdjxtools.blacklist_afterEleve()

# Create table occurences
cursor.execute('''DROP TABLE IF EXISTS occurences''')
cursor.execute('''CREATE TABLE occurences
             (date text, ngram text, source text, postid integer)''')  #postid?
print('\t reset table DB.occurences')

from eleve import Segmenter
s = Segmenter(storage)

rejected_ngrams = set()
i = 0
cursor.execute('SELECT date, source, title, summary, rowid FROM posts')
for line in cursor.fetchall():
    date = line[0]
    source = line[1]
    postid = line[4]


    segmentedPhrase = []
    for phrase in line[2:3]:
        formattedtext = lmdjxtools.format( phrase   )
        segmentedPhrase += s.segment( formattedtext.split(' ') )
Esempio n. 16
0
print( '\t %s one-grams (non-unique) found in %s sentences'%\
    ( '{:,}'.format(nMots), '{:,}'.format(nPosts))  )

print('> ELeVE: Segment & count')
dicoFr = lmdjxtools.getDicoFr()
blacklist_afterEleve = lmdjxtools.blacklist_afterEleve()

# Create table occurences
cursor.execute('''DROP TABLE IF EXISTS occurences''')
cursor.execute('''CREATE TABLE occurences
             (date text, ngram text, source text, postid integer)''')  #postid?
print('\t reset table DB.occurences')

from eleve import Segmenter

s = Segmenter(storage)

rejected_ngrams = set()
i = 0
cursor.execute('SELECT date, source, title, summary, rowid FROM posts')
for line in cursor.fetchall():
    date = line[0]
    source = line[1]
    postid = line[4]

    segmentedPhrase = []
    for phrase in line[2:3]:
        formattedtext = lmdjxtools.format(phrase)
        segmentedPhrase += s.segment(formattedtext.split(' '))

    ngramAdded = set()