def segmenteCorpus(corpus, storage): result = [] seg = Segmenter(storage) for page_id, pad in corpus: for line in pad.split("\n"): result.extend(["".join(w) for w in seg.segment(list(line))]) return result
def test_segmentation_2grams(storage): storage.add_sentence('je vous parle de hot dog'.split()) storage.add_sentence('j ador les hot dog'.split()) storage.add_sentence('hot dog ou pas'.split()) storage.add_sentence('hot dog ou sandwich'.split()) segmenter = Segmenter(storage) assert segmenter.segment('je deteste les hot dog'.split()) == [['je'], ['deteste'], ['les'], ['hot'], ['dog']]
def test_segmentation_basic(storage): storage.add_sentence('je vous parle de hot dog'.split()) storage.add_sentence('j ador les hot dog'.split()) storage.add_sentence('hot dog ou pas'.split()) storage.add_sentence('hot dog ou sandwich'.split()) segmenter = Segmenter(storage) assert segmenter.segment('je deteste les hot dog'.split()) == [['je'], ['deteste'], ['les'], ['hot', 'dog']] assert segmenter.segment('un chat noir et blanc'.split()) == [['un'], ['chat'], ['noir'], ['et'], ['blanc']]
def test_segmentation_2grams(storage): storage.add_sentence("je vous parle de hot dog".split()) storage.add_sentence("j ador les hot dog".split()) storage.add_sentence("hot dog ou pas".split()) storage.add_sentence("hot dog ou sandwich".split()) segmenter = Segmenter(storage) assert segmenter.segment("je deteste les hot dog".split()) == [ ["je"], ["deteste"], ["les"], ["hot"], ["dog"], ]
def benchmark(storage_class, create=True): m = storage_class(4) s = Segmenter(m, 3) if create: m.clear() corpus = reuters.raw() tokens = list(filter(lambda t: t.category == '', tokeniser_fr(corpus)))[:10000] if create: m.add_sentence(tokens) for i in range(1,5000,30): print(s.segment(tokens[i:i+30]))
def benchmark(storage_class, create=True): m = storage_class(4) s = Segmenter(m, 3) if create: m.clear() corpus = reuters.raw() tokens = list(filter(lambda t: t.category == "", tokeniser_fr(corpus)))[:10000] if create: m.add_sentence(tokens) for i in range(1, 5000, 30): print(s.segment(tokens[i:i + 30]))
def segment_with_preprocessing_pool(seg: Segmenter, sentences: List[str], bies: bool = True) -> List[str]: pool = Pool() tokenized_sentences = pool.map(tokenize_by_unicode_category, sentences) for sent in tokenized_sentences: tokens = [] for group in sent: try: if isCJK(group[0]): words = ["".join(w) for w in seg.segment(list(group))] for w in words: if bies: tokens.extend(add_bies(w)) else: tokens.append(w) else: if bies: tokens.extend(add_bies(group)) else: tokens.append(group) except: if bies: tokens.extend(add_bies(group)) else: tokens.append(group) yield " ".join(tokens) pool.terminate() pool.close()
def segment_file(storage, input_file: Path, output_file: Path): segmenter = Segmenter(storage) with open(input_file) as f: with open(output_file, "w") as out: for line in f: line = line[:-1] if line.strip() != "": out.write( chinese.segment_with_preprocessing(segmenter, line) + "\n")
def segment_batch(storage: Storage, batch: List[str]) -> List[str]: segmenter = Segmenter(storage) result = [] # pool = Pool() # for line in pool.map(lambda x: chinese.segment_with_preprocessing(segmenter,x), batch): for line in batch: line = line[:-1] if line.strip() != "": result.append(chinese.segment_with_preprocessing(segmenter, line)) return result
def addSegToBioUnsup(input: str, model: str, output: str, char_ind: int = 0, tag_ind: int = 1): from eleve import Segmenter from eleve.memory import CSVStorage lm = CSVStorage(model) s = Segmenter(lm) all = [] tags = [] sub = [] tag = [] with open(input, "r", encoding="utf-8") as fp: for line in fp: if line == "\n" or line == "": all.append(sub) tags.append(tag) sub = [] tag = [] else: line_split = line.replace("\n", "").split() c = line_split[char_ind] t = line_split[tag_ind] sub.append(c) tag.append(t) out = open(output, "w", encoding='utf-8') for i in range(len(all)): if len(all[i]) > 0: sentence = " ".join(all[i]) segmentation = s.segmentSentenceTIWBIES(sentence).split(" ") for j in range(len(segmentation)): line_out = segmentation[j] + " " + tags[i][j] + "\n" out.write(line_out) out.write('\n') out.close()
def test_segmentation_basic(storage): storage.add_sentence("je vous parle de hot dog".split()) storage.add_sentence("j ador les hot dog".split()) storage.add_sentence("hot dog ou pas".split()) storage.add_sentence("hot dog ou sandwich".split()) segmenter = Segmenter(storage) assert segmenter.segment("je deteste les hot dog".split()) == [ ["je"], ["deteste"], ["les"], ["hot", "dog"], ] assert segmenter.segment("un chat noir et blanc".split()) == [ ["un"], ["chat"], ["noir"], ["et"], ["blanc"], ]
def segment_with_preprocessing(seg: Segmenter, sent: str, bies: bool = True) -> str: tokens = [] for group in tokenize_by_unicode_category(sent): try: if isCJK(group[0]): words = ["".join(w) for w in seg.segment(list(group))] for w in words: if bies: tokens.extend(add_bies(w)) else: tokens.append(w) else: if bies: tokens.extend(add_bies(group)) else: tokens.append(group) except: if bies: tokens.extend(add_bies(group)) else: tokens.append(group) return " ".join(tokens)
from eleve import CMemoryStorage as Storage from eleve import Segmenter import sys corpus_file = sys.argv[1] with open(corpus_file) as f: storage = Storage(10) corpus = [] for line in f: tokens = list(line.strip().replace(" ", "")) storage.add_sentence(tokens) corpus.append(tokens) seg = Segmenter(storage) for tokens in corpus: result = seg.segment_nbest(tokens, 5) for ibest in result: print(" ".join(["".join(w) for w in ibest]))
print( '\t %s one-grams (non-unique) found in %s sentences'%\ ( '{:,}'.format(nMots), '{:,}'.format(nPosts)) ) print( '> ELeVE: Segment & count') dicoFr = lmdjxtools.getDicoFr() blacklist_afterEleve = lmdjxtools.blacklist_afterEleve() # Create table occurences cursor.execute('''DROP TABLE IF EXISTS occurences''') cursor.execute('''CREATE TABLE occurences (date text, ngram text, source text, postid integer)''') #postid? print('\t reset table DB.occurences') from eleve import Segmenter s = Segmenter(storage) rejected_ngrams = set() i = 0 cursor.execute('SELECT date, source, title, summary, rowid FROM posts') for line in cursor.fetchall(): date = line[0] source = line[1] postid = line[4] segmentedPhrase = [] for phrase in line[2:3]: formattedtext = lmdjxtools.format( phrase ) segmentedPhrase += s.segment( formattedtext.split(' ') )
print( '\t %s one-grams (non-unique) found in %s sentences'%\ ( '{:,}'.format(nMots), '{:,}'.format(nPosts)) ) print('> ELeVE: Segment & count') dicoFr = lmdjxtools.getDicoFr() blacklist_afterEleve = lmdjxtools.blacklist_afterEleve() # Create table occurences cursor.execute('''DROP TABLE IF EXISTS occurences''') cursor.execute('''CREATE TABLE occurences (date text, ngram text, source text, postid integer)''') #postid? print('\t reset table DB.occurences') from eleve import Segmenter s = Segmenter(storage) rejected_ngrams = set() i = 0 cursor.execute('SELECT date, source, title, summary, rowid FROM posts') for line in cursor.fetchall(): date = line[0] source = line[1] postid = line[4] segmentedPhrase = [] for phrase in line[2:3]: formattedtext = lmdjxtools.format(phrase) segmentedPhrase += s.segment(formattedtext.split(' ')) ngramAdded = set()