Esempi in Python per read_files_yield_lines

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: nlp2

Metodo/funzione: read_files_yield_lines

Esempi su hotexamples.com: 3

read_files_yield_lines in Python: 3 esempi trovati. Questi sono i migliori esempi reali in Python per nlp2.read_files_yield_lines, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: dataset.py Progetto: voidful/NLPrep

def toMiddleFormat(path):
    from phraseg import Phraseg
    punctuations = r"[．﹑︰〈〉─《﹖﹣﹂﹁﹔！？｡。＂＃＄％＆＇（）＊＋，﹐－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.．!\"#$%&()*+,\-.\:;<=>?@\[\]\\\/^_`{|}~]+"
    MASKTOKEN = "[MASK]"
    dataset = MiddleFormat(DATASETINFO, [MASKTOKEN])
    phraseg = Phraseg(path)

    for line in tqdm(nlp2.read_files_yield_lines(path)):
        line = nlp2.clean_all(line).strip()

        if len(nlp2.split_sentence_to_array(line)) > 1:
            phrases = list((phraseg.extract(sent=line,
                                            merge_overlap=False)).keys())
            reg = "[0-9]+|[a-zA-Z]+\'*[a-z]*|[\w]" + "|" + punctuations
            reg = "|".join(phrases) + "|" + reg
            input_sent = re.findall(reg, line, re.UNICODE)
            target_sent = re.findall(reg, line, re.UNICODE)
            for ind, word in enumerate(input_sent):
                prob = random.random()
                if prob <= 0.15 and len(word) > 0:
                    input_sent[ind] = MASKTOKEN
            if len(input_sent) > 2 and len(target_sent) > 2 and len(
                    "".join(input_sent).strip()) > 2 and len(
                        "".join(target_sent).strip()) > 2:
                dataset.add_data(nlp2.join_words_to_sentence(input_sent),
                                 nlp2.join_words_to_sentence(target_sent))

    return dataset

Esempio n. 2

Mostra file

File: tok.py Progetto: paper2code/TFkit

def get_freqK_unk_token(tokenizer, file_paths: list, freqK: int):
    unk_count_dict = OrderedDict()
    for path in file_paths:
        for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
            for tok in nlp2.split_sentence_to_array(input_sent):
                if tokenizer._unk_token in tokenizer.tokenize(tok):
                    unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
    return [key for key, value in unk_count_dict.items() if value >= freqK]

Esempio n. 3

Mostra file

File: tok.py Progetto: paper2code/TFkit

def get_topP_unk_token(tokenizer, file_paths: list, topP: float):
    unk_count_dict = OrderedDict()
    for path in file_paths:
        for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
            for tok in nlp2.split_sentence_to_array(input_sent):
                if tokenizer._unk_token in tokenizer.tokenize(tok):
                    unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
    top_range = int(len(unk_count_dict) * (topP / 100))
    return list(unk_count_dict.keys())[:top_range]