Python read_files_yield_lines Examples

Programming Language: Python

Namespace/Package Name: nlp2

Method/Function: read_files_yield_lines

Examples at hotexamples.com: 3

Python read_files_yield_lines - 3 examples found. These are the top rated real world Python examples of nlp2.read_files_yield_lines extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: dataset.py Project: voidful/NLPrep

def toMiddleFormat(path):
    from phraseg import Phraseg
    punctuations = r"[．﹑︰〈〉─《﹖﹣﹂﹁﹔！？｡。＂＃＄％＆＇（）＊＋，﹐－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.．!\"#$%&()*+,\-.\:;<=>?@\[\]\\\/^_`{|}~]+"
    MASKTOKEN = "[MASK]"
    dataset = MiddleFormat(DATASETINFO, [MASKTOKEN])
    phraseg = Phraseg(path)

    for line in tqdm(nlp2.read_files_yield_lines(path)):
        line = nlp2.clean_all(line).strip()

        if len(nlp2.split_sentence_to_array(line)) > 1:
            phrases = list((phraseg.extract(sent=line,
                                            merge_overlap=False)).keys())
            reg = "[0-9]+|[a-zA-Z]+\'*[a-z]*|[\w]" + "|" + punctuations
            reg = "|".join(phrases) + "|" + reg
            input_sent = re.findall(reg, line, re.UNICODE)
            target_sent = re.findall(reg, line, re.UNICODE)
            for ind, word in enumerate(input_sent):
                prob = random.random()
                if prob <= 0.15 and len(word) > 0:
                    input_sent[ind] = MASKTOKEN
            if len(input_sent) > 2 and len(target_sent) > 2 and len(
                    "".join(input_sent).strip()) > 2 and len(
                        "".join(target_sent).strip()) > 2:
                dataset.add_data(nlp2.join_words_to_sentence(input_sent),
                                 nlp2.join_words_to_sentence(target_sent))

    return dataset

Example #2

Show file

File: tok.py Project: paper2code/TFkit

def get_freqK_unk_token(tokenizer, file_paths: list, freqK: int):
    unk_count_dict = OrderedDict()
    for path in file_paths:
        for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
            for tok in nlp2.split_sentence_to_array(input_sent):
                if tokenizer._unk_token in tokenizer.tokenize(tok):
                    unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
    return [key for key, value in unk_count_dict.items() if value >= freqK]

Example #3

Show file

File: tok.py Project: paper2code/TFkit

def get_topP_unk_token(tokenizer, file_paths: list, topP: float):
    unk_count_dict = OrderedDict()
    for path in file_paths:
        for input_sent in tqdm(nlp2.read_files_yield_lines(path)):
            for tok in nlp2.split_sentence_to_array(input_sent):
                if tokenizer._unk_token in tokenizer.tokenize(tok):
                    unk_count_dict[tok] = unk_count_dict.get(tok, 0) + 1
    top_range = int(len(unk_count_dict) * (topP / 100))
    return list(unk_count_dict.keys())[:top_range]