Ejemplo n.º 1
0
def cut(sentence):
    ''''''
    old_numeric_chars = ["壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
    simple_numeric_chars = ["零", "一", "二", "三", "四", "五", "六", "七", "八", "九"]
    old_numeric_char_set = set(old_numeric_chars)
    simple_numeric_char_set = set(simple_numeric_chars)
    r_symbols = '[`~!@#$%^&+*()=|{}\':;,\t\n\\[\\]『』「」<>/?《》~!@#¥%……&*()|{}【】‘;:”“’。,、?]'
    r_float = "-?(\d+)?\.\d+"
    r_alnum = "^[a-z]+[0-9]+$"

    ## basic replacement
    sentence = text_regularization.extractWords(sentence)
    ## domain replacement
    sentence = sentence.replace('+', '加')
    ## symbol replacement
    sentence = re.sub(r_symbols, ' ', sentence.strip())
    ## word segmentation
    words = [w for w in jieba.lcut(sentence, cut_all=False)]
    ## word filter
    clean_words = []
    for w in words:
        if ((w == '') | (w == ' ')):
            continue
        if (w.isnumeric()):  # integer
            old_numeric_ratio = np.sum(
                [1 for c in w if (c in old_numeric_char_set)]) / len(w)
            simple_numeric_ratio = np.sum(
                [1 for c in w if (c in simple_numeric_char_set)]) / len(w)
            if ((old_numeric_ratio == 1.0) | (simple_numeric_ratio == 1.0)):
                clean_words.append('INTEGER_CN_%s' % len(w))
            else:
                clean_words.append('INTEGER_%s' % len(w))
        elif (re.match(r_float, w) != None):  # float
            clean_words.append('FLOAT')
        elif (
            (w.isalpha() == True) & (is_chinese_words(w) == False)):  ## alpha
            clean_words.append(w.lower())
        elif (is_chinese_words(w)):  # chinese words
            clean_words.append(w)
        elif (re.match(r_alnum, w) != None):  # alpha + num
            if (w.lower().startswith('qq')):
                clean_words.append('qq')
                clean_words.append('INTEGER_%s' % (len(w) - 2))
            elif (w.lower().startswith("tel")):
                clean_words.append('tel')
                clean_words.append('INTEGER_%s' % (len(w) - 3))
            else:
                clean_words.append('ALNUM_%s' % len(w))
        elif ((w == '-') | (w == '_')):
            clean_words.append(w)
    return clean_words
Ejemplo n.º 2
0
#encoding:UTF-8
import codecs
import jieba
import text_regularization as tr

filenames = ["ad.txt", "not_ad.txt"]
output = open("fastText_test.txt", "a")

for filename in filenames:
    with open(filename, 'r') as f:
        for line in f:
            text = tr.extractWords(line)
            word_list = " ".join(jieba.cut(text))
            output.write(
                word_list.replace("\n", " ") + "\t__label__" + filename[:-4] +
                "\n")
    f.close()

output.flush()
output.close()
Ejemplo n.º 3
0
"""

import numpy as np
import tensorlayer as tl
import sys
sys.path.append("../serving/packages")
from text_regularization import extractWords

wv = tl.files.load_npy_to_any(name='./output/model_word2vec_200.npy')
for label in ["pass", "spam"]:
    embeddings = []
    inp = "data/msglog/msg" + label + ".log.seg"
    outp = "output/sample_" + label
    f = open(inp, encoding='utf-8')
    for line in f:
        line = extractWords(line)
        words = line.strip().split(' ')
        text_embedding = np.zeros(200)
        for word in words:
            try:
                text_embedding += wv[word]
            except KeyError:
                text_embedding += wv['UNK']
        embeddings.append(text_embedding)

    embeddings = np.asarray(embeddings, dtype=np.float32)
    if label == "spam":
        labels = np.zeros(embeddings.shape[0])
    elif label == "pass":
        labels = np.ones(embeddings.shape[0])
Ejemplo n.º 4
0
def cut_1(sentence):
    ''''''
    sentence = text_regularization.extractWords(sentence)
    return jieba.lcut(sentence, cut_all=False)