Python clean_text Examples, tools.clean_text Python Examples

Example #1

0

Show file

File: change_new_item.py Project: SYSchel/adimport

def _update_vendor(step=10):
    """
    Create vendor
    :param step: The number of products per pass
    """
    vendor_list = {}
    for vendor in Item.objects.values_list('vendor_name', flat=True).filter(vendor_name__isnull=False).\
            annotate(count=Count('vendor_name')).order_by('-count'):
        clean_vendor = clean_text(vendor)
        if len(clean_vendor) > 2:
            vendors = ItemVendor.objects.get_or_create(name=clean_text(clean_vendor))[0]
            vendor_list[vendor] = int(vendors.pk)

    round_for = int(math.ceil(float(len(vendor_list)) / step))
    for a in xrange(round_for):
        start =a*step
        stop = (a+1)*step
        with transaction.atomic():
            for k, v in vendor_list.items()[start:stop]:
                Item.objects.filter(vendor__isnull=True, vendor_name=u"%s" % k).\
                    update(vendor_id=v, vendor_name=None)

Example #2

0

Show file

def _update_vendor(step=10):
    """
    Create vendor
    :param step: The number of products per pass
    """
    vendor_list = {}
    for vendor in Item.objects.values_list('vendor_name', flat=True).filter(vendor_name__isnull=False).\
            annotate(count=Count('vendor_name')).order_by('-count'):
        clean_vendor = clean_text(vendor)
        if len(clean_vendor) > 2:
            vendors = ItemVendor.objects.get_or_create(
                name=clean_text(clean_vendor))[0]
            vendor_list[vendor] = int(vendors.pk)

    round_for = int(math.ceil(float(len(vendor_list)) / step))
    for a in xrange(round_for):
        start = a * step
        stop = (a + 1) * step
        with transaction.atomic():
            for k, v in vendor_list.items()[start:stop]:
                Item.objects.filter(vendor__isnull=True, vendor_name=u"%s" % k).\
                    update(vendor_id=v, vendor_name=None)

Example #3

0

Show file

def main():
    """
        主函数
    """
    # Step 1: 处理数据集
    print('===Step1: 处理数据集===')

    if not os.path.exists(constant.cln_text_csv_file):
        print('清洗数据...')
        # 读取原始csv文件
        raw_text_df = pd.read_csv(constant.raw_text_csv_file)

        # 清洗原始数据
        cln_text_df = clean_text(raw_text_df)

        # 保存处理好的文本数据
        cln_text_df.to_csv(constant.cln_text_csv_file, index=None)
        print('完成，并保存结果至', constant.cln_text_csv_file)

    print('================\n')

    # Step 2. 查看整理好的数据集，并选取部分数据作为模型的训练
    print('===Step2. 查看数据集===')
    text_data = pd.read_csv(constant.cln_text_csv_file)
    text_data['date'] = pd.to_datetime(text_data['date'])
    text_data.set_index('date', inplace=True)
    print('各类样本数量：')
    print(text_data.groupby('label').size())

    # Step 3. 分割训练集和测试集
    print('===Step3. 分割训练集合测试集===')
    train_text_df, test_text_df = split_train_test(text_data)
    # 查看训练集测试集基本信息
    print('训练集中各类的数据个数：')
    print(train_text_df.groupby('label').size())
    print('测试集中各类的数据个数：')
    print(test_text_df.groupby('label').size())
    print('================\n')

    # Step 4. 特征提取
    print('===Step4. 文本特征提取===')
    # 计算词频
    n_common_words = 200

    # 将训练集中的单词拿出来统计词频
    print('统计词频...')
    all_words_in_train = get_word_list_from_data(train_text_df)
    fdisk = nltk.FreqDist(all_words_in_train)
    common_words_freqs = fdisk.most_common(n_common_words)
    print('出现最多的{}个词是：'.format(n_common_words))
    for word, count in common_words_freqs:
        print('{}: {}次'.format(word, count))
    print()

    # 在训练集上提取特征
    text_collection = TextCollection(train_text_df['text'].values.tolist())
    print('训练样本提取特征...')
    train_X, train_y = extract_feat_from_data(train_text_df, text_collection,
                                              common_words_freqs)
    print('完成')
    print()

    print('测试样本提取特征...')
    test_X, test_y = extract_feat_from_data(test_text_df, text_collection,
                                            common_words_freqs)
    print('完成')
    print('================\n')

    # 特征处理
    # 特征范围归一化
    scaler = StandardScaler()
    tr_feat_scaled = scaler.fit_transform(train_X)
    te_feat_scaled = scaler.transform(test_X)

    # 3.6 特征选择
    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    tr_feat_scaled_sel = sel.fit_transform(tr_feat_scaled)
    te_feat_scaled_sel = sel.transform(te_feat_scaled)

    # 3.7 PCA降维操作
    pca = PCA(n_components=0.95)  # 保留95%贡献率的特征向量
    tr_feat_scaled_sel_pca = pca.fit_transform(tr_feat_scaled_sel)
    te_feat_scaled_sel_pca = pca.transform(te_feat_scaled_sel)
    print('特征处理结束')
    print('处理后每个样本特征维度：', tr_feat_scaled_sel_pca.shape[1])

    # Step 5. 训练模型
    models = []
    print('===Step5. 训练模型===')
    print('1. 朴素贝叶斯模型：')
    gnb_model = GaussianNB()
    gnb_model.fit(tr_feat_scaled_sel_pca, train_y)
    models.append(['朴素贝叶斯', gnb_model])
    print('完成')
    print()

    print('2. 逻辑回归：')
    lr_param_grid = [{'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}]
    lr_model = LogisticRegression()
    best_lr_model = get_best_model(lr_model,
                                   tr_feat_scaled_sel_pca,
                                   train_y,
                                   lr_param_grid,
                                   cv=3)
    models.append(['逻辑回归', best_lr_model])
    print('完成')
    print()

    print('3. 支持向量机：')
    svm_param_grid = [
        {
            'C': [1e-2, 1e-1, 1, 10, 100],
            'gamma': [0.001, 0.0001],
            'kernel': ['rbf']
        },
    ]
    svm_model = svm.SVC(probability=True)
    best_svm_model = get_best_model(svm_model,
                                    tr_feat_scaled_sel_pca,
                                    train_y,
                                    svm_param_grid,
                                    cv=3)
    models.append(['支持向量机', best_svm_model])
    print('完成')
    print()

    print('4. 随机森林：')
    rf_param_grid = [{'n_estimators': [10, 50, 100, 150, 200]}]

    rf_model = RandomForestClassifier()
    best_rf_model = get_best_model(rf_model,
                                   tr_feat_scaled_sel_pca,
                                   train_y,
                                   rf_param_grid,
                                   cv=3)
    rf_model.fit(tr_feat_scaled_sel_pca, train_y)
    models.append(['随机森林', best_rf_model])
    print('完成')
    print()

    # Step 6. 测试模型
    print('===Step6. 测试模型===')
    for i, model in enumerate(models):
        print('{}-{}'.format(i + 1, model[0]))
        # 输出准确率
        print('准确率：',
              accuracy_score(test_y, model[1].predict(te_feat_scaled_sel_pca)))
        print(
            'AUC：',
            roc_auc_score(test_y,
                          model[1].predict_proba(te_feat_scaled_sel_pca)[:,
                                                                         0]))
        # 输出混淆矩阵
        print('混淆矩阵')
        print(
            confusion_matrix(test_y, model[1].predict(te_feat_scaled_sel_pca)))
        print()

Example #4

0

Show file

def read_i2b2(txt, con):
    """
    read_i2b2()

    @param txt. A file path for the tokenized medical record
    @param con. A file path for the i2b2 annotated concepts for txt
    """
    tokenized_sents = []

    sent_tokenize = lambda text: text.split('\n')
    word_tokenize = lambda text: text.split(' ')

    # Read in the medical text
    with open(txt) as f:
        try:
            # Original text file
            text = f.read().strip('\n')
            # text = f.read()
        except:
            f = open(txt, encoding="utf8")
            text = f.read().strip('\n')
            f.close()

        # tokenize
        sentences = sent_tokenize(text)
        # print(sentences)
        '''
        ['DATE OF ADMISSION : MM/DD/YYYY', 'DATE OF DISCHARGE : MM/DD/YYYY', 'DISCHARGE DIAGNOSES :', '1 . Vasovagal syncope , status post fall .', '2 . Traumatic arthritis , right knee .', '3 .
        Hypertension .', '4 ]
        '''
        for sentence in sentences:
            sent = clean_text(sentence.rstrip())

            # lowercase
            sent = sent.lower()

            toks = word_tokenize(sent)

            # normalize tokens
            normed_toks = normalize_tokens(toks)

            #for w in normed_toks:
            #    print(w)
            #print()

            tokenized_sents.append(normed_toks)

    # If an accompanying concept file was specified, read it
    tok_concepts = []
    if con:
        with open(con) as f:
            for line in f.readlines():
                # Empty line
                if not line.strip():
                    continue

                # parse concept line
                concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$'
                match = re.search(concept_regex, line.strip())
                groups = match.groups()

                # retrieve regex info
                concept_text = groups[0]
                start_lineno = int(groups[1])
                start_tok_ind = int(groups[2])
                end_lineno = int(groups[3])
                end_tok_ind = int(groups[4])
                concept_label = groups[5]

                # pre-process text for error-check
                #matching_line = tokenized_sents[start_lineno-1]
                #matching_toks = matching_line[start_tok_ind:end_tok_ind+1]
                #matching_text = ' '.join(matching_toks).lower()
                #concept_text  = ' '.join(word_tokenize(concept_text))

                # error-check info
                assert start_lineno == end_lineno, 'concept must span single line'
                #assert concept_text==matching_text, 'something wrong with inds'

                # add the concept info
                tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind)
                tok_concepts.append(tup)

        # Safe guard against concept file having duplicate entries
        tok_concepts = list(set(tok_concepts))

        # Concept file does not guarantee ordering by line number
        tok_concepts = sorted(tok_concepts, key=lambda t: t[1:])

        # Ensure no overlapping concepts (that would be bad)
        for i in range(len(tok_concepts) - 1):
            c1 = tok_concepts[i]
            c2 = tok_concepts[i + 1]
            if c1[1] == c2[1]:
                if c1[2] <= c2[2] and c2[2] <= c1[3]:
                    fname = os.path.basename(con)
                    error1 = '%s has overlapping entities on line %d' % (fname,
                                                                         c1[1])
                    error2 = "It can't be processed until you remove one"
                    error3 = 'Please modify this file: %s' % con
                    error4 = '\tentity 1: c="%s" %d:%d %d:%d||t="%s"' % (
                        ' '.join(tokenized_sents[c1[1] - 1][c1[2]:c1[3] + 1]),
                        c1[1], c1[2], c1[1], c1[3], c1[0])
                    error5 = '\tentity 2: c="%s" %d:%d %d:%d||t="%s"' % (
                        ' '.join(tokenized_sents[c2[1] - 1][c2[2]:c2[3] + 1]),
                        c2[1], c2[2], c2[1], c2[3], c2[0])
                    error_msg = '\n\n%s\n%s\n\n%s\n\n%s\n%s\n' % (
                        error1, error2, error3, error4, error5)
                    raise DocumentException(error_msg)

    # print(tok_concepts) # ('treatment', 48, 2, 2), ('treatment', 49, 5, 5)]

    return tokenized_sents, tok_concepts

Example #5

0

Show file

File: sensor.py Project: Bordera/wemood-volume

import json
import sys

from subprocess import check_output
from datetime import datetime as dt
from pymongo import MongoClient
from bson.json_util import dumps

import tools

client = MongoClient()
db = client.wemood

program_out = check_output(
    ["soundmeter", "--collect", "--seconds", sys.argv[1]])
clean_program_data = tools.clean_text(program_out)

clean_program_data["date"] = unicode(dt.now())
clean_program_data["sensor"] = "volume"

with open('output/output.json', 'w') as outfile:
    json.dump(clean_program_data, outfile)

db.sensors.insert_one(clean_program_data)

Example #6

0

Show file

def readDocs(txt, concept):
    tokenizedSentences = []
    sentTokenize = lambda text: text.split('\n')
    wordTokenize = lambda text: text.split(' ')

    with open(txt) as foo:
        text = foo.read().strip('\n')
        sentences = sentTokenize(text)
        for s in sentences:
            sent = clean_text(s.rstrip())
            sent = sent.lower()
            tokens = wordTokenize(sent)
            normedTokens = normalize_tokens(tokens)
            tokenizedSentences.append(normedTokens)

    tokenizedConcepts = []
    if concept:
        with open(concept) as foo:
            for l in foo.readlines():
                if not l.strip():
                    continue

                conceptRegex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$'
                match = re.search(conceptRegex, l.strip())
                groups = match.groups()

                concept_text = groups[0]
                beginLineNum = int(groups[1])
                beginTokenIndex = int(groups[2])
                lastLineNum = int(groups[3])
                lastTokenIndex = int(groups[4])
                conceptLabel = groups[5]

                assert beginLineNum == lastLineNum, 'concept must span single line'

                tup = (conceptLabel, beginLineNum, beginTokenIndex,
                       lastTokenIndex)
                tokenizedConcepts.append(tup)

        tokenizedConcepts = list(set(tokenizedConcepts))
        tokenizedConcepts = sorted(tokenizedConcepts, key=lambda t: t[1:])

        # Ensure no overlapping concepts (that would be bad)
        for i in range(len(tokenizedConcepts) - 1):
            c1 = tokenizedConcepts[i]
            c2 = tokenizedConcepts[i + 1]
            if c1[1] == c2[1]:
                if c1[2] <= c2[2] and c2[2] <= c1[3]:
                    fname = os.path.basename(con)
                    error1 = '%s has overlapping entities on line %d' % (fname,
                                                                         c1[1])
                    error2 = "It can't be processed until you remove one"
                    error3 = 'Please modify this file: %s' % con
                    error4 = '\tentity 1: c="%s" %d:%d %d:%d||t="%s"' % (
                        ' '.join(
                            tokenizedSentences[c1[1] - 1][c1[2]:c1[3] + 1]),
                        c1[1], c1[2], c1[1], c1[3], c1[0])
                    error5 = '\tentity 2: c="%s" %d:%d %d:%d||t="%s"' % (
                        ' '.join(
                            tokenizedSentences[c2[1] - 1][c2[2]:c2[3] + 1]),
                        c2[1], c2[2], c2[1], c2[3], c2[0])
                    error_msg = '\n\n%s\n%s\n\n%s\n\n%s\n%s\n' % (
                        error1, error2, error3, error4, error5)
                    raise DocumentException(error_msg)

    return tokenizedSentences, tokenizedConcepts

Example #7

0

Show file

File: documents.py Project: saadjanjua/CliNER

def read_i2b2(txt, con):
    """
    read_i2b2()

    @param txt. A file path for the tokenized medical record
    @param con. A file path for the i2b2 annotated concepts for txt
    """
    tokenized_sents = []

    sent_tokenize = lambda text: text.split('\n')
    word_tokenize = lambda text: text.split(' ')

    # Read in the medical text
    with open(txt) as f:
        # Original text file
        text = f.read().strip('\n')

        # tokenize
        sentences = sent_tokenize(text)
        for sentence in sentences:
            sent = clean_text(sentence.rstrip())

            # lowercase
            sent = sent.lower()

            toks = word_tokenize(sent)

            # normalize tokens
            normed_toks = normalize_tokens(toks)

            #for w in normed_toks:
            #    print w
            #print

            tokenized_sents.append(normed_toks)

    # If an accompanying concept file was specified, read it
    tok_concepts = []
    if con:
        with open(con) as f:
            for line in f.readlines():
                # Empty line
                if not line.strip():
                    continue

                # parse concept line
                concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$'
                match = re.search(concept_regex, line.strip())
                groups = match.groups()

                # retrieve regex info
                concept_text = groups[0]
                start_lineno = int(groups[1])
                start_tok_ind = int(groups[2])
                end_lineno = int(groups[3])
                end_tok_ind = int(groups[4])
                concept_label = groups[5]

                # pre-process text for error-check
                #matching_line = tokenized_sents[start_lineno-1]
                #matching_toks = matching_line[start_tok_ind:end_tok_ind+1]
                #matching_text = ' '.join(matching_toks).lower()
                #concept_text  = ' '.join(word_tokenize(concept_text))

                # error-check info
                assert start_lineno == end_lineno, 'concept must span single line'
                #assert concept_text==matching_text, 'something wrong with inds'

                # add the concept info
                tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind)
                tok_concepts.append(tup)

        # Safe guard against concept file having duplicate entries
        tok_concepts = list(set(tok_concepts))

        # Concept file does not guarantee ordering by line number
        tok_concepts = sorted(tok_concepts, cmp=classification_cmp)

        # Ensure no overlapping concepts (that would be bad)
        for i in range(len(tok_concepts) - 1):
            c1 = tok_concepts[i]
            c2 = tok_concepts[i + 1]
            if c1[1] == c2[1]:
                if c1[2] <= c2[2] and c2[2] <= c1[3]:
                    fname = os.path.basenme(con)
                    error1 = '%s has overlapping entities on line %d' % (fname,
                                                                         c1[1])
                    error2 = "It can't be processed until you remove one"
                    error_msg = '%s\n%s' % (error1, error2)
                    raise DocumentException(error_msg)

    return tokenized_sents, tok_concepts

Example #8

0

Show file

File: documents.py Project: wboag/CliNER

def read_i2b2(txt, con):
    """
    read_i2b2()

    @param txt. A file path for the tokenized medical record
    @param con. A file path for the i2b2 annotated concepts for txt
    """
    tokenized_sents = []

    sent_tokenize = lambda text: text.split('\n')
    word_tokenize = lambda text: text.split(' ')

    # Read in the medical text
    with open(txt) as f:
        # Original text file
        text = f.read().strip('\n')

        # tokenize
        sentences = sent_tokenize(text)
        for sentence in sentences:
            sent = clean_text(sentence.rstrip())

            # lowercase (like word2vec preprocessing)
            sent = sent.lower()

            toks = word_tokenize(sent)

            # normalize tokens
            normed_toks = normalize_tokens(toks)

            #for w in normed_toks:
            #    print w
            #print

            tokenized_sents.append(normed_toks)

    # If an accompanying concept file was specified, read it
    tok_concepts = []
    if con:
        with open(con) as f:
            for line in f.readlines():
                # Empty line
                if not line.strip():
                    continue

                # parse concept line
                concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$'
                match = re.search(concept_regex, line.strip())
                groups = match.groups()

                # retrieve regex info
                concept_text  =     groups[0]
                start_lineno  = int(groups[1])
                start_tok_ind = int(groups[2])
                end_lineno    = int(groups[3])
                end_tok_ind   = int(groups[4])
                concept_label =     groups[5]

                # pre-process text for error-check
                #matching_line = tokenized_sents[start_lineno-1]
                #matching_toks = matching_line[start_tok_ind:end_tok_ind+1]
                #matching_text = ' '.join(matching_toks).lower()
                #concept_text  = ' '.join(word_tokenize(concept_text))

                # error-check info
                assert start_lineno==end_lineno, 'concept must span single line'
                #assert concept_text==matching_text, 'something wrong with inds'

                # add the concept info
                tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind)
                tok_concepts.append(tup)

        # Safe guard against concept file having duplicate entries
        tok_concepts = list(set(tok_concepts))

        # Concept file does not guarantee ordering by line number
        tok_concepts = sorted(tok_concepts, cmp=classification_cmp)

        # Ensure no overlapping concepts (that would be bad)
        for i in range(len(tok_concepts)-1):
            c1 = tok_concepts[i]
            c2 = tok_concepts[i+1]
            if c1[1] == c2[1]:
                if c1[2] <= c2[2] and c2[2] <= c1[3]:
                    fname = os.path.basenme(con)
                    error1='%s has overlapping entities on line %d'%(fname,c1[1])
                    error2="It can't be processed until you remove one"
                    error_msg = '%s\n%s' % (error1,error2)
                    raise DocumentException(error_msg)

    return tokenized_sents, tok_concepts