def _update_vendor(step=10): """ Create vendor :param step: The number of products per pass """ vendor_list = {} for vendor in Item.objects.values_list('vendor_name', flat=True).filter(vendor_name__isnull=False).\ annotate(count=Count('vendor_name')).order_by('-count'): clean_vendor = clean_text(vendor) if len(clean_vendor) > 2: vendors = ItemVendor.objects.get_or_create(name=clean_text(clean_vendor))[0] vendor_list[vendor] = int(vendors.pk) round_for = int(math.ceil(float(len(vendor_list)) / step)) for a in xrange(round_for): start =a*step stop = (a+1)*step with transaction.atomic(): for k, v in vendor_list.items()[start:stop]: Item.objects.filter(vendor__isnull=True, vendor_name=u"%s" % k).\ update(vendor_id=v, vendor_name=None)
def _update_vendor(step=10): """ Create vendor :param step: The number of products per pass """ vendor_list = {} for vendor in Item.objects.values_list('vendor_name', flat=True).filter(vendor_name__isnull=False).\ annotate(count=Count('vendor_name')).order_by('-count'): clean_vendor = clean_text(vendor) if len(clean_vendor) > 2: vendors = ItemVendor.objects.get_or_create( name=clean_text(clean_vendor))[0] vendor_list[vendor] = int(vendors.pk) round_for = int(math.ceil(float(len(vendor_list)) / step)) for a in xrange(round_for): start = a * step stop = (a + 1) * step with transaction.atomic(): for k, v in vendor_list.items()[start:stop]: Item.objects.filter(vendor__isnull=True, vendor_name=u"%s" % k).\ update(vendor_id=v, vendor_name=None)
def main(): """ 主函数 """ # Step 1: 处理数据集 print('===Step1: 处理数据集===') if not os.path.exists(constant.cln_text_csv_file): print('清洗数据...') # 读取原始csv文件 raw_text_df = pd.read_csv(constant.raw_text_csv_file) # 清洗原始数据 cln_text_df = clean_text(raw_text_df) # 保存处理好的文本数据 cln_text_df.to_csv(constant.cln_text_csv_file, index=None) print('完成,并保存结果至', constant.cln_text_csv_file) print('================\n') # Step 2. 查看整理好的数据集,并选取部分数据作为模型的训练 print('===Step2. 查看数据集===') text_data = pd.read_csv(constant.cln_text_csv_file) text_data['date'] = pd.to_datetime(text_data['date']) text_data.set_index('date', inplace=True) print('各类样本数量:') print(text_data.groupby('label').size()) # Step 3. 分割训练集和测试集 print('===Step3. 分割训练集合测试集===') train_text_df, test_text_df = split_train_test(text_data) # 查看训练集测试集基本信息 print('训练集中各类的数据个数:') print(train_text_df.groupby('label').size()) print('测试集中各类的数据个数:') print(test_text_df.groupby('label').size()) print('================\n') # Step 4. 特征提取 print('===Step4. 文本特征提取===') # 计算词频 n_common_words = 200 # 将训练集中的单词拿出来统计词频 print('统计词频...') all_words_in_train = get_word_list_from_data(train_text_df) fdisk = nltk.FreqDist(all_words_in_train) common_words_freqs = fdisk.most_common(n_common_words) print('出现最多的{}个词是:'.format(n_common_words)) for word, count in common_words_freqs: print('{}: {}次'.format(word, count)) print() # 在训练集上提取特征 text_collection = TextCollection(train_text_df['text'].values.tolist()) print('训练样本提取特征...') train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) print('完成') print() print('测试样本提取特征...') test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) print('完成') print('================\n') # 特征处理 # 特征范围归一化 scaler = StandardScaler() tr_feat_scaled = scaler.fit_transform(train_X) te_feat_scaled = scaler.transform(test_X) # 3.6 特征选择 sel = VarianceThreshold(threshold=(.8 * (1 - .8))) tr_feat_scaled_sel = sel.fit_transform(tr_feat_scaled) te_feat_scaled_sel = sel.transform(te_feat_scaled) # 3.7 PCA降维操作 pca = PCA(n_components=0.95) # 保留95%贡献率的特征向量 tr_feat_scaled_sel_pca = pca.fit_transform(tr_feat_scaled_sel) te_feat_scaled_sel_pca = pca.transform(te_feat_scaled_sel) print('特征处理结束') print('处理后每个样本特征维度:', tr_feat_scaled_sel_pca.shape[1]) # Step 5. 训练模型 models = [] print('===Step5. 训练模型===') print('1. 朴素贝叶斯模型:') gnb_model = GaussianNB() gnb_model.fit(tr_feat_scaled_sel_pca, train_y) models.append(['朴素贝叶斯', gnb_model]) print('完成') print() print('2. 逻辑回归:') lr_param_grid = [{'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}] lr_model = LogisticRegression() best_lr_model = get_best_model(lr_model, tr_feat_scaled_sel_pca, train_y, lr_param_grid, cv=3) models.append(['逻辑回归', best_lr_model]) print('完成') print() print('3. 支持向量机:') svm_param_grid = [ { 'C': [1e-2, 1e-1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] svm_model = svm.SVC(probability=True) best_svm_model = get_best_model(svm_model, tr_feat_scaled_sel_pca, train_y, svm_param_grid, cv=3) models.append(['支持向量机', best_svm_model]) print('完成') print() print('4. 随机森林:') rf_param_grid = [{'n_estimators': [10, 50, 100, 150, 200]}] rf_model = RandomForestClassifier() best_rf_model = get_best_model(rf_model, tr_feat_scaled_sel_pca, train_y, rf_param_grid, cv=3) rf_model.fit(tr_feat_scaled_sel_pca, train_y) models.append(['随机森林', best_rf_model]) print('完成') print() # Step 6. 测试模型 print('===Step6. 测试模型===') for i, model in enumerate(models): print('{}-{}'.format(i + 1, model[0])) # 输出准确率 print('准确率:', accuracy_score(test_y, model[1].predict(te_feat_scaled_sel_pca))) print( 'AUC:', roc_auc_score(test_y, model[1].predict_proba(te_feat_scaled_sel_pca)[:, 0])) # 输出混淆矩阵 print('混淆矩阵') print( confusion_matrix(test_y, model[1].predict(te_feat_scaled_sel_pca))) print()
def read_i2b2(txt, con): """ read_i2b2() @param txt. A file path for the tokenized medical record @param con. A file path for the i2b2 annotated concepts for txt """ tokenized_sents = [] sent_tokenize = lambda text: text.split('\n') word_tokenize = lambda text: text.split(' ') # Read in the medical text with open(txt) as f: try: # Original text file text = f.read().strip('\n') # text = f.read() except: f = open(txt, encoding="utf8") text = f.read().strip('\n') f.close() # tokenize sentences = sent_tokenize(text) # print(sentences) ''' ['DATE OF ADMISSION : MM/DD/YYYY', 'DATE OF DISCHARGE : MM/DD/YYYY', 'DISCHARGE DIAGNOSES :', '1 . Vasovagal syncope , status post fall .', '2 . Traumatic arthritis , right knee .', '3 . Hypertension .', '4 ] ''' for sentence in sentences: sent = clean_text(sentence.rstrip()) # lowercase sent = sent.lower() toks = word_tokenize(sent) # normalize tokens normed_toks = normalize_tokens(toks) #for w in normed_toks: # print(w) #print() tokenized_sents.append(normed_toks) # If an accompanying concept file was specified, read it tok_concepts = [] if con: with open(con) as f: for line in f.readlines(): # Empty line if not line.strip(): continue # parse concept line concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$' match = re.search(concept_regex, line.strip()) groups = match.groups() # retrieve regex info concept_text = groups[0] start_lineno = int(groups[1]) start_tok_ind = int(groups[2]) end_lineno = int(groups[3]) end_tok_ind = int(groups[4]) concept_label = groups[5] # pre-process text for error-check #matching_line = tokenized_sents[start_lineno-1] #matching_toks = matching_line[start_tok_ind:end_tok_ind+1] #matching_text = ' '.join(matching_toks).lower() #concept_text = ' '.join(word_tokenize(concept_text)) # error-check info assert start_lineno == end_lineno, 'concept must span single line' #assert concept_text==matching_text, 'something wrong with inds' # add the concept info tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind) tok_concepts.append(tup) # Safe guard against concept file having duplicate entries tok_concepts = list(set(tok_concepts)) # Concept file does not guarantee ordering by line number tok_concepts = sorted(tok_concepts, key=lambda t: t[1:]) # Ensure no overlapping concepts (that would be bad) for i in range(len(tok_concepts) - 1): c1 = tok_concepts[i] c2 = tok_concepts[i + 1] if c1[1] == c2[1]: if c1[2] <= c2[2] and c2[2] <= c1[3]: fname = os.path.basename(con) error1 = '%s has overlapping entities on line %d' % (fname, c1[1]) error2 = "It can't be processed until you remove one" error3 = 'Please modify this file: %s' % con error4 = '\tentity 1: c="%s" %d:%d %d:%d||t="%s"' % ( ' '.join(tokenized_sents[c1[1] - 1][c1[2]:c1[3] + 1]), c1[1], c1[2], c1[1], c1[3], c1[0]) error5 = '\tentity 2: c="%s" %d:%d %d:%d||t="%s"' % ( ' '.join(tokenized_sents[c2[1] - 1][c2[2]:c2[3] + 1]), c2[1], c2[2], c2[1], c2[3], c2[0]) error_msg = '\n\n%s\n%s\n\n%s\n\n%s\n%s\n' % ( error1, error2, error3, error4, error5) raise DocumentException(error_msg) # print(tok_concepts) # ('treatment', 48, 2, 2), ('treatment', 49, 5, 5)] return tokenized_sents, tok_concepts
import json import sys from subprocess import check_output from datetime import datetime as dt from pymongo import MongoClient from bson.json_util import dumps import tools client = MongoClient() db = client.wemood program_out = check_output( ["soundmeter", "--collect", "--seconds", sys.argv[1]]) clean_program_data = tools.clean_text(program_out) clean_program_data["date"] = unicode(dt.now()) clean_program_data["sensor"] = "volume" with open('output/output.json', 'w') as outfile: json.dump(clean_program_data, outfile) db.sensors.insert_one(clean_program_data)
def readDocs(txt, concept): tokenizedSentences = [] sentTokenize = lambda text: text.split('\n') wordTokenize = lambda text: text.split(' ') with open(txt) as foo: text = foo.read().strip('\n') sentences = sentTokenize(text) for s in sentences: sent = clean_text(s.rstrip()) sent = sent.lower() tokens = wordTokenize(sent) normedTokens = normalize_tokens(tokens) tokenizedSentences.append(normedTokens) tokenizedConcepts = [] if concept: with open(concept) as foo: for l in foo.readlines(): if not l.strip(): continue conceptRegex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$' match = re.search(conceptRegex, l.strip()) groups = match.groups() concept_text = groups[0] beginLineNum = int(groups[1]) beginTokenIndex = int(groups[2]) lastLineNum = int(groups[3]) lastTokenIndex = int(groups[4]) conceptLabel = groups[5] assert beginLineNum == lastLineNum, 'concept must span single line' tup = (conceptLabel, beginLineNum, beginTokenIndex, lastTokenIndex) tokenizedConcepts.append(tup) tokenizedConcepts = list(set(tokenizedConcepts)) tokenizedConcepts = sorted(tokenizedConcepts, key=lambda t: t[1:]) # Ensure no overlapping concepts (that would be bad) for i in range(len(tokenizedConcepts) - 1): c1 = tokenizedConcepts[i] c2 = tokenizedConcepts[i + 1] if c1[1] == c2[1]: if c1[2] <= c2[2] and c2[2] <= c1[3]: fname = os.path.basename(con) error1 = '%s has overlapping entities on line %d' % (fname, c1[1]) error2 = "It can't be processed until you remove one" error3 = 'Please modify this file: %s' % con error4 = '\tentity 1: c="%s" %d:%d %d:%d||t="%s"' % ( ' '.join( tokenizedSentences[c1[1] - 1][c1[2]:c1[3] + 1]), c1[1], c1[2], c1[1], c1[3], c1[0]) error5 = '\tentity 2: c="%s" %d:%d %d:%d||t="%s"' % ( ' '.join( tokenizedSentences[c2[1] - 1][c2[2]:c2[3] + 1]), c2[1], c2[2], c2[1], c2[3], c2[0]) error_msg = '\n\n%s\n%s\n\n%s\n\n%s\n%s\n' % ( error1, error2, error3, error4, error5) raise DocumentException(error_msg) return tokenizedSentences, tokenizedConcepts
def read_i2b2(txt, con): """ read_i2b2() @param txt. A file path for the tokenized medical record @param con. A file path for the i2b2 annotated concepts for txt """ tokenized_sents = [] sent_tokenize = lambda text: text.split('\n') word_tokenize = lambda text: text.split(' ') # Read in the medical text with open(txt) as f: # Original text file text = f.read().strip('\n') # tokenize sentences = sent_tokenize(text) for sentence in sentences: sent = clean_text(sentence.rstrip()) # lowercase sent = sent.lower() toks = word_tokenize(sent) # normalize tokens normed_toks = normalize_tokens(toks) #for w in normed_toks: # print w #print tokenized_sents.append(normed_toks) # If an accompanying concept file was specified, read it tok_concepts = [] if con: with open(con) as f: for line in f.readlines(): # Empty line if not line.strip(): continue # parse concept line concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$' match = re.search(concept_regex, line.strip()) groups = match.groups() # retrieve regex info concept_text = groups[0] start_lineno = int(groups[1]) start_tok_ind = int(groups[2]) end_lineno = int(groups[3]) end_tok_ind = int(groups[4]) concept_label = groups[5] # pre-process text for error-check #matching_line = tokenized_sents[start_lineno-1] #matching_toks = matching_line[start_tok_ind:end_tok_ind+1] #matching_text = ' '.join(matching_toks).lower() #concept_text = ' '.join(word_tokenize(concept_text)) # error-check info assert start_lineno == end_lineno, 'concept must span single line' #assert concept_text==matching_text, 'something wrong with inds' # add the concept info tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind) tok_concepts.append(tup) # Safe guard against concept file having duplicate entries tok_concepts = list(set(tok_concepts)) # Concept file does not guarantee ordering by line number tok_concepts = sorted(tok_concepts, cmp=classification_cmp) # Ensure no overlapping concepts (that would be bad) for i in range(len(tok_concepts) - 1): c1 = tok_concepts[i] c2 = tok_concepts[i + 1] if c1[1] == c2[1]: if c1[2] <= c2[2] and c2[2] <= c1[3]: fname = os.path.basenme(con) error1 = '%s has overlapping entities on line %d' % (fname, c1[1]) error2 = "It can't be processed until you remove one" error_msg = '%s\n%s' % (error1, error2) raise DocumentException(error_msg) return tokenized_sents, tok_concepts
def read_i2b2(txt, con): """ read_i2b2() @param txt. A file path for the tokenized medical record @param con. A file path for the i2b2 annotated concepts for txt """ tokenized_sents = [] sent_tokenize = lambda text: text.split('\n') word_tokenize = lambda text: text.split(' ') # Read in the medical text with open(txt) as f: # Original text file text = f.read().strip('\n') # tokenize sentences = sent_tokenize(text) for sentence in sentences: sent = clean_text(sentence.rstrip()) # lowercase (like word2vec preprocessing) sent = sent.lower() toks = word_tokenize(sent) # normalize tokens normed_toks = normalize_tokens(toks) #for w in normed_toks: # print w #print tokenized_sents.append(normed_toks) # If an accompanying concept file was specified, read it tok_concepts = [] if con: with open(con) as f: for line in f.readlines(): # Empty line if not line.strip(): continue # parse concept line concept_regex = '^c="(.*)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.*)"$' match = re.search(concept_regex, line.strip()) groups = match.groups() # retrieve regex info concept_text = groups[0] start_lineno = int(groups[1]) start_tok_ind = int(groups[2]) end_lineno = int(groups[3]) end_tok_ind = int(groups[4]) concept_label = groups[5] # pre-process text for error-check #matching_line = tokenized_sents[start_lineno-1] #matching_toks = matching_line[start_tok_ind:end_tok_ind+1] #matching_text = ' '.join(matching_toks).lower() #concept_text = ' '.join(word_tokenize(concept_text)) # error-check info assert start_lineno==end_lineno, 'concept must span single line' #assert concept_text==matching_text, 'something wrong with inds' # add the concept info tup = (concept_label, start_lineno, start_tok_ind, end_tok_ind) tok_concepts.append(tup) # Safe guard against concept file having duplicate entries tok_concepts = list(set(tok_concepts)) # Concept file does not guarantee ordering by line number tok_concepts = sorted(tok_concepts, cmp=classification_cmp) # Ensure no overlapping concepts (that would be bad) for i in range(len(tok_concepts)-1): c1 = tok_concepts[i] c2 = tok_concepts[i+1] if c1[1] == c2[1]: if c1[2] <= c2[2] and c2[2] <= c1[3]: fname = os.path.basenme(con) error1='%s has overlapping entities on line %d'%(fname,c1[1]) error2="It can't be processed until you remove one" error_msg = '%s\n%s' % (error1,error2) raise DocumentException(error_msg) return tokenized_sents, tok_concepts