def exc(): # 5. Test tmp = store.loadTermData() termList = {'X': [i[0] for i in tmp], 'y': [int(i[1]) for i in tmp]} print('=======================================================') print('=> Term Classifying...') if (file_model): clf = store.loadClassifier(file=file_model) else: clf = store.loadClassifier() results = [] for i in range(len(termList['X'])): preprocessd_term = preprocess(termList['X'][i]) X = np.asarray([extractFeatureText(termList['X'][i], getFeatureNames())]) y_hat = clf.predict(X)[0].tolist()[0] results.append(clf.predict(X)[0].tolist() + clf.predict_proba(X)[0].tolist() + [1 if (y_hat != termList['y'][i]) else 0, preprocessd_term] + X[0].tolist()) titles = ['TestCase', 'Term', 'Label', 'Predicted Label', 'Name Score', 'Address Score', 'Phone Score', 'Error', 'Preprocessed_Term'] + \ getFeatureNames() tacc = sum([1 for (y1, y2) in zip(termList['y'], [result[0] for result in results]) if (y1 == y2)]) / len(termList['y']) if (file_model): store.saveTermTestResults(tacc, titles, termList, results, file=file_model + '_' + file_term_classify_result) else: store.saveTermTestResults(tacc, titles, termList, results, file=timeManage.getTime() + '_' + file_term_classify_result) return tacc
def exc(file='testdata.txt', label_file=folder_datasource + '/' + 'fa_labels'): # 5. Test & Labels textList = store.loadTextData(file=file) labels = store.loadCSV(label_file) if (file_model): _time, templateList = sg.parseAddress(textList, file_model) else: _time, templateList = sg.parseAddress(textList) acc, dist = checkAddressSegmentAccuracy(labels, templateList) titles = [['#', 'Text', 'Runtime'], ['TestCase', 'Top', 'Name', 'Address', 'Phone', 'NameScore', 'AddressScore', 'PhoneScore', 'Score', 'PrepName'] + ['Name_' + ft for ft in getFeatureNames()] + \ ['PrepAddress'] + ['Address_' + ft for ft in getFeatureNames()] + \ ['PrepPhone'] + ['Phone_' + ft for ft in getFeatureNames()]] if (file_model): store.saveResults(titles, (textList, _time), templateList, file=file_model + '_' + file_segment_address_result, acc=acc) else: store.saveResults(titles, (textList, _time), templateList, file=timeManage.getTime() + '_' + file_segment_address_result, acc=acc) return acc
def exc(file='testdata.txt', label_file=folder_datasource + '/' + 'fa_labels'): # 5. Test & Labels textList = store.loadTextData(file=file) labels = store.loadCSV(label_file) if (file_model): _time, templateList = sg.parseAddress(textList, file_model) else: _time, templateList = sg.parseAddress(textList) if (sum([len(itmp) for itmp in templateList]) > 0): acc, dist = checkAddressSegmentAccuracy(labels, templateList) else: acc = 0 titles = [['#', 'Text', 'Runtime'], ['TestCase', 'Top', 'Name', 'Address', 'Phone', 'NameScore', 'AddressScore', 'PhoneScore', 'Score', 'PrepName'] + ['Name_' + ft for ft in getFeatureNames()] + \ ['PrepAddress'] + ['Address_' + ft for ft in getFeatureNames()] + \ ['PrepPhone'] + ['Phone_' + ft for ft in getFeatureNames()]] if (file_model): store.saveResults(titles, (textList, _time), templateList, file=file_model + '_' + file_segment_address_result, acc=acc) else: store.saveResults(titles, (textList, _time), templateList, file=timeManage.getTime() + '_' + file_segment_address_result, acc=acc) return acc
def templateFiler(clf, ptemplates): templates = [] m = {0: 'name', 1: 'address', 2: 'phone'} ft = fe.getFeatureNames() dft = {key: value for (key, value) in zip(ft, range(len(ft)))} for terms in ptemplates: # preprocessedTerms = [fe.extractFeatureText(term) for term in terms] preprocessedTerms = [] for term in terms: if (bpreprocessing): preprocessedTerms.append(fe.preprocess(term)) btmp = False _X = [] for term in preprocessedTerms: if (len(term) > 0): _X.append(fe.feature(term, fe.getFeatureNames())) else: btmp = True if (btmp): continue X = np.asarray(_X) cls = clf.predict(X) tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0]) tmp.sort() if (tmp == list(range(len(terms)))): dct = {} probs = clf.predict_proba(X) for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs, preprocessedTerms, _X): try: dct[m[int(cl)]] = { 'term': term, 'score': prob[int(cl)], 'preprocessed': prepTerm, 'features': _x } except ValueError: dct[m[int(cl)]] = { 'term': term, 'score': prob[cl], 'preprocessed': prepTerm, 'features': _x } dct['score'] = sum([log(dct[key]['score']) for key in dct]) if (checkTemplate(dct, dft)): templates.append(dct) if (len(templates) > 0): templates = sorted(templates, key=lambda k: k['score'], reverse=True) return templates
def exc(): # 5. Test tmp = store.loadTermData() termList = {'X': [i[0] for i in tmp], 'y': [int(i[1]) for i in tmp]} print('=======================================================') print('=> Term Classifying...') if (file_model): clf = store.loadClassifier(file=file_model) else: clf = store.loadClassifier() results = [] ft = getFeatureNames() for i in range(len(termList['X'])): preprocessd_term = preprocess(termList['X'][i]) X = np.asarray([extractFeatureText(termList['X'][i], ft)]) y_hat = clf.predict(X)[0].tolist()[0] results.append(clf.predict(X)[0].tolist() + clf.predict_proba(X)[0].tolist() + [1 if (y_hat != termList['y'][i]) else 0, preprocessd_term] + X[0].tolist()) titles = ['TestCase', 'Term', 'Label', 'Predicted Label', 'Name Score', 'Address Score', 'Phone Score', 'Error', 'Preprocessed_Term'] + ft tacc = sum([1 for (y1, y2) in zip(termList['y'], [result[0] for result in results]) if (y1 == y2)]) / len(termList['y']) if (file_model): store.saveTermTestResults(tacc, titles, termList, results, file=file_model + '_' + file_term_classify_result) else: store.saveTermTestResults(tacc, titles, termList, results, file=timeManage.getTime() + '_' + file_term_classify_result) return tacc
def getModelConfig(testAcc=0, valAcc=0): dct = {} dct['name'] = timeManage.getTime() dct['dictionary'] = { 'folder': folder_dictionary, 'file': [files_dictionary] } dct['database'] = { 'folder': folder_data, 'name': files_data[0], 'address': files_data[1], 'phone': files_data[2] } dct['preprocessing'] = { 'flag': bpreprocessing, 'type': list(preprocessing_name.keys()) } dct['features'] = getFeatureNames() dct['model'] = { 'class': model_type, 'target': model_target, 'config': model_config } dct['results'] = {'test-accuracy': testAcc, 'validate-accuracy': valAcc} return dct
def templateFiler(clf, ptemplates): templates = [] m = {0: 'name', 1: 'address', 2: 'phone'} ft = fe.getFeatureNames() dft = {key: value for (key, value) in zip(ft, range(len(ft)))} for terms in ptemplates: # preprocessedTerms = [fe.extractFeatureText(term) for term in terms] preprocessedTerms = [] for term in terms: if (bpreprocessing): preprocessedTerms.append(fe.preprocess(term)) btmp = False _X = [] for term in preprocessedTerms: if (len(term) > 0): _X.append(fe.feature(term, fe.getFeatureNames())) else: btmp = True if (btmp): continue X = np.asarray(_X) cls = clf.predict(X) tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0]) tmp.sort() if (tmp == list(range(len(terms)))): dct = {} probs = clf.predict_proba(X) for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs, preprocessedTerms, _X): try: dct[m[int(cl)]] = {'term': term, 'score': prob[int(cl)], 'preprocessed': prepTerm, 'features': _x} except ValueError: dct[m[int(cl)]] = {'term': term, 'score': prob[cl], 'preprocessed': prepTerm, 'features': _x} dct['score'] = sum([log(dct[key]['score']) for key in dct]) if (checkTemplate(dct, dft)): templates.append(dct) if (len(templates) > 0): templates = sorted(templates, key=lambda k: k['score'], reverse=True) return templates
def exc(file='testdata.txt'): # 5. Test textList = store.loadTextData(file=file) if (file_model): _time, templateList = sg.parseAddress(textList, file_model) else: _time, templateList = sg.parseAddress(textList) titles = [['#', 'Text', 'Runtime'], ['TestCase', 'Top', 'Name', 'Address', 'Phone', 'NameScore', 'AddressScore', 'PhoneScore', 'Score', 'PrepName'] + ['Name_' + ft for ft in getFeatureNames()] + \ ['PrepAddress'] + ['Address_' + ft for ft in getFeatureNames()] + \ ['PrepPhone'] + ['Phone_' + ft for ft in getFeatureNames()]] if (file_model): store.saveResults(titles, (textList, _time), templateList, file=file_model + '_' + file_segment_address_result) else: store.saveResults(titles, (textList, _time), templateList, file=timeManage.getTime() + '_' + file_segment_address_result)
def exc(): # 1. Read txt data tupleData = store.loadTxtData() # 2. Preprocessing preprocessedData = tupleData if (bpreprocessing): preprocessedData = dataPreprocess(tupleData) store.savePreprocessedDataCSV(preprocessedData, getFeatureNames())
def getModelConfig(testAcc=0, valAcc=0): dct = {} dct['name'] = timeManage.getTime() dct['dictionary'] = { 'folder': folder_datasource, 'file': [files_dictionary] } dct['database'] = { 'folder': folder_data, 'name': files_data[0], 'address': files_data[1], 'phone': files_data[2] } dct['preprocessing'] = { 'flag': bpreprocessing, 'type': list(preprocessing_name.keys()) } dct['features'] = getFeatureNames() dct['model'] = { 'class': model_type, 'target': model_target, 'config': model_config } dct['results'] = { 'test-accuracy': testAcc, 'validate-accuracy': valAcc } return dct
# 2. Extract Features extract_feature.exc() # 3. Random Data random_data.exc() # 4. Train Data tacc = train.exc() # 5. Test # 5.1. Test Term Classification ttacc = tt.exc() # 5.2. Test Address Segmentation tas.exc() millis_E = int(round(time.time() * 1000)) logs.append([timeManage.getTime(), (millis_E - millis_S)/1000, ', '.join([key for key in preprocessing_name if preprocessing_name[key]]) if bpreprocessing else '', str(len(getFeatureNames())) + ' features: ' + ', '.join(getFeatureNames()), model_type, modelDetails(), tacc, ttacc] ) workbook = xlsxwriter.Workbook(folder_running_logs + '/' + file_log) writeSheet(workbook.add_worksheet('logs'), logs) workbook.close()
def exc(): # store.loadFeatureCSV() # 4.1 Random Training and Testing Data x = randomSample() # store.saveTrainingTestingData(x) store.saveTrainingTestingDataCSV(x, getFeatureNames())
def exc(): preprocessedData = store.loadPreprocessedDataCSV() # 3. Extract features featureTuples = extractFeature(preprocessedData) store.saveFeatureCSV(featureTuples, getFeatureNames())
# 3. Random Data random_data.exc() # 4. Train Data tacc = train.exc() # 5. Test # 5.1. Test Term Classification ttacc = tt.exc() # 5.2. Test Address Segmentation acc = tas.exc() millis_E = int(round(time.time() * 1000)) features = getFeatureNames() tfilters = getTemplateRemoveFilters() logs.append([timeManage.getTime(), (millis_E - millis_S)/1000, ', '.join([key for key in preprocessing_name if preprocessing_name[key]]) if bpreprocessing else '', str(len(features)) + ' features: ' + ', '.join(features), model_type, modelDetails(), tacc, ttacc, str(len(tfilters)) + ' filters: ' + ', '.join(tfilters), acc] )
def printData(text): print('Text : ', text) for i, j in zip(getFeatureNames(), feature(preprocess(text))): print(i, ' : ', j) print('---------------------------------')
extract_feature.exc() # 3. Random Data random_data.exc() # 4. Train Data tacc = train.exc() # 5. Test # 5.1. Test Term Classification ttacc = tt.exc() # 5.2. Test Address Segmentation acc = tas.exc() millis_E = int(round(time.time() * 1000)) features = getFeatureNames() tfilters = getTemplateRemoveFilters() logs.append([ timeManage.getTime(), (millis_E - millis_S) / 1000, ', '.join( [key for key in preprocessing_name if preprocessing_name[key]]) if bpreprocessing else '', str(len(features)) + ' features: ' + ', '.join(features), model_type, modelDetails(), tacc, ttacc, str(len(tfilters)) + ' filters: ' + ', '.join(tfilters), acc ]) workbook = xlsxwriter.Workbook(folder_running_logs + '/' + file_log) writeSheet(workbook.add_worksheet('logs'), logs) workbook.close()