def checkTemplate(dct, dft): if (#'#digit' in dft and template_rm_filters['Phone: #digit < 8']): ndigit = sum([1 for c in dct['phone']['term'] if (c in string.digits)]) if (ndigit < 8): return False if (#'#ascii/(#ascii+#digit+#punctuation)' in dft and \ #'#digit/(#ascii+#digit+#punctuation)' in dft and \ template_rm_filters['Phone: 2 * _%ascii < _%digit']): nascii = sum([1 for c in dct['phone']['term'] if (c in string.ascii_letters)]) ndigit = sum([1 for c in dct['phone']['term'] if (c in string.digits)]) npunctuation = sum([1 for c in dct['phone']['term'] if (c in string.punctuation)]) if (2 * nascii/(nascii + ndigit + npunctuation) >= ndigit/(nascii + ndigit + npunctuation)): return False if (#'#ascii/(#ascii+#digit+#punctuation)' in dft and \ #'%kwPhone' in dft and \ template_rm_filters['Phone: _%ascii > 0 & %kwPhone = 0']): preprocessText = fe.preprocess4GetTerm(dct['phone']['term']) nascii = sum([1 for c in dct['phone']['term'] if (c in string.ascii_letters)]) phoneTerms = fe.removeDuplicate([term for term in phoneTermSet if ((' ' + preprocessText + ' ').find(' ' + term + ' ')>= 0)]) pctKwPhone = len(phoneTerms) / len(phoneTermSet) if (len(phoneTermSet)) else 0 if (nascii > 0 and pctKwPhone == 0): return False if (#'first_character_digit' in dft and \ template_rm_filters['Phone: first_character_type != digit']): preproTerm = fe.preprocess(dct['phone']['term']) if (preproTerm[0] not in string.digits): return False if (#'#ascii' in dft and template_rm_filters['Name: #ascii < 5']): nascii = sum([1 for c in dct['name']['term'] if (c in string.ascii_letters)]) if (nascii < 5): return False if (#'#digit' in dft and \ template_rm_filters['Name: _%digit > 0']): ndigit = sum([1 for c in dct['name']['term'] if (c in string.digits)]) if (ndigit > 0): return False if (#'first_character_ascii' in dft and template_rm_filters['Name: first_character_type != ascii']): preproTerm = fe.preprocess(dct['name']['term']) if (preproTerm[0] not in string.ascii_letters): return False return True
def saveFileForReview(termData, models): _f = lambda l: [i[0] for i in l] ftName = ['Name_' + i for i in _f(getFeatureList(featureConfig['name']))] ftAddress = ['Address_' + i for i in _f(getFeatureList(featureConfig['address']))] ftPhone = ['Phone_' + i for i in _f(getFeatureList(featureConfig['phone']))] data = [['Text', 'Label', 'Predicted', 'Error', 'Preprocessed'] + \ ['%ProbName', '%ProbNotName'] + ['%ProbAddress', '%ProbNotAddress'] + ['%ProbPhone', '%ProbNotPhone'] + \ ['%PName', '%PAddress', '%PPhone', '%PNothing'] + ftName + ftAddress + ftPhone] for x, y in zip(termData['X'], termData['y']): prepx = preprocess(x[0]) y_hat, prob = clasify(models, prepx) _ftName = getFeature([prepx], 'name') probName = models['name'].predict_proba(_ftName)[0].tolist() _ftAddress = getFeature([prepx], 'address') probAddress = models['address'].predict_proba(_ftAddress)[0].tolist() _ftPhone = getFeature([prepx], 'phone') probPhone = models['phone'].predict_proba(_ftPhone)[0].tolist() data.append([x[0], y, y_hat, 1 if y != y_hat else 0, prepx] + \ probName + probAddress + probPhone + prob.tolist() + \ _ftName[0].tolist() + _ftAddress[0].tolist() + _ftPhone[0].tolist()) saveTermTestResult({'sheet_1': data})
def exc(): # 5. Test tmp = store.loadTermData() termList = {'X': [i[0] for i in tmp], 'y': [int(i[1]) for i in tmp]} print('=======================================================') print('=> Term Classifying...') if (file_model): clf = store.loadClassifier(file=file_model) else: clf = store.loadClassifier() results = [] for i in range(len(termList['X'])): preprocessd_term = preprocess(termList['X'][i]) X = np.asarray([extractFeatureText(termList['X'][i])]) results.append(clf.predict(X)[0].tolist() + clf.predict_proba(X)[0].tolist() + ['', preprocessd_term] + X[0].tolist()) titles = ['TestCase', 'Term', 'Label', 'Predicted Label', 'Name Score', 'Address Score', 'Phone Score', '', 'Preprocessed_Term'] + \ feature_names tacc = sum([1 for (y1, y2) in zip(termList['y'], [result[0] for result in results]) if (y1 == y2)]) / len(termList['y']) if (file_model): store.saveTermTestResults(tacc, titles, termList, results, file=file_model + '_' + file_term_classify_result) else: store.saveTermTestResults(tacc, titles, termList, results, file=timeManage.getTime() + '_' + file_term_classify_result) return tacc
def exc(): # 5. Test tmp = store.loadTermData() termList = {'X': [i[0] for i in tmp], 'y': [int(i[1]) for i in tmp]} print('=======================================================') print('=> Term Classifying...') if (file_model): clf = store.loadClassifier(file=file_model) else: clf = store.loadClassifier() results = [] for i in range(len(termList['X'])): preprocessd_term = preprocess(termList['X'][i]) X = np.asarray([extractFeatureText(termList['X'][i], getFeatureNames())]) y_hat = clf.predict(X)[0].tolist()[0] results.append(clf.predict(X)[0].tolist() + clf.predict_proba(X)[0].tolist() + [1 if (y_hat != termList['y'][i]) else 0, preprocessd_term] + X[0].tolist()) titles = ['TestCase', 'Term', 'Label', 'Predicted Label', 'Name Score', 'Address Score', 'Phone Score', 'Error', 'Preprocessed_Term'] + \ getFeatureNames() tacc = sum([1 for (y1, y2) in zip(termList['y'], [result[0] for result in results]) if (y1 == y2)]) / len(termList['y']) if (file_model): store.saveTermTestResults(tacc, titles, termList, results, file=file_model + '_' + file_term_classify_result) else: store.saveTermTestResults(tacc, titles, termList, results, file=timeManage.getTime() + '_' + file_term_classify_result) return tacc
def templateFiler(clf, ptemplates): templates = [] m = {0: 'name', 1: 'address', 2: 'phone'} ft = fe.getFeatureNames() dft = {key: value for (key, value) in zip(ft, range(len(ft)))} for terms in ptemplates: # preprocessedTerms = [fe.extractFeatureText(term) for term in terms] preprocessedTerms = [] for term in terms: if (bpreprocessing): preprocessedTerms.append(fe.preprocess(term)) btmp = False _X = [] for term in preprocessedTerms: if (len(term) > 0): _X.append(fe.feature(term, fe.getFeatureNames())) else: btmp = True if (btmp): continue X = np.asarray(_X) cls = clf.predict(X) tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0]) tmp.sort() if (tmp == list(range(len(terms)))): dct = {} probs = clf.predict_proba(X) for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs, preprocessedTerms, _X): try: dct[m[int(cl)]] = { 'term': term, 'score': prob[int(cl)], 'preprocessed': prepTerm, 'features': _x } except ValueError: dct[m[int(cl)]] = { 'term': term, 'score': prob[cl], 'preprocessed': prepTerm, 'features': _x } dct['score'] = sum([log(dct[key]['score']) for key in dct]) if (checkTemplate(dct, dft)): templates.append(dct) if (len(templates) > 0): templates = sorted(templates, key=lambda k: k['score'], reverse=True) return templates
def _exec(): _f = lambda x, ms: x + \ ms['name'].predict_proba(getFeature(x, 'name'))[0].tolist() + \ ms['address'].predict_proba(getFeature(x, 'address'))[0].tolist() + \ ms['phone'].predict_proba(getFeature(x, 'phone'))[0].tolist() + \ getFeature(x, 'name')[0].tolist() + \ getFeature(x, 'address')[0].tolist() + \ getFeature(x, 'phone')[0].tolist() X, yName, yAddress, yPhone = loadFullAddress() models = loadModel() goodresults = [] badresults = [] for i, x in zip(range(len(X)), X): goodresults.append([]) badresults.append([]) templateList = sgm.segmentText(x) for j, tm in enumerate(templateList): label_0, score_0 = clasify(models, tm[0]) label_1, score_1 = clasify(models, tm[1]) label_2, score_2 = clasify(models, tm[2]) totalScore = log(score_0.max()) + log(score_1.max()) + log(score_2.max()) typleInfo = sorted(zip(tm, [label_0, label_1, label_2], [score_0, score_1, score_2]), key=lambda _: _[1]) row = [i, j] + [_[0] for _ in typleInfo] + [_[1] for _ in typleInfo] + \ [typleInfo[0][2].max()] + [typleInfo[1][2].max()] + [typleInfo[2][2].max()] + [totalScore] + \ _f([preprocess(typleInfo[0][0])], models) + typleInfo[0][2].tolist() + \ _f([preprocess(typleInfo[1][0])], models) + typleInfo[1][2].tolist() + \ _f([preprocess(typleInfo[2][0])], models) + typleInfo[2][2].tolist() if ([_[1] for _ in typleInfo] == list(range(3))): goodresults[i].append(row) else: badresults[i].append(row) goodresults[i] = sorted(goodresults[i], key=lambda x: x[11], reverse=True) acc, ranks, goodresults = checkSegmentResults((yName, yAddress, yPhone), goodresults) saveTemplateResults({'name': yName, 'address': yAddress, 'phone': yPhone}, acc, goodresults, badresults) saveHistogramResult(ranks) return acc
def templateFiler(clf, ptemplates): templates = [] m = {0: 'name', 1: 'address', 2: 'phone'} # ======================================= # for template in ptemplates: # dct = {} # for i, terr in zip(range(len(template)), template): # dct[m[i]] = {'term': template[i], 'score': 0} # # templates.append(dct) # ======================================= for terms in ptemplates: # preprocessedTerms = [fe.extractFeatureText(term) for term in terms] preprocessedTerms = [] for term in terms: if (bpreprocessing): preprocessedTerms.append(fe.preprocess(term)) _X = [fe.feature(term) for term in terms] X = np.asarray(_X) cls = clf.predict(X) tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0]) tmp.sort() if (tmp == list(range(len(terms)))): dct = {} probs = clf.predict_proba(X) for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs, preprocessedTerms, _X): try: dct[m[int(cl)]] = { 'term': term, 'score': prob[int(cl)], 'preprocessed': prepTerm, 'features': _x } except ValueError: dct[m[int(cl)]] = { 'term': term, 'score': prob[cl], 'preprocessed': prepTerm, 'features': _x } dct['score'] = sum([log(dct[key]['score']) for key in dct]) templates.append(dct) if (len(templates) > 0): templates = sorted(templates, key=lambda k: k['score'], reverse=True) return templates
def templateFiler(clf, ptemplates): templates = [] m = {0: 'name', 1: 'address', 2: 'phone'} # ======================================= # for template in ptemplates: # dct = {} # for i, terr in zip(range(len(template)), template): # dct[m[i]] = {'term': template[i], 'score': 0} # # templates.append(dct) # ======================================= for terms in ptemplates: # preprocessedTerms = [fe.extractFeatureText(term) for term in terms] preprocessedTerms = [] for term in terms: if (bpreprocessing): preprocessedTerms.append(fe.preprocess(term)) btmp = False _X = [] for term in preprocessedTerms: if (len(term) > 0): _X.append(fe.feature(term)) else: btmp = True if (btmp): continue X = np.asarray(_X) cls = clf.predict(X) tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0]) tmp.sort() if (tmp == list(range(len(terms)))): dct = {} probs = clf.predict_proba(X) for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs, preprocessedTerms, _X): try: dct[m[int(cl)]] = {'term': term, 'score': prob[int(cl)], 'preprocessed': prepTerm, 'features': _x} except ValueError: dct[m[int(cl)]] = {'term': term, 'score': prob[cl], 'preprocessed': prepTerm, 'features': _x} dct['score'] = sum([log(dct[key]['score']) for key in dct]) templates.append(dct) if (len(templates) > 0): templates = sorted(templates, key=lambda k: k['score'], reverse=True) return templates
def _exec(): termData = loadTermTest() models = loadModel() acc = 0 for (x, l) in zip(termData['X'], termData['y']): label, prob = clasify(models, preprocess(x[0])) acc += prob[label] acc = acc / termData['X'].shape[0] saveFileForReview(termData, models) return acc
# from libs.segment import templateSegment # # X = templateSegment('81 Duong 16, P. Binh Tri Dong B, Q.Binh Tan, 0909218877, Dinh Thi Bich Phuong', 3) # X = templateSegment('a,cb b', 3) # from libs.features import feature # # X = feature('nguyen thi thanh thuy') # # None # from execute.test_address_segment import exc # # exc() # None from libs.features import preprocess, feature X = feature(preprocess('(+84 )342-1+ Du.ong 16')) None
def printData(text): print('Text : ', text) for i, j in zip(getFeatureNames(), feature(preprocess(text))): print(i, ' : ', j) print('---------------------------------')
def checkTemplate(dct, dft): if ( #'#digit' in dft and template_rm_filters['Phone: #digit < 8']): ndigit = sum([1 for c in dct['phone']['term'] if (c in string.digits)]) if (ndigit < 8): return False if ( #'#ascii/(#ascii+#digit+#punctuation)' in dft and \ #'#digit/(#ascii+#digit+#punctuation)' in dft and \ template_rm_filters['Phone: 2 * _%ascii < _%digit']): nascii = sum( [1 for c in dct['phone']['term'] if (c in string.ascii_letters)]) ndigit = sum([1 for c in dct['phone']['term'] if (c in string.digits)]) npunctuation = sum( [1 for c in dct['phone']['term'] if (c in string.punctuation)]) if (2 * nascii / (nascii + ndigit + npunctuation) >= ndigit / (nascii + ndigit + npunctuation)): return False if ( #'#ascii/(#ascii+#digit+#punctuation)' in dft and \ #'%kwPhone' in dft and \ template_rm_filters['Phone: _%ascii > 0 & %kwPhone = 0']): preprocessText = fe.preprocess4GetTerm(dct['phone']['term']) nascii = sum( [1 for c in dct['phone']['term'] if (c in string.ascii_letters)]) phoneTerms = fe.removeDuplicate([ term for term in phoneTermSet if ((' ' + preprocessText + ' ').find(' ' + term + ' ') >= 0) ]) pctKwPhone = len(phoneTerms) / len(phoneTermSet) if ( len(phoneTermSet)) else 0 if (nascii > 0 and pctKwPhone == 0): return False if ( #'first_character_digit' in dft and \ template_rm_filters['Phone: first_character_type != digit']): preproTerm = fe.preprocess(dct['phone']['term']) if (preproTerm[0] not in string.digits): return False if ( #'#ascii' in dft and template_rm_filters['Name: #ascii < 5']): nascii = sum( [1 for c in dct['name']['term'] if (c in string.ascii_letters)]) if (nascii < 5): return False if ( #'#digit' in dft and \ template_rm_filters['Name: _%digit > 0']): ndigit = sum([1 for c in dct['name']['term'] if (c in string.digits)]) if (ndigit > 0): return False if ( #'first_character_ascii' in dft and template_rm_filters['Name: first_character_type != ascii']): preproTerm = fe.preprocess(dct['name']['term']) if (preproTerm[0] not in string.ascii_letters): return False return True
# from libs.segment import templateSegment # # X = templateSegment('81 Duong 16, P. Binh Tri Dong B, Q.Binh Tan, 0909218877, Dinh Thi Bich Phuong', 3) # X = templateSegment('a,cb b', 3) # from libs.features import feature # # X = feature('nguyen thi thanh thuy') # # None # from execute.test_address_segment import exc # # exc() # None from libs.features import preprocess, feature X = feature(preprocess('(+84 )342-1+ Du.ong 16')) None