def templateFiler(clf, ptemplates): templates = [] m = {0: 'name', 1: 'address', 2: 'phone'} ft = fe.getFeatureNames() dft = {key: value for (key, value) in zip(ft, range(len(ft)))} for terms in ptemplates: # preprocessedTerms = [fe.extractFeatureText(term) for term in terms] preprocessedTerms = [] for term in terms: if (bpreprocessing): preprocessedTerms.append(fe.preprocess(term)) btmp = False _X = [] for term in preprocessedTerms: if (len(term) > 0): _X.append(fe.feature(term, fe.getFeatureNames())) else: btmp = True if (btmp): continue X = np.asarray(_X) cls = clf.predict(X) tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0]) tmp.sort() if (tmp == list(range(len(terms)))): dct = {} probs = clf.predict_proba(X) for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs, preprocessedTerms, _X): try: dct[m[int(cl)]] = { 'term': term, 'score': prob[int(cl)], 'preprocessed': prepTerm, 'features': _x } except ValueError: dct[m[int(cl)]] = { 'term': term, 'score': prob[cl], 'preprocessed': prepTerm, 'features': _x } dct['score'] = sum([log(dct[key]['score']) for key in dct]) if (checkTemplate(dct, dft)): templates.append(dct) if (len(templates) > 0): templates = sorted(templates, key=lambda k: k['score'], reverse=True) return templates
def templateFiler(clf, ptemplates): templates = [] m = {0: 'name', 1: 'address', 2: 'phone'} # ======================================= # for template in ptemplates: # dct = {} # for i, terr in zip(range(len(template)), template): # dct[m[i]] = {'term': template[i], 'score': 0} # # templates.append(dct) # ======================================= for terms in ptemplates: # preprocessedTerms = [fe.extractFeatureText(term) for term in terms] preprocessedTerms = [] for term in terms: if (bpreprocessing): preprocessedTerms.append(fe.preprocess(term)) _X = [fe.feature(term) for term in terms] X = np.asarray(_X) cls = clf.predict(X) tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0]) tmp.sort() if (tmp == list(range(len(terms)))): dct = {} probs = clf.predict_proba(X) for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs, preprocessedTerms, _X): try: dct[m[int(cl)]] = { 'term': term, 'score': prob[int(cl)], 'preprocessed': prepTerm, 'features': _x } except ValueError: dct[m[int(cl)]] = { 'term': term, 'score': prob[cl], 'preprocessed': prepTerm, 'features': _x } dct['score'] = sum([log(dct[key]['score']) for key in dct]) templates.append(dct) if (len(templates) > 0): templates = sorted(templates, key=lambda k: k['score'], reverse=True) return templates
def templateFiler(clf, ptemplates): templates = [] m = {0: 'name', 1: 'address', 2: 'phone'} # ======================================= # for template in ptemplates: # dct = {} # for i, terr in zip(range(len(template)), template): # dct[m[i]] = {'term': template[i], 'score': 0} # # templates.append(dct) # ======================================= for terms in ptemplates: # preprocessedTerms = [fe.extractFeatureText(term) for term in terms] preprocessedTerms = [] for term in terms: if (bpreprocessing): preprocessedTerms.append(fe.preprocess(term)) btmp = False _X = [] for term in preprocessedTerms: if (len(term) > 0): _X.append(fe.feature(term)) else: btmp = True if (btmp): continue X = np.asarray(_X) cls = clf.predict(X) tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0]) tmp.sort() if (tmp == list(range(len(terms)))): dct = {} probs = clf.predict_proba(X) for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs, preprocessedTerms, _X): try: dct[m[int(cl)]] = {'term': term, 'score': prob[int(cl)], 'preprocessed': prepTerm, 'features': _x} except ValueError: dct[m[int(cl)]] = {'term': term, 'score': prob[cl], 'preprocessed': prepTerm, 'features': _x} dct['score'] = sum([log(dct[key]['score']) for key in dct]) templates.append(dct) if (len(templates) > 0): templates = sorted(templates, key=lambda k: k['score'], reverse=True) return templates
# from libs.segment import templateSegment # # X = templateSegment('81 Duong 16, P. Binh Tri Dong B, Q.Binh Tan, 0909218877, Dinh Thi Bich Phuong', 3) # X = templateSegment('a,cb b', 3) # from libs.features import feature # # X = feature('nguyen thi thanh thuy') # # None # from execute.test_address_segment import exc # # exc() # None from libs.features import preprocess, feature X = feature(preprocess('(+84 )342-1+ Du.ong 16')) None
__author__ = 'Thong_Le' import libs.features as fe test_1 = fe.feature(' ' + 'so le may le nhanh line lines' + ' ') test_2 = fe.feature(' ' + 'duong so khu cong nghiep lo' + ' ') None
def printData(text): print('Text : ', text) for i, j in zip(getFeatureNames(), feature(preprocess(text))): print(i, ' : ', j) print('---------------------------------')
# from libs.segment import templateSegment # # X = templateSegment('81 Duong 16, P. Binh Tri Dong B, Q.Binh Tan, 0909218877, Dinh Thi Bich Phuong', 3) # X = templateSegment('a,cb b', 3) # from libs.features import feature # # X = feature('nguyen thi thanh thuy') # # None # from execute.test_address_segment import exc # # exc() # None from libs.features import preprocess, feature X = feature(preprocess('(+84 )342-1+ Du.ong 16')) None