Python preprocess Examples, libs.features.preprocess Python Examples

Example #1

0

Show file

File: segment.py Project: emithongle/AddressSegmentation

def checkTemplate(dct, dft):
    if (#'#digit' in dft and
            template_rm_filters['Phone: #digit < 8']):
        ndigit = sum([1 for c in dct['phone']['term'] if (c in string.digits)])
        if (ndigit < 8):
            return False

    if (#'#ascii/(#ascii+#digit+#punctuation)' in dft and \
        #'#digit/(#ascii+#digit+#punctuation)' in dft and \
        template_rm_filters['Phone: 2 * _%ascii < _%digit']):

        nascii = sum([1 for c in dct['phone']['term'] if (c in string.ascii_letters)])
        ndigit = sum([1 for c in dct['phone']['term'] if (c in string.digits)])
        npunctuation = sum([1 for c in dct['phone']['term'] if (c in string.punctuation)])

        if (2 * nascii/(nascii + ndigit + npunctuation) >= ndigit/(nascii + ndigit + npunctuation)):
            return False

    if (#'#ascii/(#ascii+#digit+#punctuation)' in dft and \
        #'%kwPhone' in dft and \
         template_rm_filters['Phone: _%ascii > 0 & %kwPhone = 0']):

        preprocessText = fe.preprocess4GetTerm(dct['phone']['term'])
        nascii = sum([1 for c in dct['phone']['term'] if (c in string.ascii_letters)])
        phoneTerms = fe.removeDuplicate([term for term in phoneTermSet if ((' ' + preprocessText + ' ').find(' ' + term + ' ')>= 0)])
        pctKwPhone = len(phoneTerms) / len(phoneTermSet) if (len(phoneTermSet)) else 0

        if (nascii > 0 and pctKwPhone == 0):
            return False

    if (#'first_character_digit' in dft and \
        template_rm_filters['Phone: first_character_type != digit']):
        preproTerm = fe.preprocess(dct['phone']['term'])
        if (preproTerm[0] not in string.digits):
            return False

    if (#'#ascii' in dft and
        template_rm_filters['Name: #ascii < 5']):
        nascii = sum([1 for c in dct['name']['term'] if (c in string.ascii_letters)])
        if (nascii < 5):
            return False

    if (#'#digit' in dft and \
        template_rm_filters['Name: _%digit > 0']):
        ndigit = sum([1 for c in dct['name']['term'] if (c in string.digits)])
        if (ndigit > 0):
            return False

    if (#'first_character_ascii' in dft and
        template_rm_filters['Name: first_character_type != ascii']):
        preproTerm = fe.preprocess(dct['name']['term'])
        if (preproTerm[0] not in string.ascii_letters):
            return False

    return True

Example #2

0

Show file

File: classify_data.py Project: emithongle/AD---20160427

def saveFileForReview(termData, models):

    _f = lambda l: [i[0] for i in l]

    ftName = ['Name_' + i for i in _f(getFeatureList(featureConfig['name']))]
    ftAddress = ['Address_' + i for i in _f(getFeatureList(featureConfig['address']))]
    ftPhone = ['Phone_' + i for i in _f(getFeatureList(featureConfig['phone']))]

    data = [['Text', 'Label', 'Predicted', 'Error', 'Preprocessed'] + \
            ['%ProbName', '%ProbNotName'] + ['%ProbAddress', '%ProbNotAddress'] + ['%ProbPhone', '%ProbNotPhone'] + \
            ['%PName', '%PAddress', '%PPhone', '%PNothing'] + ftName + ftAddress + ftPhone]
    for x, y in zip(termData['X'], termData['y']):
        prepx = preprocess(x[0])
        y_hat, prob = clasify(models, prepx)

        _ftName = getFeature([prepx], 'name')
        probName = models['name'].predict_proba(_ftName)[0].tolist()
        _ftAddress = getFeature([prepx], 'address')
        probAddress = models['address'].predict_proba(_ftAddress)[0].tolist()
        _ftPhone = getFeature([prepx], 'phone')
        probPhone = models['phone'].predict_proba(_ftPhone)[0].tolist()

        data.append([x[0], y, y_hat, 1 if y != y_hat else 0, prepx] + \
                    probName + probAddress + probPhone + prob.tolist() + \
                    _ftName[0].tolist() + _ftAddress[0].tolist() + _ftPhone[0].tolist())

    saveTermTestResult({'sheet_1': data})

Example #3

0

Show file

File: test_term_classifier_model.py Project: emithongle/AddressSegmentation

def exc():
    # 5. Test
    tmp = store.loadTermData()
    termList = {'X': [i[0] for i in tmp], 'y': [int(i[1]) for i in tmp]}

    print('=======================================================')
    print('=> Term Classifying...')

    if (file_model):
        clf = store.loadClassifier(file=file_model)
    else:
        clf = store.loadClassifier()
    results = []

    for i in range(len(termList['X'])):
        preprocessd_term = preprocess(termList['X'][i])
        X = np.asarray([extractFeatureText(termList['X'][i])])
        results.append(clf.predict(X)[0].tolist() + clf.predict_proba(X)[0].tolist() +
                       ['', preprocessd_term] + X[0].tolist())

    titles = ['TestCase', 'Term', 'Label', 'Predicted Label', 'Name Score', 'Address Score', 'Phone Score', '', 'Preprocessed_Term'] + \
            feature_names

    tacc = sum([1 for (y1, y2) in zip(termList['y'], [result[0] for result in results]) if (y1 == y2)]) / len(termList['y'])

    if (file_model):
        store.saveTermTestResults(tacc, titles, termList, results, file=file_model + '_' + file_term_classify_result)
    else:
        store.saveTermTestResults(tacc, titles, termList, results, file=timeManage.getTime() + '_' + file_term_classify_result)

    return tacc

Example #4

0

Show file

def exc():
    # 5. Test
    tmp = store.loadTermData()
    termList = {'X': [i[0] for i in tmp], 'y': [int(i[1]) for i in tmp]}

    print('=======================================================')
    print('=> Term Classifying...')

    if (file_model):
        clf = store.loadClassifier(file=file_model)
    else:
        clf = store.loadClassifier()
    results = []

    for i in range(len(termList['X'])):
        preprocessd_term = preprocess(termList['X'][i])
        X = np.asarray([extractFeatureText(termList['X'][i], getFeatureNames())])
        y_hat = clf.predict(X)[0].tolist()[0]
        results.append(clf.predict(X)[0].tolist() + clf.predict_proba(X)[0].tolist() +
                       [1 if (y_hat != termList['y'][i]) else 0, preprocessd_term] + X[0].tolist())

    titles = ['TestCase', 'Term', 'Label', 'Predicted Label', 'Name Score', 'Address Score', 'Phone Score', 'Error', 'Preprocessed_Term'] + \
            getFeatureNames()

    tacc = sum([1 for (y1, y2) in zip(termList['y'], [result[0] for result in results]) if (y1 == y2)]) / len(termList['y'])

    if (file_model):
        store.saveTermTestResults(tacc, titles, termList, results, file=file_model + '_' + file_term_classify_result)

    else:
        store.saveTermTestResults(tacc, titles, termList, results, file=timeManage.getTime() + '_' + file_term_classify_result)

    return tacc

Example #5

0

Show file

File: segment.py Project: emithongle/AD---20160408

def templateFiler(clf, ptemplates):
    templates = []

    m = {0: 'name', 1: 'address', 2: 'phone'}
    ft = fe.getFeatureNames()
    dft = {key: value for (key, value) in zip(ft, range(len(ft)))}

    for terms in ptemplates:
        # preprocessedTerms = [fe.extractFeatureText(term) for term in terms]

        preprocessedTerms = []
        for term in terms:
            if (bpreprocessing):
                preprocessedTerms.append(fe.preprocess(term))

        btmp = False
        _X = []
        for term in preprocessedTerms:
            if (len(term) > 0):
                _X.append(fe.feature(term, fe.getFeatureNames()))
            else:
                btmp = True
        if (btmp):
            continue

        X = np.asarray(_X)
        cls = clf.predict(X)

        tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0])
        tmp.sort()
        if (tmp == list(range(len(terms)))):
            dct = {}
            probs = clf.predict_proba(X)
            for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs,
                                                      preprocessedTerms, _X):
                try:
                    dct[m[int(cl)]] = {
                        'term': term,
                        'score': prob[int(cl)],
                        'preprocessed': prepTerm,
                        'features': _x
                    }
                except ValueError:
                    dct[m[int(cl)]] = {
                        'term': term,
                        'score': prob[cl],
                        'preprocessed': prepTerm,
                        'features': _x
                    }
            dct['score'] = sum([log(dct[key]['score']) for key in dct])
            if (checkTemplate(dct, dft)):
                templates.append(dct)

    if (len(templates) > 0):
        templates = sorted(templates, key=lambda k: k['score'], reverse=True)

    return templates

Example #6

0

Show file

File: segment_data.py Project: emithongle/AD---20160427

def _exec():
    _f = lambda x, ms: x + \
                   ms['name'].predict_proba(getFeature(x, 'name'))[0].tolist() + \
                   ms['address'].predict_proba(getFeature(x, 'address'))[0].tolist() + \
                   ms['phone'].predict_proba(getFeature(x, 'phone'))[0].tolist() + \
                   getFeature(x, 'name')[0].tolist() + \
                   getFeature(x, 'address')[0].tolist() + \
                   getFeature(x, 'phone')[0].tolist()

    X, yName, yAddress, yPhone = loadFullAddress()
    models = loadModel()

    goodresults = []
    badresults = []
    for i, x in zip(range(len(X)), X):
        goodresults.append([])
        badresults.append([])
        templateList = sgm.segmentText(x)
        for j, tm in enumerate(templateList):
            label_0, score_0 = clasify(models, tm[0])
            label_1, score_1 = clasify(models, tm[1])
            label_2, score_2 = clasify(models, tm[2])
            totalScore = log(score_0.max()) + log(score_1.max()) + log(score_2.max())

            typleInfo = sorted(zip(tm, [label_0, label_1, label_2], [score_0, score_1, score_2]), key=lambda _: _[1])

            row = [i, j] + [_[0] for _ in typleInfo] + [_[1] for _ in typleInfo] + \
                  [typleInfo[0][2].max()] + [typleInfo[1][2].max()] + [typleInfo[2][2].max()] + [totalScore] + \
                  _f([preprocess(typleInfo[0][0])], models) + typleInfo[0][2].tolist() + \
                  _f([preprocess(typleInfo[1][0])], models) + typleInfo[1][2].tolist() + \
                  _f([preprocess(typleInfo[2][0])], models) + typleInfo[2][2].tolist()

            if ([_[1] for _ in typleInfo] == list(range(3))):
                goodresults[i].append(row)
            else:
                badresults[i].append(row)

        goodresults[i] = sorted(goodresults[i], key=lambda x: x[11], reverse=True)

    acc, ranks, goodresults = checkSegmentResults((yName, yAddress, yPhone), goodresults)
    saveTemplateResults({'name': yName, 'address': yAddress, 'phone': yPhone}, acc, goodresults, badresults)
    saveHistogramResult(ranks)

    return acc

Example #7

0

Show file

def templateFiler(clf, ptemplates):
    templates = []

    m = {0: 'name', 1: 'address', 2: 'phone'}

    # =======================================
    # for template in ptemplates:
    #     dct = {}
    #     for i, terr in zip(range(len(template)), template):
    #         dct[m[i]] = {'term': template[i], 'score': 0}
    #
    #     templates.append(dct)
    # =======================================

    for terms in ptemplates:
        # preprocessedTerms = [fe.extractFeatureText(term) for term in terms]

        preprocessedTerms = []
        for term in terms:
            if (bpreprocessing):
                preprocessedTerms.append(fe.preprocess(term))

        _X = [fe.feature(term) for term in terms]

        X = np.asarray(_X)
        cls = clf.predict(X)

        tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0])
        tmp.sort()
        if (tmp == list(range(len(terms)))):
            dct = {}
            probs = clf.predict_proba(X)
            for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs,
                                                      preprocessedTerms, _X):
                try:
                    dct[m[int(cl)]] = {
                        'term': term,
                        'score': prob[int(cl)],
                        'preprocessed': prepTerm,
                        'features': _x
                    }
                except ValueError:
                    dct[m[int(cl)]] = {
                        'term': term,
                        'score': prob[cl],
                        'preprocessed': prepTerm,
                        'features': _x
                    }
            dct['score'] = sum([log(dct[key]['score']) for key in dct])
            templates.append(dct)

    if (len(templates) > 0):
        templates = sorted(templates, key=lambda k: k['score'], reverse=True)

    return templates

Example #8

0

Show file

File: segment.py Project: emithongle/AddressSegmentation

def templateFiler(clf, ptemplates):
    templates = []

    m = {0: 'name', 1: 'address', 2: 'phone'}

    # =======================================
    # for template in ptemplates:
    #     dct = {}
    #     for i, terr in zip(range(len(template)), template):
    #         dct[m[i]] = {'term': template[i], 'score': 0}
    #
    #     templates.append(dct)
    # =======================================

    for terms in ptemplates:
        # preprocessedTerms = [fe.extractFeatureText(term) for term in terms]

        preprocessedTerms = []
        for term in terms:
            if (bpreprocessing):
                preprocessedTerms.append(fe.preprocess(term))

        btmp = False
        _X = []
        for term in preprocessedTerms:
            if (len(term) > 0):
                _X.append(fe.feature(term))
            else:
                btmp = True
        if (btmp):
            continue

        X = np.asarray(_X)
        cls = clf.predict(X)

        tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0])
        tmp.sort()
        if (tmp == list(range(len(terms)))):
            dct = {}
            probs = clf.predict_proba(X)
            for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs, preprocessedTerms, _X):
                try:
                    dct[m[int(cl)]] = {'term': term, 'score': prob[int(cl)], 'preprocessed': prepTerm, 'features': _x}
                except ValueError:
                    dct[m[int(cl)]] = {'term': term, 'score': prob[cl], 'preprocessed': prepTerm, 'features': _x}
            dct['score'] = sum([log(dct[key]['score']) for key in dct])
            templates.append(dct)

    if (len(templates) > 0):
        templates = sorted(templates, key=lambda k: k['score'], reverse=True)

    return templates

Example #9

0

Show file

File: classify_data.py Project: emithongle/AD---20160427

def _exec():
    termData = loadTermTest()
    models = loadModel()

    acc = 0

    for (x, l) in zip(termData['X'], termData['y']):
        label, prob = clasify(models, preprocess(x[0]))
        acc += prob[label]

    acc = acc / termData['X'].shape[0]

    saveFileForReview(termData, models)

    return acc

Example #10

0

Show file

File: test.py Project: emithongle/AddressSegmentation

# from libs.segment import templateSegment
#
# X = templateSegment('81 Duong 16, P. Binh Tri Dong B, Q.Binh Tan, 0909218877, Dinh Thi Bich Phuong', 3)

# X = templateSegment('a,cb         b', 3)

# from libs.features import feature
#
# X = feature('nguyen thi thanh thuy')
#
# None

# from execute.test_address_segment import exc
#
# exc()
# None

from libs.features import preprocess, feature

X = feature(preprocess('(+84 )342-1+ Du.ong 16'))
None

Example #11

0

Show file

File: test.py Project: emithongle/AD---20160411

def printData(text):
    print('Text : ', text)
    for i, j in zip(getFeatureNames(), feature(preprocess(text))):
        print(i, ' : ', j)
    print('---------------------------------')

Example #12

0

Show file

File: segment.py Project: emithongle/AddressSegmentation

def checkTemplate(dct, dft):
    if (  #'#digit' in dft and
            template_rm_filters['Phone: #digit < 8']):
        ndigit = sum([1 for c in dct['phone']['term'] if (c in string.digits)])
        if (ndigit < 8):
            return False

    if (  #'#ascii/(#ascii+#digit+#punctuation)' in dft and \
            #'#digit/(#ascii+#digit+#punctuation)' in dft and \
            template_rm_filters['Phone: 2 * _%ascii < _%digit']):

        nascii = sum(
            [1 for c in dct['phone']['term'] if (c in string.ascii_letters)])
        ndigit = sum([1 for c in dct['phone']['term'] if (c in string.digits)])
        npunctuation = sum(
            [1 for c in dct['phone']['term'] if (c in string.punctuation)])

        if (2 * nascii / (nascii + ndigit + npunctuation) >= ndigit /
            (nascii + ndigit + npunctuation)):
            return False

    if (  #'#ascii/(#ascii+#digit+#punctuation)' in dft and \
            #'%kwPhone' in dft and \
            template_rm_filters['Phone: _%ascii > 0 & %kwPhone = 0']):

        preprocessText = fe.preprocess4GetTerm(dct['phone']['term'])
        nascii = sum(
            [1 for c in dct['phone']['term'] if (c in string.ascii_letters)])
        phoneTerms = fe.removeDuplicate([
            term for term in phoneTermSet
            if ((' ' + preprocessText + ' ').find(' ' + term + ' ') >= 0)
        ])
        pctKwPhone = len(phoneTerms) / len(phoneTermSet) if (
            len(phoneTermSet)) else 0

        if (nascii > 0 and pctKwPhone == 0):
            return False

    if (  #'first_character_digit' in dft and \
            template_rm_filters['Phone: first_character_type != digit']):
        preproTerm = fe.preprocess(dct['phone']['term'])
        if (preproTerm[0] not in string.digits):
            return False

    if (  #'#ascii' in dft and
            template_rm_filters['Name: #ascii < 5']):
        nascii = sum(
            [1 for c in dct['name']['term'] if (c in string.ascii_letters)])
        if (nascii < 5):
            return False

    if (  #'#digit' in dft and \
            template_rm_filters['Name: _%digit > 0']):
        ndigit = sum([1 for c in dct['name']['term'] if (c in string.digits)])
        if (ndigit > 0):
            return False

    if (  #'first_character_ascii' in dft and
            template_rm_filters['Name: first_character_type != ascii']):
        preproTerm = fe.preprocess(dct['name']['term'])
        if (preproTerm[0] not in string.ascii_letters):
            return False

    return True

Example #13

0

Show file

File: test.py Project: emithongle/AddressSegmentation

# from libs.segment import templateSegment
#
# X = templateSegment('81 Duong 16, P. Binh Tri Dong B, Q.Binh Tan, 0909218877, Dinh Thi Bich Phuong', 3)

# X = templateSegment('a,cb         b', 3)

# from libs.features import feature
#
# X = feature('nguyen thi thanh thuy')
#
# None

# from execute.test_address_segment import exc
#
# exc()
# None


from libs.features import preprocess, feature

X = feature(preprocess('(+84 )342-1+ Du.ong 16'))
None