コード例 #1
0
def exc():
    # 5. Test
    tmp = store.loadTermData()
    termList = {'X': [i[0] for i in tmp], 'y': [int(i[1]) for i in tmp]}

    print('=======================================================')
    print('=> Term Classifying...')

    if (file_model):
        clf = store.loadClassifier(file=file_model)
    else:
        clf = store.loadClassifier()
    results = []

    for i in range(len(termList['X'])):
        preprocessd_term = preprocess(termList['X'][i])
        X = np.asarray([extractFeatureText(termList['X'][i], getFeatureNames())])
        y_hat = clf.predict(X)[0].tolist()[0]
        results.append(clf.predict(X)[0].tolist() + clf.predict_proba(X)[0].tolist() +
                       [1 if (y_hat != termList['y'][i]) else 0, preprocessd_term] + X[0].tolist())

    titles = ['TestCase', 'Term', 'Label', 'Predicted Label', 'Name Score', 'Address Score', 'Phone Score', 'Error', 'Preprocessed_Term'] + \
            getFeatureNames()

    tacc = sum([1 for (y1, y2) in zip(termList['y'], [result[0] for result in results]) if (y1 == y2)]) / len(termList['y'])

    if (file_model):
        store.saveTermTestResults(tacc, titles, termList, results, file=file_model + '_' + file_term_classify_result)

    else:
        store.saveTermTestResults(tacc, titles, termList, results, file=timeManage.getTime() + '_' + file_term_classify_result)

    return tacc
コード例 #2
0
def exc(file='testdata.txt', label_file=folder_datasource + '/' + 'fa_labels'):
    # 5. Test & Labels
    textList = store.loadTextData(file=file)
    labels = store.loadCSV(label_file)

    if (file_model):
        _time, templateList = sg.parseAddress(textList, file_model)
    else:
        _time, templateList = sg.parseAddress(textList)

    acc, dist = checkAddressSegmentAccuracy(labels, templateList)

    titles = [['#', 'Text', 'Runtime'],
              ['TestCase', 'Top', 'Name', 'Address', 'Phone', 'NameScore', 'AddressScore', 'PhoneScore', 'Score',
               'PrepName'] + ['Name_' + ft for ft in getFeatureNames()] + \
               ['PrepAddress'] + ['Address_' + ft for ft in getFeatureNames()] + \
               ['PrepPhone'] + ['Phone_' + ft for ft in getFeatureNames()]]

    if (file_model):
        store.saveResults(titles, (textList, _time),
                          templateList,
                          file=file_model + '_' + file_segment_address_result,
                          acc=acc)
    else:
        store.saveResults(titles, (textList, _time),
                          templateList,
                          file=timeManage.getTime() + '_' +
                          file_segment_address_result,
                          acc=acc)

    return acc
コード例 #3
0
def exc(file='testdata.txt', label_file=folder_datasource + '/' + 'fa_labels'):
    # 5. Test & Labels
    textList = store.loadTextData(file=file)
    labels = store.loadCSV(label_file)

    if (file_model):
        _time, templateList = sg.parseAddress(textList, file_model)
    else:
        _time, templateList = sg.parseAddress(textList)

    if (sum([len(itmp) for itmp in templateList]) > 0):
        acc, dist = checkAddressSegmentAccuracy(labels, templateList)
    else:
        acc = 0


    titles = [['#', 'Text', 'Runtime'],
              ['TestCase', 'Top', 'Name', 'Address', 'Phone', 'NameScore', 'AddressScore', 'PhoneScore', 'Score',
               'PrepName'] + ['Name_' + ft for ft in getFeatureNames()] + \
               ['PrepAddress'] + ['Address_' + ft for ft in getFeatureNames()] + \
               ['PrepPhone'] + ['Phone_' + ft for ft in getFeatureNames()]]

    if (file_model):
        store.saveResults(titles, (textList, _time), templateList, file=file_model + '_' + file_segment_address_result,
                            acc=acc)
    else:
        store.saveResults(titles, (textList, _time), templateList, file=timeManage.getTime() + '_' + file_segment_address_result,
                            acc=acc)

    return acc
コード例 #4
0
ファイル: segment.py プロジェクト: emithongle/AD---20160408
def templateFiler(clf, ptemplates):
    templates = []

    m = {0: 'name', 1: 'address', 2: 'phone'}
    ft = fe.getFeatureNames()
    dft = {key: value for (key, value) in zip(ft, range(len(ft)))}

    for terms in ptemplates:
        # preprocessedTerms = [fe.extractFeatureText(term) for term in terms]

        preprocessedTerms = []
        for term in terms:
            if (bpreprocessing):
                preprocessedTerms.append(fe.preprocess(term))

        btmp = False
        _X = []
        for term in preprocessedTerms:
            if (len(term) > 0):
                _X.append(fe.feature(term, fe.getFeatureNames()))
            else:
                btmp = True
        if (btmp):
            continue

        X = np.asarray(_X)
        cls = clf.predict(X)

        tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0])
        tmp.sort()
        if (tmp == list(range(len(terms)))):
            dct = {}
            probs = clf.predict_proba(X)
            for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs,
                                                      preprocessedTerms, _X):
                try:
                    dct[m[int(cl)]] = {
                        'term': term,
                        'score': prob[int(cl)],
                        'preprocessed': prepTerm,
                        'features': _x
                    }
                except ValueError:
                    dct[m[int(cl)]] = {
                        'term': term,
                        'score': prob[cl],
                        'preprocessed': prepTerm,
                        'features': _x
                    }
            dct['score'] = sum([log(dct[key]['score']) for key in dct])
            if (checkTemplate(dct, dft)):
                templates.append(dct)

    if (len(templates) > 0):
        templates = sorted(templates, key=lambda k: k['score'], reverse=True)

    return templates
コード例 #5
0
def exc():
    # 5. Test
    tmp = store.loadTermData()
    termList = {'X': [i[0] for i in tmp], 'y': [int(i[1]) for i in tmp]}

    print('=======================================================')
    print('=> Term Classifying...')

    if (file_model):
        clf = store.loadClassifier(file=file_model)
    else:
        clf = store.loadClassifier()
    results = []

    ft = getFeatureNames()

    for i in range(len(termList['X'])):
        preprocessd_term = preprocess(termList['X'][i])
        X = np.asarray([extractFeatureText(termList['X'][i], ft)])
        y_hat = clf.predict(X)[0].tolist()[0]
        results.append(clf.predict(X)[0].tolist() + clf.predict_proba(X)[0].tolist() +
                       [1 if (y_hat != termList['y'][i]) else 0, preprocessd_term] + X[0].tolist())

    titles = ['TestCase', 'Term', 'Label', 'Predicted Label', 'Name Score', 'Address Score', 'Phone Score', 'Error', 'Preprocessed_Term'] + ft

    tacc = sum([1 for (y1, y2) in zip(termList['y'], [result[0] for result in results]) if (y1 == y2)]) / len(termList['y'])

    if (file_model):
        store.saveTermTestResults(tacc, titles, termList, results, file=file_model + '_' + file_term_classify_result)

    else:
        store.saveTermTestResults(tacc, titles, termList, results, file=timeManage.getTime() + '_' + file_term_classify_result)

    return tacc
コード例 #6
0
def getModelConfig(testAcc=0, valAcc=0):
    dct = {}

    dct['name'] = timeManage.getTime()

    dct['dictionary'] = {
        'folder': folder_dictionary,
        'file': [files_dictionary]
    }

    dct['database'] = {
        'folder': folder_data,
        'name': files_data[0],
        'address': files_data[1],
        'phone': files_data[2]
    }

    dct['preprocessing'] = {
        'flag': bpreprocessing,
        'type': list(preprocessing_name.keys())
    }

    dct['features'] = getFeatureNames()

    dct['model'] = {
        'class': model_type,
        'target': model_target,
        'config': model_config
    }

    dct['results'] = {'test-accuracy': testAcc, 'validate-accuracy': valAcc}

    return dct
コード例 #7
0
ファイル: segment.py プロジェクト: emithongle/AD---20160411
def templateFiler(clf, ptemplates):
    templates = []

    m = {0: 'name', 1: 'address', 2: 'phone'}
    ft = fe.getFeatureNames()
    dft = {key: value for (key, value) in zip(ft, range(len(ft)))}

    for terms in ptemplates:
        # preprocessedTerms = [fe.extractFeatureText(term) for term in terms]

        preprocessedTerms = []
        for term in terms:
            if (bpreprocessing):
                preprocessedTerms.append(fe.preprocess(term))

        btmp = False
        _X = []
        for term in preprocessedTerms:
            if (len(term) > 0):
                _X.append(fe.feature(term, fe.getFeatureNames()))
            else:
                btmp = True
        if (btmp):
            continue

        X = np.asarray(_X)
        cls = clf.predict(X)

        tmp = copy.deepcopy(cls.reshape((1, cls.shape[0])).tolist()[0])
        tmp.sort()
        if (tmp == list(range(len(terms)))):
            dct = {}
            probs = clf.predict_proba(X)
            for (term, cl, prob, prepTerm, _x) in zip(terms, cls, probs, preprocessedTerms, _X):
                try:
                    dct[m[int(cl)]] = {'term': term, 'score': prob[int(cl)], 'preprocessed': prepTerm, 'features': _x}
                except ValueError:
                    dct[m[int(cl)]] = {'term': term, 'score': prob[cl], 'preprocessed': prepTerm, 'features': _x}
            dct['score'] = sum([log(dct[key]['score']) for key in dct])
            if (checkTemplate(dct, dft)):
                templates.append(dct)

    if (len(templates) > 0):
        templates = sorted(templates, key=lambda k: k['score'], reverse=True)

    return templates
コード例 #8
0
def exc(file='testdata.txt'):
    # 5. Test
    textList = store.loadTextData(file=file)
    if (file_model):
        _time, templateList = sg.parseAddress(textList, file_model)
    else:
        _time, templateList = sg.parseAddress(textList)

    titles = [['#', 'Text', 'Runtime'],
              ['TestCase', 'Top', 'Name', 'Address', 'Phone', 'NameScore', 'AddressScore', 'PhoneScore', 'Score',
               'PrepName'] + ['Name_' + ft for ft in getFeatureNames()] + \
               ['PrepAddress'] + ['Address_' + ft for ft in getFeatureNames()] + \
               ['PrepPhone'] + ['Phone_' + ft for ft in getFeatureNames()]]

    if (file_model):
        store.saveResults(titles, (textList, _time), templateList, file=file_model + '_' + file_segment_address_result)
    else:
        store.saveResults(titles, (textList, _time), templateList, file=timeManage.getTime() + '_' + file_segment_address_result)
コード例 #9
0
def exc():
    # 1. Read txt data
    tupleData = store.loadTxtData()

    # 2. Preprocessing
    preprocessedData = tupleData
    if (bpreprocessing):
        preprocessedData = dataPreprocess(tupleData)
    store.savePreprocessedDataCSV(preprocessedData, getFeatureNames())
コード例 #10
0
def exc():
    # 1. Read txt data
    tupleData = store.loadTxtData()

    # 2. Preprocessing
    preprocessedData = tupleData
    if (bpreprocessing):
        preprocessedData = dataPreprocess(tupleData)
    store.savePreprocessedDataCSV(preprocessedData, getFeatureNames())
コード例 #11
0
def exc(file='testdata.txt'):
    # 5. Test
    textList = store.loadTextData(file=file)
    if (file_model):
        _time, templateList = sg.parseAddress(textList, file_model)
    else:
        _time, templateList = sg.parseAddress(textList)

    titles = [['#', 'Text', 'Runtime'],
              ['TestCase', 'Top', 'Name', 'Address', 'Phone', 'NameScore', 'AddressScore', 'PhoneScore', 'Score',
               'PrepName'] + ['Name_' + ft for ft in getFeatureNames()] + \
               ['PrepAddress'] + ['Address_' + ft for ft in getFeatureNames()] + \
               ['PrepPhone'] + ['Phone_' + ft for ft in getFeatureNames()]]

    if (file_model):
        store.saveResults(titles, (textList, _time),
                          templateList,
                          file=file_model + '_' + file_segment_address_result)
    else:
        store.saveResults(titles, (textList, _time),
                          templateList,
                          file=timeManage.getTime() + '_' +
                          file_segment_address_result)
コード例 #12
0
def getModelConfig(testAcc=0, valAcc=0):
    dct = {}

    dct['name'] = timeManage.getTime()

    dct['dictionary'] = {
        'folder': folder_datasource,
        'file': [files_dictionary]
    }

    dct['database'] = {
        'folder': folder_data,
        'name': files_data[0],
        'address': files_data[1],
        'phone': files_data[2]
    }

    dct['preprocessing'] = {
        'flag': bpreprocessing,
        'type': list(preprocessing_name.keys())
    }

    dct['features'] = getFeatureNames()

    dct['model'] = {
        'class': model_type,
        'target': model_target,
        'config': model_config
    }

    dct['results'] = {
        'test-accuracy': testAcc,
        'validate-accuracy': valAcc
    }

    return dct
コード例 #13
0
ファイル: run.py プロジェクト: emithongle/AddressSegmentation
    # 2. Extract Features
    extract_feature.exc()

    # 3. Random Data
    random_data.exc()

    # 4. Train Data
    tacc = train.exc()

    # 5. Test
    # 5.1. Test Term Classification
    ttacc = tt.exc()
    # 5.2. Test Address Segmentation
    tas.exc()

    millis_E = int(round(time.time() * 1000))

    logs.append([timeManage.getTime(),
                 (millis_E - millis_S)/1000,
                 ', '.join([key for key in preprocessing_name if preprocessing_name[key]])
                    if bpreprocessing else '',
                 str(len(getFeatureNames())) + ' features: ' + ', '.join(getFeatureNames()),
                 model_type,
                 modelDetails(),
                 tacc,
                 ttacc]
            )

workbook = xlsxwriter.Workbook(folder_running_logs + '/' + file_log)
writeSheet(workbook.add_worksheet('logs'), logs)
workbook.close()
コード例 #14
0
def exc():
    # store.loadFeatureCSV()
    # 4.1 Random Training and Testing Data
    x = randomSample()
    # store.saveTrainingTestingData(x)
    store.saveTrainingTestingDataCSV(x, getFeatureNames())
コード例 #15
0
def exc():
    preprocessedData = store.loadPreprocessedDataCSV()

    # 3. Extract features
    featureTuples = extractFeature(preprocessedData)
    store.saveFeatureCSV(featureTuples, getFeatureNames())
コード例 #16
0
def exc():
    preprocessedData = store.loadPreprocessedDataCSV()

    # 3. Extract features
    featureTuples = extractFeature(preprocessedData)
    store.saveFeatureCSV(featureTuples, getFeatureNames())
コード例 #17
0
ファイル: run.py プロジェクト: emithongle/AD---20160331
    # 3. Random Data
    random_data.exc()

    # 4. Train Data
    tacc = train.exc()

    # 5. Test
    # 5.1. Test Term Classification
    ttacc = tt.exc()
    # 5.2. Test Address Segmentation
    acc = tas.exc()

    millis_E = int(round(time.time() * 1000))

    features = getFeatureNames()
    tfilters = getTemplateRemoveFilters()

    logs.append([timeManage.getTime(),
                 (millis_E - millis_S)/1000,
                 ', '.join([key for key in preprocessing_name if preprocessing_name[key]])
                    if bpreprocessing else '',
                 str(len(features)) + ' features: ' + ', '.join(features),
                 model_type,
                 modelDetails(),
                 tacc,
                 ttacc,
                 str(len(tfilters)) + ' filters: ' + ', '.join(tfilters),
                 acc]
            )
コード例 #18
0
ファイル: test.py プロジェクト: emithongle/AD---20160411
def printData(text):
    print('Text : ', text)
    for i, j in zip(getFeatureNames(), feature(preprocess(text))):
        print(i, ' : ', j)
    print('---------------------------------')
コード例 #19
0
ファイル: run.py プロジェクト: emithongle/AD---20160408
    extract_feature.exc()

    # 3. Random Data
    random_data.exc()

    # 4. Train Data
    tacc = train.exc()

    # 5. Test
    # 5.1. Test Term Classification
    ttacc = tt.exc()
    # 5.2. Test Address Segmentation
    acc = tas.exc()

    millis_E = int(round(time.time() * 1000))

    features = getFeatureNames()
    tfilters = getTemplateRemoveFilters()

    logs.append([
        timeManage.getTime(), (millis_E - millis_S) / 1000, ', '.join(
            [key for key in preprocessing_name
             if preprocessing_name[key]]) if bpreprocessing else '',
        str(len(features)) + ' features: ' + ', '.join(features), model_type,
        modelDetails(), tacc, ttacc,
        str(len(tfilters)) + ' filters: ' + ', '.join(tfilters), acc
    ])

workbook = xlsxwriter.Workbook(folder_running_logs + '/' + file_log)
writeSheet(workbook.add_worksheet('logs'), logs)
workbook.close()