from Data.DataManager import DataManager
from ScriptToolkit import ScriptToolkit
from Preprocessing.DataReader import DataReader
from Preprocessing import ProcessorFactory
from Model.ConditionalRandomField import CRF

if __name__ == '__main__':
    # create data manager
    DM = DataManager()
    DM.change_pwd()
    DM.source_data_file = 'CorpusLabelData_SalesModule.txt'
    DM.remove(DM.log_wrong_sentences)

    # create datums
    DR = DataReader(source_data_file=DM.source_data_file)
    DR.standard_read()

    # create toolkits
    ST = ScriptToolkit(DM)
    features = ScriptToolkit.get_demo_features()

    # analysis
    sent_accuracys, train_times, test_times = [], [], []
    cycle_times = 30
    for i in range(cycle_times):
        # data preprocessing
        crf_processor = ProcessorFactory.CRFProcessorFactory().produce(
            source_data_file=DM.source_data_file,
            train_file=DM.train_file,
            test_file=DM.test_file)
        crf_processor.get_train_data(DR.Datums)
Exemple #2
0
# -*- coding:utf-8 -*-
from sys import argv
from Data.DataManager import DataManager
from Scripts.ScriptToolkit import ScriptToolkit
from Preprocessing.DataReader import DataReader
from Preprocessing import ProcessorFactory
from Model.ConditionalRandomField import CRF

if __name__ == '__main__':
    # create data manager
    DM = DataManager()
    DM.change_pwd()
    DM.source_data_file = argv[
        argv.index("-source") +
        1] if "-source" in argv else 'CorpusLabelData_MergedFilter.txt'
    DM.remove(DM.log_wrong_sentences)

    # create datums
    DR = DataReader(source_data_file=DM.source_data_file)
    DR.standard_read()

    # create toolkits
    ST = ScriptToolkit(DM)
    features = ScriptToolkit.get_demo_features()

    # analysis
    sent_accuracys, train_times, test_times = [], [], []
    cycle_times = int(argv[argv.index("-iter") + 1]) if "-iter" in argv else 10
    for i in range(cycle_times):
        print "This is the %d-th experiments." % (i + 1)
from Preprocessing import Preprocessor
from Data.DataManager import DataManager


class Provider(object):
    def produce(self, **kw):
        pass


class CRFProcessorFactory(Provider):
    def produce(self, **kw):
        return Preprocessor.CRFPreprocessor(**kw)


class LSTMProcessorFactory(Provider):
    def produce(self, **kw):
        return Preprocessor.LSTMPreprocessor(**kw)


if __name__ == '__main__':
    DM = DataManager()
    DM.change_pwd()
    DM.source_data_file = 'CorpusLabelData_MergedFilter_Update.txt'
    crf_factory = CRFProcessorFactory()
    crf_processor = crf_factory.produce(source_data_file=DM.source_data_file,
                                        train_file=DM.train_file,
                                        test_file=DM.test_file)
    crf_processor.preprocess()
    crf_processor.get_train_data()
if __name__ == '__main__':
    # create data manager
    DM = DataManager()
    DM.change_pwd()
    DM.remove(DM.log_wrong_sentences)

    # create toolkits
    ST = ScriptToolkit(DM)
    features = ScriptToolkit.get_demo_features()

    # analysis
    cycle_times = 1
    sent_accuracys, sent_accuracys_f = [], []
    for i in range(cycle_times):
        DM.source_data_file = 'CorpusLabelData_MergedFilter.txt'  # change source data as old corpus
        # create datums
        DR = DataReader(source_data_file=DM.source_data_file)
        DR.standard_read()

        # data preprocessing
        crf_processor = ProcessorFactory.CRFProcessorFactory().produce(
            source_data_file=DM.source_data_file,
            train_file=DM.train_file,
            test_file=DM.test_file)
        crf_processor.get_train_data(DR.Datums)

        # training and testing
        crf_test = CRF(path_to_jar=DM.path_to_jar,
                       prop_file=DM.prop_file,
                       model_file=DM.model_file,