from Data.DataManager import DataManager from ScriptToolkit import ScriptToolkit from Preprocessing.DataReader import DataReader from Preprocessing import ProcessorFactory from Model.ConditionalRandomField import CRF if __name__ == '__main__': # create data manager DM = DataManager() DM.change_pwd() DM.source_data_file = 'CorpusLabelData_SalesModule.txt' DM.remove(DM.log_wrong_sentences) # create datums DR = DataReader(source_data_file=DM.source_data_file) DR.standard_read() # create toolkits ST = ScriptToolkit(DM) features = ScriptToolkit.get_demo_features() # analysis sent_accuracys, train_times, test_times = [], [], [] cycle_times = 30 for i in range(cycle_times): # data preprocessing crf_processor = ProcessorFactory.CRFProcessorFactory().produce( source_data_file=DM.source_data_file, train_file=DM.train_file, test_file=DM.test_file) crf_processor.get_train_data(DR.Datums)
# -*- coding:utf-8 -*- from sys import argv from Data.DataManager import DataManager from Scripts.ScriptToolkit import ScriptToolkit from Preprocessing.DataReader import DataReader from Preprocessing import ProcessorFactory from Model.ConditionalRandomField import CRF if __name__ == '__main__': # create data manager DM = DataManager() DM.change_pwd() DM.source_data_file = argv[ argv.index("-source") + 1] if "-source" in argv else 'CorpusLabelData_MergedFilter.txt' DM.remove(DM.log_wrong_sentences) # create datums DR = DataReader(source_data_file=DM.source_data_file) DR.standard_read() # create toolkits ST = ScriptToolkit(DM) features = ScriptToolkit.get_demo_features() # analysis sent_accuracys, train_times, test_times = [], [], [] cycle_times = int(argv[argv.index("-iter") + 1]) if "-iter" in argv else 10 for i in range(cycle_times): print "This is the %d-th experiments." % (i + 1)
from Preprocessing import Preprocessor from Data.DataManager import DataManager class Provider(object): def produce(self, **kw): pass class CRFProcessorFactory(Provider): def produce(self, **kw): return Preprocessor.CRFPreprocessor(**kw) class LSTMProcessorFactory(Provider): def produce(self, **kw): return Preprocessor.LSTMPreprocessor(**kw) if __name__ == '__main__': DM = DataManager() DM.change_pwd() DM.source_data_file = 'CorpusLabelData_MergedFilter_Update.txt' crf_factory = CRFProcessorFactory() crf_processor = crf_factory.produce(source_data_file=DM.source_data_file, train_file=DM.train_file, test_file=DM.test_file) crf_processor.preprocess() crf_processor.get_train_data()
if __name__ == '__main__': # create data manager DM = DataManager() DM.change_pwd() DM.remove(DM.log_wrong_sentences) # create toolkits ST = ScriptToolkit(DM) features = ScriptToolkit.get_demo_features() # analysis cycle_times = 1 sent_accuracys, sent_accuracys_f = [], [] for i in range(cycle_times): DM.source_data_file = 'CorpusLabelData_MergedFilter.txt' # change source data as old corpus # create datums DR = DataReader(source_data_file=DM.source_data_file) DR.standard_read() # data preprocessing crf_processor = ProcessorFactory.CRFProcessorFactory().produce( source_data_file=DM.source_data_file, train_file=DM.train_file, test_file=DM.test_file) crf_processor.get_train_data(DR.Datums) # training and testing crf_test = CRF(path_to_jar=DM.path_to_jar, prop_file=DM.prop_file, model_file=DM.model_file,