class Module: def __init__(self): self._pipeline = Pipeline() self._factory = Factory() self._execution_params = {} def set_execution_params(self, execution_params): self._execution_params = execution_params def add(self, name, attr): item = self._factory.produce(name, attr) self._pipeline.add(item) def execute(self, data): return self._pipeline.execute(data, self._execution_params)
def makeAnalyzer(env): pipeline = Pipeline() # XXX make a scanner that also validates the character range accepted by Ada? # XXX only create the phases lazily if they're all needed. phases = [("scanner", ByteScanner()), ("lexer", LongestMatchLexer(syntax.definition())), ("token-filter", Filter(grammar.ignoredTokens())), ("parser", slr.Parser(slr.makeGrammar(grammar.definition()))), ("type-constructor", TypeConstructor()), ("symbol-collector", SymbolCollector()), ] for phaseName, phase in phases: pipeline.append(phase) if env.get(environment.LAST_PHASE) == phaseName: return (pipeline, False) return (pipeline, True)
if (args.r): from adapter.repeatingLettersAdapter import RepeatingLettersAdapter rpt = RepeatingLettersAdapter(preprocess) prefilters.append(rpt) print " RepeatingLetters filter." if (args.e): from adapter.tagRemover import TagRemover tagrm = TagRemover() postfilters.append(tagrm) print " TagRemover." print "]" print "" print "" pipeline = Pipeline(tokenizer, tagger, prefilters, postfilters) file = ["tweeti-b", "tweeti-b.dev"] if not args.n: # Load standard tweet file trainingfile = map(lambda path: args.datasetpath + path + ".tsv", file) labeled_featuresets = read_tweets_file(trainingfile, pipeline).values() else: # If the not adapter filter has to be used, the program has to load the *.conll files instead # the conll files must be in the same dataset path specified by the user. trainingfile = map(lambda path: args.datasetpath + path + ".tsv", file) conllfile = map(lambda path: args.datasetpath + path + ".conll", file) labeled_featuresets = read_conll_file(trainingfile, conllfile, pipeline).values() if not args.predict:
def __init__(self): Pipeline.__init__(self)
# adjust all path originalFile = map(lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", file) conllFile = map(lambda path: get_project_dir() + "resources/conll/" + path + ".conll", file) # timer used for timing timer = Timer() # classifiers to test classifiers = {"ShortTextClassifier": ShortTextClassifier(), "SVMClassifier": SVMClassifier(), "Bayes": BayesianClassifier()} #classifiers = {"LinearClassifier": LinearClassifier()} #classifiers = {"Bayes": BayesianClassifier()} # loading and processing data set timer.start() labeled_featuresets = read_conll_file(originalFile, conllFile, Pipeline()).values() validator = CrossValidator(labeled_featuresets) print "Elapsed time for data set processing: %.0fs\n" % (timer.stop()/1000) # test the classifiers for classifierName in classifiers: timer.start() print "- %s " % classifierName (acc, conf_matr, prec, recall, f_measure) = validator.validate(classifiers[classifierName], numOfBins) print "Accuracy: %f" % acc print "Confusion Matrix:" for prec_label in conf_matr: for real_label in conf_matr[prec_label]: print "\tPredicted: "+prec_label + "\tReal: "+ real_label +"\t"+ str(conf_matr[prec_label][real_label]) print "Precision:"
for k in keys: steps.append(Imputer(k)) # steps.append(RobustScaler(k)) steps.append(Discretizer(k)) steps.append(RemoveKey(k)) util.string.remove(columns, keys) print(len(columns), 'remaining attrs') # TODO update this list # print(columns) ############################################################################### # Apply pipeline ############################################################################### print_primary('\n ----- \n Fit estimator models on training data \n ---- \n') pipeline = Pipeline(steps, data) # save data to disk # with open('data/pipeline.pkl', 'wb') as f: # pickle.dump(pipeline, f, pickle.HIGHEST_PROTOCOL) data.to_csv('data/training_set_VU_DM_clean.csv', sep=';', index=False) data = None # clear memory gc.collect() # Transfrom test data print_primary('\n\n ----- \n Transform test data \n ---- \n\n') data_test = pd.read_csv('data/test_set_VU_DM.csv', sep=',') # data_test = pd.read_csv('data/test_set_VU_DM.csv', sep=',', nrows=1000 * 1000) pipeline.transform(data_test) # data_test.to_csv('data/test_set_VU_DM_clean.csv', sep=';', index=False)
def __init__(self): self._pipeline = Pipeline() self._factory = Factory() self._execution_params = {}
# classifier from classifier.shortTextClassifier import ShortTextClassifier classifier = ShortTextClassifier() # file paths originalFile = map( lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", ["tweeti-b", "tweeti-b.dev"]) testingFile = map( lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", ["twitter-test-input-B"]) # initialize the pipeline used to transform the tweets tokenizer = POSTokenizer() tagger = POSTagger() pipeline = Pipeline(tokenizer, tagger, [], []) # read the training file labeled_featuresets = read_tweets_file(originalFile, pipeline).values() # training classifier.train(labeled_featuresets) # read the test file labeled_featuresets_test = read_tweets_file(testingFile, pipeline) for key in labeled_featuresets_test: labeled_featuresets_test[key] = labeled_featuresets_test[key][0] # classification labeled_featuresets_test = classifier.classify_dict(labeled_featuresets_test)
print " Elapsed time: %.0fs\n" % (timer.stop() / 1000) """ TEST """ print " ********** TEST 1 *********** " print " stopwords: no" print " url: no" print " punctuation: no" print " repeating letters: no" print " tag removing: no" print " not adapter: no\n" test_pipeline(Pipeline(tokenizer, tagger)) print " ********** TEST 2 *********** " print " stopwords: yes" print " url: no" print " punctuation: no" print " repeating letters: no" print " tag removing: no" print " not adapter: no\n" test_pipeline(Pipeline(tokenizer, tagger, [], [stopwrd])) print " ********** TEST 3 *********** " print " stopwords: yes" print " url: yes" print " punctuation: no"