Exemple #1
0
class Module:
    def __init__(self):
        self._pipeline = Pipeline()
        self._factory = Factory()
        self._execution_params = {}

    def set_execution_params(self, execution_params):
        self._execution_params = execution_params

    def add(self, name, attr):
        item = self._factory.produce(name, attr)
        self._pipeline.add(item)

    def execute(self, data):
        return self._pipeline.execute(data, self._execution_params)
Exemple #2
0
def makeAnalyzer(env):
   pipeline = Pipeline()

   # XXX make a scanner that also validates the character range accepted by Ada?   
   # XXX only create the phases lazily if they're all needed.
   phases = [("scanner", ByteScanner()),
             ("lexer", LongestMatchLexer(syntax.definition())),
             ("token-filter", Filter(grammar.ignoredTokens())),
             ("parser", slr.Parser(slr.makeGrammar(grammar.definition()))),
             ("type-constructor", TypeConstructor()),
             ("symbol-collector", SymbolCollector()),
   ]

   for phaseName, phase in phases:
      pipeline.append(phase)
      if env.get(environment.LAST_PHASE) == phaseName:
         return (pipeline, False)

   return (pipeline, True)
if (args.r):
    from adapter.repeatingLettersAdapter import RepeatingLettersAdapter
    rpt = RepeatingLettersAdapter(preprocess)
    prefilters.append(rpt)
    print "		RepeatingLetters filter."
if (args.e):
    from adapter.tagRemover import TagRemover
    tagrm = TagRemover()
    postfilters.append(tagrm)
    print "		TagRemover."

print "]"
print ""
print ""

pipeline = Pipeline(tokenizer, tagger, prefilters, postfilters)

file = ["tweeti-b", "tweeti-b.dev"]
if not args.n:
    # Load standard tweet file
    trainingfile = map(lambda path: args.datasetpath + path + ".tsv", file)
    labeled_featuresets = read_tweets_file(trainingfile, pipeline).values()
else:
    # If the not adapter filter has to be used, the program has to load the *.conll files instead
    # the conll files must be in the same dataset path specified by the user.
    trainingfile = map(lambda path: args.datasetpath + path + ".tsv", file)
    conllfile = map(lambda path: args.datasetpath + path + ".conll", file)
    labeled_featuresets = read_conll_file(trainingfile, conllfile,
                                          pipeline).values()

if not args.predict:
 def __init__(self):
     Pipeline.__init__(self)
Exemple #5
0
# adjust all path
originalFile = map(lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv", file)
conllFile = map(lambda path: get_project_dir() + "resources/conll/" + path + ".conll", file)

# timer used for timing
timer = Timer()

# classifiers to test
classifiers = {"ShortTextClassifier": ShortTextClassifier(), "SVMClassifier": SVMClassifier(), "Bayes": BayesianClassifier()}
#classifiers = {"LinearClassifier": LinearClassifier()}

#classifiers = {"Bayes": BayesianClassifier()}
# loading and processing data set
timer.start()
labeled_featuresets = read_conll_file(originalFile, conllFile, Pipeline()).values()

validator = CrossValidator(labeled_featuresets)
print "Elapsed time for data set processing: %.0fs\n" % (timer.stop()/1000)

# test the classifiers
for classifierName in classifiers:
	timer.start()
	print "- %s " % classifierName
	(acc, conf_matr, prec, recall, f_measure) = validator.validate(classifiers[classifierName], numOfBins)
	print "Accuracy:		%f" % acc
	print "Confusion Matrix:"
	for prec_label in conf_matr:
		for real_label in conf_matr[prec_label]:
			print "\tPredicted: "+prec_label + "\tReal: "+ real_label +"\t"+ str(conf_matr[prec_label][real_label])
	print "Precision:"
Exemple #6
0
for k in keys:
    steps.append(Imputer(k))
    # steps.append(RobustScaler(k))
    steps.append(Discretizer(k))
    steps.append(RemoveKey(k))
util.string.remove(columns, keys)

print(len(columns), 'remaining attrs')  # TODO update this list
# print(columns)

###############################################################################
# Apply pipeline
###############################################################################

print_primary('\n ----- \n Fit estimator models on training data \n ---- \n')
pipeline = Pipeline(steps, data)
# save data to disk
# with open('data/pipeline.pkl', 'wb') as f:
# pickle.dump(pipeline, f, pickle.HIGHEST_PROTOCOL)
data.to_csv('data/training_set_VU_DM_clean.csv', sep=';', index=False)
data = None
# clear memory
gc.collect()

# Transfrom test data
print_primary('\n\n ----- \n Transform test data \n ---- \n\n')
data_test = pd.read_csv('data/test_set_VU_DM.csv', sep=',')
# data_test = pd.read_csv('data/test_set_VU_DM.csv', sep=',', nrows=1000 * 1000)

pipeline.transform(data_test)
# data_test.to_csv('data/test_set_VU_DM_clean.csv', sep=';', index=False)
Exemple #7
0
 def __init__(self):
     self._pipeline = Pipeline()
     self._factory = Factory()
     self._execution_params = {}
Exemple #8
0
# classifier
from classifier.shortTextClassifier import ShortTextClassifier
classifier = ShortTextClassifier()

# file paths
originalFile = map(
    lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv",
    ["tweeti-b", "tweeti-b.dev"])
testingFile = map(
    lambda path: get_project_dir() + "resources/tweeti/" + path + ".tsv",
    ["twitter-test-input-B"])

# initialize the pipeline used to transform the tweets
tokenizer = POSTokenizer()
tagger = POSTagger()
pipeline = Pipeline(tokenizer, tagger, [], [])

# read the training file
labeled_featuresets = read_tweets_file(originalFile, pipeline).values()

# training
classifier.train(labeled_featuresets)

# read the test file
labeled_featuresets_test = read_tweets_file(testingFile, pipeline)
for key in labeled_featuresets_test:
    labeled_featuresets_test[key] = labeled_featuresets_test[key][0]

# classification
labeled_featuresets_test = classifier.classify_dict(labeled_featuresets_test)
Exemple #9
0
        print "  Elapsed time: %.0fs\n" % (timer.stop() / 1000)


"""
			TEST
"""

print "	********** TEST 1 ***********   "
print "	stopwords: no"
print "	url: no"
print "	punctuation: no"
print "	repeating letters: no"
print "	tag removing: no"
print "	not adapter: no\n"

test_pipeline(Pipeline(tokenizer, tagger))

print "	********** TEST 2 ***********   "
print "	stopwords: yes"
print "	url: no"
print "	punctuation: no"
print "	repeating letters: no"
print "	tag removing: no"
print "	not adapter: no\n"

test_pipeline(Pipeline(tokenizer, tagger, [], [stopwrd]))

print "	********** TEST 3 ***********	"
print "	stopwords: yes"
print "	url: yes"
print "	punctuation: no"