コード例 #1
0
def run():
    computed_similarity = []
    dataset = {
        "current": 0,
        "current_data": {},
        "current_index": [],
        "new": 0,
        "new_data": {},
        "new_index": []
    }

    dataset["current_index"], dataset["current_data"], dataset[
        "current"] = addFile(parameters["current_path"],
                             parameters["properties"], parameters["index"])
    dataset["new_index"], dataset["new_data"], dataset["new"] = addFile(
        parameters["new_path"], parameters["properties"], parameters["index"])

    for i in range(0, len(dataset["new_index"])):
        output = {"key": dataset["new_index"][i], "similars": []}

        for prop in parameters["properties"]:
            preprocessor.parameters['corpus'] = [
                *dataset["current_data"][prop], *dataset["new_data"][prop]
            ]
            X, Z = preprocessor.run()
            similarity.parameters['matrix'] = X
            S = similarity.run()[0:dataset["current"], dataset["current"]:]
            for j in range(0, S.shape[0]):
                item = {"key": dataset["current_index"][j], prop: S[j, i]}
                output['similars'] = setSimilarity(item, prop,
                                                   output['similars'])
        computed_similarity.append(output)
    return computed_similarity
コード例 #2
0
def Assembly(event=0):
    global list_v

    source = list_v[5].get('1.0', END)
    source = preprocessor.run(source)
    source = assembler.run(source)
    simulator.init_8051()
    simulator.load_hex_to_pm(source)
    update_data()
コード例 #3
0
setup.load()

import manageFiles

manageFiles.parameters['path'] = './support/vagas.json'
inputList = manageFiles.read()  # Le o arquivo

corpus = []
for doc in inputList:  # captura a descrição
    corpus.append(doc["desc"])

import preprocessor

preprocessor.parameters['corpus'] = corpus

X, Z = preprocessor.run()

print(X)
print(Z)

output_file = []
for i in range(0, X.shape[0]):
    output = {"titulo": "", "texto": "", "tokens": []}
    output["titulo"] = 'Código: {} - Título: {}'.format(
        inputList[i]["codigo"], inputList[i]["titulo"])
    output["texto"] = corpus[i]
    for j in range(0, X.shape[1]):
        if X[i, j] > 0:
            token = {"item": Z[j], "tfidf": X[i, j]}
            output['tokens'].append(token)
    output_file.append(output)
コード例 #4
0
ファイル: build.py プロジェクト: ly774508966/medea.js
def run(input_folder, config):
    output_folder = config['output']
    files_to_compact = config['modules']
    resources_to_include = config['resources']
    symbols = config['symbols']

    input_folder_3rdparty = os.path.join(input_folder, '3rdparty')
    output_folder_3rdparty = os.path.join(output_folder, '3rdparty')

    make_clean_folders(output_folder, output_folder_3rdparty)

    mods_by_deps = {}
    all_deps = set()

    # add implicit dependencies dependent on core (core itself is handled separately)
    files_to_compact.append('node')
    files_to_compact.append('viewport')

    # scan input files for dependencies
    cursor = 0
    while cursor < len(files_to_compact):
        file = files_to_compact[cursor]
        cursor = cursor + 1

        full_file_name = get_full_file_name(file)
        path = os.path.join(input_folder, full_file_name)
        print('processing: ' + path)

        # Support wildcards in modules and resolve them using GLOB on
        # a filesystem level.
        for matched_path in glob.iglob(path):
            with open(matched_path, 'rt') as inp:
                contents = inp.read()

                l = parse_module_dependencies(contents, all_deps)
                for dep in (dep for dep in l if not dep in mods_by_deps
                            and not dep in files_to_compact):
                    files_to_compact.append(dep)
                    print full_file_name + ' depends on ' + dep

                # Extract the original module name from the file name
                # For non-wildcard modules, this simply reproduces the
                # original name (i.e. |file|).
                mods_by_deps[get_module_name_from_full_file_name(
                    matched_path)] = l

    print('deriving topological order of collated modules')

    # pre-define sprintf, matrix and the core module as they do not follow the
    # usual module dependency system.
    topo_order = derive_topological_order(['core', 'glMatrix.js'],
                                          mods_by_deps)
    print(topo_order)
    print('writing medea.core-compiled.js')

    # generate medea.core-compiled.js output file
    with open(os.path.join(output_folder, primary_compiled_file),
              'wt') as outp:
        outp.write(get_google_closure_params())
        outp.write(get_license(input_folder))
        outp.write('medea_is_compiled = true;')
        for n, dep in enumerate(topo_order):
            name = get_full_file_name(dep)
            path = os.path.join(input_folder, name)
            print('collating: ' + path)

            for matched_path in glob.iglob(path):
                with open(matched_path, 'rt') as inp:
                    outp.write(
                        preprocessor.run(inp.read(), input_folder, name,
                                         symbols))
                    if '.js' in dep:
                        outp.write('medealib._MarkScriptAsLoaded("' + dep +
                                   '");')
                    outp.write('\n')

        # embed resource files
        if resources_to_include:
            outp.write('medealib._bakedResources = {}; \n')
            for k, v in resources_to_include.items():
                print('embedding: ' + v + ' as ' + k)
                outp.write(include_resource(k, v, input_folder))

        outp.write('delete window.medea_is_compiled;')

    topo_order = [get_full_file_name(e) for e in topo_order]

    # all other files are preprocessed, but kept in separate JS files
    for file in os.listdir(input_folder):
        if not file in topo_order and ".js" in file:
            print('writing ' + file + ' to output folder')

            with open(os.path.join(output_folder, file), 'wt') as outp:
                with open(os.path.join(input_folder, file), 'rt') as inp:
                    outp.write(
                        preprocessor.run(inp.read(), input_folder, file,
                                         symbols))

    for file in os.listdir(input_folder_3rdparty):
        if not os.path.join('3rdparty', file) in topo_order and ".js" in file:
            print('copying ' + file + ' to output folder')
            shutil.copy2(os.path.join(input_folder_3rdparty, file),
                         os.path.join(output_folder_3rdparty, file))

    print('** done - ' + output_folder)
コード例 #5
0
        if(word["pos"] in nonstop):
            if(word["tok"] in local_word_score[c_id]):
                local_word_score[c_id][word["tok"]] +=1
            else:
                local_word_score[c_id][word["tok"]] = 1
    return local_word_score

def coherence_filter(forest, lld):
    for cluster in forest:
        for sent in cluster:
            cf = coherence_factor(sent["s_id"], lld)
            if(cf < 0.6):
                cluster.remove(sent)
    return forest

lld = preprocessor.run('pos_output.txt')
s_count = len(lld)-1
word_score = calculate_word_score(lld)
local_word_score = []
forest = []
cluster_count = 0
s_id =0
for sent in lld:

    if(s_id==0):
        forest.append([])
        local_word_score.append({})
        forest[cluster_count].append({"s_id":s_id, "score":1, "sent":sent})
        local_word_score = update_lwscr(cluster_count, s_id, local_word_score, lld)
        cluster_count+=1
        s_id+=1
コード例 #6
0
ファイル: build.py プロジェクト: acgessler/medea.js
def run(input_folder, config):
	output_folder = config['output']
	files_to_compact = config['modules']
	resources_to_include = config['resources']
	symbols = config['symbols']

	input_folder_3rdparty = os.path.join(input_folder, '3rdparty')
	output_folder_3rdparty = os.path.join(output_folder, '3rdparty')

	make_clean_folders(output_folder, output_folder_3rdparty)

	mods_by_deps = {}
	all_deps = set()

	# add implicit dependencies dependent on core (core itself is handled separately)
	files_to_compact.append('node')
	files_to_compact.append('viewport')

	# scan input files for dependencies
	cursor = 0
	while cursor < len(files_to_compact):
		file = files_to_compact[cursor]
		cursor = cursor + 1

		full_file_name = get_full_file_name(file)
		path = os.path.join(input_folder, full_file_name) 
		print('processing: ' + path)

        # Support wildcards in modules and resolve them using GLOB on
        # a filesystem level.
		for matched_path in glob.iglob(path):
			with open(matched_path, 'rt') as inp:
				contents = inp.read()

				l = parse_module_dependencies(contents, all_deps)
				for dep in (dep for dep in l if not dep in mods_by_deps and not dep in files_to_compact):
					files_to_compact.append(dep)
					print full_file_name + ' depends on ' + dep
			
				# Extract the original module name from the file name
				# For non-wildcard modules, this simply reproduces the
				# original name (i.e. |file|).
				mods_by_deps[get_module_name_from_full_file_name(matched_path)] = l 

	print('deriving topological order of collated modules')

	# pre-define sprintf, matrix and the core module as they do not follow the 
	# usual module dependency system.
	topo_order = derive_topological_order(['core', 'glMatrix.js'], mods_by_deps)
	print(topo_order)
	print('writing medea.core-compiled.js')
	
	# generate medea.core-compiled.js output file
	with open(os.path.join(output_folder, primary_compiled_file), 'wt') as outp:
		outp.write(get_google_closure_params())
		outp.write(get_license(input_folder))
		outp.write('medea_is_compiled = true;');
		for n, dep in enumerate(topo_order):
			name = get_full_file_name(dep)
			path = os.path.join(input_folder, name);
			print('collating: ' + path)

			for matched_path in glob.iglob(path):
				with open(matched_path , 'rt') as inp:
					outp.write(preprocessor.run(inp.read(), input_folder, name, symbols))
					if '.js' in dep:
						outp.write('medealib._MarkScriptAsLoaded("'+ dep +'");')
					outp.write('\n')

		# embed resource files
		if resources_to_include:
			outp.write('medealib._bakedResources = {}; \n')
			for k,v in resources_to_include.items():
				print('embedding: ' + v + ' as ' + k)
				outp.write(include_resource(k,v,input_folder))

		outp.write('delete window.medea_is_compiled;');

	topo_order = [get_full_file_name(e) for e in topo_order]

	# all other files are preprocessed, but kept in separate JS files
	for file in os.listdir(input_folder):
		if not file in topo_order and ".js" in file:
			print('writing ' + file + ' to output folder')

			with open(os.path.join(output_folder, file), 'wt') as outp:
				with open(os.path.join(input_folder, file), 'rt') as inp:
					outp.write(preprocessor.run(inp.read(), input_folder, file, symbols))

	for file in os.listdir(input_folder_3rdparty):
		if not os.path.join('3rdparty',file) in topo_order and ".js" in file:
			print('copying ' + file + ' to output folder')
			shutil.copy2(os.path.join(input_folder_3rdparty, file), os.path.join(output_folder_3rdparty, file))

	print('** done - ' + output_folder)
コード例 #7
0
import analytics
import preprocessor
import project


if __name__ == '__main__':
	project.run()
	preprocessor.run()
	analytics.run()
コード例 #8
0
def run(unpredictableSeed = False):
	# for reproducibility
	if not unpredictableSeed:
		np.random.seed(1337) 
	else:
		np.random.seed()

	# Cull samples which have <= this ratio of data points as non-zero values
	percentageThreshold = 0.7
	# in ms
	frameSize = 30
	# number of convolutional filters to use
	nb_filters = 32
	# size of pooling area for max pooling
	nb_pool = 2
	# convolution kernel size
	filter_len = 3

	# number of samples before weight update
	batch_size = 128
	# number of possible classes. In this case, just 2 (TODO: should be 3 after adding noise)
	nb_classes = 2
	# how many iterations to run
	nb_epoch = 12

	((X_train, y_train), (X_test, y_test)) = inp.run(frameSize, percentageThreshold)

	print X_train.shape, y_train.shape, X_test.shape, y_test.shape
	print X_train.dtype, y_train.dtype, X_test.dtype, y_test.dtype

	# X_train:	input for the training set
	# X_test:	input for the test set
	# y_train:	result for the training set
	# y_test:	result for the test set

	X_train = X_train.astype('float32')
	X_test = X_test.astype('float32')

	# convert class vectors to binary class matrices
	Y_train = np_utils.to_categorical(y_train, nb_classes)
	Y_test = np_utils.to_categorical(y_test, nb_classes)

	model = Sequential()

	model.add(Convolution2D(nb_filters, filter_len, 1,
	                        border_mode='valid',
	                        input_shape=(1, X_train.shape[2], 1)))
	model.add(Activation('relu'))
	model.add(Convolution2D(nb_filters, filter_len, 1))
	model.add(Activation('relu'))
	model.add(MaxPooling2D(pool_size=(2,1)))


	model.add(Flatten())
	model.add(Dense(256))
	model.add(Activation('relu'))
	model.add(Dropout(0.5))
	model.add(Dense(nb_classes))
	model.add(Activation('softmax'))

	model.compile(loss='categorical_crossentropy',
	              optimizer='adadelta',
	              metrics=['accuracy'])

	start = time.clock()
	model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
	          verbose=1, validation_data=(X_test, Y_test))
	score = model.evaluate(X_test, Y_test, verbose=0)
	timeTaken = time.clock() - start
	print('Time taken:', timeTaken)
	print('Test score:', score[0])
	print('Test accuracy:', score[1])
	return (score[0], score[1], timeTaken)

	# 1) import wav file. DONE
	# 2) carve up wav file in to (overlapping?) fragements by a certain timestep.
	# 		This involves figuring out how a single frame lasts (use framerate?)
	# 		Kinnuen & Li suggests 20~30ms is short and yields good results over longer (100s of ms) analyses
	# 		Dynamic Time Warping is an alternative
	# 		Velocity and acceleration (of the feature value) are possible additional feature data
	# 3) design DNN to use said overlapping fragments with a true/false value for each timestep.
	# 		note this is not speech recognition; this is speaker recognition
	# 		there is no need for memory, and this is a classification issue
	# 		i.e. more of a MNIST job than a LSTM job
	# 		Kinnunen & Li suggests negative samples be provided and classified to "no one" to prevent false positives
	# 		They also claim GMM can't handle high dimensional data, but Rouvier showed DNN is fine with 60
	# 			Jain claims the feature vector should have less than 1/10 the number of speakers as its dimension. That's 0.2 features for our case.
	# Other: MFC continues to dominate despite being from the 80's, and various modern attempts to create a better feature vector
	# 			PLP is one of the more successful feature filters used along side MFC, despite being from 1990. Ceptra and spectral are more recent (both 2001)
	# 			Different filters can be used to create different feature vectors of the same sample, tripling the samples available and increasing accuracy
	# """