def run(): computed_similarity = [] dataset = { "current": 0, "current_data": {}, "current_index": [], "new": 0, "new_data": {}, "new_index": [] } dataset["current_index"], dataset["current_data"], dataset[ "current"] = addFile(parameters["current_path"], parameters["properties"], parameters["index"]) dataset["new_index"], dataset["new_data"], dataset["new"] = addFile( parameters["new_path"], parameters["properties"], parameters["index"]) for i in range(0, len(dataset["new_index"])): output = {"key": dataset["new_index"][i], "similars": []} for prop in parameters["properties"]: preprocessor.parameters['corpus'] = [ *dataset["current_data"][prop], *dataset["new_data"][prop] ] X, Z = preprocessor.run() similarity.parameters['matrix'] = X S = similarity.run()[0:dataset["current"], dataset["current"]:] for j in range(0, S.shape[0]): item = {"key": dataset["current_index"][j], prop: S[j, i]} output['similars'] = setSimilarity(item, prop, output['similars']) computed_similarity.append(output) return computed_similarity
def Assembly(event=0): global list_v source = list_v[5].get('1.0', END) source = preprocessor.run(source) source = assembler.run(source) simulator.init_8051() simulator.load_hex_to_pm(source) update_data()
setup.load() import manageFiles manageFiles.parameters['path'] = './support/vagas.json' inputList = manageFiles.read() # Le o arquivo corpus = [] for doc in inputList: # captura a descrição corpus.append(doc["desc"]) import preprocessor preprocessor.parameters['corpus'] = corpus X, Z = preprocessor.run() print(X) print(Z) output_file = [] for i in range(0, X.shape[0]): output = {"titulo": "", "texto": "", "tokens": []} output["titulo"] = 'Código: {} - Título: {}'.format( inputList[i]["codigo"], inputList[i]["titulo"]) output["texto"] = corpus[i] for j in range(0, X.shape[1]): if X[i, j] > 0: token = {"item": Z[j], "tfidf": X[i, j]} output['tokens'].append(token) output_file.append(output)
def run(input_folder, config): output_folder = config['output'] files_to_compact = config['modules'] resources_to_include = config['resources'] symbols = config['symbols'] input_folder_3rdparty = os.path.join(input_folder, '3rdparty') output_folder_3rdparty = os.path.join(output_folder, '3rdparty') make_clean_folders(output_folder, output_folder_3rdparty) mods_by_deps = {} all_deps = set() # add implicit dependencies dependent on core (core itself is handled separately) files_to_compact.append('node') files_to_compact.append('viewport') # scan input files for dependencies cursor = 0 while cursor < len(files_to_compact): file = files_to_compact[cursor] cursor = cursor + 1 full_file_name = get_full_file_name(file) path = os.path.join(input_folder, full_file_name) print('processing: ' + path) # Support wildcards in modules and resolve them using GLOB on # a filesystem level. for matched_path in glob.iglob(path): with open(matched_path, 'rt') as inp: contents = inp.read() l = parse_module_dependencies(contents, all_deps) for dep in (dep for dep in l if not dep in mods_by_deps and not dep in files_to_compact): files_to_compact.append(dep) print full_file_name + ' depends on ' + dep # Extract the original module name from the file name # For non-wildcard modules, this simply reproduces the # original name (i.e. |file|). mods_by_deps[get_module_name_from_full_file_name( matched_path)] = l print('deriving topological order of collated modules') # pre-define sprintf, matrix and the core module as they do not follow the # usual module dependency system. topo_order = derive_topological_order(['core', 'glMatrix.js'], mods_by_deps) print(topo_order) print('writing medea.core-compiled.js') # generate medea.core-compiled.js output file with open(os.path.join(output_folder, primary_compiled_file), 'wt') as outp: outp.write(get_google_closure_params()) outp.write(get_license(input_folder)) outp.write('medea_is_compiled = true;') for n, dep in enumerate(topo_order): name = get_full_file_name(dep) path = os.path.join(input_folder, name) print('collating: ' + path) for matched_path in glob.iglob(path): with open(matched_path, 'rt') as inp: outp.write( preprocessor.run(inp.read(), input_folder, name, symbols)) if '.js' in dep: outp.write('medealib._MarkScriptAsLoaded("' + dep + '");') outp.write('\n') # embed resource files if resources_to_include: outp.write('medealib._bakedResources = {}; \n') for k, v in resources_to_include.items(): print('embedding: ' + v + ' as ' + k) outp.write(include_resource(k, v, input_folder)) outp.write('delete window.medea_is_compiled;') topo_order = [get_full_file_name(e) for e in topo_order] # all other files are preprocessed, but kept in separate JS files for file in os.listdir(input_folder): if not file in topo_order and ".js" in file: print('writing ' + file + ' to output folder') with open(os.path.join(output_folder, file), 'wt') as outp: with open(os.path.join(input_folder, file), 'rt') as inp: outp.write( preprocessor.run(inp.read(), input_folder, file, symbols)) for file in os.listdir(input_folder_3rdparty): if not os.path.join('3rdparty', file) in topo_order and ".js" in file: print('copying ' + file + ' to output folder') shutil.copy2(os.path.join(input_folder_3rdparty, file), os.path.join(output_folder_3rdparty, file)) print('** done - ' + output_folder)
if(word["pos"] in nonstop): if(word["tok"] in local_word_score[c_id]): local_word_score[c_id][word["tok"]] +=1 else: local_word_score[c_id][word["tok"]] = 1 return local_word_score def coherence_filter(forest, lld): for cluster in forest: for sent in cluster: cf = coherence_factor(sent["s_id"], lld) if(cf < 0.6): cluster.remove(sent) return forest lld = preprocessor.run('pos_output.txt') s_count = len(lld)-1 word_score = calculate_word_score(lld) local_word_score = [] forest = [] cluster_count = 0 s_id =0 for sent in lld: if(s_id==0): forest.append([]) local_word_score.append({}) forest[cluster_count].append({"s_id":s_id, "score":1, "sent":sent}) local_word_score = update_lwscr(cluster_count, s_id, local_word_score, lld) cluster_count+=1 s_id+=1
def run(input_folder, config): output_folder = config['output'] files_to_compact = config['modules'] resources_to_include = config['resources'] symbols = config['symbols'] input_folder_3rdparty = os.path.join(input_folder, '3rdparty') output_folder_3rdparty = os.path.join(output_folder, '3rdparty') make_clean_folders(output_folder, output_folder_3rdparty) mods_by_deps = {} all_deps = set() # add implicit dependencies dependent on core (core itself is handled separately) files_to_compact.append('node') files_to_compact.append('viewport') # scan input files for dependencies cursor = 0 while cursor < len(files_to_compact): file = files_to_compact[cursor] cursor = cursor + 1 full_file_name = get_full_file_name(file) path = os.path.join(input_folder, full_file_name) print('processing: ' + path) # Support wildcards in modules and resolve them using GLOB on # a filesystem level. for matched_path in glob.iglob(path): with open(matched_path, 'rt') as inp: contents = inp.read() l = parse_module_dependencies(contents, all_deps) for dep in (dep for dep in l if not dep in mods_by_deps and not dep in files_to_compact): files_to_compact.append(dep) print full_file_name + ' depends on ' + dep # Extract the original module name from the file name # For non-wildcard modules, this simply reproduces the # original name (i.e. |file|). mods_by_deps[get_module_name_from_full_file_name(matched_path)] = l print('deriving topological order of collated modules') # pre-define sprintf, matrix and the core module as they do not follow the # usual module dependency system. topo_order = derive_topological_order(['core', 'glMatrix.js'], mods_by_deps) print(topo_order) print('writing medea.core-compiled.js') # generate medea.core-compiled.js output file with open(os.path.join(output_folder, primary_compiled_file), 'wt') as outp: outp.write(get_google_closure_params()) outp.write(get_license(input_folder)) outp.write('medea_is_compiled = true;'); for n, dep in enumerate(topo_order): name = get_full_file_name(dep) path = os.path.join(input_folder, name); print('collating: ' + path) for matched_path in glob.iglob(path): with open(matched_path , 'rt') as inp: outp.write(preprocessor.run(inp.read(), input_folder, name, symbols)) if '.js' in dep: outp.write('medealib._MarkScriptAsLoaded("'+ dep +'");') outp.write('\n') # embed resource files if resources_to_include: outp.write('medealib._bakedResources = {}; \n') for k,v in resources_to_include.items(): print('embedding: ' + v + ' as ' + k) outp.write(include_resource(k,v,input_folder)) outp.write('delete window.medea_is_compiled;'); topo_order = [get_full_file_name(e) for e in topo_order] # all other files are preprocessed, but kept in separate JS files for file in os.listdir(input_folder): if not file in topo_order and ".js" in file: print('writing ' + file + ' to output folder') with open(os.path.join(output_folder, file), 'wt') as outp: with open(os.path.join(input_folder, file), 'rt') as inp: outp.write(preprocessor.run(inp.read(), input_folder, file, symbols)) for file in os.listdir(input_folder_3rdparty): if not os.path.join('3rdparty',file) in topo_order and ".js" in file: print('copying ' + file + ' to output folder') shutil.copy2(os.path.join(input_folder_3rdparty, file), os.path.join(output_folder_3rdparty, file)) print('** done - ' + output_folder)
import analytics import preprocessor import project if __name__ == '__main__': project.run() preprocessor.run() analytics.run()
def run(unpredictableSeed = False): # for reproducibility if not unpredictableSeed: np.random.seed(1337) else: np.random.seed() # Cull samples which have <= this ratio of data points as non-zero values percentageThreshold = 0.7 # in ms frameSize = 30 # number of convolutional filters to use nb_filters = 32 # size of pooling area for max pooling nb_pool = 2 # convolution kernel size filter_len = 3 # number of samples before weight update batch_size = 128 # number of possible classes. In this case, just 2 (TODO: should be 3 after adding noise) nb_classes = 2 # how many iterations to run nb_epoch = 12 ((X_train, y_train), (X_test, y_test)) = inp.run(frameSize, percentageThreshold) print X_train.shape, y_train.shape, X_test.shape, y_test.shape print X_train.dtype, y_train.dtype, X_test.dtype, y_test.dtype # X_train: input for the training set # X_test: input for the test set # y_train: result for the training set # y_test: result for the test set X_train = X_train.astype('float32') X_test = X_test.astype('float32') # convert class vectors to binary class matrices Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) model = Sequential() model.add(Convolution2D(nb_filters, filter_len, 1, border_mode='valid', input_shape=(1, X_train.shape[2], 1))) model.add(Activation('relu')) model.add(Convolution2D(nb_filters, filter_len, 1)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2,1))) model.add(Flatten()) model.add(Dense(256)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) start = time.clock() model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1, validation_data=(X_test, Y_test)) score = model.evaluate(X_test, Y_test, verbose=0) timeTaken = time.clock() - start print('Time taken:', timeTaken) print('Test score:', score[0]) print('Test accuracy:', score[1]) return (score[0], score[1], timeTaken) # 1) import wav file. DONE # 2) carve up wav file in to (overlapping?) fragements by a certain timestep. # This involves figuring out how a single frame lasts (use framerate?) # Kinnuen & Li suggests 20~30ms is short and yields good results over longer (100s of ms) analyses # Dynamic Time Warping is an alternative # Velocity and acceleration (of the feature value) are possible additional feature data # 3) design DNN to use said overlapping fragments with a true/false value for each timestep. # note this is not speech recognition; this is speaker recognition # there is no need for memory, and this is a classification issue # i.e. more of a MNIST job than a LSTM job # Kinnunen & Li suggests negative samples be provided and classified to "no one" to prevent false positives # They also claim GMM can't handle high dimensional data, but Rouvier showed DNN is fine with 60 # Jain claims the feature vector should have less than 1/10 the number of speakers as its dimension. That's 0.2 features for our case. # Other: MFC continues to dominate despite being from the 80's, and various modern attempts to create a better feature vector # PLP is one of the more successful feature filters used along side MFC, despite being from 1990. Ceptra and spectral are more recent (both 2001) # Different filters can be used to create different feature vectors of the same sample, tripling the samples available and increasing accuracy # """