def prepare(params, samples): word_vec_path = utils.get_word_vec_path_by_name(params.word_vec_name) params.wvec_dim = 300 _, params.word2id = utils.create_dictionary(samples) params.word_vec = utils.get_wordvec(word_vec_path, params.word2id) return
def get_workers(): if request: app.logger.info('request headers - {0}'.format(request.headers)) category = request.args.get('category') filtered_list = [] final_list = [] # query = db.session.query(TweetInfo).filter(TweetInfo.published == TWEET_PUBLISHED).order_by(db.desc(TweetInfo.id)) query = db.session.query(Worker).all() if query: app.logger.info('query value true') for item in query: if item.category == category: filtered_list.append(item) latitude = request.args.get('latitude') unicodedata.normalize('NFKD', latitude).encode('ascii', 'ignore') longitude = request.args.get('longitude') unicodedata.normalize('NFKD', longitude).encode('ascii', 'ignore') distance = request.args.get('distance') if not distance: distance = 2.0 for item in filtered_list: distance_point = calc_dist(float(item.latitude), float(item.longitude), float(latitude), float(longitude)) app.logger.info('dist - {0}'.format(distance_point)) distance_point = 1.5 if distance_point < distance: awesome_dict = create_dictionary(item, distance_point) final_list.append(awesome_dict) return jsonify({'workers': final_list})
def build_vocabulary(path_data, path_vocs_dict, min_counts_dict, columns): """ 构建字典 Args: path_data: str, 数据路径 path_vocs_dict: dict, 字典存放路径 min_counts_dict: dict, item最少出现次数 columns: list of str, 每一列的名称 Returns: voc_size_1, voc_size_2, ...: int sequence_length: 序列最大长度 """ print('building vocs...') file_data = codecs.open(path_data, 'r', encoding='utf-8') line = file_data.readline() sequence_length_dict = defaultdict(int) # 计数 句子最大长度 # 计数items feature_item_dict_list = [] for i in range(len(columns)): feature_item_dict_list.append(defaultdict(int)) sequence_length = 0 while line: line = line.rstrip() # rstrip() 删除 string 字符串末尾的指定字符(默认为空格) if not line: line = file_data.readline() sequence_length_dict[sequence_length] += 1 sequence_length = 0 continue items = line.split('\t') sequence_length += 1 print(items) for i in range(len(items)): feature_item_dict_list[i][items[i]] += 1 line = file_data.readline() file_data.close() # last instance if sequence_length != 0: sequence_length_dict[sequence_length] += 1 # 写入文件 voc_sizes = [] for i, name in enumerate(columns): size = create_dictionary(feature_item_dict_list[i], path_vocs_dict[name], start=1, sort=True, min_count=min_counts_dict[name], overwrite=True) print('voc: %s, size: %d' % (path_vocs_dict[name], size)) voc_sizes.append(size) print('句子长度分布:') print(sorted(sequence_length_dict.items())) print('done!') return voc_sizes, max(sequence_length_dict.keys())
def run_mnn(dataset): ## DATASET = 0 => ALL DATASET ## DATASET = 1 => CONFUSION MATRIX train_csv = TRAIN_CSV if (dataset == 0) else TRAIN_CONF_CSV # print a log message for status update utils.write_log_msg("creating data dictionary...") # create a dictionary from the provided train.csv file dictionary = utils.create_dictionary(train_csv) # print a log message for status update utils.write_log_msg("extracting features of training data...") # call the feature extraction module to get audio features tr_mnn_features, tr_mnn_labels = features.parse_audio_files_train(TRAIN_AUDIO_PATH,train_csv,dictionary, 0) # print a log message for status update utils.write_log_msg("extracting features of prediction data...") # call the feature extraction module to get audio features if (dataset == 0) : ts_mnn_features, ts_mnn_name_list = features.parse_audio_files_predict(TEST_AUDIO_PATH,os.listdir(TEST_AUDIO_PATH), 0) else : test_csv = pd.read_csv(TEST_CONF_CSV) ts_mnn_features, ts_mnn_name_list = features.parse_audio_files_predict(TRAIN_AUDIO_PATH,test_csv["fname"].tolist(), 0) # print a log message for status update utils.write_log_msg("starting multi-layer neural network training...") # use the above extracted features for the training of the model mnn_y_pred, mnn_probs, mnn_pred = train.tensor_multilayer_neural_network(tr_mnn_features, tr_mnn_labels, ts_mnn_features, len(dictionary), training_epochs=500) # Get top 3 predictions ensembled_output = np.zeros(shape=(mnn_probs.shape[0], mnn_probs.shape[1])) for row, columns in enumerate(mnn_pred): for i, column in enumerate(columns): ensembled_output[row, column] += mnn_probs[row, i] top3 = ensembled_output.argsort()[:,-3:][:,::-1] # print the predicted results to a csv file. file_ = open(OUTPUT_CSV, "w") file_.write("fname,label\n") for i, value in enumerate(top3): if(dataset ==0): lbl_1 = [k for k, v in dictionary.items() if v == value[0]][0] lbl_2 = [k for k, v in dictionary.items() if v == value[1]][0] lbl_3 = [k for k, v in dictionary.items() if v == value[2]][0] file_.write("%s,%s %s %s\n" % (ts_mnn_name_list[i], lbl_1, lbl_2, lbl_3)) else : lbl_1 = [k for k, v in dictionary.items() if v == value[0]][0] file_.write("%s,%s\n" % (ts_mnn_name_list[i], lbl_1)) if (dataset ==0) : file_.write("0b0427e2.wav,Harmonica\n6ea0099f.wav,Harmonica\nb39975f5.wav,Harmonica") # print a log message for status update utils.write_log_msg("done...")
def build_vocabulary(path_data, path_vocs_dict, min_counts_dict, columns): ''' 构建字典 :param path_data: str,数据路径 :param path_vocs_dict: dict,字典存放路径 :param min_counts_dict: dict,item最少出现次数 :param columns: list of str,每一列的名称 :return: voc_size_1, voc_size_2, ...: int sequence_length:序列最大长度 ''' print('builiding vocs ...') file_data = codecs.open(path_data, 'r', encoding='utf-8') line = file_data.readline() sequence_length_dict = defaultdict(int) #句子最大长度 # 计数items feature_item_dict_list = [] for i in range(len(columns)): feature_item_dict_list.append(defaultdict(int)) sequence_length = 0 while line: line = line.strip() if not line: line = file_data.readline() sequence_length_dict[sequence_length] += 1 sequence_length = 0 continue items = line.split('\t') sequence_length += 1 print(items) for i in range(len(items)): feature_item_dict_list[i][items[i]] += 1 line = file_data.readline() file_data.close() # last instance if sequence_length != 0: sequence_length_dict[sequence_length] += 1 # 写入文件 voc_sizes = [] for i, name in enumerate(columns): size = create_dictionary(feature_item_dict_list[i], path_vocs_dict[name], start=1, sort=True, min_count=min_counts_dict[name], overwrite=True) #print('voc: %s, size: %d'(path_vocs_dict[name],size)) print('voc: %s, size: %d', (path_vocs_dict[name], size)) voc_sizes.append(size) print('句子长度分布:') print(sorted(sequence_length_dict.items())) print('done') return voc_sizes, max(sequence_length_dict.keys())
def read_audio_files(): # print a log message for status update utils.write_log_msg("creating data dictionary...") # create a dictionary from the provided train.csv file dictionary = utils.create_dictionary(TRAIN_CSV) # print a log message for status update utils.write_log_msg("extracting features of training data...") # call the feature extraction module to get audio features tr_mnn_features, tr_mnn_labels = features.parse_audio_files_train( TRAIN_AUDIO_PATH, TRAIN_CSV, dictionary, 0) # call the feature extraction module to get audio features tr_cnn_features, tr_cnn_labels = features.parse_audio_files_train( TRAIN_AUDIO_PATH, TRAIN_CSV, dictionary, 1) # print a log message for status update utils.write_log_msg( "processed {0} files of training data for mnn...".format( len(tr_mnn_features))) # print a log message for status update utils.write_log_msg( "processed {0} files of training data for cnn...".format( len(tr_cnn_features))) # print a log message for status update utils.write_log_msg("extracting features of prediction data...") # call the feature extraction module to get audio features ts_mnn_features, ts_mnn_name_list = features.parse_audio_files_predict( TEST_AUDIO_PATH, os.listdir(TEST_AUDIO_PATH), 0) # call the feature extraction module to get audio features ts_cnn_features, ts_cnn_name_list = features.parse_audio_files_predict( TEST_AUDIO_PATH, os.listdir(TEST_AUDIO_PATH), 1) # print a log message for status update utils.write_log_msg( "processed {0} files of prediction data for mnn...".format( len(ts_mnn_features))) # print a log message for status update utils.write_log_msg( "processed {0} files of prediction data for cnn...".format( len(ts_cnn_features))) # print a log message for status update utils.write_log_msg("storing features for future use...") # store features so that they can be used in future features.store_features(dictionary, tr_mnn_features, tr_mnn_labels, ts_mnn_features, ts_mnn_name_list, tr_cnn_features, tr_cnn_labels, ts_cnn_features, ts_cnn_name_list) # return the results to calling program return dictionary, tr_mnn_features, tr_mnn_labels, ts_mnn_features, ts_mnn_name_list, tr_cnn_features, tr_cnn_labels, ts_cnn_features, ts_cnn_name_list
def prepare(params, samples): word_vec_path = utils.get_word_vec_path_by_name(params.word_vec_name) word_count_path = params.word_count_path norm = params.norm params.wvec_dim = 300 _, params.word2id = utils.create_dictionary(samples) params.word_vec = utils.get_wordvec(word_vec_path, params.word2id, norm=norm, path_to_counts=word_count_path) return
def save_melgrams(dataset_path): print('Tamaño del dataset:', len(dataset_path)) for song_path in dataset_path: song_signal, song_sr = librosa.load(song_path, duration=None, sr=44100) song_duration = librosa.get_duration(y=song_signal, sr=song_sr) song_dictionary = create_dictionary(song_path, song_duration, song_signal, song_sr) mel_spectogram = extract_features(song_dictionary) folder_number = song_path.split("/")[-2] song_name = song_path.split("/")[-1].replace(".wav", "") path = os.path.join(getcwd(), 'datasets/train_spec/{}'.format(folder_number)) if not os.path.exists(path): makedirs(path) file_path = path + "/mel_spectrogram_{}".format(song_name) np.save(file_path, mel_spectogram)
import sys from utils import FORM, XPOS, DEPREL from utils import create_dictionary, create_index, read_conllu, map_to_instances, shuffled_stream from utils import parse_projective from layers import Embeddings, BiLSTM import random class MLP(object): pass if __name__ == "__main__": random.seed(1) train_file = "../treebanks/train/en/en.conllu" index = create_index(create_dictionary(read_conllu(train_file))) train_data = list( map_to_instances(read_conllu(train_file), index, (FORM, XPOS))) max_epochs = 30 lstm_dim = 250 arc_hidden_dim = 100 label_hidden_dim = 100 pc = dy.ParameterCollection() # embeddings = Embeddings(pc, [(len(index[FORM])+1, 100), (len(index[XPOS])+1, 25)]) # input_dim = embeddings.dim input_dim = 125 num_labels = len(index[DEPREL])
parser.add_argument("--inputfile", required=True) parser.add_argument("--outbasename", required=True) parser.add_argument("--fields", default=["FORM", "UPOS", "FEATS", "DEPREL"], nargs='+') parser.add_argument("--size", default=[100, 25, 25, 0], type=int, nargs='+') parser.add_argument("--min_frequency", default=5, type=int) parser.add_argument("--window", default=5, type=int) parser.add_argument("--sg") parser.add_argument("--seed", default=1, type=int) args = parser.parse_args() args.fields = [STR_TO_FIELD[f.lower()] for f in args.fields] return args if __name__ == "__main__": from gensim.models import Word2Vec args = _parse_args() print("building index...", end=" ") dic = create_dictionary(read_conllu(args.inputfile), fields=args.fields) index = create_index(dic, min_frequency=args.min_frequency) print("done") write_index(args.outbasename, index, args.fields) _word2vec(index, args)
def build_vocabulary(path_data, path_vocs_dict, min_counts_dict, columns, sequence_len_pt=98, use_char_featrue=False, word_len_pt=98): """ 构建字典 Args: path_data: str, 数据路径 path_vocs_dict: dict, 字典存放路径 min_counts_dict: dict, item最少出现次数 columns: list of str, 每一列的名称 sequence_len_pt: int,句子长度百分位 use_char_featrue: bool,是否使用字符特征(针对英文) word_len_pt: int,单词长度百分位 Returns: voc_size_1, voc_size_2, ...: int sequence_length: 序列最大长度 """ print('building vocs...') file_data = codecs.open(path_data, 'r', encoding='utf-8') line = file_data.readline() sequence_length_list = [] # 句子长度 # 计数items feature_item_dict_list = [] for i in range(len(columns)): feature_item_dict_list.append(defaultdict(int)) # char feature if use_char_featrue: char_dict = defaultdict(int) word_length_list = [] # 单词长度 sequence_length = 0 sentence_count = 0 # 句子数 while line: line = line.rstrip() if not line: sentence_count += 1 sys.stdout.write('当前处理句子数: %d\r' % sentence_count) sys.stdout.flush() line = file_data.readline() sequence_length_list.append(sequence_length) sequence_length = 0 continue items = line.split('\t') sequence_length += 1 # print(items) for i in range(len(columns) - 1): feature_item_dict_list[i][items[i]] += 1 # label feature_item_dict_list[-1][items[-1]] += 1 # char feature if use_char_featrue: for c in items[0]: char_dict[c] += 1 word_length_list.append(len(items[0])) line = file_data.readline() file_data.close() # last instance if sequence_length != 0: sentence_count += 1 sys.stdout.write('当前处理句子数: %d\r' % sentence_count) sequence_length_list.append(sequence_length) print() # 写入文件 voc_sizes = [] if use_char_featrue: # char feature size = create_dictionary(char_dict, path_vocs_dict['char'], start=2, sort=True, min_count=min_counts_dict['char'], overwrite=True) voc_sizes.append(size) for i, name in enumerate(columns): start = 1 if i == len(columns) - 1 else 2 size = create_dictionary(feature_item_dict_list[i], path_vocs_dict[name], start=start, sort=True, min_count=min_counts_dict[name], overwrite=True) print('voc: %s, size: %d' % (path_vocs_dict[name], size)) voc_sizes.append(size) print('句子长度分布:') sentence_length = -1 option_len_pt = [90, 95, 98, 100] if sequence_len_pt not in option_len_pt: option_len_pt.append(sequence_len_pt) for per in sorted(option_len_pt): tmp = int(np.percentile(sequence_length_list, per)) if per == sequence_len_pt: sentence_length = tmp print('%3d percentile: %d (default)' % (per, tmp)) else: print('%3d percentile: %d' % (per, tmp)) if use_char_featrue: print('单词长度分布:') word_length = -1 option_len_pt = [90, 95, 98, 100] if word_len_pt not in option_len_pt: option_len_pt.append(word_len_pt) for per in sorted(option_len_pt): tmp = int(np.percentile(word_length_list, per)) if per == word_len_pt: word_length = tmp print('%3d percentile: %d (default)' % (per, tmp)) else: print('%3d percentile: %d' % (per, tmp)) print('done!') lengths = [sentence_length] if use_char_featrue: lengths.append(word_length) return voc_sizes, lengths
parser.add_argument("--cluster_num", default=10, type=int, help="number of semantic groups to construct") parser.add_argument("--postprocessing", default=1, type=int, help="principal component removal") args = parser.parse_args() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Load text file sentences = load_file(PATH_TO_SENTENCE) # Load dictionary args.id2word, args.word2id = create_dictionary(sentences) # Load word vectors args.word_vec_np = load_wordvec(PATH_TO_VEC, args.word2id) args.wvec_dim = args.word_vec_np.shape[1] # Load word weights args.word_weight = load_word_weight(PATH_TO_WORD_WEIGHTS, args.word2id, a=1e-3) # Construct semantic groups semantic_construction(args) # Generate embedding sentence_emb = compute_embedding(args, sentences)
def parse_fasta_input(options, Results, logger): modelName = path.splitext(path.basename(options.hmm_model))[0] logger.info('Parsing FASTA files') frame = '6' for fastafile in options.infiles: fastaBaseName = path.splitext(path.basename(fastafile))[0] hmmOut = '%s/%s-%s-hmmsearched.out' % (path.abspath( options.hmm_out_dir), fastaBaseName, modelName) fastaOut = '%s/%s-%s-filtered.fasta' % (path.abspath( options.final_gene_dir), fastaBaseName, modelName) aminoOut = '%s/%s-%s-filtered-peptides.fasta' % (path.abspath( options.final_gene_dir), fastaBaseName, modelName) orfFile = '%s/%s-%s-predicted-orfs.fasta' % (path.abspath( options.final_gene_dir), fastaBaseName, modelName) orfAminoFile = '%s/%s-%s-predicted-orfs-amino.fasta' % (path.abspath( options.final_gene_dir), fastaBaseName, modelName) hitFile = '%s/%s-positives.out' % (path.abspath( options.tmp_dir), fastaBaseName) elongated_fasta = '%s/%s-gene-elongated.fasta' % (path.abspath( options.tmp_dir), fastaBaseName) if options.protein: utils.perform_hmmsearch(fastafile, options.hmm_model, hmmOut, options) utils.classifier(hmmOut, hitFile, options) hitDict = utils.create_dictionary(hitFile, options) utils.retrieve_fasta(hitDict, fastafile, fastaOut, options) else: if options.store_peptides: peptideFile = '%s/%s-amino.fasta' % (path.abspath( options.tmp_dir), fastaBaseName) utils.translate_sequence(fastafile, peptideFile, options, frame) logger.info('Performing hmmsearch') utils.perform_hmmsearch(peptideFile, options.hmm_model, hmmOut, options) else: utils.translate_and_search(fastafile, options.hmm_model, hmmOut, options) utils.classifier(hmmOut, hitFile, options) hitDict = utils.create_dictionary(hitFile, options) utils.retrieve_fasta(hitDict, fastafile, fastaOut, options) if not path.isfile(fastaOut): logger.critical('Could not find file %s', fastaOut) # exit() else: utils.retrieve_surroundings(hitDict, fastafile, elongated_fasta) if path.isfile(elongated_fasta): if not options.orf_finder: tmpORFfile = '%s/%s-long-orfs.fasta' % ( options.tmp_dir, fastaBaseName) predict_orfs_prodigal(elongated_fasta, options.tmp_dir, tmpORFfile, options.min_orf_length) orfFile = utils.retrieve_predicted_orfs( options, tmpORFfile) else: tmpORFfile = '%s/%s-long-orfs.fasta' % ( options.tmp_dir, fastaBaseName) predict_orfs_orfFinder(elongated_fasta, options.tmp_dir, tmpORFfile, options.min_orf_length) orfFile = utils.retrieve_predicted_orfs( options, tmpORFfile) if options.store_peptides: options.retrieve_whole = False utils.retrieve_peptides(hitDict, peptideFile, aminoOut, options) else: tmpFastaOut = utils.make_fasta_unique(fastaOut, options) utils.retrieve_predicted_genes_as_amino(options, tmpFastaOut, aminoOut, frame='6') Results.count_hits(hitFile) if path.isfile(orfFile): if not options.orf_finder: Results.count_orfs_genomes(orfFile) else: Results.predictedOrfs = Results.count_contigs(orfFile) return orfFile
def generate_dataset(): all_audio_song_paths = list_dataset('./datasets/audio/') all_speech_song_paths = list_dataset('./datasets/speech/') # Vector de 1 a 10 para los dB vector_aux = list(range(1, 11, 1)) targets = target(vector_aux) # Aleatoriza el orden de los segmentos de voz shuffle(all_speech_song_paths) # Comprueba si hay canciones de voz y de audio if (len(all_speech_song_paths) or len(all_audio_song_paths)) == 0: return aux = 0 for k, audio_song_path in enumerate(all_audio_song_paths): # Carga la cancion y la duracion total audio_signal, audio_sr = librosa.load(audio_song_path, duration=None, sr=44100) audio_duration = librosa.get_duration(y=audio_signal, sr=audio_sr) # Comprueba si la duración del audio es menor que 5 y si lo es pasa a la siguiente cancion de audio if audio_duration < 5: # Pasa a la siguiente canción con música continue # Carga 5s de la señal de audio audio_signal, audio_sr = librosa.load(audio_song_path, duration=5, sr=44100) audio_duration = librosa.get_duration(y=audio_signal, sr=audio_sr) # Crea un dictionario con ciertas características de la señal audio_dictionary = create_dictionary(audio_song_path, audio_duration, audio_signal, audio_sr) # Coge una canción de la base de datos de voz y la quita de la lista (pop) speech_song_path = all_speech_song_paths.pop(0) # Carga la cancion y la duracion total speech_signal, speech_sr = librosa.load(speech_song_path, duration=None, sr=44100) speech_duration = librosa.get_duration(y=speech_signal, sr=speech_sr) exit_for = False # Si la voz dura menos de 5s, pasa a la siguiente while speech_duration < 5: # Comprueba si hay canciones de voz if not all_speech_song_paths: exit_for = True break speech_song_path = all_speech_song_paths.pop(0) speech_signal, speech_sr = librosa.load(speech_song_path, duration=None, sr=44100) speech_duration = librosa.get_duration(y=speech_signal, sr=speech_sr) if exit_for: break # Carga 5s de la señal de voz speech_signal, speech_sr = librosa.load(speech_song_path, duration=5, sr=44100) speech_duration = librosa.get_duration(y=speech_signal, sr=speech_sr) # Crea un dictionario con ciertas características de la señal speech_dictionary = create_dictionary(speech_song_path, speech_duration, speech_signal, speech_sr) for db_level in vector_aux: audio_dict = normalize(audio_dictionary) speech_dict = normalize(speech_dictionary) new_mixed_song = mix_signals(audio_dict, speech_dict, targets[db_level-1], db_level) ''' extract_features(audio_dict) extract_features(speech_dict) extract_features(new_mixed_song) ''' path = os.path.join(getcwd(), 'datasets/train/', str(new_mixed_song['db_level'])) if not os.path.exists(path): makedirs(path) librosa.output.write_wav(path + '/mix_' + str(k) + '.wav', new_mixed_song['signal'], new_mixed_song['sr']) aux += 1 if aux % 100 == 0: total = len(vector_aux) * len(all_audio_song_paths) print('{} audios procesados de {}'.format(aux, total)) if aux == len(all_audio_song_paths): songs_counter = len(vector_aux) * aux print('Proceso terminado. ¡Dataset de {} canciones creado con éxito!'.format(songs_counter)) break