def convert_bakeoff2005_dataset(dataset): root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') convert_file('data/bakeoff2005/{}_training.utf8'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset), True) convert_file('data/bakeoff2005/{}_test_gold.utf8'.format(dataset), 'data/{}/raw/test.txt'.format(dataset), False) split_train_dev(dataset)
def make_bmes(dataset='pku'): # + tag path = 'data/' + dataset + '/' make_sure_path_exists(path + 'bmes') bmes_tag(path + 'raw/train.txt', path + 'bmes/train.txt') bmes_tag(path + 'raw/train-all.txt', path + 'bmes/train-all.txt') bmes_tag(path + 'raw/dev.txt', path + 'bmes/dev.txt') bmes_tag(path + 'raw/test.txt', path + 'bmes/test.txt')
def convert_sighan2005_dataset(dataset): root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') convert_file('data/sighan2005/{}_training.utf8'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset), True) convert_file('data/sighan2005/{}_test_gold.utf8'.format(dataset), 'data/{}/raw/test.txt'.format(dataset), False) split_train_dev(dataset)
def make_bmes(dataset='pku',encode="utf-16"): path = 'data/' + dataset + '/' make_sure_path_exists(path + 'bmes') bmes_tag(path + 'raw/train.txt', path + 'bmes/train.txt',encode) bmes_tag(path + 'raw/train-all.txt', path + 'bmes/train-all.txt',encode) bmes_tag(path + 'raw/dev.txt', path + 'bmes/dev.txt',encode) bmes_tag(path + 'raw/test.txt', path + 'bmes/test.txt',encode)
def make_bmes(dataset='pku'): path = 'data/' + dataset + '/' make_sure_path_exists(path + 'bmes') bmes_tag(path + 'raw/train.txt', path + 'bmes/train.txt') bmes_tag(path + 'raw/train-all.txt', path + 'bmes/train-all.txt') bmes_tag(path + 'raw/dev.txt', path + 'bmes/dev.txt') bmes_tag(path + 'raw/test.txt', path + 'bmes/test.txt')
def converter(filepath, src, dst): """Convert a MIDI file to a multi-track piano-roll and save the resulting multi-track piano-roll to the destination directory. Return a tuple of `midi_md5` and useful information extracted from the MIDI file. """ midi_md5 = os.path.splitext(os.path.basename(filepath))[0] multitrack = Multitrack(beat_resolution=CONFIG["beat_resolution"], name=midi_md5) pm = pretty_midi.PrettyMIDI(filepath) # Merge tracks assert pm.instruments[0].name == "MELODY" assert pm.instruments[1].name == "BRIDGE" assert pm.instruments[2].name == "PIANO" pm.instruments[0].name = "MAIN" pm.instruments[0].notes = ( pm.instruments[0].notes + pm.instruments[1].notes + pm.instruments[2].notes ) del pm.instruments[2] del pm.instruments[1] multitrack.parse_pretty_midi(pm) midi_info = get_midi_info(pm) result_dir = change_prefix(os.path.dirname(filepath), src, dst) make_sure_path_exists(result_dir) multitrack.save(os.path.join(result_dir, midi_md5 + ".npz")) return (midi_md5, midi_info)
def main(): """Main function.""" result_dir, src, subset_ids_path = parse_args() id_lists = {tag: [] for tag in TAGS} # Load the IDs of the songs in the subset with open(subset_ids_path) as f: subset_ids = [line.rstrip('\n').split()[1] for line in f] # Loop over all the songs in the subsets for msd_id in subset_ids: for dataset in ('lastfm_train', 'lastfm_test'): filepath = os.path.join( src, dataset, msd_id_to_dirs(msd_id) + '.json') if os.path.exists(filepath): with open(filepath) as f: data = json.load(f) # Loop over all the tags annotated to the song for tag_freq_pair in data['tags']: if tag_freq_pair[0] in TAGS: # Add the ID to the corresponding tag id_lists[tag_freq_pair[0]].append(msd_id) # Save the ID lists to files make_sure_path_exists(result_dir) for tag in TAGS: filename = 'id_list_{}.txt'.format(tag.lower()) with open(os.path.join(result_dir, filename), 'w') as f: for msd_id in id_lists[tag]: f.write(msd_id + '\n') print("ID lists for Last.fm Dataset successfully saved.")
def main(): """Main function.""" result_dir, src, subset_ids_path = parse_args() id_lists = {tag: [] for tag in TAGS} # Load the IDs of the songs in the subset with open(subset_ids_path) as f: subset_ids = [line.rstrip('\n').split()[1] for line in f] # Loop over all the songs in the subsets for msd_id in subset_ids: for dataset in ('lastfm_train', 'lastfm_test'): filepath = os.path.join(src, dataset, msd_id_to_dirs(msd_id) + '.json') if os.path.exists(filepath): with open(filepath) as f: data = json.load(f) # Loop over all the tags annotated to the song for tag_freq_pair in data['tags']: if tag_freq_pair[0] in TAGS: # Add the ID to the corresponding tag id_lists[tag_freq_pair[0]].append(msd_id) # Save the ID lists to files make_sure_path_exists(result_dir) for tag in TAGS: filename = 'id_list_{}.txt'.format(tag.lower()) with open(os.path.join(result_dir, filename), 'w') as f: for msd_id in id_lists[tag]: f.write(msd_id + '\n') print("ID lists for Last.fm Dataset successfully saved.")
def galSaveflux(fList, fid, savedir): fileDir = os.path.join(savedir, "doublet_ML") make_sure_path_exists(fileDir) fileDir = os.path.join(fileDir, str(fid) + ".pkl") f = open(fileDir, "wb") pickle.dump(fList, f) f.close()
def make_bmes(dataset="pku"): path = data_path + "/" + dataset + "/" make_sure_path_exists(path + "bmes") bmes_tag(path + "raw/train.txt", path + "bmes/train.txt") bmes_tag(path + "raw/train-all.txt", path + "bmes/train-all.txt") bmes_tag(path + "raw/dev.txt", path + "bmes/dev.txt") bmes_tag(path + "raw/test.txt", path + "bmes/test.txt")
def plotGalaxyLens(doublet, obj, savedir, peak_candidates, preProd, nxtProd, doublet_index, fit): if not doublet: ax = plt.subplot(1, 1, 1) plt.title('RA=' + str(obj.RA) + ', Dec=' + str(obj.DEC) + ', Plate=' + str(obj.plate) + ', Fiber=' + str(obj.fiberid) + ', MJD=' + str(obj.mjd) + '\n$z=' + str(obj.z) + ' \pm' + str(obj.z_err) + '$, Class=' + str(obj.obj_class)) ax.plot(obj.wave, obj.reduced_flux, 'k') plt.xlabel('$Wavelength\, (Angstroms)$') plt.ylabel('$f_{\lambda}\, (10^{-17} erg\, s^{-1} cm^{-2} Ang^{-1}$') ax.plot(obj.wave, fit, 'r') make_sure_path_exists(savedir + '/plots/') plt.savefig(savedir + '/plots/' + str(obj.plate) + '-' + str(obj.mjd) + '-' + str(obj.fiberid) + '.png') plt.close() # If doublet, plot in two different windows else: # Plot currently inspecting spectra plt.figure(figsize=(14, 4)) plt.suptitle('RA=' + str(obj.RA) + ', Dec=' + str(obj.DEC) + ', Plate=' + str(obj.plate) + ', Fiber='+str(obj.fiberid) + ', MJD=' + str(obj.mjd) + '\n$z=' + str(obj.z) + ' \pm' + str(obj.z_err) + '$, Class=' + str(obj.obj_class)) # Reduced flux overall ax1 = plt.subplot2grid((1, 3), (0, 0), colspan=2) ax1.plot(obj.wave[10:-10], obj.reduced_flux[10:-10], 'k') ax1.plot(obj.wave, fit, 'r') ax1.set_xlabel('$\lambda \, [\AA]$ ') ax1.set_ylabel( '$f_{\lambda}\, (10^{-17} erg\, s^{-1} cm^{-2} Ang^{-1}$') ax1.set_xlim([np.min(obj.wave), np.max(obj.wave)]) # Reduced flux detail ax2 = plt.subplot2grid((1, 3), (0, 2)) ax2.set_xlabel('$\lambda \, [\AA]$ ') ax2.locator_params(tight=True) ax2.set_xlim([peak_candidates[doublet_index].wavelength - 30.0, peak_candidates[doublet_index].wavelength + 30.0]) ax2.plot(obj.wave, obj.reduced_flux, 'k') ax2.plot(obj.wave, fit, 'r') ax2.set_ylim([-5, 10]) ax2.vlines(x=obj.zline['linewave'] * (1.0 + obj.z), ymin=-10, ymax=10, colors='g', linestyles='dashed') # Plot previous one if obj.fiberid != 1: objPre = SDSSObject(obj.plate, obj.mjd, obj.fiberid - 1, obj.dataVersion, obj.baseDir) ax2.plot(objPre.wave, objPre.reduced_flux, 'b') # Plot next one if obj.fiberid != 1000: objNxt = SDSSObject(obj.plate, obj.mjd, obj.fiberid + 1, obj.dataVersion, obj.baseDir) ax2.plot(objNxt.wave, objNxt.reduced_flux, 'g') # Save to file make_sure_path_exists(os.path.join(savedir, 'plots')) plt.savefig(os.path.join(savedir, 'plots', str(obj.plate) + '-' + str(obj.mjd) + '-' + str(obj.fiberid) + '.png')) plt.close()
def main(): """Main function.""" dst, src, fs, tempo = parse_args() make_sure_path_exists(os.path.dirname(dst)) multitrack = pypianoroll.Multitrack(src) pm = multitrack.to_pretty_midi(tempo) waveform = pm.fluidsynth() scipy.io.wavfile.write(dst, fs, waveform)
def convert_bakeoff2005_dataset(dataset): print('Converting {}...'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') convert_file('data/bakeoff2005/{}_training.utf8'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset), True) convert_file('data/bakeoff2005/{}_test_gold.utf8'.format(dataset), 'data/{}/raw/test.txt'.format(dataset), False) split_train_dev(dataset)
def make_radical(dataset='pku'): print('Making radical tags for {}...'.format(dataset)) path = 'data/' + dataset make_sure_path_exists(path + '/radical') to_radical(path + '/bmes/train.txt', path + '/radical/train.txt') to_radical(path + '/bmes/train-all.txt', path + '/radical/train-all.txt') to_radical(path + '/bmes/dev.txt', path + '/radical/dev.txt') to_radical(path + '/bmes/test.txt', path + '/radical/test.txt')
def make_bmes(dataset='pku'): print('Making bmes tags for {}...'.format(dataset)) path = 'data/' + dataset + '/' make_sure_path_exists(path + 'bmes') bmes_tag(path + 'raw/train.txt', path + 'bmes/train.txt') bmes_tag(path + 'raw/train-all.txt', path + 'bmes/train-all.txt') bmes_tag(path + 'raw/dev.txt', path + 'bmes/dev.txt') bmes_tag(path + 'raw/test.txt', path + 'bmes/test.txt')
def convert_sighan2008_dataset(dataset, utf=16): root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') convert_file('data/sighan2008/{}_train_seg/{}_train_utf{}.seg'.format(dataset, dataset, utf), 'data/{}/raw/train-all.txt'.format(dataset), True, 'utf-{}'.format(utf)) convert_file('data/sighan2008/{}_seg_truth&resource/{}_truth_utf{}.seg'.format(dataset, dataset, utf), 'data/{}/raw/test.txt'.format(dataset), False, 'utf-{}'.format(utf)) split_train_dev(dataset)
def plotGalaxyLens(doublet, obj, savedir, peak_candidates, preProd, nxtProd, doublet_index, fit): if not doublet: ax = plt.subplot(1, 1, 1) plt.title('RA=' + str(obj.RA) + ', Dec=' + str(obj.DEC) + ', Plate=' + str(obj.plate) + ', Fiber=' + str(obj.fiberid) + ', MJD=' + str(obj.mjd) + '\n$z=' + str(obj.z) + ' \pm' + str(obj.z_err) + '$, Class=' + str(obj.obj_class)) ax.plot(obj.wave, obj.reduced_flux, 'k') plt.xlabel('$Wavelength\, (Angstroms)$') plt.ylabel('$f_{\lambda}\, (10^{-17} erg\, s^{-1} cm^{-2} Ang^{-1}$') ax.plot(obj.wave, fit, 'r') make_sure_path_exists(savedir + '/plots/') plt.savefig(savedir + '/plots/' + str(obj.plate) + '-' + str(obj.mjd) + '-' + str(obj.fiberid) + '.png') plt.close() # If doublet, plot in two different windows else: # Plot currently inspecting spectra plt.figure(figsize=(14, 6)) ax1 = plt.subplot2grid((1, 3), (0, 0), colspan=2) plt.suptitle('RA=' + str(obj.RA) + ', Dec=' + str(obj.DEC) + ', Plate=' + str(obj.plate) + ', Fiber='+str(obj.fiberid) + ', MJD=' + str(obj.mjd) + '\n$z=' + str(obj.z) + ' \pm' + str(obj.z_err) + '$, Class=' + str(obj.obj_class)) ax2 = plt.subplot2grid((1, 3), (0, 2)) ax1.plot(obj.wave[10:-10], obj.reduced_flux[10:-10], 'k') ax1.plot(obj.wave, fit, 'r') ax1.set_xlabel('$\lambda \, [\AA]$ ') ax1.set_ylabel( '$f_{\lambda}\, (10^{-17} erg\, s^{-1} cm^{-2} Ang^{-1}$') ax2.set_xlabel('$\lambda \, [\AA]$ ') ax2.locator_params(tight=True) ax2.set_xlim([peak_candidates[doublet_index].wavelength - 30.0, peak_candidates[doublet_index].wavelength + 30.0]) ax2.plot(obj.wave, obj.reduced_flux, 'k') ax2.plot(obj.wave, fit, 'r') ax2.set_ylim([-5, 10]) ax2.vlines(x=obj.zline['linewave'] * (1.0 + obj.z), ymin=-10, ymax=10, colors='g', linestyles='dashed') ax1.set_xlim([np.min(obj.wave), np.max(obj.wave)]) # Plot previous one if obj.fiberid != 1: objPre = SDSSObject(obj.plate, obj.mjd, obj.fiberid - 1, obj.dataVersion, obj.baseDir) ax2.plot(objPre.wave, objPre.reduced_flux, 'b') # Plot next one if obj.fiberid != 1000: objNxt = SDSSObject(obj.plate, obj.mjd, obj.fiberid + 1, obj.dataVersion, obj.baseDir) ax2.plot(objNxt.wave, objNxt.reduced_flux, 'g') # Save to file make_sure_path_exists(os.path.join(savedir, 'plots')) plt.savefig(os.path.join(savedir, 'plots', str(obj.plate) + '-' + str(obj.mjd) + '-' + str(obj.fiberid) + '.png')) plt.close()
def convert_sxu(): dataset = 'sxu' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') convert_file('data/bakeoff2008/{}/train.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset), True) convert_file('data/bakeoff2008/{}/test.txt'.format(dataset), 'data/{}/raw/test.txt'.format(dataset), False) split_train_dev(dataset) make_bmes(dataset)
def convert_sxu(): dataset = 'sxu' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') convert_file('data/other/{}/train.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset), True) convert_file('data/other/{}/test.txt'.format(dataset), 'data/{}/raw/test.txt'.format(dataset), False) split_train_dev(dataset) make_bmes(dataset)
def merger(filepath, src, dst): """Load and merge a multitrack pianoroll and save to the given path.""" # Load and merge the multitrack pianoroll multitrack = Multitrack(filepath) merged = get_merged(multitrack) # Save the merged multitrack pianoroll result_path = change_prefix(filepath, src, dst) make_sure_path_exists(os.path.dirname(result_path)) merged.save(result_path)
def lensFinder(plate, mjd, fiberid, datav, datadir, savedir, lya, qso, jpt, bwidth, bsig, maxchi2): sd = os.path.join(savedir, str(plate) + "-" + str(mjd)) make_sure_path_exists(sd) try: eBOSSLens(plate, mjd, fiberid, datav, lya, qso, jpt, sd, datadir, max_chi2=maxchi2, bwidth=bwidth, bsig=bsig) except Exception as reason: text = str(plate) + " " + str(mjd) + " " + str(fiberid) + " " + \ str(reason) print(text)
def binarizer(filepath, src, dst): """Load and binarize a multitrack pianoroll and save the resulting multitrack pianoroll to the destination directory.""" # Load and binarize the multitrack pianoroll multitrack = Multitrack(filepath) multitrack.binarize() # Save the binarized multitrack pianoroll result_path = change_prefix(filepath, src, dst) make_sure_path_exists(os.path.dirname(result_path)) multitrack.save(result_path)
def lensFinder(plate, mjd, fiberid, datav, datadir, savedir, lya, qso, jpt, bwidth, bsig, maxchi2, doplot): sd = os.path.join(savedir, str(plate) + "-" + str(mjd)) make_sure_path_exists(sd) try: eBOSSLens(plate, mjd, fiberid, datav, lya, qso, jpt, sd, datadir, max_chi2=maxchi2, bwidth=bwidth, bsig=bsig, doPlot=doplot) except Exception as reason: text = str(plate) + " " + str(mjd) + " " + str(fiberid) + " " + \ str(reason) print(text)
def convert_wiki(): dataset = 'wiki' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') convert_file('data/wiki/generated.train.txt', 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/wiki/generated.dev.txt', 'data/{}/raw/dev.txt'.format(dataset), True) convert_file('data/wiki/generated.test.txt', 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def make_training_examples(): for i in xrange(NUM_TRAINING_EXAMPLES): print "Training: {0}/{1}".format(i+1, NUM_TRAINING_EXAMPLES) category = random.choice(categories) audio_segments_dir = '{0}/{1}/train'.format(AUDIO_SEGMENTS_DIR, category) segment_wav_file = '{0}/{1}'.format(audio_segments_dir, random.choice(audio_segments[category])) example_dir = '{0}/{1}'.format(TRAINING_EXAMPLES_DIR, category) make_sure_path_exists(example_dir) example_file_prefix = '{0}/{1}'.format(example_dir, i) extract_random_augmented_spectrogram(segment_wav_file, example_file_prefix)
def convert_ctb(): dataset = 'ctb' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') convert_file('data/ctb/ctb6.train.seg', 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/ctb/ctb6.dev.seg', 'data/{}/raw/dev.txt'.format(dataset), True) convert_file('data/ctb/ctb6.test.seg', 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def main(): """Main function.""" src, dst = parse_args() make_sure_path_exists(dst) if CONFIG['multicore'] > 1: joblib.Parallel(n_jobs=CONFIG['multicore'], verbose=5)( joblib.delayed(merger)(npz_path, src, dst) for npz_path in findall_endswith('.npz', src)) else: for npz_path in findall_endswith('.npz', src): merger(npz_path, src, dst)
def convert_ctb(): dataset = 'ctb' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') convert_file('data/other/ctb/ctb6.train.seg', 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/other/ctb/ctb6.dev.seg', 'data/{}/raw/dev.txt'.format(dataset), True) convert_file('data/other/ctb/ctb6.test.seg', 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def convert_weibo(): dataset = 'weibo' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') convert_file('data/weibo/nlpcc2016-word-seg-train.dat', 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/weibo/nlpcc2016-wordseg-dev.dat', 'data/{}/raw/dev.txt'.format(dataset), True) # TODO the weibo test answer is missing convert_file('data/weibo/nlpcc2016-wordseg-dev.dat', 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def convert_synthetic_corpus(): dataset = 'syn' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') split_train_dev_test('syn') convert_file('data/syn/train.txt', 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/syn/dev.txt', 'data/{}/raw/dev.txt'.format(dataset), True) convert_file('data/syn/test.txt', 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def main(): """Main function.""" src, dst = parse_args() make_sure_path_exists(dst) if CONFIG['multicore'] > 1: joblib.Parallel(n_jobs=CONFIG['multicore'], verbose=5)( joblib.delayed(binarizer)(npz_path, src, dst) for npz_path in findall_endswith('.npz', src)) else: for npz_path in findall_endswith('.npz', src): binarizer(npz_path, src, dst) print("Dataset successfully binarized.")
def process(sources, output, force): """Download sources and process the file to the output directory. \b SOURCES: Source JSON file or directory of files. Required. OUTPUT: Destination directory for generated data. Required. """ for path in utils.get_files(sources): pathparts = utils.get_path_parts(path) pathparts[0] = output.strip(os.sep) pathparts[-1] = pathparts[-1].replace('.json', '.geojson') outdir = os.sep.join(pathparts[:-1]) outfile = os.sep.join(pathparts) source = utils.read_json(path) urlfile = urlparse(source['url']).path.split('/')[-1] if not hasattr(adapters, source['filetype']): utils.error('Unknown filetype', source['filetype'], '\n') continue if os.path.isfile(outfile) and not force: utils.error('Skipping', path, 'since generated file exists.', 'Use --force to regenerate.', '\n') continue utils.info('Downloading', source['url']) try: fp = utils.download(source['url']) except IOError: utils.error('Failed to download', source['url'], '\n') continue utils.info('Reading', urlfile) try: geojson = getattr(adapters, source['filetype']).read(fp, source['properties']) except IOError: utils.error('Failed to read', urlfile) continue finally: os.remove(fp.name) utils.make_sure_path_exists(outdir) utils.write_json(outfile, geojson) utils.success('Done. Processed to', outfile, '\n')
def init_logger(): log_formatter = logging.Formatter("%(message)s") logger = logging.getLogger() console_handler = logging.StreamHandler() console_handler.setFormatter(log_formatter) logger.addHandler(console_handler) logger.setLevel(logging.INFO) root_dir = FLAGS.model_dir if FLAGS.model_dir != '' else FLAGS.load_dir make_sure_path_exists(root_dir) if not os.path.exists(root_dir): os.mkdir(root_dir) file_handler = logging.FileHandler("{0}/info.log".format(root_dir), mode='a') file_handler.setFormatter(log_formatter) logger.addHandler(file_handler) return logger
def convert_sighan2005_dataset(dataset): global sighan05_root root = os.path.join(data_path, dataset) make_sure_path_exists(root) make_sure_path_exists(root + "/raw") file_path = "{}/{}_training.utf8".format(sighan05_root, dataset) convert_file(file_path, "{}/raw/train-all.txt".format(root), is_traditional(dataset), True) if dataset == "as": file_path = "{}/{}_testing_gold.utf8".format(sighan05_root, dataset) else: file_path = "{}/{}_test_gold.utf8".format(sighan05_root, dataset) convert_file(file_path, "{}/raw/test.txt".format(root), is_traditional(dataset), False) split_train_dev(dataset)
def convert_conll(dataset): print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') extract_conll('data/other/{}/dev.conll'.format(dataset), 'data/{}/dev.txt'.format(dataset)) extract_conll('data/other/{}/test.conll'.format(dataset), 'data/{}/test.txt'.format(dataset)) extract_conll('data/other/{}/train.conll'.format(dataset), 'data/{}/train.txt'.format(dataset)) convert_file('data/{}/train.txt'.format(dataset), 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/{}/dev.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), True) convert_file('data/{}/test.txt'.format(dataset), 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def convert_cncorpus(): dataset = 'cnc' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') remove_pos('data/other/cnc/train.txt', 'data/cnc/train-no-pos.txt') remove_pos('data/other/cnc/dev.txt', 'data/cnc/dev-no-pos.txt') remove_pos('data/other/cnc/test.txt', 'data/cnc/test-no-pos.txt') convert_file('data/cnc/train-no-pos.txt', 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/cnc/dev-no-pos.txt', 'data/{}/raw/dev.txt'.format(dataset), True) convert_file('data/cnc/test-no-pos.txt', 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def convert_zhuxian(): dataset = 'zx' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') remove_pos('data/other/zx/dev.zhuxian.wordpos', 'data/zx/dev.txt', '_') remove_pos('data/other/zx/train.zhuxian.wordpos', 'data/zx/train.txt', '_') remove_pos('data/other/zx/test.zhuxian.wordpos', 'data/zx/test.txt', '_') convert_file('data/zx/train.txt', 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/zx/dev.txt', 'data/{}/raw/dev.txt'.format(dataset), True) convert_file('data/zx/test.txt', 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def convert_conll(dataset): print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') extract_conll('data/{}/dev.conll'.format(dataset), 'data/{}/dev.txt'.format(dataset)) extract_conll('data/{}/test.conll'.format(dataset), 'data/{}/test.txt'.format(dataset)) extract_conll('data/{}/train.conll'.format(dataset), 'data/{}/train.txt'.format(dataset)) convert_file('data/{}/train.txt'.format(dataset), 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/{}/dev.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), True) convert_file('data/{}/test.txt'.format(dataset), 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def convert_cncorpus(): dataset = 'cnc' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') remove_pos('data/cnc/train.txt', 'data/cnc/train-no-pos.txt') remove_pos('data/cnc/dev.txt', 'data/cnc/dev-no-pos.txt') remove_pos('data/cnc/test.txt', 'data/cnc/test-no-pos.txt') convert_file('data/cnc/train-no-pos.txt', 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/cnc/dev-no-pos.txt', 'data/{}/raw/dev.txt'.format(dataset), True) convert_file('data/cnc/test-no-pos.txt', 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def convert_zhuxian(): dataset = 'zx' print('Converting corpus {}'.format(dataset)) root = 'data/' + dataset make_sure_path_exists(root) make_sure_path_exists(root + '/raw') remove_pos('data/zx/dev.zhuxian.wordpos', 'data/zx/dev.txt', '_') remove_pos('data/zx/train.zhuxian.wordpos', 'data/zx/train.txt', '_') remove_pos('data/zx/test.zhuxian.wordpos', 'data/zx/test.txt', '_') convert_file('data/zx/train.txt', 'data/{}/raw/train.txt'.format(dataset), True) convert_file('data/zx/dev.txt', 'data/{}/raw/dev.txt'.format(dataset), True) convert_file('data/zx/test.txt', 'data/{}/raw/test.txt'.format(dataset), False) combine_files('data/{}/raw/train.txt'.format(dataset), 'data/{}/raw/dev.txt'.format(dataset), 'data/{}/raw/train-all.txt'.format(dataset)) make_bmes(dataset)
def main(): """Main function.""" src, dst, id_list_path = parse_args() make_sure_path_exists(dst) with open(id_list_path) as f: id_list = [line.split() for line in f] if CONFIG['multicore'] > 1: joblib.Parallel(n_jobs=CONFIG['multicore'], verbose=5)( joblib.delayed(collector)(midi_md5, msd_id, src, dst) for midi_md5, msd_id in id_list) else: for midi_md5, msd_id in id_list: collector(midi_md5, msd_id, src, dst) print("Subset successfully collected for: {}".format(id_list_path))
def export_gen_graph(tf, sess, variables_filter, variables_bias, variables_scalars, path, name="gen_export.pb", width=224, ratio=1.0): var_gen_filter_new = [] for i in range(len(variables_filter)): var_gen_filter_new.append(sess.run(variables_filter[i])) var_gen_bias_new = [] for i in range(len(variables_bias)): var_gen_bias_new.append(sess.run(variables_bias[i])) var_gen_scalars_new = [] for i in range(len(variables_scalars)): var_gen_scalars_new.append(sess.run(variables_scalars[i])) to_graph = tf.Graph() with to_graph.as_default() as g: gn.build_gen_graph_deep(tf, trainable=False, variables_gen_filter=var_gen_filter_new, variables_gen_bias=var_gen_bias_new, variables_scalars=var_gen_scalars_new, width_res=width, ratio=ratio) #saver = tf.train.Saver(tf.all_variables()) utils.make_sure_path_exists(conf.project_path + conf.output_generator + path) with tf.Session() as new_sess: init = tf.global_variables_initializer() new_sess.run(init) #summary_writer = tf.train.SummaryWriter(project_path + log_generator, graph_def=new_sess.graph_def) #saver.save(new_sess, project_path + "\\android_exports" + path + name) tf.train.write_graph(tf.get_default_graph(), conf.project_path + conf.output_generator + path, name, as_text=False)
def plot_Jackpot(obj, peak,em_lines, savedir, counter): ''' jptSave.plot_Jackpot(obj, peak,em_lines, savedir, counter) ========================================================= Plots Jackpot lens candidates Parameters: obj: The SDSS object/spectra on which applied the subtraction peak_candidates: The inquired peaks savedir: Directory to save the plots/data em_lines: Rest frame ELG emission lines counter: To keep track of each candidates per spectra Returns: - Nothing. Create and save the plot. ''' fontP = FontProperties() fontP.set_size('medium') plt.suptitle(SDSSname(obj.RA,obj.DEC)+'\n'+'RA='+str(obj.RA)+ ', Dec='+str(obj.DEC) +', $z_{QSO}='+'{:03.3}'.format(obj.z)+ '$') gs = gridspec.GridSpec(1,4) p1 = plt.subplot(gs[0,:4]) smoothed_flux = np.array([np.mean(obj.flux[ii-2:ii+3]) for ii in range(len(obj.flux)) if (ii>4 and ii<len(obj.flux)-4)]) p1.plot(obj.wave[5:-4], smoothed_flux, 'k', label = 'BOSS Flux', drawstyle='steps-mid') #p1.plot(wave, flux, 'k', label = 'BOSS Flux') p1.plot(obj.wave, obj.synflux, 'r', label = 'PCA fit') box = p1.get_position() p1.set_position([box.x0,box.y0+0.02,box.width*0.9,box.height]) p1.set_ylim(np.min(obj.synflux)-3, np.max(obj.synflux)+3) p1.vlines(x = em_lines*(1+peak.z_1),ymin= -100,ymax= 100,colors= 'g',linestyles='dashed') p1.vlines(x = em_lines*(1+peak.z_2),ymin= -100,ymax= 100,colors= 'b',linestyles='dashed') p1.legend(loc='upper right', bbox_to_anchor = (1.2,1), ncol = 1, prop=fontP) p1.set_xlim(3500,10500) plt.ylabel('Flux [$10^{-17} erg\, s^{-1} cm^{-2} \AA^{-1}]$') make_sure_path_exists(savedir +'/plots/') plt.savefig(savedir +'/plots/'+SDSSname(obj.RA,obj.DEC)+ '-' + str(obj.plate) + '-' + str(obj.mjd) + '-' + str(obj.fiberid) + '-'+str(counter) +'.png') plt.close()
def split(videos_dict): """For each video in each category, split its audio file into minute segments. For each minute segment, extract the first 50 seconds into an audio file to be used for training data, and the last 10 seconds into an audio file to be used for validation data. """ for category in videos_dict: for url in videos_dict[category]: video_id = url video_wav_file = '{0}/{1}/{2}/audio.wav'.format(AUDIO_DIR, category, video_id) train_segments_dir = '{0}/{1}/train'.format(AUDIO_SEGMENTS_DIR, category) val_segments_dir = '{0}/{1}/val'.format(AUDIO_SEGMENTS_DIR, category) make_sure_path_exists(train_segments_dir) make_sure_path_exists(val_segments_dir) split_segments(video_wav_file, video_id, train_segments_dir, val_segments_dir)
def main(): """Main function.""" result_dir, src, subset_ids_path = parse_args() # Parse the label of each song id_label_masd = {} with open(src) as f: for line in f: if line.startswith('#'): continue id_label_masd[line.split()[0]] = LABEL_NUM_MAP[line.split()[1]] # Load the IDs of the songs in the subset with open(subset_ids_path) as f: subset_ids = [line.rstrip('\n').split()[1] for line in f] # Loop over all the songs in the subset collected = {} for msd_id in subset_ids: label = id_label_masd.get(msd_id) if label is None: continue collected[msd_id] = label # Save the ID label pairs to a file make_sure_path_exists(result_dir) filepath = os.path.join(result_dir, 'masd_labels.txt') with open(filepath, 'w') as f: f.write("# msd_id, label_num\n") for msd_id in collected: f.write("{} {}\n".format(msd_id, collected[msd_id])) print("Labels successfully saved.") # Save the cleansed ID label pairs to a file cleansed = {} for msd_id in collected: if collected[msd_id] in CLEANSED_LABELS: cleansed[msd_id] = CLEANSED_LABELS.index(collected[msd_id]) filepath = os.path.join(result_dir, 'masd_labels_cleansed.txt') with open(filepath, 'w') as f: f.write("# msd_id, label_num\n") for msd_id in cleansed: f.write("{} {}\n".format(msd_id, cleansed[msd_id])) print("Cleansed labels successfully saved.")
def converter(filepath, src, dst): """Convert a MIDI file to a multi-track piano-roll and save the resulting multi-track piano-roll to the destination directory. Return a tuple of `midi_md5` and useful information extracted from the MIDI file. """ try: midi_md5 = os.path.splitext(os.path.basename(filepath))[0] multitrack = Multitrack(beat_resolution=CONFIG['beat_resolution'], name=midi_md5) pm = pretty_midi.PrettyMIDI(filepath) multitrack.parse_pretty_midi(pm) midi_info = get_midi_info(pm) result_dir = change_prefix(os.path.dirname(filepath), src, dst) make_sure_path_exists(result_dir) multitrack.save(os.path.join(result_dir, midi_md5 + '.npz')) return (midi_md5, midi_info) except: return None
def main(): """Main function.""" result_dir, src, subset_ids_path = parse_args() # Parse the label of each song tag_dict = {} with open(src) as f: for line in f: if line.startswith('#'): continue elif len(line.split()) == 2: tag_dict[line.split()[0]] = line.split()[1] elif len(line.split()) > 2: tag_dict[line.split()[0]] = '-'.join(line.split()[1:]) tags = set(tag_dict.values()) id_lists = {tag: [] for tag in tags} # Load the IDs of the songs in the subset with open(subset_ids_path) as f: subset_ids = [line.rstrip('\n').split()[1] for line in f] # Loop over all the songs in the subset for msd_id in subset_ids: tag = tag_dict.get(msd_id) if tag is None: continue # Add the ID to the corresponding tag id_lists[tag].append(msd_id) # Save the ID lists to files make_sure_path_exists(result_dir) for tag in tags: filename = 'id_list_{}.txt'.format(tag) with open(os.path.join(result_dir, filename), 'w') as f: for msd_id in id_lists[tag]: f.write(msd_id + '\n') print("ID lists for Million Song Dataset Benchmarks successfully saved.")
def _create_directories(output_path): distribution_path = os.path.join(output_path, 'distributions') two_d_interaction_path = os.path.join(output_path, '2d_interactions') three_d_interaction_path = os.path.join(output_path, '3d_interactions') make_sure_path_exists(distribution_path) make_sure_path_exists(two_d_interaction_path) make_sure_path_exists(three_d_interaction_path) return distribution_path, two_d_interaction_path, three_d_interaction_path
def main(): """Main function.""" src, dst, midi_info_path = parse_args() make_sure_path_exists(dst) midi_info = {} if CONFIG['multicore'] > 1: kv_pairs = joblib.Parallel(n_jobs=CONFIG['multicore'], verbose=5)( joblib.delayed(converter)(midi_path, src, dst) for midi_path in findall_endswith('.mid', src)) for kv_pair in kv_pairs: if kv_pair is not None: midi_info[kv_pair[0]] = kv_pair[1] else: for midi_path in findall_endswith('.mid', src): kv_pair = converter(midi_path, src, dst) if kv_pair is not None: midi_info[kv_pair[0]] = kv_pair[1] if midi_info_path is not None: with open(midi_info_path, 'w') as f: json.dump(midi_info, f) print("{} files have been successfully converted".format(len(midi_info)))
def plotQSOGal(obj, peak, savedir,em_lines, n ): ''' qsoSave.plotQSOGal(obj,peak_candidates, savedir) ==================================== Parameters: obj: inspected spectra peak: Inquired peak on the spectra savedir: Directory to save the plots and info em_lines: rest frame ELG emission lines n: numerotation of plots Returns: - Nothing. Prints peak info and save plots ''' make_sure_path_exists(savedir +'/plots/') z_backgal = peak.redshift fontP = FontProperties() fontP.set_size('medium') plt.suptitle(SDSSname(obj.RA,obj.DEC)+'\n'+'RA='+str(obj.RA)+ ', Dec='+str(obj.DEC) +', $z_{QSO}='+'{:03.3}'.format(obj.z)+ '$') gs = gridspec.GridSpec(2,4) p1 = plt.subplot(gs[0,:4]) smoothed_flux = np.array([np.mean(obj.flux[ii-2:ii+3]) for ii in range(len(obj.flux)) if (ii>4 and ii<len(obj.flux)-4)]) p1.plot(obj.wave[5:-4], smoothed_flux, 'k', label = 'BOSS Flux', drawstyle='steps-mid') p1.plot(obj.wave, obj.synflux, 'r', label = 'PCA fit') #if z<1 and show == True: # p1.plot(HB_wave, lorentz(HB_wave, params_beta[0],params_beta[1],params_beta[2]) +HB_wave*line_coeff[0] + line_coeff[1], '--g') box = p1.get_position() p1.set_position([box.x0,box.y0+0.02,box.width*0.9,box.height]) p1.set_ylim(np.min(obj.synflux)-3, np.max(obj.synflux)+3) p1.vlines(x = em_lines*(1+z_backgal),ymin= -100,ymax= 100,colors= 'g',linestyles='dashed') p1.legend(loc='upper right', bbox_to_anchor = (1.2,1), ncol = 1, prop=fontP) p1.set_xlim(3500,10500) plt.ylabel('Flux [$10^{-17} erg\, s^{-1} cm^{-2} \AA^{-1}]$') p2 = plt.subplot(gs[1,:1]) p2.vlines(x = em_lines*(1+z_backgal),ymin= -100,ymax= 100,colors= 'g',linestyles='dashed') loc_flux =obj.flux[obj.wave2bin((1+z_backgal)*(3727-10)) : obj.wave2bin((1+z_backgal)*(3727+10))] p2.plot(obj.wave[obj.wave2bin((1+z_backgal)*(3727-10)) :obj.wave2bin((1+z_backgal)*(3727+10))], loc_flux,'k', label = 'OII', drawstyle='steps-mid') p2.plot(obj.wave[obj.wave2bin((1+z_backgal)*(3727-10)) :obj.wave2bin((1+z_backgal)*(3727+10))], obj.synflux[obj.wave2bin((1+z_backgal)*(3727-10)) :obj.wave2bin((1+z_backgal)*(3727+10))], 'r', label = 'OII', drawstyle='steps-mid') if loc_flux != []: p2.set_ylim(np.min(loc_flux)-1,np.max(loc_flux)+1) plt.title('[OII] 3727') p2.set_xlim((1+z_backgal)*(3727-10),(1+z_backgal)*(3727+10)) x1 = int((1+z_backgal)*3727) plt.xticks([x1-15,x1,x1+15]) plt.ylabel('Flux [$10^{-17} erg\, s^{-1} cm^{-2} \AA^{-1}]$') #If Ha is below 9500 A, show it if obj.z>0.44: p3 = plt.subplot(gs[1,1:4]) else: p3 = plt.subplot(gs[1,1:3]) p3.vlines(x = em_lines*(1+z_backgal),ymin= -100,ymax= 100,colors= 'g',linestyles='dashed') loc_flux = obj.flux[obj.wave2bin((1+z_backgal)*(4861-10)) :obj.wave2bin((1+z_backgal)*(5007+10))] p3.plot(obj.wave[obj.wave2bin((1+z_backgal)*(4861-10)):obj.wave2bin((1+z_backgal)*(5007+10))], loc_flux,'k', label = 'OIII, Hb', drawstyle='steps-mid') p3.plot(obj.wave[obj.wave2bin((1+z_backgal)*(4861-10)):obj.wave2bin((1+z_backgal)*(5007+10))], obj.synflux[obj.wave2bin((1+z_backgal)*(4861-10)):obj.wave2bin((1+z_backgal)*(5007+10))], 'r', label = 'OIII, Hb', drawstyle='steps-mid') if loc_flux != []: p3.set_ylim(np.min(loc_flux)-1,np.max(loc_flux)+1) plt.title(r'H$\beta$,[OIII] 4959, [OIII] 5007') plt.xlabel(r'Observed wavelength [$\AA$]') p3.set_xlim((1+z_backgal)*(4861-10),(1+z_backgal)*(5007+10)) x1 = int((1+z_backgal)*4862/10.)*10 if x1<7600: plt.xticks([x1,x1+50 , x1+100 , x1 +150 ,x1+200]) else: plt.xticks([x1,x1+50 , x1+100 , x1 +150 ,x1+200, x1+ 250]) box = p3.get_position() p3.set_position([box.x0+0.02,box.y0,box.width*0.9,box.height]) if obj.z<0.44: p4 = plt.subplot(gs[1,3:4]) p4.vlines(x = em_lines*(1+z_backgal),ymin= -100,ymax= 100,colors= 'g',linestyles='dashed') loc_flux = obj.flux[obj.wave2bin((1+z_backgal)*(6562-10)):obj.wave2bin((1+z_backgal)*(6562+10))] p4.plot(obj.wave[obj.wave2bin((1+z_backgal)*(6562-10)):obj.wave2bin((1+z_backgal)*(6562+10))], loc_flux,'k', label = 'Ha', drawstyle='steps-mid') p4.plot(obj.wave[obj.wave2bin((1+z_backgal)*(6562-10)):obj.wave2bin((1+z_backgal)*(6562+10))], obj.synflux[obj.wave2bin((1+z_backgal)*(6562-10)) :obj.wave2bin((1+z_backgal)*(6562+10))], 'r', label = 'Ha', drawstyle='steps-mid') if loc_flux != []: p4.set_ylim(np.min(loc_flux)-1,np.max(loc_flux)+1) plt.title(r'H$\alpha$') p4.set_xlim((1+z_backgal)*(6562-10),(1+z_backgal)*(6562+10)) x1 = int((1+z_backgal)*6562) if x1 < 9900: plt.xticks([x1-10,x1,x1+10], [str(x1-10),str(x1),str(x1+10)]) else: plt.xticks([x1-10,x1,x1+10], [str(x1-10),'',str(x1+10)]) plt.savefig(savedir +'/plots/'+SDSSname(obj.RA,obj.DEC)+ '-' + str(obj.plate) + '-' + str(obj.mjd) + '-' + str(obj.fiberid) + '-'+str(n) +'.png') plt.close()