def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0): ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, 'ntype_train.txt')))) clean_kv = dict( zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt')))) noisy_kv = dict( zip(*read_kv(os.path.join(data_root, 'noisy_label_kv.txt')))) clean_keys = read_list(os.path.join(data_root, 'clean_train_key_list.txt')) noisy_keys = read_list(os.path.join(data_root, 'noisy_train_key_list.txt')) # upsampling clean keys to ratio * #noisy_keys clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio) # mix clean and noisy data keys = list(clean_keys) + list(noisy_keys) np.random.shuffle(keys) clean, noisy, ntype = [], [], [] for k in keys: if k in clean_kv: clean.append(clean_kv[k]) noisy.append('-1') else: clean.append('-1') noisy.append(noisy_kv[k]) if k in ntype_kv: ntype.append(ntype_kv[k]) else: ntype.append('-1') keys = [k + ' -1' for k in keys] write_list(keys, os.path.join(output_dir, 'mixed_train_images.txt')) write_list(clean, os.path.join(output_dir, 'mixed_train_label_clean.txt')) write_list(noisy, os.path.join(output_dir, 'mixed_train_label_noisy.txt')) write_list(ntype, os.path.join(output_dir, 'mixed_train_label_ntype.txt'))
def make_aux_mixed(data_root, output_dir, upsample_ratio=1.0): ntype_kv = dict(zip(*read_kv(os.path.join(output_dir, "ntype_train.txt")))) clean_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt")))) noisy_kv = dict(zip(*read_kv(os.path.join(data_root, "noisy_label_kv.txt")))) clean_keys = read_list(os.path.join(data_root, "clean_train_key_list.txt")) noisy_keys = read_list(os.path.join(data_root, "noisy_train_key_list.txt")) # upsampling clean keys to ratio * #noisy_keys clean_keys = np.random.choice(clean_keys, len(noisy_keys) * upsample_ratio) # mix clean and noisy data keys = list(clean_keys) + list(noisy_keys) np.random.shuffle(keys) clean, noisy, ntype = [], [], [] for k in keys: if k in clean_kv: clean.append(clean_kv[k]) noisy.append("-1") else: clean.append("-1") noisy.append(noisy_kv[k]) if k in ntype_kv: ntype.append(ntype_kv[k]) else: ntype.append("-1") keys = [k + " -1" for k in keys] write_list(keys, os.path.join(output_dir, "mixed_train_images.txt")) write_list(clean, os.path.join(output_dir, "mixed_train_label_clean.txt")) write_list(noisy, os.path.join(output_dir, "mixed_train_label_noisy.txt")) write_list(ntype, os.path.join(output_dir, "mixed_train_label_ntype.txt"))
def get_release_years(avatar_list_path: str, getchu_data_path: str) -> list: """ statistics of dataset's release years. :param avatar_list_path: :param getchu_data_path: :return: """ avatar_list = utils.read_list(avatar_list_path) getchu_data_list = utils.read_list(getchu_data_path) avatar_list = list(map(lambda each: int(each[0]), avatar_list)) getchu_data_list = list( map( lambda each: (int(each[0]), int(re.findall('(\d+)-\d+-\d+', each[1])[-1])), getchu_data_list)) years = [ 0 for i in range( 0, np.max(np.array(list(map(lambda each: each[0], getchu_data_list)))) + 1) ] statistics = [ 0 for i in range( 0, np.max(np.array(list(map(lambda each: each[1], getchu_data_list)))) + 1) ] for each in getchu_data_list: years[each[0]] = each[1] for each in avatar_list: statistics[years[each]] += 1 print(statistics[1990:]) return statistics
def make_aux_ntype(data_root, output_dir): clean_kv = dict(zip(*read_kv(os.path.join(data_root, "clean_label_kv.txt")))) noisy_kv = dict(zip(*read_kv(os.path.join(data_root, "noisy_label_kv.txt")))) train_keys = set(read_list(os.path.join(data_root, "clean_train_key_list.txt"))) val_keys = set(read_list(os.path.join(data_root, "clean_val_key_list.txt"))) test_keys = set(read_list(os.path.join(data_root, "clean_test_key_list.txt"))) noisy_keys = set(noisy_kv.keys()) # compute and save matrix C keys = (train_keys | val_keys) & noisy_keys clean_labels = np.asarray([int(clean_kv[k]) for k in keys]) noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys]) C = compute_matrix_c(clean_labels, noisy_labels) save_to_blobproto(C, os.path.join(output_dir, "matrix_c.binaryproto")) # make noise type (ntype) def _make(keys, token): clean_labels = np.asarray([int(clean_kv[k]) for k in keys]) noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys]) lines = [] alpha = 1.0 / (C.shape[0] - 1) for key, y, y_tilde in zip(keys, clean_labels, noisy_labels): if y == y_tilde: lines.append(key + " 0") elif alpha >= C[y_tilde][y]: lines.append(key + " 1") else: lines.append(key + " 2") np.random.shuffle(lines) output_file = os.path.join(output_dir, "ntype_{}.txt".format(token)) write_list(lines, output_file) _make(train_keys & noisy_keys, "train") _make(val_keys & noisy_keys, "val") _make(test_keys & noisy_keys, "test")
def LINK_ENTITIES(parsed_list, predicted, STAGE=True): """ Two entities are considered to have a link if they appear in a range of two consecutive sentences. :return: a list of tuples """ print("Start LINKS") if STAGE: people_links = [] per_link = [] location_links = [] events = [] for idx in tqdm(range(len(parsed_list))): per, loc, per_idx, loc_idx = get_ents_from_predicted( predicted[idx], parsed_list[idx]) # CR per sentence if len(per_idx) > 1: ev = coref_events(parsed_list[idx], per_idx, loc_idx) if any(ev): [x.append(idx) for x in ev] events.append([x for x in ev]) # PER + LOC LINKS per two sentences if idx < len(parsed_list) - 1: per2, loc2, per_idx2, loc_idx2 = get_ents_from_predicted( predicted[idx + 1], parsed_list[idx + 1]) per += per2 loc += loc2 per_idx += per_idx2 loc_idx += loc_idx2 per = list(set(per)) loc = list(set(loc)) # PEOPLE for a, b in combinations(per, 2): link_two_person(people_links, a, b) per_link.append([a, b, idx]) # LOCATIONS if any(loc) and any(per): for l in loc: for p in per: if p != l: location_links.append([l, p, idx]) # POST PROCESS people_links = sorted([x for x in people_links if x[2] > 4], key=lambda x: x[2], reverse=True) events = [x[0] for x in events] # write_list('people_links', people_links) # write_list('location_links', location_links) # write_list('events', events) else: people_links = read_list('people_links') location_links = read_list('location_links') return people_links, location_links, events
def get_short_edge_size(avatar_list_path: str, getchu_data_path: str) -> list: """ statistics of dataset's shortest edge's size. :param avatar_list_path: :param getchu_data_path: :return: """ avatar_list = utils.read_list(avatar_list_path) getchu_data_list = utils.read_list(getchu_data_path) avatar_list = list(map(lambda each: get_short_size(each[2].strip('\n')), avatar_list)) statistics = [0 for i in range(0, np.max(np.array(list(map(lambda each: each, avatar_list))))+1)] for each in avatar_list: statistics[each] += 1 print(statistics[42:])
def _make(token): keys = read_list( os.path.join(data_root, 'clean_{}_key_list.txt'.format(token))) lines = [k + ' ' + label_kv[k] for k in keys] np.random.shuffle(lines) output_file = os.path.join(output_dir, 'clean_{}.txt'.format(token)) write_list(lines, output_file)
def read_data(self, arch, releasedir, symvers): """ Read both data files """ self.matchdata, exists = read_list(arch, releasedir, self.verbose) self.total = read_total_list(symvers) return exists
def normalize_list(people_links, STAGE=True): print("Start NORMALIZATION") if STAGE: normalized = people_links_norm(people_links) for a, b in combinations( [[i, x] for i, x in enumerate(normalized) if len(x) == 1], 2): if a[1][0] in b[1][0] or b[1][0] in a[1][0]: [normalized.remove(x) for x in normalized if x[0] == a[1][0]] [normalized.remove(x) for x in normalized if x[0] == b[1][0]] normalized.append([a[1][0], b[1][0]]) normalized[-1] = list(set(normalized[-1])) # write_list('normalized', normalized) else: normalized = read_list('normalized') print("NER and NORMALIZATION finished:", len(normalized), "'person' entities found") new_list = [] for idx, link in enumerate(people_links): for i, name in [[i, x] for i, x in enumerate(link) if i != 2]: a = [x[0] for x in normalized if name in x] if any(a): people_links[idx][i] = a[0] if link[0] != link[1]: new_list.append(link) people_links = new_list return people_links
def make_aux_ntype(data_root, output_dir): clean_kv = dict( zip(*read_kv(os.path.join(data_root, 'clean_label_kv.txt')))) noisy_kv = dict( zip(*read_kv(os.path.join(data_root, 'noisy_label_kv.txt')))) train_keys = set( read_list(os.path.join(data_root, 'clean_train_key_list.txt'))) val_keys = set(read_list(os.path.join(data_root, 'clean_val_key_list.txt'))) test_keys = set( read_list(os.path.join(data_root, 'clean_test_key_list.txt'))) noisy_keys = set(noisy_kv.keys()) # compute and save matrix C keys = (train_keys | val_keys) & noisy_keys clean_labels = np.asarray([int(clean_kv[k]) for k in keys]) noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys]) C = compute_matrix_c(clean_labels, noisy_labels) save_to_blobproto(C, os.path.join(output_dir, 'matrix_c.binaryproto')) # make noise type (ntype) def _make(keys, token): clean_labels = np.asarray([int(clean_kv[k]) for k in keys]) noisy_labels = np.asarray([int(noisy_kv[k]) for k in keys]) lines = [] alpha = 1.0 / (C.shape[0] - 1) for key, y, y_tilde in zip(keys, clean_labels, noisy_labels): if y == y_tilde: lines.append(key + ' 0') elif alpha >= C[y_tilde][y]: lines.append(key + ' 1') else: lines.append(key + ' 2') np.random.shuffle(lines) output_file = os.path.join(output_dir, 'ntype_{}.txt'.format(token)) write_list(lines, output_file) _make(train_keys & noisy_keys, 'train') _make(val_keys & noisy_keys, 'val') _make(test_keys & noisy_keys, 'test')
def make_data(files, noise_types, data_root): # noise types training and val merged = zip(files, noise_types) np.random.shuffle(merged) training = ['{} {}'.format(f, t) for f, t in merged[:8000]] test = ['{} {}'.format(f, t) for f, t in merged[8000:]] write_list(training, osp.join(data_root, 'ntype_train.txt')) write_list(test, osp.join(data_root, 'ntype_test.txt')) # noise types of mixed training images dic = defaultdict(lambda: -1) dic.update(dict(zip(files, noise_types))) files = read_list(osp.join(data_root, 'mixed_train_images.txt')) files = [f.split()[0] for f in files] noise_types = [dic[f] for f in files] write_list(noise_types, osp.join(data_root, 'mixed_train_label_ntype.txt'))
def __init__(self, patient_info_dir, merge_info_csv='merge_info.csv', exclude_list='exclude_patient_sids.txt', outcome_csv='new_outcomes.csv'): # patient_info_dir: directory containing the merge_info_csv, exclude_list, and outcome_csv # merge_info_csv: name of the csv containing list of edfs and timestamps for # all patients # exclude_list: name of the txt file containing sids of patients to be excluded # outcome_csv: name of the csv contiaining patient outcomes merge_info_csv = os.path.join(patient_info_dir, merge_info_csv) exclude_list = os.path.join(patient_info_dir, exclude_list) outcome_csv = os.path.join(patient_info_dir, outcome_csv) self.merge_info_df = pd.read_csv(merge_info_csv) self.exclude_patients = utils.read_list(exclude_list) self.outcomes_df = pd.read_csv(outcome_csv, index_col=0)
def rename_files( folder_path: Path = Argument(default='.', exists=True, file_okay=True, dir_okay=True, readable=True, resolve_path=True), list_path: Path = Argument(default='list.csv', exists=True, file_okay=True, dir_okay=True, readable=True, resolve_path=True), ): list = read_list(list_path) for cp, dir, files in walk(folder_path): for file in files: if file in list: move(path.join(cp, file), path.join(cp, list[file].replace('\n', '')))
def get_tileset(tileset_name, index=-1, override_offset=-1): offsets = utils.read_table('scripts/res/meta_tileset_load_offsets.tbl') base_offset = 0 if override_offset == -1: if index == -1: idx_tbl = utils.read_table('scripts/res/meta_tileset_index.tbl') hits = [idx for idx in idx_tbl if idx_tbl[idx] == tileset_name] if len(hits) != 1: raise f"Found more or less than one entry for {tileset_name}, provide an index if it appears more than once" index = hits[0] base_offset = (int(offsets[index], 16) // 0x10) & 0xFF else: base_offset = override_offset tbl = utils.read_list(f'scripts/res/tilesets/{tileset_name}.lst', base_offset) tbl[0] = ' ' return tbl
def visualize_ner(ydx): from preprocess import get_texts from config import spacy, FoundationTrilogy, displacy from utils import read_list from spacy.tokens import Span validation_idx = read_list('validation_dataset') sentences2 = get_texts(FoundationTrilogy) sentences = [sentences2[i] for i in validation_idx] nlp2 = spacy.load("en_core_web_sm", disable=['ner']) doc = nlp2(sentences[ydx]) spans = [] for sp in sents_pred_labels[ydx]: spans.append(Span(doc, int(sp['start_idx']), int(sp['end_idx']+1), label=sp['type'])) doc.ents = spans colors = {"PER": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", "LOC": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"} options = {"ents": ["PER", "LOC"], "colors": colors} displacy.serve(doc, style="ent", options=options)
def fast_grab(): global proxy_count proxies = [] sites = read_list('sites.txt') for site in sites: print(f'[i]Site:{site}') try: proxies_from_site = parse_proxies(requests.get(site).text) if len(proxies_from_site) != 0: proxies += proxies_from_site proxy_count += len(proxies_from_site) print( f'[+]Proxy from site:{str(len(proxies_from_site)).zfill(5)}, Total:{str(proxy_count).zfill(5)}' ) else: print(f'[-]No proxy from: {site}') except SilentException as e: print(f'[-]Dead Site: {site}', e) return proxies
def __init__(self, **kwargs): super(SiameseDataLoader, self).__init__() self.input_args = kwargs self.image_size = kwargs.get('image_size') self.image_pairs_list = read_list(kwargs.get('data_list_path')) self.label_dir = kwargs.get('label_dir') self.rois_dir = kwargs.get('rois_dir') self.rois_siamese_dir = kwargs.get('rois_siamese_dir') self.multi_thread = kwargs.get('multi_thread', True) self.n_thread = kwargs.get('n_thread', 7) self.stop_word = kwargs.get('stop_word', '==STOP--') self.batch_size = kwargs.pop('batch_size', 10) self.mode = kwargs.pop('mode', 'train') self.data_num = len(self.image_pairs_list) self.current = 0 self.worker_proc = None self._get_next(True) if self.multi_thread: self.stop_flag = mp.Value('b', False) self.result_queue = mp.Queue(maxsize=self.batch_size * 30) self.data_queue = mp.Queue()
def upload_photos_to_album(album_id, photos): print('Upload photos to album %s' % album_id) if os.path.isfile(photos): files = [os.path.basename(photos)] output = os.path.dirname(photos) else: files = os.listdir(photos) output = photos done_file = os.path.join(output, '%s_done.txt' % album_id) finished = read_list(done_file) error_count = 0 for f in files: image = os.path.join(output, f) _, ext = os.path.splitext(f) if not ext or ext.lower() not in ['.jpg', '.png', '.gif']: # print('Invalid %s' % image) continue try: if f not in finished: print('Uploading %s' % image) api.photo_upload(album_id, image, f) finished.append(f) write_list(done_file, finished) time.sleep(random.randint(1, 3)) else: print('Skip %s' % image) except KeyboardInterrupt, e: print("User interrupt, quit.") raise except Exception, e: print("Error:%s On uploading :%s" % (e, image)) traceback.print_exc() error_count += 1 if error_count > 5: break time.sleep(error_count * 10)
__license__ = "GPL" import tensorflow as tf from utils import read_list from sklearn.datasets.mldata import fetch_mldata # Fetch the dataset dataset = fetch_mldata("USPS") print("Dataset USPS loaded...") data = dataset.data target = dataset.target - 1 # Labels between 0 and 9 to match digits n_samples = data.shape[0] # Number of samples in the dataset n_clusters = 10 # Number of clusters to obtain # Get the split between training/test set and validation set test_indices = read_list("split/usps/test") validation_indices = read_list("split/usps/validation") # Auto-encoder architecture input_size = data.shape[1] hidden_1_size = 500 hidden_2_size = 500 hidden_3_size = 2000 embedding_size = n_clusters dimensions = [ hidden_1_size, hidden_2_size, hidden_3_size, embedding_size, # Encoder layer dimensions hidden_3_size, hidden_2_size,
BIN_DIR = os.path.join(DATA_DIR, "bin") OUTPUT_DIR = os.path.join(".", "output") os.makedirs(OUTPUT_DIR, exist_ok=True) np.random.seed(2020) epoch = 1_000 batch_size = 8 train_split = 0.8 num_train = 13_580 num_test = 4_000 num_valid = 1_000 mos_list = utils.read_list(os.path.join(DATA_DIR, "mos_list.txt")) train_idx = np.random.randint(0, len(mos_list), int(train_split * len(mos_list))) mos_list = np.array(mos_list) train_list = mos_list[train_idx] valid_list = np.delete(mos_list, train_idx) train_data = utils.data_generator( train_list, BIN_DIR, frame=True, batch_size=batch_size ) valid_data = utils.data_generator( valid_list, BIN_DIR, frame=True, batch_size=batch_size ) MOSNet = model.CNN() model = MOSNet.build()
def test(FLAG): print("Reading dataset...") # load data file_list = [ FLAG.test_dir + file.replace('_sat.jpg', '') for file in os.listdir(FLAG.test_dir) if file.endswith('_sat.jpg') ] file_list.sort() Xtest = read_list(file_list, with_mask=False) vgg16 = VGG16(classes=7, shape=(256, 256, 3)) vgg16.build(vgg16_npy_path=FLAG.init_from, mode=FLAG.mode) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f ] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) with tf.Session() as sess: if FLAG.save_dir is not None: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(FLAG.save_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model restored %s" % ckpt.model_checkpoint_path) sess.run(tf.global_variables()) else: print("No model checkpoint in %s" % FLAG.save_dir) else: sess.run(tf.global_variables_initializer()) sess.run(tf.global_variables()) print("Initialized") print("Plot saved in %s" % FLAG.plot_dir) for i, fname in enumerate(file_list): Xplot = sess.run( vgg16.pred, feed_dict={ vgg16.x: Xtest[i:(i + 1), :], #vgg16.y: Ytest[i:(i+1),:], vgg16.is_train: False }) saveimg = skimage.transform.resize(Xplot[0], output_shape=(512, 512), order=0, preserve_range=True, clip=False) saveimg = label2rgb(saveimg) imageio.imsave( os.path.join(FLAG.plot_dir, os.path.basename(fname) + "_mask.png"), saveimg) print( os.path.join(FLAG.plot_dir, os.path.basename(fname) + "_mask.png"))
# set dir DATA_DIR = './data' BIN_DIR = os.path.join(DATA_DIR, 'bin') PRE_TRAINED_DIR = './pre_trained' OUTPUT_DIR = './output' NUM_TRAIN = 13580 NUM_TEST=4000 NUM_VALID=3000 if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) mos_list = utils.read_list(os.path.join(DATA_DIR,'mos_list.txt')) random.shuffle(mos_list) train_list= mos_list[0:-(NUM_TEST+NUM_VALID)] random.shuffle(train_list) valid_list= mos_list[-(NUM_TEST+NUM_VALID):-NUM_TEST] test_list= mos_list[-NUM_TEST:] print('{} for training; {} for valid; {} for testing'.format(NUM_TRAIN, NUM_VALID, NUM_TEST)) # init model MOSNet = model.CNN_BLSTM() model = MOSNet.build() # load pre-trained weights model.load_weights(os.path.join(PRE_TRAINED_DIR, 'cnn_blstm.h5')) # Load the best model
from preprocess import get_texts from config import spacy, FoundationTrilogy, displacy from utils import read_list from spacy.tokens import Span from validation import list_of_values text = "When Hardin denied owning the Journal" sentences = [(i, x) for i, x in enumerate(get_texts(FoundationTrilogy)) if text in x][0] index = sentences[0] sentence = sentences[1] predicted = [ y for idx, y in enumerate(read_list('predicted')) if idx == index ][0] print(sentence) nlp = spacy.load("en_core_web_sm", disable=['ner']) doc = nlp(sentence) tags = list_of_values(predicted, doc) dict_list = [] KEYS = ['start_idx', 'end_idx', 'text', 'type'] [dict_list.append(dict(zip(KEYS, elem))) for elem in tags] """spans = [] for sp in tags: spans.append(Span(doc, int(sp[0]), int(sp[1]), label=sp[2]))""" spans = [] for sp in dict_list: spans.append( Span(doc, int(sp['start_idx']), int(sp['end_idx']), label=sp['type']))
def _make(token): keys = read_list(os.path.join(data_root, "clean_{}_key_list.txt".format(token))) lines = [k + " " + label_kv[k] for k in keys] np.random.shuffle(lines) output_file = os.path.join(output_dir, "clean_{}.txt".format(token)) write_list(lines, output_file)
# set dir DATA_DIR = './data_' + args.data BIN_DIR = os.path.join(DATA_DIR, args.feats) OUTPUT_DIR = './results_O_alpha/output_' + args.model + "_" + str( args.batch_size) + "_" + args.data + "_" + args.feats + "_" + str(alpha) results_file = OUTPUT_DIR + "/results.pkl" EPOCHS = args.epoch BATCH_SIZE = args.batch_size if args.data == "VC": NUM_TRAIN = 13580 NUM_TEST = 4000 NUM_VALID = 3000 mos_list = utils.read_list(os.path.join(DATA_DIR, 'mos_list.txt')) random.shuffle(mos_list) train_list = mos_list[0:-(NUM_TEST + NUM_VALID)] random.shuffle(train_list) valid_list = mos_list[-(NUM_TEST + NUM_VALID):-NUM_TEST] test_list = mos_list[-NUM_TEST:] if args.data == "LA": train_list = utils.read_list(os.path.join(DATA_DIR, 'train_list.txt')) valid_list = utils.read_list(os.path.join(DATA_DIR, 'valid_list.txt')) test_list = utils.read_list(os.path.join(DATA_DIR, 'test_list.txt')) random.shuffle(train_list) random.shuffle(valid_list) random.shuffle(test_list) NUM_TRAIN = len(train_list) NUM_TEST = len(valid_list) NUM_VALID = len(test_list)
def parse(file_path): lines = read_list(file_path) lines = map(str.split, lines) files, labels = zip(*lines) labels = map(int, labels) return files, labels
def NER(sentence_list, STAGE=True, VALIDATE=False): """ :param VALIDATE: if validate, there is no ovewritten in the txt files :return: labeled sentence list """ print("Start NER") if STAGE: main_characters_ = [] locations_ = [] unclassified = [] unclassified_sent = [] predicted = [] for sent in sentence_list: predicted.append(['O' for x in range(len(sent))]) for i in tqdm(range(len(sentence_list))): doc = sentence_list[i] ents = get_ents_from_doc(doc) ents = [ x for x in ents if len(x[0]) > 2 and not x[0].islower() and not x[0].isupper() ] if any(ents): for name, num_list in ents: if len(num_list) > 1: num = num_list[1] token = doc[num] else: num = num_list[0] token = doc[num] if ner_person(doc, token, num): main_characters_.append(name) if len(num_list) == 1: predicted[i][num_list[0]] = 'B-PER' else: predicted[i][num_list[0]] = 'B-PER' predicted[i][num_list[1]] = 'I-PER' elif ner_location(doc, token): locations_.append(name) if len(num_list) == 1: predicted[i][num_list[0]] = 'B-LOC' else: predicted[i][num_list[0]] = 'B-LOC' predicted[i][num_list[1]] = 'I-LOC' else: unclassified.append([name, i]) unclassified_sent.append([name, i, num_list]) location_list = list(set(locations_)) people_list = list(set([x for x in main_characters_ if len(x) > 2])) for ent in people_list: if any([x for x in punctuation_tokens if x in ent]): people_list.remove(ent) # remove bad retrieved entities for ent, i in unclassified: if any([x for x in punctuation_tokens if x in ent]): unclassified.remove([ent, i]) # NER unclassified for tup in unclassified: if tup[0] in people_list: ner_unclassified_per(predicted, unclassified_sent, tup) if tup[0] in location_list: ner_unclassified_loc(predicted, unclassified_sent, tup) # write_list('predicted', predicted) else: predicted = read_list('predicted') return predicted
def run(l2_val, dr, n, batch_size, bn): # set dir DATA_DIR = './data_'+args.data BIN_DIR = os.path.join(DATA_DIR, args.feats) OUTPUT_DIR = './results_R3/output_'+args.model+"_"+str(batch_size)+"_"+args.data+"_"+args.feats+"_"+str(l2_val)+"_"+str(dr)+"_"+str(n)+"_"+str(bn) results_file = OUTPUT_DIR+"/results.pkl" EPOCHS = args.epoch BATCH_SIZE = batch_size if args.data == "VC": NUM_TRAIN = 13580 NUM_TEST=4000 NUM_VALID=3000 mos_list = utils.read_list(os.path.join(DATA_DIR,'mos_list.txt')) random.shuffle(mos_list) train_list= mos_list[0:-(NUM_TEST+NUM_VALID)] random.shuffle(train_list) valid_list= mos_list[-(NUM_TEST+NUM_VALID):-NUM_TEST] test_list= mos_list[-NUM_TEST:] train_data_feat, train_data_mos = utils.data_rep(train_list, BIN_DIR) valid_data_feat, valid_data_mos = utils.data_rep(valid_list, BIN_DIR) if args.data == "LA": test_list = utils.read_list(os.path.join(DATA_DIR,'test_list.txt')) train_data_feat = np.load(DATA_DIR+'/'+args.feats+'_X_train.npy') train_data_mos = np.load(DATA_DIR+'/'+args.feats+'_y_train.npy') valid_data_feat = np.load(DATA_DIR+'/'+args.feats+'_X_valid.npy') valid_data_mos = np.load(DATA_DIR+'/'+args.feats+'_y_valid.npy') NUM_TRAIN = train_data_feat.shape[0] NUM_TEST=valid_data_feat.shape[0] NUM_VALID=len(test_list) if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) print('{} for training; {} for valid; {} for testing'.format(NUM_TRAIN, NUM_TEST, NUM_VALID)) # CNN-LDA has 100, and CNN-PCA has 512 ?? rep_dims = {'DS-image':4096, 'CNN':100, 'xvec_0':512, 'xvec_1':512, 'xvec_2':512, 'xvec_3':512, 'xvec_4':512, 'xvec_5':512} # init model if args.model == 'CNN': dim = rep_dims[args.feats] MOSNet = model_rep.CNN(dim, l2_val, dr, n, bn) # elif args.model == 'FFN': # dim = rep_dims[args.feats] # MOSNet = model_rep.FFN(dim, dr, n, bn) else: raise ValueError('please specify model to train with, CNN, FFN') sys.exit() model = MOSNet.build() model.compile( optimizer=tf.keras.optimizers.Adam(1e-4),metrics=["mean_absolute_error"], loss="mse") CALLBACKS = [ keras.callbacks.ModelCheckpoint( filepath=os.path.join(OUTPUT_DIR,'mosnet.h5'), save_best_only=True, monitor='val_loss', verbose=1), keras.callbacks.EarlyStopping( monitor='val_loss', mode='min', min_delta=0, patience=5, verbose=1) ] train_data_feat = np.expand_dims(train_data_feat, axis=3) valid_data_feat = np.expand_dims(valid_data_feat, axis=3) print(train_data_feat.shape) print(train_data_mos.shape) # start fitting model hist = model.fit(x=train_data_feat, y=train_data_mos, epochs=EPOCHS, callbacks=CALLBACKS, shuffle=True, batch_size=BATCH_SIZE, validation_data=(valid_data_feat, valid_data_mos), verbose=1) # plot testing result model.load_weights(os.path.join(OUTPUT_DIR,'mosnet.h5'),) # Load the best model print('testing...') MOS_Predict=np.zeros([len(test_list),]) MOS_true =np.zeros([len(test_list),]) df = pd.DataFrame(columns=['audio', 'true_mos','predict_mos','system_ID','speaker_ID']) for i in tqdm(range(len(test_list))): if args.data == "VC": filepath=test_list[i].split(',') filename=filepath[0].split('.')[0] sysid = "" speakerid = "" mos=float(filepath[1]) elif args.data == "LA": filepath=test_list[i].split(',') filename=filepath[2].split('.')[0] sysid = filepath[1] speakerid = filepath[0] mos=float(filepath[3]) _DS = utils.read_rep(os.path.join(BIN_DIR,filename+'.npy')) _DS = np.expand_dims(_DS, axis=3) Average_score=model.predict(_DS, verbose=0, batch_size=1) MOS_Predict[i]=Average_score MOS_true[i] =mos df = df.append({'audio': filepath[0], 'true_mos': MOS_true[i], 'predict_mos': MOS_Predict[i], 'system_ID': sysid, 'speaker_ID': speakerid}, ignore_index=True) df.to_pickle(results_file) plt.style.use('seaborn-deep') x = df['true_mos'] y = df['predict_mos'] bins = np.linspace(1, 5, 40) plt.figure(2) plt.hist([x, y], bins, label=['true_mos', 'predict_mos']) plt.legend(loc='upper right') plt.xlabel('MOS') plt.ylabel('number') plt.savefig('./'+OUTPUT_DIR+'/MOSNet_distribution.png', dpi=150) LCC=np.corrcoef(MOS_true, MOS_Predict) print('[UTTERANCE] Linear correlation coefficient= %f' % LCC[0][1]) SRCC=scipy.stats.spearmanr(MOS_true.T, MOS_Predict.T) print('[UTTERANCE] Spearman rank correlation coefficient= %f' % SRCC[0]) MSE=np.mean((MOS_true-MOS_Predict)**2) print('[UTTERANCE] Test error= %f' % MSE) # Plotting scatter plot M=np.max([np.max(MOS_Predict),5]) plt.figure(3) plt.scatter(MOS_true, MOS_Predict, s =15, color='b', marker='o', edgecolors='b', alpha=.20) plt.xlim([0.5,M]) plt.ylim([0.5,M]) plt.xlabel('True MOS') plt.ylabel('Predicted MOS') plt.title('Utterance-Level') plt.savefig('./'+OUTPUT_DIR+'/MOSNet_scatter_plot.png', dpi=150) if args.data == "VC": # load vcc2018_system sys_df = pd.read_csv(os.path.join(DATA_DIR,'vcc2018_system.csv')) df['system_ID'] = df['audio'].str.split('_').str[-1].str.split('.').str[0] + '_' + df['audio'].str.split('_').str[0] elif args.data == "LA": # load LA 2019 system sys_df = pd.read_csv(os.path.join(DATA_DIR,'LA_mos_system.csv')) sys_result_mean = df[['system_ID', 'predict_mos']].groupby(['system_ID']).mean() sys_mer_df = pd.merge(sys_result_mean, sys_df, on='system_ID') sys_true = sys_mer_df['mean'] sys_predicted = sys_mer_df['predict_mos'] print(sys_true) print(sys_predicted) print(sys_true.shape) print(sys_predicted.shape) LCC=np.corrcoef(sys_true, sys_predicted) print('[SYSTEM] Linear correlation coefficient= %f' % LCC[0][1]) SRCC=scipy.stats.spearmanr(sys_true.T, sys_predicted.T) print('[SYSTEM] Spearman rank correlation coefficient= %f' % SRCC[0]) MSE=np.mean((sys_true-sys_predicted)**2) print('[SYSTEM] Test error= %f' % MSE) # Plotting scatter plot M=np.max([np.max(sys_predicted),5]) # m=np.max([np.min(sys_predicted)-1,0.5]) plt.figure(4) plt.scatter(sys_true, sys_predicted, s =25, color='b', marker='o', edgecolors='b') plt.xlim([1,M]) plt.ylim([1,M]) plt.xlabel('True MOS') plt.ylabel('Predicted MOS') plt.title('System-Level') # # add system id # for i in range(len(sys_mer_df)): # sys_ID = mer_df['system_ID'][i] # x = mer_df['mean'][i] # y = mer_df['predict_mos'][i] # plt.text(x-0.05, y+0.1, sys_ID, fontsize=8) plt.savefig('./'+OUTPUT_DIR+'/MOSNet_system_scatter_plot.png', dpi=150) if args.data == "LA": spk_df = pd.read_csv(os.path.join(DATA_DIR,'LA_mos_speaker.csv')) spk_result_mean = df[['speaker_ID', 'predict_mos']].groupby(['speaker_ID']).mean() spk_mer_df = pd.merge(spk_result_mean, spk_df, on='speaker_ID') spk_result_mean = df[['speaker_ID', 'predict_mos']].groupby(['speaker_ID']).mean() spk_mer_df = pd.merge(spk_result_mean, spk_df, on='speaker_ID') spk_true = spk_mer_df['mean'] spk_predicted = spk_mer_df['predict_mos'] LCC=np.corrcoef(spk_true, spk_predicted) print('[SPEAKER] Linear correlation coefficient= %f' % LCC[0][1]) SRCC=scipy.stats.spearmanr(spk_true.T, spk_predicted.T) print('[SPEAKER] Spearman rank correlation coefficient= %f' % SRCC[0]) MSE=np.mean((spk_true-spk_predicted)**2) print('[SPEAKER] Test error= %f' % MSE) # Plotting scatter plot M=np.max([np.max(spk_predicted),5]) # m=np.max([np.min(spk_predicted)-1,0.5]) plt.figure(4) plt.scatter(spk_true, spk_predicted, s =25, color='b', marker='o', edgecolors='b') plt.xlim([1,M]) plt.ylim([1,M]) plt.xlabel('True MOS') plt.ylabel('Predicted MOS') plt.title('Speaker-Level') # # add system id # for i in range(len(spk_mer_df)): # spk_ID = mer_df['speaker_ID'][i] # x = mer_df['mean'][i] # y = mer_df['predict_mos'][i] # plt.text(x-0.05, y+0.1, spk_ID, fontsize=8) plt.savefig('./'+OUTPUT_DIR+'/MOSNet_speaker_scatter_plot.png', dpi=150)
], "dev": ["dev-clean", "dev-other"], } num_wordpieces = 5000 nbest = 10 prefix = "librispeech-train-all-unigram-{}".format(num_wordpieces) prefix = os.path.join(args.dst, prefix) textfile = os.path.join(args.dst, "train-all.text") model = prefix + ".model" vocab = prefix + ".vocab" # prepare data sys.stdout.write("preparing data...\n") sys.stdout.flush() train_text = utils.read_list(args.src, filelists["train"]) dev_text = utils.read_list(args.src, filelists["dev"]) with open(textfile, "w") as f: for line in train_text: f.write(line) f.write("\n") word_dict = set() for line in train_text + dev_text: words = line.split() for w in words: word_dict.add(w) word_dict = sorted(word_dict) # train
def train(FLAG): print("Reading dataset...") # load data Xtrain, Ytrain = read_images(TRAIN_DIR), read_masks(TRAIN_DIR, onehot=True) Xtest, Ytest = read_images(VAL_DIR), read_masks(VAL_DIR, onehot=True) track = [ "hw3-train-validation/validation/0008", "hw3-train-validation/validation/0097", "hw3-train-validation/validation/0107" ] Xtrack, Ytrack = read_list(track) vgg16 = VGG16(classes=7, shape=(256, 256, 3)) vgg16.build(vgg16_npy_path=FLAG.init_from, mode=FLAG.mode, keep_prob=FLAG.keep_prob) saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) checkpoint_path = os.path.join(FLAG.save_dir, 'model.ckpt') def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run( [tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [ v for (v, f) in zip(global_vars, is_not_initialized) if not f ] if len(not_initialized_vars): sess.run(tf.variables_initializer(not_initialized_vars)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # hyper parameters batch_size = 32 epoch = 500 early_stop_patience = 50 min_delta = 0.0001 opt_type = 'adam' # recorder epoch_counter = 0 # optimizer global_step = tf.Variable(0, trainable=False) # Passing global_step to minimize() will increment it at each step. if opt_type is 'sgd': start_learning_rate = FLAG.lr half_cycle = 2000 learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, half_cycle, 0.5, staircase=True) opt = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True) else: start_learning_rate = FLAG.lr half_cycle = 2000 learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, half_cycle, 0.5, staircase=True) opt = tf.train.AdamOptimizer(learning_rate=learning_rate) obj = vgg16.loss train_op = opt.minimize(obj, global_step=global_step) # progress bar ptrain = IntProgress() pval = IntProgress() display(ptrain) display(pval) ptrain.max = int(Xtrain.shape[0] / batch_size) pval.max = int(Xtest.shape[0] / batch_size) # re-initialize initialize_uninitialized(sess) # reset due to adding a new task patience_counter = 0 current_best_val_loss = np.float('Inf') # optimize when the aggregated obj while (patience_counter < early_stop_patience and epoch_counter < epoch): # start training stime = time.time() bar_train = Bar( 'Training', max=int(Xtrain.shape[0] / batch_size), suffix='%(index)d/%(max)d - %(percent).1f%% - %(eta)ds') bar_val = Bar( 'Validation', max=int(Xtest.shape[0] / batch_size), suffix='%(index)d/%(max)d - %(percent).1f%% - %(eta)ds') train_loss, train_accu = 0.0, 0.0 for i in range(int(Xtrain.shape[0] / batch_size)): st = i * batch_size ed = (i + 1) * batch_size loss, accu, _ = sess.run( [obj, vgg16.accuracy, train_op], feed_dict={ vgg16.x: Xtrain[st:ed, :], vgg16.y: Ytrain[st:ed, :], vgg16.is_train: True }) train_loss += loss train_accu += accu ptrain.value += 1 ptrain.description = "Training %s/%s" % (ptrain.value, ptrain.max) train_loss = train_loss / ptrain.value train_accu = train_accu / ptrain.value # validation val_loss = 0 val_accu = 0 for i in range(int(Xtest.shape[0] / batch_size)): st = i * batch_size ed = (i + 1) * batch_size loss, accu = sess.run( [obj, vgg16.accuracy], feed_dict={ vgg16.x: Xtest[st:ed, :], vgg16.y: Ytest[st:ed, :], vgg16.is_train: False }) val_loss += loss val_accu += accu pval.value += 1 pval.description = "Testing %s/%s" % (pval.value, pval.value) val_loss = val_loss / pval.value val_accu = val_accu / pval.value # plot if epoch_counter % 10 == 0: Xplot = sess.run(vgg16.pred, feed_dict={ vgg16.x: Xtrack[:, :], vgg16.y: Ytrack[:, :], vgg16.is_train: False }) for i, fname in enumerate(track): saveimg = skimage.transform.resize(Xplot[i], output_shape=(512, 512), order=0, preserve_range=True, clip=False) saveimg = label2rgb(saveimg) imageio.imwrite( os.path.join( FLAG.save_dir, os.path.basename(fname) + "_pred_" + str(epoch_counter) + ".png"), saveimg) print( os.path.join( FLAG.save_dir, os.path.basename(fname) + "_pred_" + str(epoch_counter) + ".png")) # early stopping check if (current_best_val_loss - val_loss) > min_delta: current_best_val_loss = val_loss patience_counter = 0 saver.save(sess, checkpoint_path, global_step=epoch_counter) print("save in %s" % checkpoint_path) else: patience_counter += 1 # shuffle Xtrain and Ytrain in the next epoch idx = np.random.permutation(Xtrain.shape[0]) Xtrain, Ytrain = Xtrain[idx, :, :, :], Ytrain[idx, :] # epoch end epoch_counter += 1 ptrain.value = 0 pval.value = 0 bar_train.finish() bar_val.finish() print( "Epoch %s (%s), %s sec >> train loss: %.4f, train accu: %.4f, val loss: %.4f, val accu: %.4f" % (epoch_counter, patience_counter, round(time.time() - stime, 2), train_loss, train_accu, val_loss, val_accu))
for i, doc in enumerate(data): tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2)) stemmed_doc = [] for word in tokenized_doc: stemmed_word = stemmer.stem(word) if stemmed_word not in stop_words: stemmed_doc.append(stemmed_word) #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words] if stemmed_doc == []: # Empty document after pre-processing: to be removed id_to_delete.append(i) else: processed_data.append(stemmed_doc) data = processed_data target = np.delete(target, id_to_delete, axis=0) ##### keywords = read_list("constraints/reuters/keywords_freq_auto_5", "str") keywords = [keywords[i].split(" ") for i in range(len(keywords))] # Otherwise don't stem nr_kw_perclass = 3 kw= np.array(list(itertools.chain(*keywords))) counter = np.zeros((len(data), len(kw))) for i in range(len(data)): for k in range(len(data[i])): for j in range(len(kw)): if kw[j]==data[i][k]: counter[i][j] = counter[i][j]+1 print(len(data)) window = 50 model_path = "models/reuters_w2v_window" + str(window) + ".model" if isfile(model_path): # Load if the word2vec model exists print("Loading an existing word2vec model trained on the dataset...") w2v = Word2Vec.load(model_path)
https://plotly.com/python/reference/#scatter-texttemplate https://plotly.com/python/text-and-annotations/ """ from utils import read_list import plotly.graph_objects as go GALAXY = [[425, 400, "Rossem"], [1100, 700, "Haven"], [770, 490, "Neotrantor"], [1300, 600, "Askone"], [500, 500, "Tazenda"], [430, 200, "Arcturus"], [170, 610, "Kalgan"], [160, 650, "Terminus"], [210, 690, "Anacreon"], [1200, 680, "Synnax"], [900, 650, "Radole"], [700, 500, "Trantor"]] planet_names = [x[2] for x in GALAXY] planets_x = [x[0] for x in GALAXY] planets_y = [x[1] for x in GALAXY] test = ["Tests<br>" + str(i) for i in range(12)] location_links = read_list("location_links") hover = [[] for x in range(len(planet_names))] for link in location_links: person = link[1] idx = [i for i, x in enumerate(planet_names) if x == link[0]] if any(idx): hover[idx[0]].append(person) PEOPLE = [ "<b>Linked PER entities:<b><br>- " + "<br>- ".join(y) for y in [list(set(x)) for x in hover] ] # Create figure fig = go.Figure()
def parse(file_path): lines = read_list(file_path) lines = map(str.split, lines) files, labels = zip(*lines) labels = map(int, labels) return (files, labels)
from doubanapi import ApiClient from utils import read_list, write_list from config import API_KEY, API_SECRET, USERNAME, PASSWORD if __name__ == '__main__': print(sys.argv) if len(sys.argv) < 3: print('Usage: python %s album_id dir' % sys.argv[0]) exit(1) api = ApiClient(key=API_KEY, secret=API_SECRET) print(api.login(USERNAME, PASSWORD)) album = sys.argv[1] directory = sys.argv[2] files = os.listdir(directory) finished = read_list('%s.txt' % album) error_count = 0 for f in files: image = os.path.join(directory, f) try: if f not in finished: print('Uploading %s' % image) api.photo_upload(album, image, f) finished.append(image) time.sleep(2) except Exception, e: print("error:%s on uploading :%s" % (e, image)) error_count += 1 if error_count > 5: break time.sleep(error_count * 10)