def plot_budget(): data_frame = get_data_frames_combined() print(data_frame.head(20)) places = data_frame[1].unique() categories = data_frame[3].unique() color_generator = RandomColor() # print('# PAIKAT') # print(places) # print('# KATEGORIAT') # print(categories) plot.subplot(121) for category in categories: category_filter = data_frame[3] == category fdf = data_frame[category_filter] plot.scatter(fdf[0], fdf[2], color=color_generator.generate()[0], marker='.', label=category) plot.xlabel('Ajankohta ms alkaen 1.1.1970 (UNIX timestamp)') plot.ylabel('Hinta (EUR)') plot.legend(bbox_to_anchor=(1, 1), loc='upper left', ncol=1) plot.show()
def update_random_colors(results): ''' results should be dict items for eg. : [{"item": 1}, ...] ''' color = RandomColor() for res in results: res.update({"color": color.generate()[0]}) return results
def plot_data(data): num_sample = 5000 label = data[:, -1] feature = data[:, :-1] assignment = {} for i in range(len(feature)): if label[i] not in assignment: assignment[label[i]] = [] assignment[label[i]].append(i) # down sample old_assignment = assignment assignment = {} indicies = [] for label in old_assignment: last_length = len(indicies) indicies += np.random.choice( old_assignment[label], size=min(int(num_sample / len(old_assignment)), len(old_assignment[label])), replace=False).tolist() assignment[label] = np.arange(last_length, len(indicies)) feature = feature[indicies] print(feature.shape) print(len(indicies)) print(len(np.unique(indicies))) tsne = TSNE() x = tsne.fit_transform(feature) fig, ax = plt.subplots() # ax.plot(x[:, 0], x[:, 1], '*') r = RandomColor() colors = r.generate(count=len(assignment)) for i, label in enumerate(assignment): ax.plot(x[assignment[label]][:, 0], x[assignment[label]][:, 1], '*', color=colors[i], label=label) plt.legend() plt.show()
def Domains(self): seq_records = SeqIO.parse(os.path.join('Input', 'ProteinInput'), 'fasta') input_protein_list = [] for record in seq_records: input_protein_list.append(record.id) Complete_Domains = [] domainNameList = [] for item in input_protein_list: sleep(self.timer) e_fetch = Entrez.efetch(db='protein', id="%s" % item, retmax=1000, rettype='gb', retmode='fasta') for seq_record in SeqIO.parse(e_fetch, 'gb'): domain_list = [] accession_number = seq_record.id for i in range(len(seq_record.features)): if seq_record.features[i].type == 'Region': domain_location = str( seq_record.features[i].location).split( '[')[1].split(']')[0] domain_name = str( seq_record.features[i].qualifiers['region_name'] [0]) domainNameList.append(domain_name) domain_list.append([domain_name, domain_location]) Complete_Domains.append(dict([(accession_number, domain_list) ])) rand_color = RandomColor() domains_dict_colors = { domain: rand_color.generate()[0] for domain in set(domainNameList) } Domains = [domain for domain in set(domainNameList)] return Complete_Domains, domains_dict_colors, Domains
def colored_pb_example(example): arg_text_examples = [arg['arg_text'] for arg in example['args']] colors = RandomColor(seed = example['example_name']).generate(count=len(example['args']), luminosity='light') color_dict = {arg_text:color for arg_text,color in zip(arg_text_examples, colors)} colored_arg_dict = {arg_text:'<span style=background-color:'+color_dict[arg_text]+'>'+arg_text+'</span>' for arg_text in arg_text_examples } for arg in example['args']: arg['arg_text'] = colored_arg_dict[arg['arg_text']] for arg_text in arg_text_examples: example['example_text'] = example['example_text'].replace(arg_text, colored_arg_dict[arg_text]) return example
def formatted_def(frame, markup): frame_elements = frame['elements'] colors = RandomColor(seed=frame['name']).generate( count=len(frame_elements), luminosity='light') color_dict = {f['fe_name']: c for f, c in zip(frame_elements, colors)} color_dict_abbrev = { f['abbrev']: c for f, c in zip(frame_elements, colors) } pattern_fen = re.compile('<fen[^>]*>[^<]+</fen>') for tagset in set(re.findall(pattern_fen, markup)): fen = tagset[5:-6] try: fen_color = color_dict[fen] markup = markup.replace( '<fen>' + fen, '<fen style="background-color:' + fen_color + ';">' + fen) except KeyError: try: fen_color = color_dict_abbrev[fen] markup = markup.replace( '<fen>' + fen, '<fen style="background-color:' + fen_color + ';">' + fen) except KeyError: break pattern_t = re.compile('<t[^>]*>[^<]+</t>') for tagset in set(re.findall(pattern_t, markup)): markup = markup.replace( '<t>', '<t style="font-weight:bold; text-transform:uppercase;">') pattern_ex = re.compile('<fex[^>]*>[^<]+</fex>') ex_tags = re.findall(pattern_ex, markup) name_pattern = re.compile('<fex (name=".+")>.*</fex>') for ex_tag in ex_tags: fex_name_block = re.findall(name_pattern, ex_tag)[0] fex_name = fex_name_block.split('=')[1].strip('"') try: fex_color = color_dict[fex_name] markup = markup.replace( fex_name_block, 'style="background-color:' + fex_color + ';"') except KeyError: try: fex_color = color_dict_abbrev[fex_name] markup = markup.replace( fex_name_block, 'style="background-color:' + fex_color + ';"') except KeyError: break return markup
def getRandomColour() -> int: """ Generates a random colour as a hexadecimal integer. Notes ------- The :module:`randomcolor` library is used to generate a random colour in hex format. The '#' character is stripped from the string and then then string is converted to a hexadecimal integer. Returns ------- Int A random colour represented as a hexadecimal integer. """ return int(RandomColor().generate()[0].lstrip('#'), 16)
def create_colors(genomes_dict, og_id): ''' ''' OGs = list() for genome, genes in genomes_dict.items(): for gene in genes: OGs.append(gene[-1]) print(OGs) OGs = list(set(OGs)) colors = {OG: RandomColor().generate()[0] for OG in OGs} # set colors for non_OG genes and target OG_ID colors[og_id] = "red" if "" in colors: colors[""] = "white" return colors
def formatted_def(frame, markup, popover=False): frame_elements = frame['elements'] colors = RandomColor(seed=frame['name']).generate( count=len(frame_elements), luminosity='light') color_dict = {f['fe_name']: c for f, c in zip(frame_elements, colors)} color_dict_abbrev = { f['abbrev']: c for f, c in zip(frame_elements, colors) } if popover == True: element_dict = { element['fe_name']: formatted_def(frame, element['def_markup']) for element in frame['elements'] } else: element_dict = None pattern_fen = re.compile('<fen[^>]*>[^<]+</fen>') for tagset in set(re.findall(pattern_fen, markup)): fen = tagset[5:-6] try: fen_color = color_dict[fen] if element_dict: fen_def = element_dict.get(fen, 'No Entry Found') popover_id = 'fen_popover_' + fen popover_div_content = '<div id=' + popover_id + ' style="display:none;"><div class="popover-body">' + fen_def + '</div></div>' markup = markup.replace( '<fen>' + fen, popover_div_content + '<fen data-trigger="hover focus" data-toggle="popover" data-popover-content="#' + popover_id + '" style="background-color:' + fen_color + ';">' + fen) else: markup = markup.replace( '<fen>' + fen, '<fen style="background-color:' + fen_color + ';">' + fen) except KeyError: try: fen_color = color_dict_abbrev[fen] if element_dict: popover_id = 'fen_popover_' + fen popover_div_content = '<div id=' + popover_id + ' style="display:none;"><div class="popover-body">' + fen_def + '</div></div>' markup = markup.replace( '<fen>' + fen, popover_div_content + '<fen data-trigger="hover focus" data-toggle="popover" data-popover-content="#' + popover_id + '" style="background-color:' + fen_color + ';">' + fen) else: markup = markup.replace( '<fen>' + fen, '<fen style="background-color:' + fen_color + ';">' + fen) except KeyError: break pattern_t = re.compile('<t[^>]*>[^<]+</t>') for tagset in set(re.findall(pattern_t, markup)): markup = markup.replace( '<t>', '<t style="font-weight:bold; text-transform:uppercase;">') pattern_ex = re.compile('<fex[^>]*>[^<]+</fex>') ex_tags = re.findall(pattern_ex, markup) name_pattern = re.compile('<fex (name=".+")>.*</fex>') for ex_tag in ex_tags: try: fex_name_block = re.findall(name_pattern, ex_tag)[0] fex_name = fex_name_block.split('=')[1].strip('"') try: fex_color = color_dict[fex_name] markup = markup.replace( fex_name_block, 'style="background-color:' + fex_color + ';"') except KeyError: try: fex_color = color_dict_abbrev[fex_name] markup = markup.replace( fex_name_block, 'style="background-color:' + fex_color + ';"') except KeyError: break except IndexError: pass return markup
def get_palette(df): """Create color pallette so that static map and animation routes match""" each_route = df['route_id'].unique() pal = {r: RandomColor().generate()[0] for r in each_route} return pal
ap.add_argument('-l', '--luminosity', type=str, default=None, help=''' Controls the luminosity of the generated color. You can specify a string containing bright, light, or dark.''') # Seed # FIXME: doesnt work #ap.add_argument('-s','--seed', type = int, default=None, help=''' #An integer which when passed will cause randomColor to return #the same color each time.''') # Format ap.add_argument('-f', '--format', type=str, default=None, help=''' A string which specifies the format of the generated color. Possible values are rgb, rgbArray, hsl, hslArray and hex (default).''') # Parse input arguments. args = vars(ap.parse_args()) args = rmNone(args) # Generate n random colors. rand_color = RandomColor() print(rand_color.generate(**args))
import re import cv2 import time import threading import numpy as np from randomcolor import RandomColor from skimage.measure import find_contours randomcolor = RandomColor() color_regex = re.compile(r'(\d+)') def random_colors(count): return list( map(lambda x: list(map(int, color_regex.findall(x))), randomcolor.generate(count=count, format_='rgb'))) def apply_mask(image, mask, color=None, alpha=0.5): if not color: color = random_colors(1)[0] for i in range(3): image[:, :, i] = np.where(mask, image[:, :, i] * (1 - alpha) + alpha * color[i], image[:, :, i]) return image def mask_image(image, boxes,
def main(variant): HYPERBAND_MAX_EPOCHS = 50 EXECUTION_PER_TRIAL = 2 SEED = 13377331 # ,'../data/FunLines/task-1/preproc/2_concat_train.bin' train_path, dev_path, test_path = [ '../data/task-1/preproc/2_concat_train.bin' ], ['../data/task-1/preproc/2_concat_dev.bin' ], ['../data/task-1/preproc/2_concat_test.bin'] if variant == 'HUMOR': params = json.load(open("./lib/models/tuning/model.json", 'r')) train_data = load_data(train_path) dev_data = load_data(dev_path) test_data = load_data(test_path) features = [ PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature, NellKbFeature ] train_data.AddFeatures(features) dev_data.AddFeatures(features) test_data.AddFeatures(features) features, train_y = train_data.GetFeatureVectors( ), train_data.GetGrades() ins = {"FeatureInput": features[:, :4]} i = 4 ins["EntityInput"] = features[:, i:] # text = np.load('../data/task-1/train_replaced.npy', allow_pickle=True) # text = train_data.GetReplaced() ins["ReplacedInput"] = text # text = np.load('../data/task-1/train_edit.npy', allow_pickle=True) # text = train_data.GetEdits() ins["ReplacementInput"] = text # Dev data dev_features, dev_y = dev_data.GetFeatureVectors(), dev_data.GetGrades( ) devIns = {"FeatureInput": dev_features[:, :4]} i = 4 devIns["EntityInput"] = dev_features[:, i:] # text = np.load('../data/task-1/dev_replaced.npy', allow_pickle=True) # text = dev_data.GetReplaced() devIns["ReplacedInput"] = text # text = np.load('../data/task-1/dev_edit.npy', allow_pickle=True) # text = dev_data.GetEdits() devIns["ReplacementInput"] = text # Test data test_features, test_y = test_data.GetFeatureVectors( ), test_data.GetGrades() testIns = {"FeatureInput": test_features[:, :4]} i = 4 testIns["EntityInput"] = test_features[:, i:] # text = np.load('../data/task-1/test_replaced.npy', allow_pickle=True) # text = test_data.GetReplaced() testIns["ReplacedInput"] = text # text = np.load('../data/task-1/test_edit.npy', allow_pickle=True) # text = test_data.GetEdits() testIns["ReplacementInput"] = text early = tf.keras.callbacks.EarlyStopping( monitor='val_root_mean_squared_error', min_delta=0.0001, patience=5, mode='min', restore_best_weights=True) score = [] for i in range(10): model = create_HUMOR2_model(4, 25, 128, params["hyperparameters"]) model.fit(x=ins, y=train_y, validation_data=(devIns, dev_y), batch_size=16, epochs=40, shuffle=True, callbacks=[early]) preds = model.predict(testIns) score.append( mean_squared_error(test_y, round_numbers(preds), squared=False)) print(score[i]) del model # tf.reset_default_graph() tf.keras.backend.clear_session() gc.collect() score = np.array(score) print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}') elif variant == 'HUMOR2': params = json.load(open("./lib/models/tuning/model2.json", 'r')) train_data = load_data(train_path) dev_data = load_data(dev_path) test_data = load_data(test_path) features = [ PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature, NellKbFeature, AlbertTokenizer ] train_data.AddFeatures(features) dev_data.AddFeatures(features) test_data.AddFeatures(features) features, train_y = train_data.GetFeatureVectors( ), train_data.GetGrades() ins = {"FeatureInput": features[:, :4]} i = 4 ins["EntityInput"] = features[:, i:i + 25] i += 25 ins["input_word_ids"] = features[:, i:i + 128] i += 128 ins["segment_ids"] = features[:, i:i + 128] i += 128 ins["input_mask"] = features[:, i:i + 128] text = train_data.GetReplaced() ins["ReplacedInput"] = text text = train_data.GetEdits() ins["ReplacementInput"] = text # Dev data dev_features, dev_y = dev_data.GetFeatureVectors(), dev_data.GetGrades( ) devIns = {"FeatureInput": dev_features[:, :4]} i = 4 devIns["EntityInput"] = dev_features[:, i:i + 25] i += 25 devIns["input_word_ids"] = dev_features[:, i:i + 128] i += 128 devIns["segment_ids"] = dev_features[:, i:i + 128] i += 128 devIns["input_mask"] = dev_features[:, i:i + 128] text = dev_data.GetReplaced() devIns["ReplacedInput"] = text text = dev_data.GetEdits() devIns["ReplacementInput"] = text # Test data test_features, test_y = test_data.GetFeatureVectors( ), test_data.GetGrades() testIns = {"FeatureInput": test_features[:, :4]} i = 4 testIns["EntityInput"] = test_features[:, i:i + 25] i += 25 testIns["input_word_ids"] = test_features[:, i:i + 128] i += 128 testIns["segment_ids"] = test_features[:, i:i + 128] i += 128 testIns["input_mask"] = test_features[:, i:i + 128] text = test_data.GetReplaced() testIns["ReplacedInput"] = text text = test_data.GetEdits() testIns["ReplacementInput"] = text early = tf.keras.callbacks.EarlyStopping( monitor='val_root_mean_squared_error', min_delta=0.0001, patience=5, mode='min', restore_best_weights=True) score = [] for i in range(10): model = create_HUMOR2_model(4, 25, 128, params["hyperparameters"]) model.fit(x=ins, y=train_y, validation_data=(devIns, dev_y), batch_size=16, epochs=25, shuffle=True, callbacks=[early]) preds = model.predict(testIns) score.append(mean_squared_error(test_y, preds, squared=False)) del model # tf.reset_default_graph() tf.keras.backend.clear_session() gc.collect() score = np.array(score) print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}') elif variant == 'TASK2INFER': model = './headline_regression/20200308-194029-BEST/weights/final.hdf5' infer = Task2Inference(model, '../data/task-2/preproc/2_concat_test.bin') infer.predict('../data/task-2/predictions/task-2-output.csv') elif variant == 'TESTINFER': preds = 'task-1-output.context.csv' test = load_data(test_path) y = test.GetGrades() with open(preds, 'r') as f: i = 0 pred_list = [] for line in f: if i == 0: i = 1 else: pred_list.append(float(line.strip().split(',')[1])) rmse = mean_squared_error(y, np.array(pred_list), squared=False) print(rmse) elif variant == 'NAM': model = create_NAM_model(1, 181544, 832) data_path = '../data/NELL/NELLRDF.xml' ent_vocab = '../data/NELL/NELLWordNetVocab.txt' rel_vocab = '../data/NELL/NELLRelVocab.txt' trainer = NAMTraining(model, data_path, ent_vocab, rel_vocab) trainer.train(30, 2048) trainer.test() elif variant == 'TUNING': model = HumorTuner(4, 20) tuner = Hyperband(model, max_epochs=HYPERBAND_MAX_EPOCHS, objective=kerastuner.Objective( "val_root_mean_squared_error", direction="min"), seed=SEED, executions_per_trial=EXECUTION_PER_TRIAL, hyperband_iterations=2, directory=f'tuning_hyperband', project_name='ContextHumor') tuner.search_space_summary() ## Loading the data train_data = load_data(train_path) dev_data = load_data(dev_path) features = [ PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature, NellKbFeature ] train_data.AddFeatures(features) dev_data.AddFeatures(features) features, train_y = train_data.GetFeatureVectors( ), train_data.GetGrades() ins = {"FeatureInput": features[:, :4]} i = 4 ins["EntityInput"] = features[:, i:i + 20] ins["ReplacedInput"] = train_data.GetReplaced() ins["ReplacementInput"] = train_data.GetEdits() # Dev data dev_features, dev_y = dev_data.GetFeatureVectors(), dev_data.GetGrades( ) devIns = {"FeatureInput": dev_features[:, :4]} i = 4 devIns["EntityInput"] = dev_features[:, i:i + 20] devIns["ReplacedInput"] = dev_data.GetReplaced() devIns["ReplacementInput"] = dev_data.GetEdits() early = tf.keras.callbacks.EarlyStopping( monitor='val_root_mean_squared_error', min_delta=0.0005, patience=2, mode='min', restore_best_weights=True) tuner.oracle.hyperband_iterations = 2 tuner.search(ins, train_y, epochs=HYPERBAND_MAX_EPOCHS, batch_size=64, validation_data=(devIns, dev_y), callbacks=[early]) tuner.results_summary() elif variant == 'TUNINGSERVER': params = json.load(open("./lib/models/tuning/model.json", 'r')) model = HumorTunerServer(4, 20, 128, params["hyperparameters"]) tuner = Hyperband(model, max_epochs=HYPERBAND_MAX_EPOCHS, objective=kerastuner.Objective( "val_root_mean_squared_error", direction="min"), seed=SEED, executions_per_trial=EXECUTION_PER_TRIAL, hyperband_iterations=1, directory=f'tuning_hyperband', project_name='ContextHumor') tuner.search_space_summary() ## Loading the data train_data = load_data(train_path) dev_data = load_data(dev_path) features = [ PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature, NellKbFeature, AlbertTokenizer ] train_data.AddFeatures(features) dev_data.AddFeatures(features) features, train_y = train_data.GetFeatureVectors( ), train_data.GetGrades() ins = {"FeatureInput": features[:, :4]} i = 4 ins["EntityInput"] = features[:, i:i + 20] i += 20 ins["input_word_ids"] = features[:, i:i + 128] i += 128 ins["segment_ids"] = features[:, i:i + 128] i += 128 ins["input_mask"] = features[:, i:i + 128] text = train_data.GetReplaced() ins["ReplacedInput"] = text text = train_data.GetEdits() ins["ReplacementInput"] = text # Dev data dev_features, dev_y = dev_data.GetFeatureVectors(), dev_data.GetGrades( ) devIns = {"FeatureInput": dev_features[:, :4]} i = 4 devIns["EntityInput"] = dev_features[:, i:i + 20] i += 20 devIns["input_word_ids"] = dev_features[:, i:i + 128] i += 128 devIns["segment_ids"] = dev_features[:, i:i + 128] i += 128 devIns["input_mask"] = dev_features[:, i:i + 128] text = dev_data.GetReplaced() devIns["ReplacedInput"] = text text = dev_data.GetEdits() devIns["ReplacementInput"] = text early = tf.keras.callbacks.EarlyStopping( monitor='val_root_mean_squared_error', min_delta=0.0005, patience=2, mode='min', restore_best_weights=True) tuner.search(ins, train_y, epochs=HYPERBAND_MAX_EPOCHS, batch_size=64, validation_data=(devIns, dev_y), callbacks=[early]) tuner.results_summary() elif variant == 'PLOT': axes = [ "feature_units1", "feature_units2", "entity_units1", "entity_units2", "sentence_units1", "sentence_units2", "sentence_units3" ] models = json.load(open("./lib/models/tuning/result_summary.json", 'r')) params = defaultdict(list) for model in models["top_10"]: t_id = model["TrialID"] model_param = json.load( open(f"./tuning_hyperband/HumorHumor/trial_{t_id}/trial.json", "r")) for a in axes: params[a].append(model_param["hyperparameters"]["values"][a]) params["score"].append(model["Score"]) fig = go.Figure( data=go.Parcoords(line_color='green', dimensions=list([ dict(range=[8, 128], label='Feature Layer 1', values=params[axes[0]]), dict(range=[8, 128], label='Feature Layer 2', values=params[axes[1]]), dict(range=[8, 128], label='Knowledge Layer 1', values=params[axes[2]]), dict(range=[8, 128], label='Knowledge Layer 2', values=params[axes[3]]), dict(range=[32, 512], label='Word Layer 2', values=params[axes[4]]), dict(range=[32, 512], label='Word Layer 1', values=params[axes[5]]), dict(range=[8, 128], label='Word Layer 2', values=params[axes[6]]), dict(range=[0, 1], label='Root Mean Square Error', values=params["score"]), ]))) fig.show() elif variant == 'MultiCNN': train_data = load_data(train_path) dev_data = load_data(dev_path) test_data = load_data(test_path) with codecs.open('../data/vocab/train_vocab.json', encoding='utf-8') as fp: vocab_dict = json.load(fp) max_length = longest(train_data.GetTokenizedWEdit()) ins = { "TextIn": convert_to_index(vocab_dict, train_data.GetTokenizedWEdit(), max_length) } train_y = train_data.GetGrades() devIns = { "TextIn": convert_to_index(vocab_dict, dev_data.GetTokenizedWEdit(), max_length) } dev_y = dev_data.GetGrades() testIns = { "TextIn": convert_to_index(vocab_dict, test_data.GetTokenizedWEdit(), max_length) } test_y = test_data.GetGrades() early = tf.keras.callbacks.EarlyStopping( monitor='val_root_mean_squared_error', min_delta=0.0001, patience=5, mode='min', restore_best_weights=True) lr_schedule = create_learning_rate_scheduler(max_learn_rate=1e-1, end_learn_rate=1e-6, warmup_epoch_count=15, total_epoch_count=40) score = [] for i in range(10): model = create_MultiCNN_model() model.fit(x=ins, y=train_y, validation_data=(devIns, dev_y), batch_size=16, epochs=40, shuffle=True, callbacks=[early]) preds = model.predict(testIns) score.append(mean_squared_error(test_y, preds, squared=False)) del model # tf.reset_default_graph() tf.keras.backend.clear_session() gc.collect() score = np.array(score) print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}') # preds = model.predict(devIns) # ids = dev_data.GetIDs() # out = np.stack((ids, preds.flatten()), axis=-1) # Save the predictions to file # np.savetxt(f'../plots/{variant}.csv', out, header='id,pred', fmt="%d,%1.8f") # print(f'Mean of preds: {preds.mean()}, STD of preds: {preds.std()}, Mean of true: {dev_y.mean()}, STD of true: {dev_y.std()}') # bins = np.linspace(0, 3, 50) # plt.hist(preds, bins=bins, alpha=0.5, label="preds") # plt.hist(dev_y, bins=bins, alpha=0.5, label="true") # plt.legend(loc='upper right') # plt.show() # del model elif variant == 'CNN': # model = create_CNN_model() train_data = load_data(train_path) dev_data = load_data(dev_path) test_data = load_data(test_path) with codecs.open('../data/vocab/train_vocab.json', encoding='utf-8') as fp: vocab_dict = json.load(fp) max_length = longest(train_data.GetTokenizedWEdit()) ins = { "TextIn": convert_to_index(vocab_dict, train_data.GetTokenizedWEdit(), max_length) } train_y = train_data.GetGrades() devIns = { "TextIn": convert_to_index(vocab_dict, dev_data.GetTokenizedWEdit(), max_length) } dev_y = dev_data.GetGrades() testIns = { "TextIn": convert_to_index(vocab_dict, test_data.GetTokenizedWEdit(), max_length) } test_y = test_data.GetGrades() early = tf.keras.callbacks.EarlyStopping( monitor='val_root_mean_squared_error', min_delta=0.0001, patience=5, mode='min', restore_best_weights=True) score = [] for i in range(10): model = create_CNN_model() model.fit(x=ins, y=train_y, validation_data=(devIns, dev_y), batch_size=16, epochs=40, shuffle=True, callbacks=[early]) preds = model.predict(testIns) score.append(mean_squared_error(test_y, preds, squared=False)) del model # tf.reset_default_graph() tf.keras.backend.clear_session() gc.collect() score = np.array(score) print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}') # preds = model.predict(devIns) # ids = dev_data.GetIDs() # out = np.stack((ids, preds.flatten()), axis=-1) # Save the predictions to file # np.savetxt(f'../plots/{variant}.csv', out, header='id,pred', fmt="%d,%1.8f") # print(f'Mean of preds: {preds.mean()}, STD of preds: {preds.std()}, Mean of true: {dev_y.mean()}, STD of true: {dev_y.std()}') # bins = np.linspace(0, 3, 50) # plt.hist(preds, bins=bins, alpha=0.5, label="preds") # plt.hist(dev_y, bins=bins, alpha=0.5, label="true") # plt.legend(loc='upper right') # plt.show() # del model elif variant == 'KBLSTM': train_data = load_data(train_path) train_data.AddFeatures([NellKbFeature]) dev_data = load_data(dev_path) dev_data.AddFeatures([NellKbFeature]) test_data = load_data(test_path) test_data.AddFeatures([NellKbFeature]) with codecs.open('../data/vocab/train_vocab.json', encoding='utf-8') as fp: vocab_dict = json.load(fp) max_length = longest(train_data.GetTokenizedWEdit()) train = convert_to_index(vocab_dict, train_data.GetTokenizedWEdit(), max_length) ins = {"TextIn": train, "EntityInput": train_data.GetFeatureVectors()} train_y = train_data.GetGrades() dev = convert_to_index(vocab_dict, dev_data.GetTokenizedWEdit(), max_length) devIns = {"TextIn": dev, "EntityInput": dev_data.GetFeatureVectors()} dev_y = dev_data.GetGrades() test = convert_to_index(vocab_dict, test_data.GetTokenizedWEdit(), max_length) testIns = { "TextIn": test, "EntityInput": test_data.GetFeatureVectors() } test_y = test_data.GetGrades() early = tf.keras.callbacks.EarlyStopping( monitor='val_root_mean_squared_error', min_delta=0.0001, patience=5, mode='min', restore_best_weights=True) score = [] for i in range(10): model = create_KBLSTM_model() model.fit(x=ins, y=train_y, validation_data=(devIns, dev_y), batch_size=16, epochs=40, shuffle=True, callbacks=[early]) preds = model.predict(testIns) score.append(mean_squared_error(test_y, preds, squared=False)) del model # tf.reset_default_graph() tf.keras.backend.clear_session() gc.collect() score = np.array(score) print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}') # preds = model.predict(devIns) # ids = dev_data.GetIDs() # out = np.stack((ids, preds.flatten()), axis=-1) # Save the predictions to file # np.savetxt(f'../plots/{variant}.csv', out, header='id,pred', fmt="%d,%1.8f") # print(f'Mean of preds: {preds.mean()}, STD of preds: {preds.std()}, Mean of true: {dev_y.mean()}, STD of true: {dev_y.std()}') # bins = np.linspace(0, 3, 50) # plt.hist(preds, bins=bins, alpha=0.5, label="preds") # plt.hist(dev_y, bins=bins, alpha=0.5, label="true") # plt.legend(loc='upper right') # plt.show() del model elif variant == 'NNLM': train_data = load_data(train_path) dev_data = load_data(dev_path) test_data = load_data(test_path) ins = {"sentence_in": train_data.GetEditSentences()} devIns = {"sentence_in": dev_data.GetEditSentences()} testIns = {"sentence_in": test_data.GetEditSentences()} train_y = train_data.GetGrades() dev_y = dev_data.GetGrades() test_y = test_data.GetGrades() early = tf.keras.callbacks.EarlyStopping( monitor='val_root_mean_squared_error', min_delta=0.0001, patience=5, mode='min', restore_best_weights=True) score = [] for i in range(10): model = create_BERT_model() model.fit(x=ins, y=train_y, validation_data=(devIns, dev_y), batch_size=16, epochs=40, shuffle=True, callbacks=[early]) preds = model.predict(testIns) score.append(mean_squared_error(test_y, preds, squared=False)) del model # tf.reset_default_graph() tf.keras.backend.clear_session() gc.collect() score = np.array(score) print(f'{variant}: Mean: {score.mean()}, STD: {score.std()}') # preds = model.predict(devIns) # ids = dev_data.GetIDs() # out = np.stack((ids, preds.flatten()), axis=-1) # Save the predictions to file # np.savetxt(f'../plots/{variant}.csv', out, header='id,pred', fmt="%d,%1.8f") # print(f'Mean of preds: {preds.mean()}, STD of preds: {preds.std()}, Mean of true: {dev_y.mean()}, STD of true: {dev_y.std()}') # bins = np.linspace(0, 3, 50) # plt.hist(preds, bins=bins, alpha=0.5, label="preds") # plt.hist(dev_y, bins=bins, alpha=0.5, label="true") # plt.legend(loc='upper right') # plt.show() # del model elif variant == 'LINEAR': train = load_data(train_path) dev = load_data(dev_path) features = [ PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature ] train.AddFeatures(features) dev.AddFeatures(features) X, y = train.GetFeatureVectors(), train.GetGrades() X_dev, dev_y = dev.GetFeatureVectors(), dev.GetGrades() reg = LinearRegression(n_jobs=-1).fit(X, y) preds = reg.predict(X_dev) rmse = mean_squared_error(test_y, preds, squared=False) # ids = dev.GetIDs() # out = np.stack((ids, preds.flatten()), axis=-1) # Save the predictions to file # np.savetxt(f'../plots/{variant}.csv', out, header='id,pred', fmt="%d,%1.8f") # print(f'Mean of preds: {preds.mean()}, STD of preds: {preds.std()}, Mean of true: {dev_y.mean()}, STD of true: {dev_y.std()}') elif variant == 'VOCAB': embed_path = '../data/embeddings/numpy/headline.npy' print("Loading embeddings and vocab...") model = Word2Vec.load('../data/embeddings/headlineEmbeds.bin') print("Loaded embeddings...") with codecs.open('../data/vocab/train_vocab.funlines.json', encoding='utf-8') as fp: vocab_dict = json.load(fp) print("Loaded vocab...") embed_matrix = np.zeros((len(vocab_dict), 300)) i = 0 for k, v in vocab_dict.items(): try: embed_matrix[v] = model.wv.get_vector(k) except KeyError: # print(f'{k} does not exist in FastText embeddings') i += 1 print(len(vocab_dict), i) print("Created the embedding matrix...") np.save(embed_path, embed_matrix) print("Saved the new embeddings...") elif variant == 'WORD2VEC': print("Loading data...") headline_paths = [ '../data/extra_data_sarcasm/Sarcasm_Headlines_Dataset_v2.json' ] headlines = [] for headline_path in headline_paths: with open(headline_path, 'r') as fp: for line in fp: d = json.loads(line) headlines.append(text_to_word_sequence(d["headline"])) train_data = load_data(train_path) print("Train model...") print(len(headlines)) headlines.extend(train_data.GetTokenizedWEdit()) print(len(headlines)) model = Word2Vec(headlines, size=300, window=14, workers=4, min_count=1) vocab = list(model.wv.vocab) print(len(vocab)) print("Saving model...") model.save('../data/embeddings/headlineEmbeds.bin') elif variant == 'PCA': model = PCA(n_components=3) entities = np.load('../data/NELL/embeddings/entity.npy') labels = load_vocab('../data/NELL/NELLWordNetVocab_proc.txt') top_100 = {} with open('../data/NELL/top_100_nell.txt', 'r') as f: for line in f: label = line.strip() top_100[label] = entities[labels[label]] # print(entities[:4]) # print(labels[:4]) pca_ent = model.fit_transform(list(top_100.values())) # create_dendrogram(list(top_100.values()), list(top_100.keys()), 'ward') # print(pca_ent.shape) # print(pca_ent[:10]) rand_color = RandomColor() fig = go.Figure(data=[ go.Scatter3d(x=pca_ent[:, 0], y=pca_ent[:, 1], z=pca_ent[:, 2], mode='markers', text=list(top_100.keys()), marker=dict(size=12, color=rand_color.generate(count=100), colorscale='Viridis', opacity=0.8)) ]) plotly.offline.plot(fig, filename="NELLPCA.html") elif variant == 'MEAN': files = [ 'CNN.csv', 'context.csv', 'KBLSTM.csv', 'LINEAR.csv', 'MultiCNN.csv', 'NNLM.csv', 'simple.csv' ] for f in files: with open(f'../plots/{f}', 'r') as fp: i = 0 vals = [] for line in fp: if i == 0: i += 1 continue vals.append(float(line.strip().split(',')[1])) vals = np.array(vals) mean, std = vals.mean(), vals.std() print(f'{f.split(".")[0]}: Mean: {mean}, STD: {std}') elif variant == 'COEF': train = load_data(train_path) features = [ PhoneticFeature, PositionFeature, DistanceFeature, SentLenFeature ] train.AddFeatures(features) X, y = train.GetFeatureVectors(), train.GetGrades() y = np.reshape(y, (-1, 1)) print(y.shape) z = np.concatenate((X, y), axis=-1).T coef = np.corrcoef(z).round(decimals=4) np.savetxt("coef.csv", coef, delimiter=',') elif variant == 'ALBERT': model = create_BERT_model() train = load_data(train_path) dev = load_data(dev_path) test = load_data(test_path) features = [AlbertTokenizer] train.AddFeatures(features) dev.AddFeatures(features) test.AddFeatures(features) features, indexes = dev.GetFeatureVectors(), dev.GetIndexes() ins = {} i = 0 ins["input_word_ids"] = features[:, i:i + 128] i += 128 ins["segment_ids"] = features[:, i:i + 128] i += 128 ins["input_mask"] = features[:, i:i + 128] preds = model.predict(ins) words_train = [] for i, pred in enumerate(preds): words_train.append(pred[indexes[i]]) words_train = np.array(words_train) print(words_train.shape) np.save("./dev_edit.npy", words_train) elif variant == 'MEDIAN': train_data = load_data(train_path) test_data = load_data(test_path) train_y = train_data.GetGrades() test_y = test_data.GetGrades() pred = np.mean(train_y) print("Median", pred) pred_y = np.array([pred] * len(test_y)) rmse = mean_squared_error(test_y, pred_y, squared=False) print("RMSE", rmse)