def sensegram(context,word_list,context_window_size): sentence_score_array = np.array([]) for word in word_list: wsd_model = WSD(sv, wv, window=context_window_size, lang='en', ignore_case=True) word_scores_list = wsd_model.disambiguate(context, word)[1] word_scores_list = sorted(word_scores_list, reverse=True) word_scores_arr = np.asarray(word_scores_list) word_scores_arr = filter_scores(word_scores_arr) sentence_score_array = np.concatenate((sentence_score_array, word_scores_arr), axis=0) sentence_score_array = np.reshape(sentence_score_array, (len(word_list), 10)) sentence_df = pd.DataFrame() sentence_df = sentence_df.append({'score': sentence_score_array}, ignore_index=True) return sentence_df
def get_wsd(word, context): context_words_max = 5 # change this paramters to 1, 2, 5, 10, 15, 20 : it may improve the results context_window_size = 10 # this parameters can be also changed during experiments ignore_case = True method = 'sim' lang = "pt" # to filter out stopwords # Disambiguate a word in a context wsd_model = WSD(sv, wv, window=context_window_size, lang=lang, max_context_words=context_words_max, ignore_case=ignore_case, method=method, verbose=False) return wsd_model.disambiguate(context, word)
def main(): params = {} params["home"] = (env["HOME"], ) params["corpus-path"] = ("{}/Desktop/corpus".format(env["HOME"]), ) params["agent"] = ("sfef", "sfer", "sfed", "sref", "srer", "sred", "sdef", "sder", "sded") params["recognition"] = ("on", ) params["trial"] = list(range(0, 1)) params["max-sentences"] = (5000, ) params["agent"] = ("sfer", "sfed") params["max-sentences"] = (5000, ) commands = ( "source {home}/research/code/wsd/agent/{agent}.soar", "smem --set recognition {recognition}", "epmem --set recognition {recognition}", "w 0", ) reporters = ( soar_exp.branch_name, (lambda param_map, domain, agent: ("sentence-num", domain.cur_file_index)), (lambda param_map, domain, agent: ("path", "/".join(domain.path))), (lambda param_map, domain, agent: ("word", domain.path[-1])), (lambda param_map, domain, agent: ("response", domain.last_response)), (lambda param_map, domain, agent: ("answer", domain.last_answer)), (lambda param_map, domain, agent: ("source", domain.last_source)), (lambda param_map, domain, agent: ("epmem_recog", domain.last_epmem_recog)), (lambda param_map, domain, agent: ("smem_recog", domain.last_smem_recog)), ) # run Soar kernel = soar_exp.create_kernel() for param_map in soar_exp.param_permutations(params): agent = soar_exp.create_agent( kernel, "agent-" + "-".join(str(val) for val in param_map.values())) wsd = WSD(kernel, agent, param_map) soar_exp.register_output_change_callback( kernel, agent, soar_exp.print_report_row, soar_exp.report_data_wrapper( param_map, wsd, reporters, (lambda param_map, domain, agent: domain.responded))) soar_exp.run_parameterized_commands(agent, param_map, commands) agent.ExecuteCommandLine("run") #soar_exp.cli(agent) kernel.DestroyAgent(agent) kernel.Shutdown() del kernel
def run(test_file, sense, context, output, wsd_method="sim", filter_ctx=2, lowercase=False, ignore_case=False): print("Loading models...") vs = SenseGram.load_word2vec_format(sense, binary=False) vc = word2vec.Word2Vec.load_word2vec_format(context, binary=False) wsd_model = WSD(vs, vc, method=wsd_method, filter_ctx=filter_ctx, ignore_case=ignore_case) print("Loading test set...") reader = read_csv(test_file, encoding="utf-8", delimiter="\t", dtype={'predict_related': object, 'gold_sense_ids':object, 'predict_sense_ids':object}) rows_count = reader.shape[0] print((str(rows_count) + " test instances")) pb = pbar.Pbar(rows_count, 100) uncovered_words = [] # target words for which sense model has zero senses print(("Start prediction over " + test_file)) pb.start() for i, row in reader.iterrows(): # Form of prediction: (sense, sense_scores) ctx = row.context.lower() if lowercase else row.context start, end = [int(x) for x in row.target_position.split(',')] prediction = wsd_model.dis_text(ctx, row.target, start, end) if prediction: sense, sense_scores = prediction reader.set_value(i, 'predict_sense_ids', sense.split("#")[1]) #neighbours = wsd_model.vs.most_similar(sense, topn=n_neighbours) #neighbours = ["%s:%.3f" % (n.split("#")[0], float(sim)) for n, sim in neighbours] #reader.set_value(i, 'predict_related', ",".join(neighbours)) else: uncovered_words.append(row.target) continue pb.update(i) pb.finish() reader.to_csv(sep='\t', path_or_buf=output, encoding="utf-8", index=False, quoting=QUOTE_NONE) print(("Saved predictions to " + output))
def main(): params = {} params["home"] = (env["HOME"], ) params["corpus-path"] = ("{}/Desktop/corpus".format(env["HOME"]), ) params["agent"] = ("sded", ) params["recognition"] = ("on", "off") params["recognition"] = ("on", ) params["trial"] = list(range(0, 10)) params["max-sentences"] = (100, ) commands = ( "source {home}/research/code/wsd/agent/{agent}.soar", "smem --set recognition {recognition}", "epmem --set recognition {recognition}", "w 0", ) reporters = ( soar_exp.branch_name, soar_exp.avg_decision_time, soar_exp.max_decision_time, ) # run Soar kernel = soar_exp.create_kernel() for param_map in soar_exp.param_permutations(params): agent = soar_exp.create_agent( kernel, "agent-" + "-".join(str(val) for val in param_map.values())) wsd = WSD(kernel, agent, param_map) soar_exp.register_destruction_callback( kernel, agent, soar_exp.print_report_row, soar_exp.report_data_wrapper(param_map, wsd, reporters)) soar_exp.run_parameterized_commands(agent, param_map, commands) agent.ExecuteCommandLine("run") #soar_exp.cli(agent) kernel.DestroyAgent(agent) del agent kernel.Shutdown() del kernel
while array.size < 10: # too few scores, pad with -1 array = np.append(array, np.array([-1])) return array ignore_case = True lang = "en" # to filter out stopwords for i in range(2510): sentence = orig_data_df['text'][i] detection = orig_data_df['detection'][i] word_list = sentence.split(' ') sentence_score_array = np.array([]) for word in word_list: wsd_model = WSD(sv, wv, window=context_window_size, lang=lang, ignore_case=ignore_case) word_scores_list = wsd_model.disambiguate(sentence, word)[1] #print(word_scores_list) word_scores_list = sorted(word_scores_list, reverse=True) word_scores_arr = np.asarray(word_scores_list) word_scores_arr = filter_scores(word_scores_arr) sentence_score_array = np.concatenate( (sentence_score_array, word_scores_arr), axis=0) sentence_score_array = np.reshape(sentence_score_array, (len(word_list), 10)) sensegram_df = sensegram_df.append( { 'score': sentence_score_array, 'detection': detection
ans = context.replace(',', '') ans = ans.replace('.', '') if len(ans.split(sep=" "))<4: print('sentence too short') ans='' ans = tokenizer(ans) ans = np.array(ans) sentence = ans inputs = [] for i in range(len(ans)): inputs.append(TEXT.vocab.stoi[str(ans[i])]) length = len(ans) inputs = np.array(inputs) inputs = np.reshape(inputs,(length,1)) wsd_model = WSD(sv, wv, window=context_window_size, lang='en',ignore_case=True) word_list = context.split(' ') sentence_df = sensegram(context,word_list,context_window_size) list = [ 'rnn','sensegram_rnn'] j = 0 prob = 0.0 # SUBTASK 1 with only RNN model4 = torch.load('model_rnn.pt') predictions4, label4 = evaluate_task1(model4, torch.from_numpy(inputs), length) predictions4 = predictions4.detach().numpy() print('RNN model thinks the sentence is {}, with {}% confidence'.format(label4, str(100 * round(float(predictions4), 3))))
from wsd import WSD import os BABELNET_TOKEN = os.environ["BABELNET_TOKEN"] WSD("wsd.json", BABELNET_TOKEN, "stop_words", "EXTENDED")
def __init__(self, config, work_queue, output_queue): # Initialize Disambiguator thread Thread.__init__(self) self.daemon = True # Setting up variables self._config = config self._work_queue = work_queue self._output_queue = output_queue self._workers = [] # Shouldn't have jobs yet self.has_work = False ## Set up all the various models # spaCy log.warn('Loading spaCy model: \'en\'') start_time = time.time() self._spacy_model = spacy.load('en') seconds = time.time() - start_time log.warn('Completed in {:5.2f} seconds'.format(seconds)) # SenseGram log.info('Initializing SenseGram') # Figure out all the config file information sense_vectors_fpath = self._config['MODEL']['SENSE_VECTOR_PATH'] word_vectors_fpath = self._config['MODEL']['WORD_VECTOR_PATH'] context_words_max = int(self._config['MODEL']['CONTEXT_WORDS_MAX']) context_window_size = int(self._config['MODEL']['CONTEXT_WINDOW_SIZE']) ignore_case = self._config['MODEL'].getboolean('IGNORE_CASE') lang = self._config['MODEL']['LANG'] # Really this should just be false unless you're debugging verbose = self._config['MODEL'].getboolean('VERBOSE') ## Load model # Sense Vectors log.warn('Loading sense vectors...') start_time = time.time() sv = sensegram.SenseGram.load_word2vec_format(sense_vectors_fpath, binary=False) seconds = time.time() - start_time log.warn('Completed in {:5.2f} seconds'.format(seconds)) # Word Vectors log.warn('Loading word vectors...') start_time = time.time() wv = KeyedVectors.load_word2vec_format(word_vectors_fpath, binary=False, unicode_errors="ignore") seconds = time.time() - start_time log.warn('Completed in {:5.2f} seconds'.format(seconds)) self._sense_vectors = sv self.wsd_model = WSD(sv, wv, window=context_window_size, lang=lang, max_context_words=context_words_max, ignore_case=ignore_case, verbose=verbose) log.info('Model initialized successfully')