Beispiel #1
0
def sensegram(context,word_list,context_window_size):
    sentence_score_array = np.array([])
    for word in word_list:
        wsd_model = WSD(sv, wv, window=context_window_size, lang='en', ignore_case=True)
        word_scores_list = wsd_model.disambiguate(context, word)[1]
        word_scores_list = sorted(word_scores_list, reverse=True)
        word_scores_arr = np.asarray(word_scores_list)
        word_scores_arr = filter_scores(word_scores_arr)
        sentence_score_array = np.concatenate((sentence_score_array, word_scores_arr), axis=0)
    sentence_score_array = np.reshape(sentence_score_array, (len(word_list), 10))
    sentence_df = pd.DataFrame()
    sentence_df = sentence_df.append({'score': sentence_score_array}, ignore_index=True)
    return sentence_df
def get_wsd(word, context):
    context_words_max = 5  # change this paramters to 1, 2, 5, 10, 15, 20 : it may improve the results
    context_window_size = 10  # this parameters can be also changed during experiments
    ignore_case = True
    method = 'sim'
    lang = "pt"  # to filter out stopwords

    # Disambiguate a word in a context
    wsd_model = WSD(sv,
                    wv,
                    window=context_window_size,
                    lang=lang,
                    max_context_words=context_words_max,
                    ignore_case=ignore_case,
                    method=method,
                    verbose=False)
    return wsd_model.disambiguate(context, word)
Beispiel #3
0
def main():
    params = {}
    params["home"] = (env["HOME"], )
    params["corpus-path"] = ("{}/Desktop/corpus".format(env["HOME"]), )
    params["agent"] = ("sfef", "sfer", "sfed", "sref", "srer", "sred", "sdef",
                       "sder", "sded")
    params["recognition"] = ("on", )
    params["trial"] = list(range(0, 1))
    params["max-sentences"] = (5000, )

    params["agent"] = ("sfer", "sfed")
    params["max-sentences"] = (5000, )

    commands = (
        "source {home}/research/code/wsd/agent/{agent}.soar",
        "smem --set recognition {recognition}",
        "epmem --set recognition {recognition}",
        "w 0",
    )
    reporters = (
        soar_exp.branch_name,
        (lambda param_map, domain, agent:
         ("sentence-num", domain.cur_file_index)),
        (lambda param_map, domain, agent: ("path", "/".join(domain.path))),
        (lambda param_map, domain, agent: ("word", domain.path[-1])),
        (lambda param_map, domain, agent: ("response", domain.last_response)),
        (lambda param_map, domain, agent: ("answer", domain.last_answer)),
        (lambda param_map, domain, agent: ("source", domain.last_source)),
        (lambda param_map, domain, agent:
         ("epmem_recog", domain.last_epmem_recog)),
        (lambda param_map, domain, agent:
         ("smem_recog", domain.last_smem_recog)),
    )
    # run Soar
    kernel = soar_exp.create_kernel()
    for param_map in soar_exp.param_permutations(params):
        agent = soar_exp.create_agent(
            kernel,
            "agent-" + "-".join(str(val) for val in param_map.values()))
        wsd = WSD(kernel, agent, param_map)
        soar_exp.register_output_change_callback(
            kernel, agent, soar_exp.print_report_row,
            soar_exp.report_data_wrapper(
                param_map, wsd, reporters,
                (lambda param_map, domain, agent: domain.responded)))
        soar_exp.run_parameterized_commands(agent, param_map, commands)
        agent.ExecuteCommandLine("run")
        #soar_exp.cli(agent)
        kernel.DestroyAgent(agent)
    kernel.Shutdown()
    del kernel
Beispiel #4
0
def run(test_file, sense, context, output, wsd_method="sim", filter_ctx=2, lowercase=False, ignore_case=False):
    print("Loading models...")
    vs = SenseGram.load_word2vec_format(sense, binary=False)
    vc = word2vec.Word2Vec.load_word2vec_format(context, binary=False)
    wsd_model = WSD(vs, vc, method=wsd_method, filter_ctx=filter_ctx, ignore_case=ignore_case)

    print("Loading test set...")
    reader = read_csv(test_file, encoding="utf-8", delimiter="\t", dtype={'predict_related': object, 'gold_sense_ids':object, 'predict_sense_ids':object})
    rows_count = reader.shape[0]
    print((str(rows_count) + " test instances"))
    pb = pbar.Pbar(rows_count, 100)
    

    uncovered_words = [] # target words for which sense model has zero senses

    print(("Start prediction over " + test_file))
    pb.start()
    for i, row in reader.iterrows():
        # Form of prediction: (sense, sense_scores)
        ctx = row.context.lower() if lowercase else row.context
        start, end = [int(x) for x in row.target_position.split(',')]
        
        prediction = wsd_model.dis_text(ctx, row.target, start, end)
        if prediction:
            sense, sense_scores = prediction
            reader.set_value(i, 'predict_sense_ids', sense.split("#")[1])
                #neighbours = wsd_model.vs.most_similar(sense, topn=n_neighbours)
                #neighbours = ["%s:%.3f" % (n.split("#")[0], float(sim)) for n, sim in neighbours]
                #reader.set_value(i, 'predict_related', ",".join(neighbours))
        else:
            uncovered_words.append(row.target)
            continue
            
        pb.update(i)
    pb.finish()
    
    reader.to_csv(sep='\t', path_or_buf=output, encoding="utf-8", index=False, quoting=QUOTE_NONE)
    print(("Saved predictions to " + output))
Beispiel #5
0
def main():
    params = {}
    params["home"] = (env["HOME"], )
    params["corpus-path"] = ("{}/Desktop/corpus".format(env["HOME"]), )
    params["agent"] = ("sded", )
    params["recognition"] = ("on", "off")
    params["recognition"] = ("on", )
    params["trial"] = list(range(0, 10))
    params["max-sentences"] = (100, )
    commands = (
        "source {home}/research/code/wsd/agent/{agent}.soar",
        "smem --set recognition {recognition}",
        "epmem --set recognition {recognition}",
        "w 0",
    )
    reporters = (
        soar_exp.branch_name,
        soar_exp.avg_decision_time,
        soar_exp.max_decision_time,
    )
    # run Soar
    kernel = soar_exp.create_kernel()
    for param_map in soar_exp.param_permutations(params):
        agent = soar_exp.create_agent(
            kernel,
            "agent-" + "-".join(str(val) for val in param_map.values()))
        wsd = WSD(kernel, agent, param_map)
        soar_exp.register_destruction_callback(
            kernel, agent, soar_exp.print_report_row,
            soar_exp.report_data_wrapper(param_map, wsd, reporters))
        soar_exp.run_parameterized_commands(agent, param_map, commands)
        agent.ExecuteCommandLine("run")
        #soar_exp.cli(agent)
        kernel.DestroyAgent(agent)
        del agent
    kernel.Shutdown()
    del kernel
Beispiel #6
0
    while array.size < 10:  # too few scores, pad with -1
        array = np.append(array, np.array([-1]))
    return array


ignore_case = True
lang = "en"  # to filter out stopwords
for i in range(2510):
    sentence = orig_data_df['text'][i]
    detection = orig_data_df['detection'][i]
    word_list = sentence.split(' ')
    sentence_score_array = np.array([])
    for word in word_list:
        wsd_model = WSD(sv,
                        wv,
                        window=context_window_size,
                        lang=lang,
                        ignore_case=ignore_case)
        word_scores_list = wsd_model.disambiguate(sentence, word)[1]
        #print(word_scores_list)
        word_scores_list = sorted(word_scores_list, reverse=True)
        word_scores_arr = np.asarray(word_scores_list)
        word_scores_arr = filter_scores(word_scores_arr)
        sentence_score_array = np.concatenate(
            (sentence_score_array, word_scores_arr), axis=0)
    sentence_score_array = np.reshape(sentence_score_array,
                                      (len(word_list), 10))
    sensegram_df = sensegram_df.append(
        {
            'score': sentence_score_array,
            'detection': detection
Beispiel #7
0
        ans = context.replace(',', '')
        ans = ans.replace('.', '')
        if len(ans.split(sep=" "))<4:
            print('sentence too short')
            ans=''
    ans = tokenizer(ans)
    ans = np.array(ans)
    sentence = ans
    inputs = []
    for i in range(len(ans)):
        inputs.append(TEXT.vocab.stoi[str(ans[i])])
    length = len(ans)
    inputs = np.array(inputs)
    inputs = np.reshape(inputs,(length,1))

    wsd_model = WSD(sv, wv, window=context_window_size, lang='en',ignore_case=True)
    word_list = context.split(' ')

    sentence_df = sensegram(context,word_list,context_window_size)


    list = [ 'rnn','sensegram_rnn']
    j = 0
    prob = 0.0

    # SUBTASK 1 with only RNN
    model4 = torch.load('model_rnn.pt')
    predictions4, label4 = evaluate_task1(model4, torch.from_numpy(inputs), length)
    predictions4 = predictions4.detach().numpy()
    print('RNN model thinks the sentence is {}, with {}% confidence'.format(label4, str(100 * round(float(predictions4), 3))))
Beispiel #8
0
from wsd import WSD
import os
BABELNET_TOKEN = os.environ["BABELNET_TOKEN"]

WSD("wsd.json", BABELNET_TOKEN, "stop_words", "EXTENDED")
Beispiel #9
0
    def __init__(self, config, work_queue, output_queue):
        # Initialize Disambiguator thread
        Thread.__init__(self)
        self.daemon = True

        # Setting up variables
        self._config = config
        self._work_queue = work_queue
        self._output_queue = output_queue
        self._workers = []

        # Shouldn't have jobs yet
        self.has_work = False

        ## Set up all the various models
        # spaCy
        log.warn('Loading spaCy model: \'en\'')
        start_time = time.time()
        self._spacy_model = spacy.load('en')
        seconds = time.time() - start_time
        log.warn('Completed in {:5.2f} seconds'.format(seconds))

        # SenseGram
        log.info('Initializing SenseGram')

        # Figure out all the config file information
        sense_vectors_fpath = self._config['MODEL']['SENSE_VECTOR_PATH']
        word_vectors_fpath = self._config['MODEL']['WORD_VECTOR_PATH']
        context_words_max = int(self._config['MODEL']['CONTEXT_WORDS_MAX'])
        context_window_size = int(self._config['MODEL']['CONTEXT_WINDOW_SIZE'])
        ignore_case = self._config['MODEL'].getboolean('IGNORE_CASE')
        lang = self._config['MODEL']['LANG']

        # Really this should just be false unless you're debugging
        verbose = self._config['MODEL'].getboolean('VERBOSE')

        ## Load model
        # Sense Vectors
        log.warn('Loading sense vectors...')
        start_time = time.time()
        sv = sensegram.SenseGram.load_word2vec_format(sense_vectors_fpath,
                                                      binary=False)
        seconds = time.time() - start_time
        log.warn('Completed in {:5.2f} seconds'.format(seconds))

        # Word Vectors
        log.warn('Loading word vectors...')
        start_time = time.time()
        wv = KeyedVectors.load_word2vec_format(word_vectors_fpath,
                                               binary=False,
                                               unicode_errors="ignore")
        seconds = time.time() - start_time
        log.warn('Completed in {:5.2f} seconds'.format(seconds))

        self._sense_vectors = sv
        self.wsd_model = WSD(sv,
                             wv,
                             window=context_window_size,
                             lang=lang,
                             max_context_words=context_words_max,
                             ignore_case=ignore_case,
                             verbose=verbose)

        log.info('Model initialized successfully')