def get_greatest_synsets_similarity_between(synsets_wn, nouns):
  synset_wn_max = None
  p_max = 0

  if len(nouns) != 0:
    for synset_wn in synsets_wn:
      p_noun = 0
      for noun in nouns:
        synsets_of_noun = WordnetHandler.get_synsets_for_word(noun, 'n')

        if len(synsets_of_noun) > 0:
          p_each_noun = 0
          for synset_of_noun in synsets_of_noun:
    #        p = synset_wn.path_similarity(synset_of_noun)
            p = WordnetHandler.cal_similarity(synset_wn, synset_of_noun)
            p_each_noun += p
          p_each_noun = p_each_noun/len(synsets_of_noun)
          p_noun += p_each_noun

      p = p_noun/len(nouns)
      if p > p_max:
        synset_wn_max = synset_wn
  else:
    print "no nouns"

  return synset_wn_max
def get_gloss_synset_for(synset):
  key = synset.name()
  if key not in __dict_gloss_for_synset__:
    synsets_gloss = ""
    definition = synset.definition()
    synsets_gloss += definition + ". "
#
#      # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
#      # # get hypernyms
#      # print "\nhypernyms ------";
    for hypernym in synset.hypernyms():
      synsets_gloss += WordnetHandler.get_lemma_synset(hypernym) + ". "


      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # get hyponyms
    for hyponym in synset.hyponyms():
      synsets_gloss += WordnetHandler.get_lemma_synset(hyponym) + ". "

    for meronym in synset.part_meronyms():
      synsets_gloss += WordnetHandler.get_lemma_synset(meronym) + ". "

    for holonym in synset.member_holonyms():
      synsets_gloss += WordnetHandler.get_lemma_synset(holonym) + ". "

    for example in synset.examples():
      synsets_gloss += example + ". "

    __dict_gloss_for_synset__[key] = synsets_gloss
#
  return __dict_gloss_for_synset__[key]
Example #3
0
def synsets_for_word(word_pos):
    (word, pos) = word_pos
    synsets_of_noun = []
    if pos_is_noun(pos):
        synsets_of_noun = WordnetHandler.get_synsets_for_word(word, "n")
    if pos_is_verb(pos):
        synsets_of_noun = WordnetHandler.get_synsets_for_word(word, "v")
    return synsets_of_noun
def sim_ox_wn_value_main_synsets(word):
  dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_value_for(word)
  synsets_wn = WordnetHandler.get_synsets_for_word(word,'n')
  dict_vectors_ox = OxParseDefinition.get_vectors_value_for_word(word, synsets_wn)

  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn)
  (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox)

  m2d_sim_defi_temp =  sim_ox_wn_defi_WDS_via_main_syns(word)
  DebugHandler.print_2d_matrix(m2d_sim_defi_temp)

  m2d_sim_defi = [[0 for x in range(len(vectors_wn))] for x in range(len(vectors_ox))]
  for i in range(len(vectors_wn)):
    for j in range(len(vectors_ox)):
      m2d_sim_defi[j][i] = m2d_sim_defi_temp[i][j]

  m2d_sim = [[0 for x in range(len(vectors_ox))] for x in range(len(vectors_wn))]
  for i in range(len(vectors_wn)):
    vector_wn = vectors_wn[i]
    print vector_wn
    for j in range(len(vectors_ox)):
      vector_ox = vectors_ox[j]
      cosine = spatial.distance.cosine(m2d_sim_defi[j], vector_wn)
      m2d_sim[i][j] = cosine

  print "\n"
  for j in range(len(vectors_ox)):
    vector_ox = vectors_ox[j]
    print vector_ox
  return m2d_sim
def sim_ox_wn_defi_WDS_via_main_syns(word):
  dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_synsets_for_word(word)
  synsets_wn = WordnetHandler.get_synsets_for_word(word,'n')
  dict_vectors_ox = OxParseDefinition.get_dict_vectors_synsets_for_word(word, synsets_wn)
#
  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn)
  (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox)
#
  m2d_sim = sim_wn_ox_vector(vectors_ox, vectors_wn)
#
  cal_sim_ngrams(word)

# write to file
#  # - - - - - - - - - - - - - - - - - - - - - - - - -
#  for i in range(len(keys_wn)):
#    m2d_sim[i].insert(0,keys_wn[i]);
#  # - - - - - - - - - - - - - - - - - - - - - - - - -
#  # row
#  row_dict = [];
#  row_dict.append(word);
#  for i in range(len(keys_ox)):
#    row_dict.append(keys_ox[i].encode('utf8'));
#  # - - - - - - - - - - - - - - - - - - - - - - - - -
#  filename = 'Results/vector_definition/' + word + '.csv'
#  FileProcess.append_to_excel_file(filename, row_dict, m2d_sim)
#  # - - - - - - - - - - - - - - - - - - - - - - - - -

  return m2d_sim
Example #6
0
def parse_ox_wn_defi_to_input(word):
  defis_wn = WordnetHandler.get_definitions_for_word(word)
  defis_ox = OxfordParser.get_definitions_of_word_for_svm(word)

  for defi_wn in defis_wn:
    for defi_ox in defis_ox:
      value = defi_wn + "\t" + defi_ox
      FileProcess.append_value_to_file(value, __filename_input_sen__)
Example #7
0
def sim_2_word(word_1, word_2):
    synsets_1 = synsets_for_word(word_1)
    synsets_2 = synsets_for_word(word_2)
    p_max = 0
    if __WSD_type__ == 1:
        for synset_1 in synsets_1:
            for synset_2 in synsets_2:
                p = WordnetHandler.cal_similarity(synset_1, synset_2)

                if p > p_max:
                    p_max = p

    if __WSD_type__ == 0:
        if len(synsets_1) == 0 or len(synsets_2) == 0:
            return 0
        p_max = WordnetHandler.cal_similarity(synsets_1[0], synsets_2[0])

    return p_max
def get_gloss_for_jacc(word):
  vectors = OrderedDict()
  synsets = WordnetHandler.get_synsets_for_word(word, 'n')
  for synset in synsets:
    vector = get_gloss_synset_for(synset)
    key = synset.definition()
    vectors[key] = vector

  return vectors
def get_dict_vectors_value_for(word):
  vectors = OrderedDict()
  synsets = WordnetHandler.get_synsets_for_word(word, 'n')
  for synset in synsets:
    vector = get_value_synset_for(synset, synsets)
    key = synset.definition()
    vectors[key] = vector

  return vectors
Example #10
0
def sim_for_synset_and_synsetvector(a_synset, vector):
  p_max = 0
  for (synset,weight) in vector:
    p = WordnetHandler.cal_similarity(a_synset, synset)
    if p is not None:
#      p = p*weight
      if p > p_max:
        p_max = p

  return p_max
def get_dict_vectors_words_for_word(word):
  vectors = OrderedDict()
  synsets = WordnetHandler.get_synsets_for_word(word, 'n')
  for synset in synsets:
    definition = synset.definition()
    vector = PreprocessDefinition.preprocess_sentence(definition)
    key = synset.definition()
    vectors[key] = vector

  return vectors
Example #12
0
def sim_ox_wn_defi_WDS_via_1_main_syn(word):
  dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_synsets_for_word(word)
  synsets_wn = WordnetHandler.get_synsets_for_word(word,'n')
  dict_vectors_ox = OxParseDefinition.get_dict_vectors_synsets_for_word(word, synsets_wn)

  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn)
  (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox)

  m2d_sim = sim_wn_ox_vector(vectors_ox, vectors_wn)

  return m2d_sim
Example #13
0
def sim_ox_wn_defi_WDS_via_align_all(word):
  words_wn = WordnetParseDefinition.get_dict_vectors_words_for_word(word)
  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(words_wn)
  words_ox = OxParseDefinition.get_dict_vectors_word_for_word(word)
  (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(words_ox)

  m2d_sim = [[0 for x in range(len(keys_ox))] for x in range(len(keys_wn))]

  for i in range(len(keys_wn)):
    vector_wn = vectors_wn[i]
    words_ox = []
    for j in range(len(keys_ox)):
      words_ox += vectors_ox[j]
    synsets_wn = WordnetHandler.get_nearest_synsets_words_words_order(vector_wn, words_ox)

    for j in range(len(keys_ox)):
      synsets_ox = WordnetHandler.get_nearest_synsets_words_synsets_order(vectors_ox[j], synsets_wn)
      m2d_sim[i][j] = sim_2_vector(synsets_wn, synsets_ox)

  cal_sim_ngrams(word)

  return m2d_sim
Example #14
0
def sim_ox_wn_defi_WDS_via_align(word):
  words_wn = WordnetParseDefinition.get_dict_vectors_words_for_word(word)
  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(words_wn)
  words_ox = OxParseDefinition.get_dict_vectors_word_for_word(word)
  (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(words_ox)

  synsets_wn = WordnetHandler.get_synsets_for_word(word, 'n')

  m2d_sim = [[0 for x in range(len(keys_ox))] for x in range(len(keys_wn))]

  for i in range(len(keys_wn)):
    vector_wn = vectors_wn[i]
    synset_wn = synsets_wn[i]
    for j in range(len(keys_ox)):
      vector_ox = vectors_ox[j]
      m2d_sim[i][j] = WordnetHandler.sim_for_words_words_no_order(vector_ox, vector_wn, synset_wn)
#      (vector_1, vector_2) = WordnetHandler.get_nearest_synsets_words_words_noorder(vector_ox, vector_wn)
#      m2d_sim[i][j] = sim_2_vector(vector_1, vector_2)

  cal_sim_ngrams(word)

  return m2d_sim
def get_greatest_synset_similarity_between(synset_1, noun_2):
  synset_max = None

  (word, pos) = noun_2
  synsets_of_noun = []
#  if pos_is_noun(pos):
#    synsets_of_noun = WordnetHandler.get_synsets_for_word(word, 'n')
#  if pos_is_verb(pos):
#    synsets_of_noun = WordnetHandler.get_synsets_for_word(word, 'v')

  synsets_of_noun_1 = WordnetHandler.get_synsets_for_word(word, 'n')
  synsets_of_noun_2 = WordnetHandler.get_synsets_for_word(word, 'v')
  synsets_of_noun = synsets_of_noun_1 + synsets_of_noun_2

  total_count = 0.1 + len(synsets_of_noun)*__SMOOTH_WEIGHT__
  for synset_of_noun in synsets_of_noun:
    total_count += WordnetHandler.get_freq_count_of_synset(synset_of_noun)
#
  if len(synsets_of_noun) > 0:
    synset_max = synsets_of_noun[0]
#    p_max = -1.0
#
#    for synset_of_noun in synsets_of_noun:
##      p = synset_1.path_similarity(synset_of_noun)
#      p = WordnetHandler.cal_similarity(synset_1, synset_of_noun)
#
#      if p is not None:
#        synset_freq_count = __SMOOTH_WEIGHT__
#        synset_freq_count += WordnetHandler.get_freq_count_of_synset(synset_of_noun)
#
#        p = p*(synset_freq_count/total_count)
#
##
#      if p > p_max:
#        p_max = p
#        synset_max = synset_of_noun
#
  return synset_max
Example #16
0
def get_m2d_sim_for_word_from_svm_result(word):
  defis_wn = WordnetHandler.get_definitions_for_word(word)
  defis_ox = OxfordParser.get_definitions_of_word_for_svm(word)

  if len(defis_wn) == 0 or len(defis_ox) == 0:
    return None

  m2d_sim = [[0 for x in range(len(defis_ox))] for x in range(len(defis_wn))]

  for i_wn in range(len(defis_wn)):
    defi_wn = str(defis_wn[i_wn])
    for i_ox in range(len(defis_ox)):
      defi_ox = str(defis_ox[i_ox])
      m2d_sim[i_wn][i_ox] = ReadSVMResult.get_sim_for_sens(defi_wn, defi_ox)

  return m2d_sim
Example #17
0
def wordnet_based_synset(syn_wn, sen_ox):
    sim = 0.0001
    words_ox = split_words(sen_ox)
    count = 0
    for word in words_ox:
        p_max = 0
        synsets_1 = synsets_for_word(word)
        for synset in synsets_1:
            p = WordnetHandler.cal_similarity(synset, syn_wn)
            if p > p_max:
                p_max = p

        if p_max != 0:
            count += 1
            sim += p_max

    sim /= count + 0.001
    return sim
Example #18
0
def create_input_sens_test(dict_ox):

  flag_can_go = False
  for word in dict_ox:

    if word == "blockage":
      flag_can_go = True

    if flag_can_go == False:
      continue

    if len(dict_ox[word]) == 0:
      continue

    defis_wn = WordnetHandler.get_definitions_for_word(word)
    defis_ox = OxfordParser.get_definitions_of_word_for_svm(word)

    if len(defis_ox) == 1 and len(defis_wn) == 1:
      continue

    if len(defis_ox) == 1 and len(defis_wn) > 1:
      all_defi_wn = ""
      for defi_wn in defis_wn:
        all_defi_wn += defi_wn + "\t"

      if all_defi_wn != "":
        all_defi_wn = all_defi_wn[:-1]
      for defi_wn in defis_wn:
        for defi_ox in defis_ox:
          value = defi_wn + "\t" + defi_ox + "\t" + all_defi_wn
          FileProcess.append_value_to_file(value, __filename_input_sen_test__)
    else:
      for defi_wn in defis_wn:
        all_defi_ox = ""
        for defi_ox in defis_ox:
          all_defi_ox += defi_ox + "\t"

        if all_defi_ox != "":
          all_defi_ox = all_defi_ox[:-1]

        for defi_ox in defis_ox:
          value = defi_wn + "\t" + defi_ox + "\t" + all_defi_ox
          FileProcess.append_value_to_file(value, __filename_input_sen_test__)
Example #19
0
def sim_ox_wn_defi_WDS_via_curr_main_syn(word):
  dict_vectors_wn = WordnetParseDefinition.get_dict_vectors_synsets_for_word(word)
  (keys_wn, vectors_wn) = Util.get_keys_values_of_dict(dict_vectors_wn)
  synsets_wn = WordnetHandler.get_synsets_for_word(word, 'n')

  definitions = OxfordParser.get_definitions_of_word(word)

  m2d_sim = [[0 for x in range(len(definitions))] for x in range(len(vectors_wn))]

  for i in range(len(vectors_wn)):
    vector_wn = vectors_wn[i]

    dict_vectors_ox = OxParseDefinition.get_dict_vectors_synsets_for_word(word, [synsets_wn[i]])
    (keys_ox, vectors_ox) = Util.get_keys_values_of_dict(dict_vectors_ox)

    for j in range(len(vectors_ox)):
      vector_ox = vectors_ox[j]
      m2d_sim[i][j] = sim_2_vector(vector_ox, vector_wn)

  cal_sim_ngrams(word)

  return m2d_sim
def get_value_synset_for(cur_synset, synsets):
  synsets_value = []
  definition = cur_synset.definition()
  nouns = PreprocessDefinition.preprocess_sentence(definition)
#  nouns = list(set(nouns))
  for synset in synsets:
    count = 0
    p = 0
    for noun in nouns:
      synset_max = get_greatest_synset_similarity_between(synset, noun)
      if synset_max is not None:
        count += 1
        sim = WordnetHandler.cal_similarity(synset, synset_max)
        if sim != None:
          p += sim

    if count != 0:
      p = p/count

    synsets_value.append(p)

  return synsets_value
Example #21
0
def create_input_for_test_svm():
  dict_ox =  OxfordParser.get_dict_nouns()
  flag_can_go = False
  for word in dict_ox:

#    if word == "brook":
#      flag_can_go = True
#
#    if flag_can_go == False:
#      continue

    if len(dict_ox[word]) == 0:
      continue

    syns_wn = WordnetHandler.get_synsets_for_word(word, 'n')
    syns_ox = dict_ox[word]

    if len(syns_ox) == 1 and len(syns_wn) == 1:
      continue

    write_sens_for_reading(syns_wn, syns_ox, __filename_input_sen_test__)
    cal_features_and_write_to_file_for(syns_wn, syns_ox, __filename_input_test_feature_values__)
Example #22
0
def get_definition_value_with_synsetwn(definition, synsets_wn):
  synsets_value = []
#  nouns = PreprocessDefinition.preprocess_sentence_to_nouns(definition)
  nouns = PreprocessDefinition.preprocess_sentence(definition)
#  nouns = list(set(nouns))
  for synset in synsets_wn:
    count = 0
    p = 0
    for noun in nouns:
      synset_max = get_greatest_synset_similarity_between([synset], noun)
      if synset_max is not None:
        count += 1
        sim = WordnetHandler.cal_similarity(synset, synset_max)
        if sim != None:
          p += sim

    if count != 0:
      p = p/count

    synsets_value.append(p)

  return synsets_value
def similarity_nbest_withword_average(WORD, dict_words):

  wn_words = WordnetHandler.get_synsets_for_word(WORD, 'n');

  matrix_similarity = similarity_by_synsets_synsets_nbest_withword_dict_wn(WORD, dict_words,wn_words)

  if matrix_similarity == None:
    return  [],[]

  matrix_similarity_reverse = similarity_by_synsets_synsets_nbest_withword_wn_dict(WORD , wn_words, dict_words)

  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord];
      matrix_similarity[iWnWord][iDictWord] /= 2;

  matrix_similarity_jaccard = similarity_by_jaccard(WORD, dict_words, wn_words)

  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*(1-PARAMETERS.JACCARD_WEIGHT) + PARAMETERS.JACCARD_WEIGHT*(1-matrix_similarity_jaccard[iWnWord][iDictWord]);

  return matrix_similarity, wn_words
Example #24
0
def pair_0_1_reducing_m2d_sim(matrix_similarity, num_rows, num_cols, word):

  if num_rows == 1 and num_cols == 1 and matrix_similarity[0][0] > Parameters.PARAMETERS_CHOICE_0_1.CHOICE_1_1_MIN:
      matrix_similarity[0][0] = 1;

  if num_rows > 1 and num_cols == 1:
    col = []
    for iWnWord in range(num_rows):
      col.append(matrix_similarity[iWnWord][0])
    order = heapq.nlargest(2, range(num_rows), col.__getitem__);
    if matrix_similarity[order[0]][0] >= Parameters.PARAMETERS_CHOICE_0_1.CHOICE_1_COL_RANGE_FIRST*matrix_similarity[order[1]][0] or \
            matrix_similarity[order[0]][0] > Parameters.PARAMETERS_CHOICE_0_1.CHOICE_1_COL_MIN_FIRST:
      matrix_similarity[order[0]][0] = 1;

  if num_rows >= 1 and num_cols > 1:
    synsets_wn = WordnetHandler.get_synsets_for_word(word,'n')
    status_synsets = create_status_array(synsets_wn)
    updated = reducing_m2d_sim(matrix_similarity, status_synsets)
    while updated == 1:
      m2d = sim_ox_wn_defi_WDS_via_main_syns_for_reduce(synsets_wn, status_synsets, word)
      updated = reducing_m2d_sim(m2d, status_synsets)
      match_matrix_sim_with_temp_matrix(matrix_similarity, m2d)

  return matrix_similarity
Example #25
0
def create_input_for_train_svm():
  dict_ox =  OxfordParser.get_dict_nouns()
  dict_gold = CompareWithGold.goldData

  for word in dict_ox:

    if len(dict_ox[word]) == 0 or word not in dict_gold:
      continue

    if word == "brook":
      return

#    if word != "bank":
#      continue

    syns_wn = WordnetHandler.get_synsets_for_word(word, 'n')
    syns_ox = dict_ox[word]

    if len(syns_ox) == 1 and len(syns_wn) == 1:
      continue

    write_label_for_svm(syns_wn, syns_ox, dict_gold[word])
    write_sens_for_reading(syns_wn, syns_ox, __filename_input_sen_train__)
    cal_features_and_write_to_file_for(syns_wn, syns_ox, __filename_input_train_feature_values__)
def get_synsets_for_word_in_wn(word_origin, wn_synsets_for_word_origin):

  # arr synsets for arr words
  # each word has an array of synsets
  wn_synsets_for_words = [];

  # add p
  p_synsets_for_words = [];

  for iWord in range(len(wn_synsets_for_word_origin)):

    # print "- - - - - - - - - - - - - - - - - - - - - - - - - - -";
    # print iWord;
    wn_synsets_for_words.append([]);

    # add p
    p_synsets_for_words.append([]);

    # get a bank in wn_words
    wordDict = wn_synsets_for_word_origin[iWord];

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get synsets of bank
    synset_of_word = wn.synset(wordDict.name());
    wn_synsets_for_words[iWord].append(synset_of_word);

    # add p
    p_synsets_for_words[iWord].append(1.5);

    # print synset_of_word
    # print "---"

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get hypernyms

    if PARAMETERS.DICT_WN_FEATURE_RELATION_hypernyms == 1:
      # print "hypernyms"
      for hypernym in wn.synset(wordDict.name()).hypernyms():
        # print hypernym
        wn_synsets_for_words[iWord].append(hypernym);

        # add p
        p_synsets_for_words[iWord].append(1.2);


    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get meronyms
    if PARAMETERS.DICT_WN_FEATURE_RELATION_part_meronyms == 1:
      # print "meronyms"
      for meronym in wn.synset(wordDict.name()).part_meronyms():
        # print meronym
        wn_synsets_for_words[iWord].append(meronym);

        # add p
        p_synsets_for_words[iWord].append(1.2);

    # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # # get holonyms
    if PARAMETERS.DICT_WN_FEATURE_RELATION_member_holonyms == 1:
      # print "holonyms"
      for holonym in wn.synset(wordDict.name()).member_holonyms():
        # print holonym
        wn_synsets_for_words[iWord].append(holonym);

        # add p
        p_synsets_for_words[iWord].append(1.2);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # get hyponyms
    if PARAMETERS.DICT_WN_FEATURE_RELATION_hyponyms == 1:
      # print "hyponyms"
      for hyponym in wn.synset(wordDict.name()).hyponyms():
        # print hyponym
        wn_synsets_for_words[iWord].append(hyponym);

        # add p
        p_synsets_for_words[iWord].append(1.2);

    # # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # # get description

    if PARAMETERS.DICT_WN_FEATURE_RELATION_definition == 1:

      # print "\ndefinition ------";

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wordDict.name()).definition()));
      # print tagged_sent

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if PARAMETERS.POS_FEATURE_n == 1:
        nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS'  or pos == 'JJ')];

        for noun in nouns:

          noun = wordnet_lemmatizer.lemmatize(noun, pos='n');

          if noun == None:
            continue

          if noun != word_origin and noun != "sth":
            synsetsDictNoun = WordnetHandler.get_synsets_for_word(noun, "n");

            if len(synsetsDictNoun) > 0:
              synsetMax = synsetsDictNoun[0];
              p_max = 0;

              for synsetNoun in synsetsDictNoun:
                p = synsetNoun.path_similarity(synset_of_word);
                if p > p_max:
                  p_max = p;
                  synsetMax = synsetNoun

              # print synsetMax
              if synsetMax not in wn_synsets_for_words[iWord]:
                wn_synsets_for_words[iWord].append(synsetMax);

                # add p
                p_synsets_for_words[iWord].append(1.);

            # if synsetsDictNoun[0] not in wn_words_synset[iWord]:
            #   # wn_words_synset[iWord].append(synsetsDictNoun[0]);
      # - - - - - - - - - - - - - - - - - - - - - - - - - - -

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      if PARAMETERS.POS_FEATURE_v == 1:
        verbs = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBD' or pos == 'VBN')];

        for verb in verbs:

          verb = wordnet_lemmatizer.lemmatize(verb, pos='v');

          if verb == None:
            continue

          if verb != "bank":
            synsetsDictVerb = WordnetHandler.get_synsets_for_word(verb, "v");


            if len(synsetsDictVerb) > 0:
              synsetMax = synsetsDictVerb[0];
              p_max = 0;

              for synsetVerb in synsetsDictVerb:
                p = synsetVerb.path_similarity(synset_of_word);
                if p > p_max:
                  p_max = p;
                  synsetMax = synsetVerb
              #
              # print synsetMax
              if synsetMax not in wn_synsets_for_words[iWord]:
                wn_synsets_for_words[iWord].append(synsetMax);

                # add p
                p_synsets_for_words[iWord].append(1.);

            # if synsetsDictNoun[0] not in wn_words_synset[iWord]:
            #   wn_words_synset[iWord].append(synsetsDictNoun[0]);

    # print wn_synsets_for_words[iWord]

  ########################################
  return wn_synsets_for_words,p_synsets_for_words;
Example #27
0
def cal_feature_values_for(syn_wn, syn_ox):
  feature_values = []

  defi_wn = WordnetHandler.get_defi_for_syn(syn_wn)
  defi_ox = OxfordParser.get_defi_for_syn(syn_ox)

  gloss_wn = WordnetHandler.get_gloss_for_syn(syn_wn)
  gloss_ox = OxfordParser.get_gloss_for_syn(syn_ox)

  lemma_wn = WordnetHandler.get_lemma_for_synset(syn_wn)
  sd_ox = OxfordParser.get_short_defi_for_syn(syn_ox)

  ex_wn = WordnetHandler.get_ex_for_syn(syn_wn)
  ex_ox = OxfordParser.get_ex_for_syn(syn_ox)

  cl_ox =  OxfordParser.get_collocation_for_syn(syn_ox)
  hyper_wn = WordnetHandler.get_hyper_defi_for_synset(syn_wn)
  mero_wn = WordnetHandler.get_mero_defi_for_synset(syn_wn)

  # # # # # # # # # # # # # # # # #
  # Literal
  literal_leven_value = 1-Literal.levenshtein(defi_wn, defi_ox)
  feature_values.append(literal_leven_value)

  literal_jacc_value = 1.00001-Literal.jaccard(defi_wn, defi_ox)
  feature_values.append(literal_jacc_value)
#  feature_values.append(literal_jacc_value+literal_leven_value)

  # # # # # # # # # #

  literal_leven_value = 1-Literal.levenshtein(gloss_wn, gloss_ox)
  feature_values.append(literal_leven_value)

  literal_jacc_value = 1.00001-Literal.jaccard(gloss_wn, gloss_ox)
  feature_values.append(literal_jacc_value)
#  feature_values.append(literal_jacc_value+literal_leven_value)

  # # # # # # # # # #

  literal_leven_ngram = literal_leven_value
  literal_jacc_ngram = literal_jacc_value

  ngrams_value = Ngrams.ngrams_word_for(gloss_wn, gloss_ox, 2)
  literal_jacc_ngram += ngrams_value
  literal_leven_ngram += ngrams_value

  ngrams_value = Ngrams.ngrams_word_for(gloss_wn, gloss_ox, 3)
  literal_jacc_ngram += ngrams_value
  literal_leven_ngram += ngrams_value

  ngrams_value = Ngrams.ngrams_word_for(gloss_wn, gloss_ox, 4)
  literal_jacc_ngram += ngrams_value
  literal_leven_ngram += ngrams_value

  ngrams_value = Ngrams.ngrams_word_for(gloss_wn, gloss_ox, 5)
  literal_jacc_ngram += ngrams_value
  literal_leven_ngram += ngrams_value

  feature_values.append(literal_jacc_ngram)
#  feature_values.append(literal_leven_ngram)

  # # # # # # # # # #

#  gloss_split_wn = Literal.split_and_stem(gloss_wn)
#  gloss_split_ox = Literal.split_and_stem(gloss_ox)
#  literal_jaro_winkler = Jelly.jaro_winkler(gloss_wn, gloss_ox)
#  feature_values.append(literal_jaro_winkler + literal_jacc_value)

  # # # # # # # # # #

#  literal_jacc_value = 1.00001-Literal.jaccard(ex_wn, ex_ox)
#  feature_values.append(literal_jacc_value)

  # # # # # # # # # # # # # # # # #
  # ShallowSyntactic

#  shallow_jaccard_POS = 0
#  shallow_jaccard_POS += 1.0001 - ShallowSyntactic.jaccard_POS(gloss_wn, gloss_ox)
#  shallow_jaccard_POS += 1.0001 - ShallowSyntactic.jaccard_POS_ngrams(gloss_wn, gloss_ox, 2)
#  shallow_jaccard_POS += 1.0001 - ShallowSyntactic.jaccard_POS_ngrams(gloss_wn, gloss_ox, 3)
#  shallow_jaccard_POS += 1.0001 - ShallowSyntactic.jaccard_POS_ngrams(gloss_wn, gloss_ox, 4)
#  feature_values.append(shallow_jaccard_POS)

  # # # # # # # # # # # # # # # # #
  # wordnet-based, WSD

  wn_value = WordnetBased.wordnet_based(defi_wn, defi_ox, 0)
  feature_values.append(wn_value)

#  wn_value = WordnetBased.wordnet_based(hyper_wn, defi_ox, 0)
#  feature_values.append(wn_value)

#  hypo_value = 0
#  if len(syn_wn.hyponyms()) > 0:
#    for hypo in syn_wn.hyponyms():
#      hypo_value += WordnetBased.wordnet_based_synset(hypo, defi_ox)
#    hypo_value /= len(syn_wn.hyponyms())
#  feature_values.append(hypo_value)

#  hyper_value = 0
#  if len(syn_wn.hypernyms()) > 0:
#    for hyper in syn_wn.hypernyms():
#      hyper_value += WordnetBased.wordnet_based_synset(hyper, defi_ox)
#    hyper_value /= len(syn_wn.hypernyms())
#  feature_values.append(hyper_value)
#
#  wn_value = WordnetBased.wordnet_based(ex_wn, ex_ox,0)
#  feature_values.append(wn_value)
#
#  wn_value_1 = WordnetBased.wordnet_based(defi_wn, defi_ox, 1)
#  feature_values.append(wn_value + wn_value_1)
#
#  wn_value = WordnetBased.wordnet_based(gloss_wn, gloss_ox, 0)
#  feature_values.append(wn_value)
#
#  wn_value_1 = WordnetBased.wordnet_based(gloss_wn, gloss_ox, 1)
#  feature_values.append(wn_value + wn_value_1)

  # # # # # # # # # # # # # # # # #
  # lsa
#  lsa_tfidf = LSA.sim_tfidf(defi_wn, defi_ox)
#  feature_values.append(lsa_tfidf)
##
#  lsa_tfidf = LSA.sim_tfidf(hyper_wn, defi_ox)
#  feature_values.append(lsa_tfidf)
#
#  lsa_tfidf = LSA.sim_tfidf(gloss_wn, gloss_ox)
#  feature_values.append(lsa_tfidf)

#  lsa_tfidf = LSA.sim_tfidf(lemma_wn, sd_ox)
#  feature_values.append(lsa_tfidf)
#
#  lsa_tfidf = LSA.sim_tfidf(ex_wn, ex_ox)
#  feature_values.append(lsa_tfidf)

  return feature_values
def get_nbest_synsets_for_word_in_oxford(dict_words,word_concept):

  dict_words_nouns = [];
  dict_words_verbs = [];
  dict_synsets_for_words = [];

  wn_words = WordnetHandler.get_synsets_for_word(word_concept, 'n');

  # add p
  p_synsets_for_words = [];

  for iWord in range(len(dict_words)):

    # print iWord;

    dict_words_nouns.append([]);
    dict_words_verbs.append([]);
    dict_synsets_for_words.append([]);

    # add p
    p_synsets_for_words.append([]);

    wordDict = dict_words[str(iWord)];

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # sd

    if not wordDict.has_key('tv'):
      continue

    if not wordDict.has_key('d'):
      continue

    nouns = [];
    if wordDict.has_key("sd") and PARAMETERS.DICT_OX_FEATURE_RELATION_sd == 1:
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["sd"]));
      nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))];

      if len(nouns) == 0:
        tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
        # print tagged_sent
        nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' and word != 'etc'))];

    elif wordDict.has_key("d") and wordDict["d"] != None:
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
      # print tagged_sent
      nouns = [word for word,pos in tagged_sent if ((pos == 'NN' or pos == 'NNS') and (word != 'sth' or word != 'etc'))];
    else:
      continue

    for noun in nouns:
      noun = wordnet_lemmatizer.lemmatize(noun, pos='n');
      if noun == None:
        continue

      if noun != "sth" and noun != 'etc' and noun not in dict_words_nouns[iWord]:
        dict_words_nouns[iWord].append(noun);

    if len(dict_words_nouns[iWord]) == 0:
      continue

    # print dict_words_nouns[iWord]
    synsetsSD = [];

    for word in dict_words_nouns[iWord]:
      synsets = WordnetHandler.get_synsets_for_word(word, 'n');
      for synset in synsets:
        synsetsSD.append(synset)

    if len(synsetsSD) == 0:
      continue

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # d

    if PARAMETERS.DICT_OX_FEATURE_RELATION_d == 1:
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
      nouns = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS')];

    if PARAMETERS.DICT_OX_FEATURE_RELATION_xh == 1:
      if wordDict.has_key('xh0') and wordDict['xh0'] is not None and wordDict['xh0'] != 'nn':
        nouns.append(wordDict['xh0']);
      if wordDict.has_key('xh1') and wordDict['xh1'] is not None:
        nouns.append(wordDict['xh1']);
      if wordDict.has_key('xh2') and wordDict['xh2'] is not None:
        nouns.append(wordDict['xh2']);

    # print  tagged_sent

    for noun in nouns:
      noun = wordnet_lemmatizer.lemmatize(noun, pos='n');
      if noun == None:
        continue

      if noun.encode('utf8') != word_concept and noun != "sth" and noun not in dict_words_nouns[iWord]:
        dict_words_nouns[iWord].append(noun);

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # print wordDict["tv"]
    # print dict_words_nouns[iWord]
    #
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    # synsets

    iSDMax = 0;
    pSD_max = 0;

    for iSyn in range(len(synsetsSD)):
      synsetSD = synsetsSD[iSyn];
      pSD = 0;

      arr_p = [];

      for synset in wn_words:
        # p_noun_max = 0;
        p = synsetSD.path_similarity(synset);
        # print "-----------------------"
        # if p > p_noun_max:
        p_noun_max = p;

        arr_p.append(p_noun_max);

      arr_p = sorted(arr_p, reverse=True);

      for i in xrange(0, len(arr_p)-1):
        if i <= 0:
          pSD += arr_p[i];

      # print "\n"

      if pSD > pSD_max:
        pSD_max = pSD;
        iSDMax = iSyn;

    # print "\n"

    synsetRoot = synsetsSD[iSDMax];
    # print "synsetroot"
    # print synsetRoot

    for noun in dict_words_nouns[iWord]:
      synsets_noun = WordnetHandler.get_synsets_for_word(noun, 'n');
      if len(synsets_noun) <= 0:
        continue;

      p_noun_max = 0;
      synMax = synsets_noun[0];

      for synset_noun in synsets_noun:
        # dict_synsets_nouns[iWord].append(synMax);
        for synset in wn_words:
          p = synset.path_similarity(synset_noun);
        # p = synsetRoot.path_similarity(synset_noun);
          if p > p_noun_max:
            p_noun_max = p;
            synMax = synset_noun;

      if synMax not in dict_synsets_for_words[iWord]:
        dict_synsets_for_words[iWord].append(synMax);

    if PARAMETERS.POS_FEATURE_v:

      # continue
      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wordDict["d"]));
      verbs = [word for word,pos in tagged_sent if (pos == 'VB' or pos == 'VBN' or pos == 'VBD')];

      # print "VVVVV"
      # print verbs
      for verb in verbs:
        verb = wordnet_lemmatizer.lemmatize(verb, pos='v');
        if verb == None:
          continue

        if verb.encode('utf8') != word_concept and verb != "sth" and verb not in dict_words_verbs[iWord]:
          # print noun;
          dict_words_verbs[iWord].append(verb);

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # print dict_words_verbs[iWord]

      # - - - - - - - - - - - - - - - - - - - - - - - - - - -
      #
      # synsets

      iSDMax = 0;
      pSD_max = 0;

      for iSyn in range(len(synsetsSD)):
        synsetSD = synsetsSD[iSyn];
        pSD = 0;

        arr_p = [];

        for synset in wn_words:
          # p_noun_max = 0;
          p = synsetSD.path_similarity(synset);
            # arr_p.append(p);
          # print "-----------------------"
          # print synsetSD
          # print synset
          # print p
          # if p > p_noun_max:
          p_verb_max = p;

          arr_p.append(p_verb_max);

        arr_p = sorted(arr_p, reverse=True);

        for i in xrange(0, len(arr_p)-1):
          if i <= 1:
            pSD += arr_p[i];

        # print "\n"

        if pSD > pSD_max:
          # print pSD
          # print pSD_max
          pSD_max = pSD;
          # print iSyn
          # print iSDMax
          iSDMax = iSyn;

      # print "\n"

      synsetRoot = synsetsSD[iSDMax];
      # print "synsetroot"
      # print synsetRoot

      for verb in dict_words_verbs[iWord]:
        synsets_verb = WordnetHandler.get_synsets_for_word(verb, 'v');
        if len(synsets_verb) <= 0:
          continue;

        p_verb_max = 0;
        synMax = synsets_verb[0];

        for synset_verb in synsets_verb:
          # p = synsetRoot.path_similarity(synset_verb);
          for synset in wn_words:
            p = synset.path_similarity(synset_verb);

            if p > p_verb_max:
              p_verb_max = p;
              synMax = synset_verb;

        if synMax not in dict_synsets_for_words[iWord]:
          dict_synsets_for_words[iWord].append(synMax);
        # if synsets_noun[0] not in dict_synsets_nouns[iWord]:
          # dict_synsets_nouns[iWord].append(synsets_noun[0]);

    # print "dict_synsets_nouns"
    # print dict_synsets_for_words[iWord]

  ########################################
  return dict_synsets_for_words;
Example #29
0
def sim_ox_wn_via_svm():
  total_tp = 0.00001
  total_tn = 0.00001
  total_fn = 0.00001
  total_fp = 0.00001
  total_pair = 0

  dict_ox = OxfordParser.get_dict_nouns()
  flag_can_go = False
  for word in dict_ox:

#    if word == "brook":
#      flag_can_go = True
#
#    if flag_can_go == False:
#      continue

    word_syns_ox = dict_ox[word]
    wn_synsets = WordnetHandler.get_synsets_for_word(word, "n")

    m2d_sim = [[0 for x in range(len(word_syns_ox))] for x in range(len(wn_synsets))]

    if len(word_syns_ox) == 1 and len(wn_synsets) == 1:
      m2d_sim[0][0] = 1
    else:
      m2d_sim = get_m2d_sim_for_word_from_svm_result(word)

    if m2d_sim == None:
      continue

#    DebugHandler.print_2d_matrix(m2d_sim)

    m2d_sim = choose_pair_0_1(m2d_sim, len(m2d_sim), len(m2d_sim[0]))
#    DebugHandler.print_2d_matrix(m2d_sim)

    pair = count_pair(m2d_sim)
    total_pair += pair

    (tp, tn, fn, fp) = CompareWithGold.compareGoldWithResult_without_cal_result(m2d_sim,word)
    if tp != -1:
      total_tp += tp
      total_tn += tn
      total_fn += fn
      total_fp += fp

  precision = total_tp / (total_tp + total_fp)
  recall = total_tp / (total_tp + total_fn)
  accuracy = (total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn)

  f_score = 0
  if precision != 0 or recall != 0:
    f_score = 2*(precision*recall)/(precision + recall)
  print "total:"
  print total_pair
  print total_tp
  print total_tn
  print total_fn
  print total_fp

  print precision
  print recall
  print f_score
  print accuracy

  Parameters.append_result_to_file( precision, recall, f_score, accuracy)
  current_params = Parameters.get_current_params()
  current_params = copy.deepcopy(current_params)
  return f_score, current_params