コード例 #1
def getInternalSetWithReducing(arr_words, target_word, model):
    Get IntS (internal, kernel) words for words in the array arr_words. 
        If |IntS (arr_words)| == 0 then try reduce arr_words, 
        (1) until |IntS (reduced arr_words)| > 0
        (2)  and    set (reduced arr_words) contains target_word
    arr_words : array of Strings
        Array of source words, for example a synonym set or a sentence' words.
    target_word : String
        (1) arr_words contain target_word, 
        (2) the result IntS should contain target_word too.
    model : object
        Word2Vec model.
    array of Strings
        Internal subset for the source list of words (arr_words) or for subset of arr_words.
        Empty array if there are no such words.

    #result_int_s = []

    arr_words = filter_vocab_words.filterVocabWords(arr_words, model.vocab)
    #print string_util.joinUtf8( ",", arr_words )                            # after filter, now there are only words with vectors

    #if len(arr_words) < 3:
    #    return []

    while len(
    ) >= 3:  # it is possible calculate IntS only when there are >= 3 words

        int_s = getInternalSet(arr_words, model)
        if len(int_s) > 0:
            return int_s

        # then now: len (int_s) == 0
        # let's find word_remote (1) within arr_words, (2) the most distant word to the target word
        target_vector = model[target_word]
        word_remote = ""
        #arr_new = []
        sim_min = 1.0
        for word in arr_words:
            if word == target_word:
                continue  # let's skip and do not delete target word itself

            vector = model[word]
            sim = 1 - spatial.distance.cosine(target_vector, vector)

            #print u"sim({}, {}) = {}".format( target_word, word, sim )

            if sim < sim_min:
                #print u"UPDATE: new sim {} < sim_min {}, word_remote old = {}, new = {}".format( sim, sim_min, word_remote, word )
                sim_min = sim
                word_remote = word

        if len(word_remote
               ) == 0:  # it is very strange that we did not find any word!
            return []

        print string_util.joinUtf8(",", arr_words)

    return []
コード例 #2
def getInternalSet(arr_words, model):
    Get IntS (internal, kernel) words for words in the array arr_words.
        Word w belong to IntS iff
        (1) set S = S1 and S2 (subsets of S),
        (2) subsets S1 and S2 are not empty,
        (3) dist(S1, S2) > dist(S1 + w, S2),
        (4) dist(S1, S2) > dist(S1    , S2 + w), 
        (where "dist" is Cosine similarity between average vectors of S1 and S2 - subsets of vectors),
        i.e. the word w makes close any subsets S1 and S2 after adding.
    arr_words : array of Strings
        Array of source words, for example a ynonym set or a sentence' words.
    model : object
        Word2Vec model.
    array of Strings
        Internal subset for the source list of words.
        Empty array if there are no such words.

    result_int_s = []
    DEBUG_PRINT = False  #True

    arr_words = filter_vocab_words.filterVocabWords(arr_words, model.vocab)
    #print string_util.joinUtf8( ",", arr_words )                              # after filter, now there are only words with vectors

    len_words = len(arr_words)
    #print "len_words = {}".format( len_words )

    if len_words < 3:
        return [
        ]  # it is possible calculate IntS only when there are >= 3 words

#    current_synset  = lib.synset.Synset()
#    current_synset.headword = arr_words[0] # let's first element in synset is a headword (? target word)
#    current_synset.line     = line

#    syn_rank       = dict()  # integer
#    syn_centrality = dict()  # float
#    syn_internal   = dict()  # boolean (true for IntS, for synonyms, which always make subsets more nearer)

# let's take all subsets for every 'out' element
    i = 0
    while (i < len_words):
        gr = arr_words[:]
        # extract the element 'out' which is under consideration
        test_word = gr.pop(i)
        #test_word_counter_int   = 0
        #test_word_counter_float = 0

        sim12_greater_sim0_always = True
        for j in range(0, len(gr)):
            for l in range(j, len(gr) - 1):
                gr1 = gr[j:l + 1]
                gr2 = gr[0:j] + gr[l + 1:len(gr)]
                if DEBUG_PRINT:
                    print u"{} | gr1={} | gr2={}".format(
                        test_word, string_util.joinUtf8(",", gr1),
                        string_util.joinUtf8(",", gr2))

                gr1_and_test_word = gr1[:]

                gr2_and_test_word = gr2[:]

                sim0 = model.n_similarity(gr1, gr2)
                sim1 = model.n_similarity(gr1_and_test_word, gr2)
                sim2 = model.n_similarity(gr1, gr2_and_test_word)
                if DEBUG_PRINT:
                    print "sim0 = {:5.3f}".format(sim0)
                    print "sim1 = {:5.3f}".format(sim1)
                    print "sim2 = {:5.3f}".format(sim2)

                if sim0 > sim1 or sim0 > sim2:
                    sim12_greater_sim0_always = False

                if DEBUG_PRINT:
                    a = 1 if sim1 > sim0 else -1
                    b = 1 if sim2 > sim0 else -1
                    #test_word_counter_int += (a + b)/2
                    #test_word_counter_float += (sim1 - sim0) + (sim2 - sim0)
                    #print "test_word_counter_int = {}".format( test_word_counter_int )
                    #print "test_word_counter_float = {}".format( test_word_counter_float )

            if DEBUG_PRINT:
        #syn_rank      [test_word] = test_word_counter_int;
        #syn_centrality[test_word] = test_word_counter_float;
        #syn_internal  [test_word] = sim12_greater_sim0_always;

        if sim12_greater_sim0_always:

        if DEBUG_PRINT:
        i += 1

    return result_int_s
コード例 #3
#print "distance('снискивать', 'стяжать') = {}".format(sim1)

#         absent in model                             presented
#source_words = [u'убаюкивать', u'укачивать', u'усыплять', u'бронь']
#source_words = [u'доносить', u'осведомлять', u'докладывать', u'объявлять', u'заявлять', u'предупреждать', u'извещать', u'информировать', u'сообщать', u'уведомлять', u'оповещать']

# 2/6 = |IntS|/|S|, [[сосредоточиваться]],  IntS(сосредоточиваться сосредотачиваться)  OutS(собираться отвлекаться фокусироваться концентрироваться) 
# source_words = [u'сосредоточиваться', u'сосредотачиваться', u'собираться', u'отвлекаться', u'фокусироваться', u'концентрироваться']
source_words = [u'лить', u'кутить', u'сосредоточиваться', u'сосредотачиваться', u'собираться', u'отвлекаться', u'фокусироваться', u'концентрироваться']

# 0/6 = |IntS|/|S|, [[абсолют]],  OutS(абсолют логос первооснова творец совершенство идеал) 

words = filter_vocab_words.filterVocabWords( source_words, model.vocab )
#print string_util.joinUtf8( ",", words )                                # after filter, now there are only words with vectors

while words:
    #print string_util.joinUtf8( ",", words )
    out_word = model.doesnt_match(words)
    print u"    - '{}'".format( out_word )
    words.remove( out_word )


#sim2 = model.n_similarity([u'убаюкивать', u'укачивать', u'усыплять', u'бронь'], [u'пробуждать', u'усыплять', u'бронь'])
#print "With    word 'бронь' in both lists sim={}".format( sim2 )
コード例 #4
def getInternalSet( arr_words, model):
    Get IntS (internal, kernel) words for words in the array arr_words.
        Word w belong to IntS iff
        (1) set S = S1 and S2 (subsets of S),
        (2) subsets S1 and S2 are not empty,
        (3) dist(S1, S2) > dist(S1 + w, S2),
        (4) dist(S1, S2) > dist(S1    , S2 + w), 
        (where "dist" is Cosine similarity between average vectors of S1 and S2 - subsets of vectors),
        i.e. the word w makes close any subsets S1 and S2 after adding.
    arr_words : array of Strings
        Array of source words, for example a ynonym set or a sentence' words.
    model : object
        Word2Vec model.
    array of Strings
        Internal subset for the source list of words.
        Empty array if there are no such words.
    result_int_s = []
    DEBUG_PRINT = False #True

    arr_words = filter_vocab_words.filterVocabWords( arr_words, model.vocab )
    #print string_util.joinUtf8( ",", arr_words )                              # after filter, now there are only words with vectors
    len_words = len(arr_words)
    #print "len_words = {}".format( len_words )
    if len_words < 3:
        return []       # it is possible calculate IntS only when there are >= 3 words

#    current_synset  = lib.synset.Synset()
#    current_synset.headword = arr_words[0] # let's first element in synset is a headword (? target word)
#    current_synset.line     = line

#    syn_rank       = dict()  # integer
#    syn_centrality = dict()  # float
#    syn_internal   = dict()  # boolean (true for IntS, for synonyms, which always make subsets more nearer)

    # let's take all subsets for every 'out' element
    while (i < len_words):
        gr = arr_words[:]
        # extract the element 'out' which is under consideration
        test_word = gr.pop(i)
        #test_word_counter_int   = 0
        #test_word_counter_float = 0

        sim12_greater_sim0_always = True
        for j in range(0, len(gr)):
            for l in range(j, len(gr)-1):
                gr1 = gr[j:l+1]
                gr2 = gr[0:j]+gr[l+1:len(gr)]
                if DEBUG_PRINT:
                    print u"{} | gr1={} | gr2={}".format( test_word,  string_util.joinUtf8( ",", gr1 ), 
                                                                      string_util.joinUtf8( ",", gr2 ) )

                gr1_and_test_word = gr1[:]
                gr1_and_test_word.append( test_word )

                gr2_and_test_word = gr2[:]
                gr2_and_test_word.append( test_word )

                sim0 = model.n_similarity(gr1, gr2)
                sim1 = model.n_similarity(gr1_and_test_word, gr2)
                sim2 = model.n_similarity(gr1,               gr2_and_test_word)
                if DEBUG_PRINT:
                    print "sim0 = {:5.3f}".format( sim0 )
                    print "sim1 = {:5.3f}".format( sim1 )
                    print "sim2 = {:5.3f}".format( sim2 )

                if sim0 > sim1 or sim0 > sim2:
                    sim12_greater_sim0_always = False
                if DEBUG_PRINT:
                    a = 1 if sim1 > sim0 else -1
                    b = 1 if sim2 > sim0 else -1
                    #test_word_counter_int += (a + b)/2
                    #test_word_counter_float += (sim1 - sim0) + (sim2 - sim0)
                    #print "test_word_counter_int = {}".format( test_word_counter_int )
                    #print "test_word_counter_float = {}".format( test_word_counter_float )

            if DEBUG_PRINT:
                print ("---")
        #syn_rank      [test_word] = test_word_counter_int;
        #syn_centrality[test_word] = test_word_counter_float;
        #syn_internal  [test_word] = sim12_greater_sim0_always;
        if sim12_greater_sim0_always:
            result_int_s.append( test_word )

        if DEBUG_PRINT:
            print ("+++++++")
        i += 1

    return result_int_s
コード例 #5
def getInternalSetWithReducing( arr_words, target_word, model):
    Get IntS (internal, kernel) words for words in the array arr_words. 
        If |IntS (arr_words)| == 0 then try reduce arr_words, 
        (1) until |IntS (reduced arr_words)| > 0
        (2)  and    set (reduced arr_words) contains target_word
    arr_words : array of Strings
        Array of source words, for example a synonym set or a sentence' words.
    target_word : String
        (1) arr_words contain target_word, 
        (2) the result IntS should contain target_word too.
    model : object
        Word2Vec model.
    array of Strings
        Internal subset for the source list of words (arr_words) or for subset of arr_words.
        Empty array if there are no such words.
    #result_int_s = []

    arr_words = filter_vocab_words.filterVocabWords( arr_words, model.vocab )
    #print string_util.joinUtf8( ",", arr_words )                            # after filter, now there are only words with vectors
    #if len(arr_words) < 3:
    #    return []       
    while len(arr_words) >= 3:  # it is possible calculate IntS only when there are >= 3 words
        int_s = getInternalSet (arr_words, model)
        if len( int_s ) > 0:
            return int_s

        # then now: len (int_s) == 0
        # let's find word_remote (1) within arr_words, (2) the most distant word to the target word
        target_vector = model [ target_word ]
        word_remote = ""
        #arr_new = []
        sim_min = 1.0
        for word in arr_words:
            if word == target_word:
                continue            # let's skip and do not delete target word itself

            vector = model [ word ]
            sim = 1 - spatial.distance.cosine( target_vector, vector )

            #print u"sim({}, {}) = {}".format( target_word, word, sim )

            if sim < sim_min:
                #print u"UPDATE: new sim {} < sim_min {}, word_remote old = {}, new = {}".format( sim, sim_min, word_remote, word )
                sim_min     = sim
                word_remote = word

        if len( word_remote ) == 0: # it is very strange that we did not find any word!
            return []

        arr_words.remove( word_remote )
        print string_util.joinUtf8( ",", arr_words )
    return []
コード例 #6
#         absent in model                             presented
#source_words = [u'убаюкивать', u'укачивать', u'усыплять', u'бронь']
#source_words = [u'доносить', u'осведомлять', u'докладывать', u'объявлять', u'заявлять', u'предупреждать', u'извещать', u'информировать', u'сообщать', u'уведомлять', u'оповещать']

# 2/6 = |IntS|/|S|, [[сосредоточиваться]],  IntS(сосредоточиваться сосредотачиваться)  OutS(собираться отвлекаться фокусироваться концентрироваться)
# source_words = [u'сосредоточиваться', u'сосредотачиваться', u'собираться', u'отвлекаться', u'фокусироваться', u'концентрироваться']
source_words = [
    u'лить', u'кутить', u'сосредоточиваться', u'сосредотачиваться',
    u'собираться', u'отвлекаться', u'фокусироваться', u'концентрироваться'

# 0/6 = |IntS|/|S|, [[абсолют]],  OutS(абсолют логос первооснова творец совершенство идеал)

words = filter_vocab_words.filterVocabWords(source_words, model.vocab)
#print string_util.joinUtf8( ",", words )                                # after filter, now there are only words with vectors

while words:
    #print string_util.joinUtf8( ",", words )
    out_word = model.doesnt_match(words)
    print u"    - '{}'".format(out_word)


#sim2 = model.n_similarity([u'убаюкивать', u'укачивать', u'усыплять', u'бронь'], [u'пробуждать', u'усыплять', u'бронь'])
#print "With    word 'бронь' in both lists sim={}".format( sim2 )

sim3 = model.n_similarity([u'убаюкивать', u'укачивать', u'усыплять', u'бронь'],
                          [u'пробуждать', u'усыплять'])