def main():
    args = parser.parse_args()
    # dialect = ['pa','sy']
    dialect = [args.dialect_one, args.dialect_two]

    folder = args.corpus_folder + '/'
    # clean_data/comparable/msa/ , clean_data/comparable/egypt/
    corpus_files = [folder + dialect[0] + '.txt', folder + dialect[1] + '.txt']

    dictionary, corpus = models.build_comparable_ldamodel_training(folder, dialect)
    # sys.exit()
    # print('dict',len(dictionary))
    # print(dictionary.token2id)
    # print('corpus', len(corpus))
    lda_model = models.build_ldamodel(corpus, dictionary)

    folders = [folder + dialect[0] + '/', folder + dialect[1] + '/']
    # for sub_folder in folders:
    Hellinger_summation = 0
    Jaaccard_summation = 0
    for file in os.listdir(folders[0]):
        try:

            extension = os.path.splitext(file)[1]
            if extension == '.txt':
                first_filepath = os.path.join(folders[0], file)
                second_filepath = os.path.join(folders[1], file)

                with open(first_filepath, encoding='utf-8') as f:  # we can define file_name
                    first_documents = f.read()
                first_dialect = [word for word in first_documents.split()]

                # print(first_dialect)
                with open(second_filepath, encoding='utf-8') as f:  # we can define file_name
                    second_documents = f.read()
                second_dialect = [word for word in second_documents.split()]

                # print(second_dialect)
                bow_first_dialect = lda_model.id2word.doc2bow(first_dialect)
                bow_second_dialect = lda_model.id2word.doc2bow(second_dialect)
                # print(bow_first_dialect)
                # we can now get the LDA topic distributions for these
                lda_bow_first_dialect = lda_model[bow_first_dialect]
                lda_bow_second_dialect = lda_model[bow_second_dialect]

                # print(lda_bow_first_dialect)

                print('Hellinger distance between 1 and 2 ')
                print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect))
                Hellinger_summation = Hellinger_summation + hellinger(lda_bow_first_dialect, lda_bow_second_dialect)
                print('Jcard Distance')
                print(jaccard(bow_first_dialect, bow_second_dialect))
                Jaaccard_summation = Jaaccard_summation + jaccard(bow_first_dialect, bow_second_dialect)
                # sys.exit()

        except :
            pass

    print('total hellinger = ', Hellinger_summation / 10197)
    print('Total JC = ', Jaaccard_summation / 10197)
def comparable_corpus_distance(folder, dialect):
    dictionary, corpus = models.build_comparable_ldamodel_training(
        folder, dialect)
    lda_model = models.build_ldamodel(corpus, dictionary)
    folders = [folder + dialect[0] + '/', folder + dialect[1] + '/']

    Hellinger_summation = 0
    Jaaccard_summation = 0
    for file in os.listdir(folders[0]):
        try:

            extension = os.path.splitext(file)[1]
            if extension == '.txt':
                first_filepath = os.path.join(folders[0], file)
                second_filepath = os.path.join(folders[1], file)

                with open(first_filepath,
                          encoding='utf-8') as f:  # we can define file_name
                    first_documents = f.read()
                first_dialect = [word for word in first_documents.split()]

                # print(first_dialect)
                with open(second_filepath,
                          encoding='utf-8') as f:  # we can define file_name
                    second_documents = f.read()
                second_dialect = [word for word in second_documents.split()]

                # print(second_dialect)
                bow_first_dialect = lda_model.id2word.doc2bow(first_dialect)
                bow_second_dialect = lda_model.id2word.doc2bow(second_dialect)
                # print(bow_first_dialect)
                # we can now get the LDA topic distributions for these
                lda_bow_first_dialect = lda_model[bow_first_dialect]
                lda_bow_second_dialect = lda_model[bow_second_dialect]

                # print(lda_bow_first_dialect)

                print('Hellinger distance between 1 and 2 ')
                print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect))
                Hellinger_summation = Hellinger_summation + hellinger(
                    lda_bow_first_dialect, lda_bow_second_dialect)
                print('Jcard Distance')
                print(jaccard(bow_first_dialect, bow_second_dialect))
                Jaaccard_summation = Jaaccard_summation + jaccard(
                    bow_first_dialect, bow_second_dialect)
                # sys.exit()

        except:
            pass

    print('total hellinger = ', Hellinger_summation / 10197)
    print('Total JC = ', Jaaccard_summation / 10197)
    def test_distributions(self):
        # checking bag of words as inputs
        vec_1 = [(2, 1), (3, 4), (4, 1), (5, 1), (1, 1), (7, 2)]
        vec_2 = [(1, 1), (3, 8), (4, 1)]
        result = matutils.jaccard(vec_2, vec_1)
        expected = 1 - 0.3
        self.assertAlmostEqual(expected, result)

        # checking ndarray, csr_matrix as inputs
        vec_1 = np.array([[1, 3], [0, 4], [2, 3]])
        vec_2 = csr_matrix([[1, 4], [0, 2], [2, 2]])
        result = matutils.jaccard(vec_1, vec_2)
        expected = 1 - 0.388888888889
        self.assertAlmostEqual(expected, result)

        # checking ndarray, list as inputs
        vec_1 = np.array([6, 1, 2, 3])
        vec_2 = [4, 3, 2, 5]
        result = matutils.jaccard(vec_1, vec_2)
        expected = 1 - 0.333333333333
        self.assertAlmostEqual(expected, result)
    def test_distributions(self):
        # checking bag of words as inputs
        vec_1 = [(2, 1), (3, 4), (4, 1), (5, 1), (1, 1), (7, 2)]
        vec_2 = [(1, 1), (3, 8), (4, 1)]
        result = matutils.jaccard(vec_2, vec_1)
        expected = 1 - 0.3
        self.assertAlmostEqual(expected, result)

        # checking ndarray, csr_matrix as inputs
        vec_1 = np.array([[1, 3], [0, 4], [2, 3]])
        vec_2 = csr_matrix([[1, 4], [0, 2], [2, 2]])
        result = matutils.jaccard(vec_1, vec_2)
        expected = 1 - 0.388888888889
        self.assertAlmostEqual(expected, result)

        # checking ndarray, list as inputs
        vec_1 = np.array([6, 1, 2, 3])
        vec_2 = [4, 3, 2, 5]
        result = matutils.jaccard(vec_1, vec_2)
        expected = 1 - 0.333333333333
        self.assertAlmostEqual(expected, result)
Beispiel #5
0
def distance_metrics_Jaccard(text_standart, textsList):
    #Получить список лемм сех текстов
    data_lemmatized_list = []
    data_lemmatized_list.append(text_standart.lemma_text)
    for text in textsList:
        data_lemmatized_list.append(text.lemma_text)

    # Модель LDA для поиска темы в тексте текста
    models = Models()
    model_LDA = models.text_LDA(data_lemmatized_list)

    # Получить мешок слов
    bow_text_standart = model_LDA.id2word.doc2bow(text_standart.lemma_text)
    for text in textsList:
        bow_text = model_LDA.id2word.doc2bow(text.lemma_text)
        # print("jaccard [0 - подобны; 1 - не подобны]")
        # print(jaccard(bow_text_standart, bow_text))
        text.jaccard_coeff = round(jaccard(bow_text_standart, bow_text), 2)
Beispiel #6
0
    def Jaccard_similiarity(self,
                            corpus,
                            corpus_model_user_description,
                            num_best=5):
        'for each user query it computes the Jaccard coefficient with respect to each hotel'
        length = len(corpus_model_user_description)
        queryXhotel = np.zeros((length, len(corpus)))

        for i in range(length):
            for j in range(len(corpus)):
                queryXhotel[i][j] = jaccard(corpus_model_user_description[i],
                                            corpus[j])

        #np.save('jaccard_similiarity', queryXhotel)
        accuracy_array = self.make_accuracy_array(queryXhotel,
                                                  num_best,
                                                  bol=False)

        return accuracy_array
    def process(self, udpipe: str, reference: str, other: str):
        # init
        reference_text = Text(reference, udpipe)
        other_text = Text(other, udpipe)

        # Получить список лемм всех текстов
        data_lemmatized_list = [
            reference_text.lemma_text, other_text.lemma_text
        ]

        # Модель LDA для поиска темы в тексте текста
        models = Models()
        model_LDA = models.text_LDA(data_lemmatized_list)

        # Получить мешок слов
        bow_reference = model_LDA.id2word.doc2bow(reference_text.lemma_text)
        bow_other = model_LDA.id2word.doc2bow(other_text.lemma_text)
        # print("jaccard [0 - подобны; 1 - не подобны]")
        # print(jaccard(bow_text_standart, bow_text))
        other_text.jaccard_coeff = round(jaccard(bow_reference, bow_other), 2)

        return other_text.jaccard_coeff
def corpus_distance(folder, dialect, corpus_files):
    dictionary, corpus = models.build_ldamodel_training(folder, dialect)

    # dictionary, corpus = premodel.upload_data(dialect)

    # print('here', len(corpus))
    lda_model = models.build_ldamodel(corpus, dictionary)

    # now we add the two dialects to test the distance betwen them
    with open(corpus_files[0],
              encoding='utf-8') as f:  # we can define file_name
        first_documents = f.read()
    first_dialect = [word for word in first_documents.split()]

    with open(corpus_files[1],
              encoding='utf-8') as f:  # we can define file_name
        second_documents = f.read()
    second_dialect = [word for word in second_documents.split()]
    # now let's make these into a bag of words format

    bow_first_dialect = lda_model.id2word.doc2bow(first_dialect)
    bow_second_dialect = lda_model.id2word.doc2bow(second_dialect)

    # we can now get the LDA topic distributions for these
    lda_bow_first_dialect = lda_model[bow_first_dialect]
    lda_bow_second_dialect = lda_model[bow_second_dialect]

    print('Hellinger distance between 1 and 2 ')
    print(hellinger(lda_bow_first_dialect, lda_bow_second_dialect))

    print('Jcard Distance')
    print(jaccard(bow_first_dialect, bow_second_dialect))

    print('kullback_leibler between 1 to 2')
    # print(kullback_leibler(lda_bow_first_dialect, lda_bow_second_dialect))

    print('kullback_leibler between 2 to 1')
Beispiel #9
0
tfidf_bow_water = tfidf[bow_water]
tfidf_bow_finance = tfidf[bow_finance]
tfidf_bow_bank = tfidf[bow_bank]

from gensim.matutils import kullback_leibler, jaccard, hellinger

hellinger(lda_bow_water, lda_bow_finance)
hellinger(lda_bow_finance, lda_bow_bank)
hellinger(lda_bow_bank, lda_bow_water)

hellinger(lda_bow_finance, lda_bow_water)
kullback_leibler(lda_bow_water, lda_bow_bank)
kullback_leibler(lda_bow_bank, lda_bow_water)


jaccard(bow_water, bow_bank)
jaccard(doc_water, doc_bank)
jaccard(['word'], ['word'])

def make_topics_bow(topic):
    # takes the string returned by model.show_topics()
    # split on strings to get topics and the probabilities
    topic = topic.split('+')
    # list to store topic bows
    topic_bow = []
    for word in topic:
        # split probability and word
        prob, word = word.split('*')
        # get rid of spaces
        word = word.replace(" ","")
        # convert to word_type
Beispiel #10
0
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full
import numpy
import pickle
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models, similarities

stemmer = PorterStemmer()

document = []

topic_dict, cosine = pickle.load(open( "../data/cosine.p", "rb" ) )

for key,value in topic_dict.items():
    text_tokens = [stemmer.stem(item) for item in key.split()]
    text_key = ' '.join([w for w in text_tokens])
    text = str(text_key +" "+value)
    document.append(text)   

texts = [[word for word in doc.split()] for doc in document]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

jaccard_matrix = [round(jaccard(corpus[i], corpus[j])*100.00,5) for i in range(0, 1680) for j in range(0, 1680)]

pickle.dump(jaccard_matrix, open( "../data/jaccard.p", "wb" ) )
threshold = 0.0
regex = re.compile('[%s]' % re.escape(string.punctuation))
for i in range(len(sentences)):
    for j in range(i + 1, len(sentences)):
        try:
            sen1 = [
                pStemmer.stem(word)
                for word in regex.sub('', sentences[i].lower()).split(" ")
                if word not in STOPWORDS
            ]
            sen2 = [
                pStemmer.stem(word)
                for word in regex.sub('', sentences[j].lower()).split(" ")
                if word not in STOPWORDS
            ]
            simScore = 1 - jaccard(sen1, sen2)
        except TypeError:
            pass
        except UnicodeError:
            sen1 = [
                pStemmer.stem(word.decode("utf-8"))
                for word in regex.sub('', sentences[i].lower()).split(" ")
                if word not in STOPWORDS
            ]
            sen2 = [
                pStemmer.stem(word.decode("utf-8"))
                for word in regex.sub('', sentences[j].lower()).split(" ")
                if word not in STOPWORDS
            ]
        if simScore > threshold:
            similarities.append((simScore, sentences[i], sentences[j]))
Beispiel #12
0
        filecontent = filecontent + word + ' '
        documents.append(filecontent)
    stoplist = set(stopwords.words('english'))
    texts = [[
        word for word in document.lower().split() if word not in stoplist
    ] for document in documents]
    basetext = []
    for list in texts:
        for item in list:
            basetext.append(item)
    bow_1 = lda.id2word.doc2bow(basetext)
    lda_1 = lda[bow_1]
    print("******************", filename)
    print("hellinger", hellinger(lda_1, lda_2))
    print("kullback_leibler", kullback_leibler(lda_1, lda_2))
    print("jaccard", jaccard(lda_1, lda_2))
    file.close()
    #dictionary = corpora.Dictionary(texts)
    #corpus = [dictionary.doc2bow(text) for text in texts]
    #lda1 = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, update_every=1, chunksize=10000, passes=5)
#print(lda1)
#print(texts)
"""
basetext=[]
for list in texts:
    for item in list:
        basetext.append(item)
#print(len(basetext))
#print(basetext)

bow_1 = lda.id2word.doc2bow(basetext)
Beispiel #13
0
    max_p = max(probabilities)
    topic = topics[probabilities.index(max_p)]
    return topic

colors =  ["skyblue", "pink", "red", "green", "yellow", "cyan", "purple", "magenta", "orange", "blue"]
def get_node_color(i):
    return colors[get_most_likely_topic(texts[i])]
    # return 'skyblue' if get_most_likely_topic(texts[i]) == 0 else 'pink'

G = nx.Graph()
for i, _ in enumerate(texts):
    G.add_node(i)
    
for (i1, i2) in itertools.combinations(range(len(texts)), 2):
    bow1, bow2 = texts[i1], texts[i2]
    distance = jaccard(bow1, bow2)
    if(distance > 0.001):
        G.add_edge(i1, i2, weight=1/distance) 

pos = nx.spring_layout(G)

threshold = 1.04
elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] > threshold]
esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] <= threshold]

node_colors = [get_node_color(i) for (i, _) in enumerate(texts)]
nx.draw_networkx_nodes(G, pos, node_size=700, node_color=node_colors)
nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2)
nx.draw_networkx_edges(G,pos,edgelist=esmall, width=2, alpha=0.2, edge_color='b', style='dashed')
nx.draw_networkx_labels(G, pos, font_size=20, font_family='sans-serif')
plt.show()
Beispiel #14
0
# Leibler_divergence>`_ and `Hellinger
# <https://en.wikipedia.org/wiki/Hellinger_distance>`_ to figure out what suits
# your needs.
# 

###############################################################################
# Jaccard
# -------
# 
# Let us now look at the `Jaccard Distance
# <https://en.wikipedia.org/wiki/Jaccard_index>`_ metric for similarity between
# bags of words (i.e, documents)
# 
from gensim.matutils import jaccard

print(jaccard(bow_water, bow_bank))
print(jaccard(doc_water, doc_bank))
print(jaccard(['word'], ['word']))

###############################################################################
# The three examples above feature 2 different input methods. 
# 
# In the first case, we present to jaccard document vectors already in bag of
# words format. The distance can be defined as 1 minus the size of the
# intersection upon the size of the union of the vectors. 
# 
# We can see (on manual inspection as well), that the distance is likely to be
# high - and it is. 
# 
# The last two examples illustrate the ability for jaccard to accept even lists
# (i.e, documents) as inputs.