Esempio n. 1
0
def demo(scorer=None, compare_scorer=None):
    """Finds trigram collocations in the files of the WebText corpus."""
    from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores

    if scorer is None:
        scorer = BigramAssocMeasures.likelihood_ratio
    if compare_scorer is None:
        compare_scorer = BigramAssocMeasures.raw_freq

    from nltk.corpus import stopwords, webtext

    ignored_words = stopwords.words('english')
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

    for file in webtext.fileids():
        words = [word.lower() for word in webtext.words(file)]

        cf = BigramCollocationFinder.from_words(words)
        cf.apply_freq_filter(3)
        cf.apply_word_filter(word_filter)

        print(file)
        print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
        print('\t Correlation to %s: %0.4f' %
              (compare_scorer.__name__,
               spearman_correlation(
                   ranks_from_scores(cf.score_ngrams(scorer)),
                   ranks_from_scores(cf.score_ngrams(compare_scorer)))))
Esempio n. 2
0
def demo(scorer=None, compare_scorer=None):
    """Finds bigram collocations in the files of the WebText corpus."""
    from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores

    if scorer is None:
        scorer = BigramAssocMeasures.likelihood_ratio
    if compare_scorer is None:
        compare_scorer = BigramAssocMeasures.raw_freq

    from nltk.corpus import stopwords, webtext

    ignored_words = stopwords.words('english')
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

    for file in webtext.fileids():
        words = [word.lower()
                 for word in webtext.words(file)]

        cf = BigramCollocationFinder.from_words(words)
        cf.apply_freq_filter(3)
        cf.apply_word_filter(word_filter)

        print(file)
        print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
        print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__,
                                               spearman_correlation(
                                                   ranks_from_scores(cf.score_ngrams(scorer)),
                                                   ranks_from_scores(cf.score_ngrams(compare_scorer)))))
Esempio n. 3
0
    def test(self, inFile, embFile="emb_art_10.npy"):

        self.cos_dict = dict()
        self.cos_dict_id = dict()

        # 1. Import wordsim353 and visualize it
        csv = pd.read_csv(inFile)
        csv = np.array(csv)

        idsim = dict()
        wordsim = dict()

        for (word_a, word_b, num) in csv:
            if word_a in self.data.word2id and word_b in self.data.word2id:
                idsim[(self.data.word2id[word_a],
                       self.data.word2id[word_b])] = num
                wordsim[(word_a, word_b)] = num

        # 2. Load embeddings & normalize them
        if not self.skip_gram_model.v_embeddings:
            self.embeddings = np.load(embFile, allow_pickle=True)
        else:
            self.embeddings = self.skip_gram_model.v_embeddings.weight.cpu(
            ).data.numpy()

        # 3. Compute Cosine Similarities
        for (id_a, id_b), value in idsim.items():

            embeddings_a = self.embeddings[id_a].reshape(1, -1)
            embeddings_b = self.embeddings[id_b].reshape(1, -1)

            similarity = np.asscalar(
                cosine_similarity(embeddings_a, embeddings_b)[0])

            self.cos_dict[(self.data.id2word[id_a],
                           self.data.id2word[id_b])] = similarity
            self.cos_dict_id[id_a, id_b] = similarity

        # Array form
        a = list([])
        b = list([])
        for (id_a, id_b), value in idsim.items():
            a.append(value)
            b.append(self.cos_dict_id[(id_a, id_b)])

        print("Spearman Coefficient:",
              spearman_correlation(self.cos_dict_id, idsim))
        spear = spearmanr(a, b)

        print(spear)

        return (spear[0])
Esempio n. 4
0
    def wordsim353_spearman(self, input_filename):
        target_word = []
        context_word = []
        human_scores = []
        with open(input_filename) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            ws353_pairs = -1
            for row in csv_reader:
                if ws353_pairs == -1:
                    ws353_pairs += 1
                else:
                    target_word.append(row[0])
                    context_word.append(row[1])
                    human_scores.append(float(row[2]))
                    ws353_pairs += 1

        for pair in range(0, ws353_pairs):
            if (target_word[pair] not in self.data.word2id):
                raise Exception('Target word not in model vocab: ',
                                target_word[pair])
            if (context_word[pair] not in self.data.word2id):
                raise Exception('Context word not in model vocab: ',
                                context_word[pair])

        human_rankings = ss.rankdata(human_scores)

        machine_scores = []
        for pair in range(0, len(human_scores)):
            machine_scores.append(
                self.calculate_probability(target_word[pair],
                                           context_word[pair]))
        machine_rankings = ss.rankdata(machine_scores)

        human_scores_dict = dict()
        machine_scores_dict = dict()
        for pair in range(0, len(human_scores)):
            human_scores_dict[pair] = human_rankings[pair]
            machine_scores_dict[pair] = machine_rankings[pair]

        return spearman.spearman_correlation(human_scores_dict,
                                             machine_scores_dict)
from nltk.collocations import BigramAssocMeasures
from nltk import FreqDist
from nltk import bigrams
from nltk.metrics import spearman

analyzer = MorphAnalyzer()
corpus = pd.read_csv("court-V-N.csv", header=None)
measures = BigramAssocMeasures()
tagger = lambda x: (x, analyzer.parse(x.lower().strip())[0].tag.POS)
tagged_corpus = corpus.applymap(tagger).drop(0, axis=1)
with open("gold_standard.txt", "r") as io:
    standard = [tuple(x.split()) for x in io.readlines()]
wfd = FreqDist(tagged_corpus.values.flatten())
bfd = FreqDist(bigrams(tagged_corpus.values.flatten()))
finder_1 = BigramCollocationFinder(wfd, bfd)

filter = lambda x: [tuple(z[0] for z in y[0]) for y in x if y[0][0][1] == "INFN"]

scored_pmi = filter(finder_1.score_ngrams(measures.pmi))
scored_student = filter(finder_1.score_ngrams(measures.student_t))
pmi_top = scored_pmi[:10]
student_top = scored_student[:10]

for name, top in [("pmi_top10.txt", pmi_top), ("student_top10.txt", student_top)]:
    with open(name, "w") as io:
        joined = [" ".join(x) + "\n" for x in top]
        io.writelines(joined)

print(spearman.spearman_correlation(pmi_top, student_top))
print("Done")