Esempio n. 1
0
# Change the log config file to relative path
from dnnwsd.utils import setup_logging
setup_logging.CONFIG_FILE = u"../config/logging.yaml"

from dnnwsd.corpus import sensem, unannotated
from dnnwsd.processor import vecprocessor

annotated_corpus_directory = "../resources/sensem/"
unannotated_corpus_directory = "../../wikicorpus/es/wikicorpus_lemmas_sample_7k/"
pos_tags_file = "../resources/semisupervised_features/es/pos_tags"
corpus_datasets_dir = "../resources/corpus_datasets/es/7k/vecpos"

annotated_corpus_directory_iterator = sensem.SenSemCorpusDirectoryIterator(
    annotated_corpus_directory)
unannotated_corpus_directory_iterator = unannotated.UnannotatedCorpusDirectoryIterator(
    unannotated_corpus_directory, corpus_name='sensem')

word_vectors_path = "../resources/wordvectors/SBW-vectors-300-min5.bin.gz"

word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word_vectors_path,
                                                             binary=True)

for corpus_index, annotated_corpus in enumerate(
        annotated_corpus_directory_iterator):
    if not annotated_corpus.has_multiple_senses(
    ) or annotated_corpus.lemma == u'estar':
        print u"Skipping preprocess for corpus of lemma {}".format(
            annotated_corpus.lemma)
        continue

    unannotated_corpus = unannotated_corpus_directory_iterator[
Esempio n. 2
0
import numpy as np

# Change the log config file to relative path
from dnnwsd.utils import setup_logging
setup_logging.CONFIG_FILE = u"../config/logging.yaml"

from dnnwsd.corpus import semeval, unannotated
from dnnwsd.processor import bowprocessor

annotated_corpus_directory = "../resources/semeval/lexelts"
unannotated_corpus_directory = "../../wikicorpus/en/wikicorpus_lemmas_sample_7k/"
corpus_datasets_dir = "../resources/corpus_datasets/en/7k/bow"

annotated_corpus_directory_iterator = semeval.SemevalCorpusDirectoryIterator(
    annotated_corpus_directory)
unannotated_corpus_directory_iterator = unannotated.UnannotatedCorpusDirectoryIterator(
    unannotated_corpus_directory)

semisupervised_features_directory = "../resources/semisupervised_features/en/"

for corpus_index, annotated_corpus in enumerate(
        annotated_corpus_directory_iterator):
    if not annotated_corpus.has_multiple_senses():
        print u"Skipping preprocess for corpus of lemma {}".format(
            annotated_corpus.lemma)
        continue

    unannotated_corpus = unannotated_corpus_directory_iterator[
        annotated_corpus.lemma]

    semisupervised_features_path = path.join(semisupervised_features_directory,
                                             "{:03d}.p".format(corpus_index))
Esempio n. 3
0
# -*- coding: utf-8 -*-

import cPickle as pickle
import os
from dnnwsd.corpus import semeval, unannotated
from collections import defaultdict

corpus_iter = semeval.SemevalCorpusDirectoryIterator(
    "resources/semeval/lexelts")
unannotated_corpus_iter = unannotated.UnannotatedCorpusDirectoryIterator(
    "../wikicorpus/en/wikicorpus_lemmas")

tags = set()

for annotated_corpus in corpus_iter:
    if not annotated_corpus.has_multiple_senses():
        print u"Skipping experiments pipeline for lemma {}.".format(
            annotated_corpus.lemma)
        print u"The corpus doesn't have enough senses"
        continue

    tokens = defaultdict(int)
    unannotated_corpus = unannotated_corpus_iter[annotated_corpus.lemma]

    for sentence in annotated_corpus:
        for word in sentence.predicate_window(5):
            tags.add(word.tag)
            tokens[word.token] += 1

    for sentence in unannotated_corpus:
        for word in sentence.predicate_window(5):