data.set_database_filename("data/wikidump.db")
data.build()
print("Done,", time.time() - t, "s")

t = time.time()
print("============== Building list of ambiguous words ===================")
filename_ambiguouswords = "data/ambiguous_words.txt"
with open(filename_ambiguouswords, 'r') as f:
    ambiguous_words = { x.rstrip() for x in f.readlines() }
    if "" in ambiguous_words :
        ambiguous_words.remove("")

nb_ambiguous_words = len(ambiguous_words)
print("Done,", time.time() - t, "s")

t = time.time()
print("======================== Build corpora ============================")
for n,w in enumerate(ambiguous_words):
    t2 = time.time()
    print("%s (%d/%d)" % (w, n, nb_ambiguous_words))
    filename = "data/corpora/" + w + ".dump"
    if os.path.isfile(filename):
        print("Already done.")
        continue
    corpus = data.get_corpus(w)
    with open(filename, 'wb') as f:
        pickle.dump(corpus, f)
    print("ok (%f s)" % (time.time() - t2))
print("Done,", time.time() - t, "s")

Exemple #2
0
from ambiruptor.library.learners.models import DecisionTreeClassifier
from ambiruptor.library.learners.models import RandomForestClassifier

from ambiruptor.library.miners.wiki_miners import DataMining


if __name__ == '__main__':

    # Data Mining
    print("************************** Data mining ***************************")
    t = time.time()
    data = DataMining()
    data.set_wikidump_filename("data/wikidump.xml")
    data.set_database_filename("data/wikidump.db")
    data.build()
    corpus = data.get_corpus("Bar_(disambiguation)")
    print("Size of the corpus:", len(corpus), "articles")
    print("Done,", time.time() - t, "s")

    # Building features
    print("********************** Building/Loading features *************************")
    t = time.time()
    feature_extractor = fe.AmbiguousExtraction()
    if os.path.isfile("data/feature_extractors/test.dump"):
        print("Loading feature extractor...")
        feature_extractor.load("data/feature_extractors/test.dump")
        corpus_extractor = fe.CorpusExtraction()
        for f in feature_extractor.features:
            corpus_extractor.add_feature(f)
    else:
        print("Building feature extractor...")