import time, pickle, os.path
from ambiruptor.library.miners.wiki_miners import DataMining

t = time.time()
print("======================== Build database ===========================")
data = DataMining()
data.set_wikidump_filename("data/wikidump.xml")
data.set_database_filename("data/wikidump.db")
data.build()
print("Done,", time.time() - t, "s")

t = time.time()
print("============== Building list of ambiguous words ===================")
filename_ambiguouswords = "data/ambiguous_words.txt"
with open(filename_ambiguouswords, 'r') as f:
    ambiguous_words = { x.rstrip() for x in f.readlines() }
    if "" in ambiguous_words :
        ambiguous_words.remove("")

nb_ambiguous_words = len(ambiguous_words)
print("Done,", time.time() - t, "s")

t = time.time()
print("======================== Build corpora ============================")
for n,w in enumerate(ambiguous_words):
    t2 = time.time()
    print("%s (%d/%d)" % (w, n, nb_ambiguous_words))
    filename = "data/corpora/" + w + ".dump"
    if os.path.isfile(filename):
        print("Already done.")
        continue
Exemple #2
0
from ambiruptor.library.learners.models import LinearSVMClassifier
from ambiruptor.library.learners.models import RbfSVMClassifier
from ambiruptor.library.learners.models import NaiveBayesClassifier
from ambiruptor.library.learners.models import DecisionTreeClassifier
from ambiruptor.library.learners.models import RandomForestClassifier

from ambiruptor.library.miners.wiki_miners import DataMining


if __name__ == '__main__':

    # Data Mining
    print("************************** Data mining ***************************")
    t = time.time()
    data = DataMining()
    data.set_wikidump_filename("data/wikidump.xml")
    data.set_database_filename("data/wikidump.db")
    data.build()
    corpus = data.get_corpus("Bar_(disambiguation)")
    print("Size of the corpus:", len(corpus), "articles")
    print("Done,", time.time() - t, "s")

    # Building features
    print("********************** Building/Loading features *************************")
    t = time.time()
    feature_extractor = fe.AmbiguousExtraction()
    if os.path.isfile("data/feature_extractors/test.dump"):
        print("Loading feature extractor...")
        feature_extractor.load("data/feature_extractors/test.dump")
        corpus_extractor = fe.CorpusExtraction()