Esempio n. 1
0
def create_and_save_model(process_pool, useSavedTill=USESAVED.none):
    """
    This takes a long time to train (~1 week), 
    run on a compute node with ~250 GB RAM and fast processor
    for wikipedia corpus of 410k documents

    Above time and storage estimates are not correct yet.
    """

    articleDB = ArticleDB.load(path=articleDBPath)

    word_corpus, word_corpus_dumper = getWordCorpus(articleDB, process_pool, useSavedTill)

    dictionary = getDictionary(word_corpus, useSavedTill)

    bow_corpus, bow_corpus_dumper = getBoWCorpus(word_corpus, dictionary, process_pool, useSavedTill)

    if process_pool:
        common_logger.info("terminating process pool")
        process_pool.close()
        process_pool.terminate()

    lda_model = getLdaModel(bow_corpus, dictionary, useSavedTill)

    articleIDToLDADict = createArticleIdToLdaDict(word_corpus, dictionary, lda_model)

    _saveAll(
        lda_model,
        dictionary,
        articleIDToLDADict,
        articleDBPath,
        stem_words,
        numPasses,
        removeNumbers,
        path=file_lda_model_all,
    )

    waitForDumper(word_corpus_dumper, "word corpus")

    waitForDumper(bow_corpus_dumper, "bow corpus")

    common_logger.info("Done creating articleIDToLDA dictionary, exiting")
Esempio n. 2
0
from DataCreators import ArticleDB, AcronymDB
from Logger import common_logger
from TextExtractors.Extract_PdfMiner import Extract_PdfMiner
from AcronymDisambiguator import AcronymDisambiguator
import string_constants
from AcronymExpanders import AcronymExpanderEnum
from sklearn.externals import joblib
from string_constants import file_vectorizer


common_logger.info("Starting server")
app = Flask(__name__)


common_logger.info("Initializing AcronymDisambiguator")
articleDB = ArticleDB.load()
acronymDB = AcronymDB.load()
disambiguator = AcronymDisambiguator(text_extractor=Extract_PdfMiner(),
                      acronym_extractor=AcronymExtractor_v1(),
                      expanders=[AcronymExpanderEnum.fromText_v2,
                                 AcronymExpanderEnum.Tfidf_multiclass],
                      articleDB=articleDB,
                      acronymDB=acronymDB,
                      vectorizer=joblib.load(file_vectorizer))


# This route will show a form to perform an AJAX request
# jQuery is loaded to execute the request and update the
# value of the operation