def create_and_save_model(process_pool, useSavedTill=USESAVED.none): """ This takes a long time to train (~1 week), run on a compute node with ~250 GB RAM and fast processor for wikipedia corpus of 410k documents Above time and storage estimates are not correct yet. """ articleDB = ArticleDB.load(path=articleDBPath) word_corpus, word_corpus_dumper = getWordCorpus(articleDB, process_pool, useSavedTill) dictionary = getDictionary(word_corpus, useSavedTill) bow_corpus, bow_corpus_dumper = getBoWCorpus(word_corpus, dictionary, process_pool, useSavedTill) if process_pool: common_logger.info("terminating process pool") process_pool.close() process_pool.terminate() lda_model = getLdaModel(bow_corpus, dictionary, useSavedTill) articleIDToLDADict = createArticleIdToLdaDict(word_corpus, dictionary, lda_model) _saveAll( lda_model, dictionary, articleIDToLDADict, articleDBPath, stem_words, numPasses, removeNumbers, path=file_lda_model_all, ) waitForDumper(word_corpus_dumper, "word corpus") waitForDumper(bow_corpus_dumper, "bow corpus") common_logger.info("Done creating articleIDToLDA dictionary, exiting")
from DataCreators import ArticleDB, AcronymDB from Logger import common_logger from TextExtractors.Extract_PdfMiner import Extract_PdfMiner from AcronymDisambiguator import AcronymDisambiguator import string_constants from AcronymExpanders import AcronymExpanderEnum from sklearn.externals import joblib from string_constants import file_vectorizer common_logger.info("Starting server") app = Flask(__name__) common_logger.info("Initializing AcronymDisambiguator") articleDB = ArticleDB.load() acronymDB = AcronymDB.load() disambiguator = AcronymDisambiguator(text_extractor=Extract_PdfMiner(), acronym_extractor=AcronymExtractor_v1(), expanders=[AcronymExpanderEnum.fromText_v2, AcronymExpanderEnum.Tfidf_multiclass], articleDB=articleDB, acronymDB=acronymDB, vectorizer=joblib.load(file_vectorizer)) # This route will show a form to perform an AJAX request # jQuery is loaded to execute the request and update the # value of the operation