def waitForDumper(dumper, name): if dumper != None: if dumper.is_alive(): common_logger.info("Waiting for" + name + " dumper to finish saving to disk") dumper.join() else: common_logger.info(name + " dumper has already finished saving to disk, not waiting")
def parallelGetWordCorpus(articleDB, process_pool): articles = articleDB.items() results = process_pool.map(parallelGetCleanedWords, articles, chunksize=chunkSize_getCleanedWords) common_logger.info("Back from multiprocessing, making dict now") word_corpus = dict(results) return word_corpus
def load(path=file_lda_model_all): """ Returns: SavedLDAModel object """ # lda_model = LdaModel.load(file_lda_model) # gensim_dictionary = Dictionary.load(file_lda_gensim_dictionary) # article_id_to_LDA_dictionary = pickle.load( # open(file_lda_articleIDToLDA, "rb")) # return lda_model, gensim_dictionary, article_id_to_LDA_dictionary common_logger.info("Loading LDA model from " + path) return pickle.load(open(path, "rb"))
def createArticleIdToLdaDict(word_corpus, dictionary, lda_model): common_logger.info("Creating article_id -> lda_vector dictionary") article_lda = {} index = 0 for article_id in word_corpus.keys(): bow = dictionary.doc2bow(word_corpus[article_id]) lda_vec = lda_model[bow] article_lda[article_id] = lda_vec index += 1 if index % 1000 == 0: common_logger.debug("done with %d articles", index) common_logger.info("saving article_id -> lda_vector dictionary") pickle.dump(article_lda, open(file_lda_articleIDToLDA, "wb"), protocol=2) return article_lda
def _saveAll( ldaModel, dictionary, articleIDToLDADict, articleDBused, stem_words, numPasses, removeNumbers, path=file_lda_model_all, ): common_logger.info("Saving LDA model object with all data") model_all = SavedLDAModel( ldaModel, dictionary, articleIDToLDADict, articleDBused, stem_words, numPasses, removeNumbers ) pickle.dump(model_all, open(file_lda_model_all, "wb"), protocol=-1)
def getBoWCorpus(word_corpus, dictionary, process_pool, useSavedTill): if useSavedTill >= USESAVED.bow_corpus: common_logger.info("loading bow_corpus from out_file") bow_corpus = pickle.load(open(file_lda_bow_corpus, "rb")) return bow_corpus, None else: common_logger.info("Creating BoW representations from articles") bow_corpus = ( parallelGetBoWCorpus(dictionary, word_corpus.values(), process_pool) if process_pool != None else serialGetBoWCorpus(dictionary, word_corpus.values()) ) out_file = open(file_lda_bow_corpus, "wb") bow_corpus_dumper = Thread(target=pickle.dump, args=(bow_corpus, out_file), kwargs={"protocol": 2}) bow_corpus_dumper.start() return bow_corpus, bow_corpus_dumper
def getLdaModel(bow_corpus, dictionary, useSavedTill): if useSavedTill >= USESAVED.lda_model: common_logger.info("loading LDA model from file") return LdaModel.load(file_lda_model) else: common_logger.info("Training LDA model") num_topics = int(math.log(len(bow_corpus)) + 1) # assumption: lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses) common_logger.info("Saving LDA model") lda_model.save(file_lda_model) common_logger.info("Done creating LDA model") return lda_model
def createFromScrapedArticles(): common_logger.info("Creating ArticleDB") csv.field_size_limit(sys.maxint) articleDB = {} loaded_articles = 0 for article_file in file_scraped_articles_list: # open as csv file with headers article_csv = csv.DictReader(open(article_file, "rb"), delimiter=",") for row in article_csv: article_id = toUnicode(row["article_id"]) articleDB[article_id] = toUnicode(row["article_text"]) loaded_articles += 1 if(loaded_articles % 10000 == 0): common_logger.debug("loaded %d articles", loaded_articles) dump(articleDB, path=file_articledb) common_logger.info("Dumped ArticleDB successfully")
def create_and_save_model(process_pool, useSavedTill=USESAVED.none): """ This takes a long time to train (~1 week), run on a compute node with ~250 GB RAM and fast processor for wikipedia corpus of 410k documents Above time and storage estimates are not correct yet. """ articleDB = ArticleDB.load(path=articleDBPath) word_corpus, word_corpus_dumper = getWordCorpus(articleDB, process_pool, useSavedTill) dictionary = getDictionary(word_corpus, useSavedTill) bow_corpus, bow_corpus_dumper = getBoWCorpus(word_corpus, dictionary, process_pool, useSavedTill) if process_pool: common_logger.info("terminating process pool") process_pool.close() process_pool.terminate() lda_model = getLdaModel(bow_corpus, dictionary, useSavedTill) articleIDToLDADict = createArticleIdToLdaDict(word_corpus, dictionary, lda_model) _saveAll( lda_model, dictionary, articleIDToLDADict, articleDBPath, stem_words, numPasses, removeNumbers, path=file_lda_model_all, ) waitForDumper(word_corpus_dumper, "word corpus") waitForDumper(bow_corpus_dumper, "bow corpus") common_logger.info("Done creating articleIDToLDA dictionary, exiting")
def getDictionary(word_corpus, useSavedTill): if useSavedTill >= USESAVED.dictionary: common_logger.info("loading dictionary from file") dictionary = Dictionary.load(file_lda_gensim_dictionary) return dictionary else: common_logger.info("Creating dictionary from corpus") dictionary = Dictionary(word_corpus.values()) common_logger.info("saving dictionary") dictionary.save(file_lda_gensim_dictionary) return dictionary
def createFromScrapedDefinitions(): common_logger.info("Creating AcronymDB") csv.field_size_limit(sys.maxint) acronymDB = {} loaded_acronyms = 0 for definition_file in file_scraped_definitions_list: # open as csv file with headers acronym_csv = csv.DictReader( open(definition_file, "rb"), delimiter=",") for row in acronym_csv: acronym = toUnicode(row["acronym"]) acronym_expansion = toUnicode(row["acronym_expansion"]) article_id = toUnicode(row["article_id"]) if(acronym not in acronymDB): acronymDB[acronym] = [] acronymDB[acronym].append([acronym_expansion .strip().lower().replace('-', ' '), article_id]) # , row["article_title"]]) # title was part of old format loaded_acronyms += 1 if(loaded_acronyms % 10000 == 0): common_logger.debug("loaded %d acronyms", loaded_acronyms) common_logger.info("adding def_count values to acronymDB") defs_per_acronym = [0] * 1000 insts_per_def = [0] * 1000 #num_acronyms = len(acronymDB) for acronym, values_for_this_acronym in acronymDB.items(): values_for_this_acronym = sorted( values_for_this_acronym, key=lambda x: x[0]) def_count = 0 inst_count = 0 expansion_of_last_acronym = values_for_this_acronym[0][0] #, article_title]\ # title was part of old format in the line below for index, [acronym_expansion, article_id]\ in enumerate(values_for_this_acronym): if AcronymExpansion.startsSameWay(acronym_expansion, expansion_of_last_acronym): inst_count += 1 values_for_this_acronym[index].append(def_count) values_for_this_acronym[index][0] = expansion_of_last_acronym else: insts_per_def[min(inst_count, len(insts_per_def) - 1)] += 1 inst_count = 0 def_count += 1 expansion_of_last_acronym = acronym_expansion values_for_this_acronym[index].append(def_count) defs_per_acronym[min(def_count, len(defs_per_acronym) - 1)] += 1 acronymDB[acronym] = numpy.array(values_for_this_acronym) dump(acronymDB) common_logger.info("Dumped AcronymDB successfully")
def getWordCorpus(articleDB, process_pool, useSavedTill): if useSavedTill >= USESAVED.word_corpus: common_logger.info("Loading word_corpus from out_file") word_corpus = pickle.load(open(file_lda_word_corpus, "rb")) return word_corpus, None else: common_logger.info("Getting word_corpus from articles") word_corpus = ( parallelGetWordCorpus(articleDB, process_pool) if process_pool != None else serialGetWordCorpus(articleDB) ) common_logger.info("Saving word_corpus asynchronously, in case the script ahead fails") out_file = open(file_lda_word_corpus, "wb") word_corpus_dumper = Thread(target=pickle.dump, args=(word_corpus, out_file), kwargs={"protocol": 2}) word_corpus_dumper.start() return word_corpus, word_corpus_dumper
from flask import Flask, render_template, request, redirect, url_for from flask.helpers import send_from_directory from AcronymExtractors.AcronymExtractor_v1 import AcronymExtractor_v1 from DataCreators import ArticleDB, AcronymDB from Logger import common_logger from TextExtractors.Extract_PdfMiner import Extract_PdfMiner from AcronymDisambiguator import AcronymDisambiguator import string_constants from AcronymExpanders import AcronymExpanderEnum from sklearn.externals import joblib from string_constants import file_vectorizer common_logger.info("Starting server") app = Flask(__name__) common_logger.info("Initializing AcronymDisambiguator") articleDB = ArticleDB.load() acronymDB = AcronymDB.load() disambiguator = AcronymDisambiguator(text_extractor=Extract_PdfMiner(), acronym_extractor=AcronymExtractor_v1(), expanders=[AcronymExpanderEnum.fromText_v2, AcronymExpanderEnum.Tfidf_multiclass], articleDB=articleDB, acronymDB=acronymDB, vectorizer=joblib.load(file_vectorizer))
def output_file(filename): common_logger.info(string_constants.folder_output + filename) return send_from_directory(string_constants.folder_output, filename)
self.shuffledArticleDBPath = file_articledb_shuffled self.acronymDBPath = file_acronymdb self.expandersToUse = [AcronymExpanderEnum.LDA_multiclass] self.ldaModelAll = LDAModel.load(path=folder_lda+"lda_model_noStem_noNums_3Pass.pickle") self.vectorizer = joblib.load(file_vectorizer) self.acronymExtractor = AcronymExtractor_v2_small() self.textExtractor = Extract_PdfMiner() def _proxyFunction(benchmarker, testArticles): return benchmarker.getScoresAndReport(testArticles) if __name__ == "__main__": common_logger.info("Starting Benchmarking") benchmarker = Benchmarker_wiki() common_logger.info("making partitions") partitions = benchmarker.getPartitions() gc.collect() pool = Pool(processes=benchmarker.numProcesses, maxtasksperchild=1) common_logger.info("delegating work to pools") partialFunc = functools.partial(_proxyFunction, benchmarker) results = pool.map(partialFunc, partitions, chunksize=1) benchmarker.plotStats(benchmarker.extractScores(results)) benchmarker.saveAndPrintReport(benchmarker.extractReports(results))
def logConfig(): common_logger.info("Logging config of script") common_logger.info("numProcesses = %d" % numProcesses) common_logger.info("articleDBPath = %s" % articleDBPath) common_logger.info("goParallel = %s" % goParallel) common_logger.info("useSavedTill = %d" % useSavedTill) common_logger.info("chunkSize_getCleanedWords = %d" % chunkSize_getCleanedWords) common_logger.info("chunkSize_doc2BoW = %d" % chunkSize_doc2BoW) common_logger.info("stem_words = %s" % stem_words) common_logger.info("removeNumbers = %s" % removeNumbers) common_logger.info("numPasses = %d" % numPasses)
common_logger.info("numProcesses = %d" % numProcesses) common_logger.info("articleDBPath = %s" % articleDBPath) common_logger.info("goParallel = %s" % goParallel) common_logger.info("useSavedTill = %d" % useSavedTill) common_logger.info("chunkSize_getCleanedWords = %d" % chunkSize_getCleanedWords) common_logger.info("chunkSize_doc2BoW = %d" % chunkSize_doc2BoW) common_logger.info("stem_words = %s" % stem_words) common_logger.info("removeNumbers = %s" % removeNumbers) common_logger.info("numPasses = %d" % numPasses) # global config for making LDA model numProcesses = 3 articleDBPath = file_articledb goParallel = True useSavedTill = USESAVED.none chunkSize_getCleanedWords = 1000 chunkSize_doc2BoW = 1000 stem_words = False removeNumbers = True numPasses = 2 if __name__ == "__main__": common_logger.info("LDA Model script started") logConfig() if goParallel: process_pool = Pool(numProcesses) create_and_save_model(process_pool, useSavedTill=useSavedTill) else: create_and_save_model(None, useSavedTill=useSavedTill)
def main(): app.run(debug=False, host='0.0.0.0', port=80) common_logger.info("Server is ready")