Beispiel #1
0
def waitForDumper(dumper, name):
    if dumper != None:
        if dumper.is_alive():
            common_logger.info("Waiting for" + name + " dumper to finish saving to disk")
            dumper.join()
        else:
            common_logger.info(name + " dumper has already finished saving to disk, not waiting")
Beispiel #2
0
def parallelGetWordCorpus(articleDB, process_pool):
    articles = articleDB.items()
    results = process_pool.map(parallelGetCleanedWords, articles, chunksize=chunkSize_getCleanedWords)

    common_logger.info("Back from multiprocessing, making dict now")
    word_corpus = dict(results)

    return word_corpus
Beispiel #3
0
def load(path=file_lda_model_all):
    """
    Returns: SavedLDAModel object
    """
    # lda_model = LdaModel.load(file_lda_model)
    # gensim_dictionary = Dictionary.load(file_lda_gensim_dictionary)
    # article_id_to_LDA_dictionary = pickle.load(
    #    open(file_lda_articleIDToLDA, "rb"))
    # return lda_model, gensim_dictionary, article_id_to_LDA_dictionary
    common_logger.info("Loading LDA model from " + path)
    return pickle.load(open(path, "rb"))
Beispiel #4
0
def createArticleIdToLdaDict(word_corpus, dictionary, lda_model):
    common_logger.info("Creating article_id -> lda_vector dictionary")
    article_lda = {}
    index = 0
    for article_id in word_corpus.keys():
        bow = dictionary.doc2bow(word_corpus[article_id])
        lda_vec = lda_model[bow]
        article_lda[article_id] = lda_vec
        index += 1
        if index % 1000 == 0:
            common_logger.debug("done with %d articles", index)
    common_logger.info("saving article_id -> lda_vector dictionary")
    pickle.dump(article_lda, open(file_lda_articleIDToLDA, "wb"), protocol=2)
    return article_lda
Beispiel #5
0
def _saveAll(
    ldaModel,
    dictionary,
    articleIDToLDADict,
    articleDBused,
    stem_words,
    numPasses,
    removeNumbers,
    path=file_lda_model_all,
):
    common_logger.info("Saving LDA model object with all data")
    model_all = SavedLDAModel(
        ldaModel, dictionary, articleIDToLDADict, articleDBused, stem_words, numPasses, removeNumbers
    )
    pickle.dump(model_all, open(file_lda_model_all, "wb"), protocol=-1)
Beispiel #6
0
def getBoWCorpus(word_corpus, dictionary, process_pool, useSavedTill):
    if useSavedTill >= USESAVED.bow_corpus:
        common_logger.info("loading bow_corpus from out_file")
        bow_corpus = pickle.load(open(file_lda_bow_corpus, "rb"))
        return bow_corpus, None
    else:
        common_logger.info("Creating BoW representations from articles")
        bow_corpus = (
            parallelGetBoWCorpus(dictionary, word_corpus.values(), process_pool)
            if process_pool != None
            else serialGetBoWCorpus(dictionary, word_corpus.values())
        )

        out_file = open(file_lda_bow_corpus, "wb")
        bow_corpus_dumper = Thread(target=pickle.dump, args=(bow_corpus, out_file), kwargs={"protocol": 2})
        bow_corpus_dumper.start()
        return bow_corpus, bow_corpus_dumper
Beispiel #7
0
def getLdaModel(bow_corpus, dictionary, useSavedTill):
    if useSavedTill >= USESAVED.lda_model:
        common_logger.info("loading LDA model from file")
        return LdaModel.load(file_lda_model)
    else:
        common_logger.info("Training LDA model")
        num_topics = int(math.log(len(bow_corpus)) + 1)  # assumption:
        lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses)
        common_logger.info("Saving LDA model")
        lda_model.save(file_lda_model)
        common_logger.info("Done creating LDA model")
        return lda_model
Beispiel #8
0
def createFromScrapedArticles():
    common_logger.info("Creating ArticleDB")
    csv.field_size_limit(sys.maxint)

    articleDB = {}
    loaded_articles = 0
    for article_file in file_scraped_articles_list:
        # open as csv file with headers
        article_csv = csv.DictReader(open(article_file, "rb"), delimiter=",")

        for row in article_csv:
            article_id = toUnicode(row["article_id"])
            articleDB[article_id] = toUnicode(row["article_text"])
            loaded_articles += 1
            if(loaded_articles % 10000 == 0):
                common_logger.debug("loaded %d articles", loaded_articles)

    dump(articleDB, path=file_articledb)
    common_logger.info("Dumped ArticleDB successfully")
Beispiel #9
0
def create_and_save_model(process_pool, useSavedTill=USESAVED.none):
    """
    This takes a long time to train (~1 week), 
    run on a compute node with ~250 GB RAM and fast processor
    for wikipedia corpus of 410k documents

    Above time and storage estimates are not correct yet.
    """

    articleDB = ArticleDB.load(path=articleDBPath)

    word_corpus, word_corpus_dumper = getWordCorpus(articleDB, process_pool, useSavedTill)

    dictionary = getDictionary(word_corpus, useSavedTill)

    bow_corpus, bow_corpus_dumper = getBoWCorpus(word_corpus, dictionary, process_pool, useSavedTill)

    if process_pool:
        common_logger.info("terminating process pool")
        process_pool.close()
        process_pool.terminate()

    lda_model = getLdaModel(bow_corpus, dictionary, useSavedTill)

    articleIDToLDADict = createArticleIdToLdaDict(word_corpus, dictionary, lda_model)

    _saveAll(
        lda_model,
        dictionary,
        articleIDToLDADict,
        articleDBPath,
        stem_words,
        numPasses,
        removeNumbers,
        path=file_lda_model_all,
    )

    waitForDumper(word_corpus_dumper, "word corpus")

    waitForDumper(bow_corpus_dumper, "bow corpus")

    common_logger.info("Done creating articleIDToLDA dictionary, exiting")
Beispiel #10
0
def getDictionary(word_corpus, useSavedTill):
    if useSavedTill >= USESAVED.dictionary:
        common_logger.info("loading dictionary from file")
        dictionary = Dictionary.load(file_lda_gensim_dictionary)
        return dictionary
    else:
        common_logger.info("Creating dictionary from corpus")
        dictionary = Dictionary(word_corpus.values())
        common_logger.info("saving dictionary")
        dictionary.save(file_lda_gensim_dictionary)
        return dictionary
Beispiel #11
0
def createFromScrapedDefinitions():
    common_logger.info("Creating AcronymDB")
    csv.field_size_limit(sys.maxint)

    acronymDB = {}
    loaded_acronyms = 0
    for definition_file in file_scraped_definitions_list:
        # open as csv file with headers
        acronym_csv = csv.DictReader(
            open(definition_file, "rb"), delimiter=",")

        for row in acronym_csv:
            acronym = toUnicode(row["acronym"])
            acronym_expansion = toUnicode(row["acronym_expansion"])
            article_id = toUnicode(row["article_id"])
            if(acronym not in acronymDB):
                acronymDB[acronym] = []
            acronymDB[acronym].append([acronym_expansion
                                       .strip().lower().replace('-', ' '), article_id])
            # , row["article_title"]]) # title was part of old format
            loaded_acronyms += 1
            if(loaded_acronyms % 10000 == 0):
                common_logger.debug("loaded %d acronyms", loaded_acronyms)

    common_logger.info("adding def_count values to acronymDB")
    defs_per_acronym = [0] * 1000
    insts_per_def = [0] * 1000
    #num_acronyms = len(acronymDB)
    for acronym, values_for_this_acronym in acronymDB.items():
        values_for_this_acronym = sorted(
            values_for_this_acronym, key=lambda x: x[0])

        def_count = 0
        inst_count = 0
        expansion_of_last_acronym = values_for_this_acronym[0][0]
        #, article_title]\ # title was part of old format in the line below
        for index, [acronym_expansion, article_id]\
                in enumerate(values_for_this_acronym):
            if AcronymExpansion.startsSameWay(acronym_expansion, expansion_of_last_acronym):
                inst_count += 1
                values_for_this_acronym[index].append(def_count)
                values_for_this_acronym[index][0] = expansion_of_last_acronym
            else:
                insts_per_def[min(inst_count, len(insts_per_def) - 1)] += 1
                inst_count = 0
                def_count += 1
                expansion_of_last_acronym = acronym_expansion
                values_for_this_acronym[index].append(def_count)
        defs_per_acronym[min(def_count, len(defs_per_acronym) - 1)] += 1
        acronymDB[acronym] = numpy.array(values_for_this_acronym)

    dump(acronymDB)
    common_logger.info("Dumped AcronymDB successfully")
Beispiel #12
0
def getWordCorpus(articleDB, process_pool, useSavedTill):
    if useSavedTill >= USESAVED.word_corpus:
        common_logger.info("Loading word_corpus from out_file")
        word_corpus = pickle.load(open(file_lda_word_corpus, "rb"))
        return word_corpus, None
    else:
        common_logger.info("Getting word_corpus from articles")
        word_corpus = (
            parallelGetWordCorpus(articleDB, process_pool) if process_pool != None else serialGetWordCorpus(articleDB)
        )

        common_logger.info("Saving word_corpus asynchronously, in case the script ahead fails")
        out_file = open(file_lda_word_corpus, "wb")
        word_corpus_dumper = Thread(target=pickle.dump, args=(word_corpus, out_file), kwargs={"protocol": 2})
        word_corpus_dumper.start()

        return word_corpus, word_corpus_dumper
Beispiel #13
0
from flask import Flask, render_template, request, redirect, url_for
from flask.helpers import send_from_directory

from AcronymExtractors.AcronymExtractor_v1 import AcronymExtractor_v1
from DataCreators import ArticleDB, AcronymDB
from Logger import common_logger
from TextExtractors.Extract_PdfMiner import Extract_PdfMiner
from AcronymDisambiguator import AcronymDisambiguator
import string_constants
from AcronymExpanders import AcronymExpanderEnum
from sklearn.externals import joblib
from string_constants import file_vectorizer


common_logger.info("Starting server")
app = Flask(__name__)


common_logger.info("Initializing AcronymDisambiguator")
articleDB = ArticleDB.load()
acronymDB = AcronymDB.load()
disambiguator = AcronymDisambiguator(text_extractor=Extract_PdfMiner(),
                      acronym_extractor=AcronymExtractor_v1(),
                      expanders=[AcronymExpanderEnum.fromText_v2,
                                 AcronymExpanderEnum.Tfidf_multiclass],
                      articleDB=articleDB,
                      acronymDB=acronymDB,
                      vectorizer=joblib.load(file_vectorizer))

Beispiel #14
0
def output_file(filename):
    common_logger.info(string_constants.folder_output + filename)

    return send_from_directory(string_constants.folder_output, filename)
        self.shuffledArticleDBPath = file_articledb_shuffled
        self.acronymDBPath = file_acronymdb
        
        self.expandersToUse = [AcronymExpanderEnum.LDA_multiclass]
        self.ldaModelAll = LDAModel.load(path=folder_lda+"lda_model_noStem_noNums_3Pass.pickle")
        self.vectorizer = joblib.load(file_vectorizer)
        
        self.acronymExtractor = AcronymExtractor_v2_small()
        self.textExtractor = Extract_PdfMiner()


def _proxyFunction(benchmarker, testArticles):
    return benchmarker.getScoresAndReport(testArticles)

if __name__ == "__main__":
    common_logger.info("Starting Benchmarking")

    benchmarker = Benchmarker_wiki()

    common_logger.info("making partitions")
    partitions = benchmarker.getPartitions()
    gc.collect()

    pool = Pool(processes=benchmarker.numProcesses, maxtasksperchild=1)
    common_logger.info("delegating work to pools")

    partialFunc = functools.partial(_proxyFunction, benchmarker)

    results = pool.map(partialFunc, partitions, chunksize=1)
    benchmarker.plotStats(benchmarker.extractScores(results))
    benchmarker.saveAndPrintReport(benchmarker.extractReports(results))
Beispiel #16
0
def logConfig():
    common_logger.info("Logging config of script")
    common_logger.info("numProcesses = %d" % numProcesses)
    common_logger.info("articleDBPath = %s" % articleDBPath)
    common_logger.info("goParallel = %s" % goParallel)
    common_logger.info("useSavedTill = %d" % useSavedTill)
    common_logger.info("chunkSize_getCleanedWords = %d" % chunkSize_getCleanedWords)
    common_logger.info("chunkSize_doc2BoW = %d" % chunkSize_doc2BoW)
    common_logger.info("stem_words = %s" % stem_words)
    common_logger.info("removeNumbers = %s" % removeNumbers)
    common_logger.info("numPasses = %d" % numPasses)
Beispiel #17
0
    common_logger.info("numProcesses = %d" % numProcesses)
    common_logger.info("articleDBPath = %s" % articleDBPath)
    common_logger.info("goParallel = %s" % goParallel)
    common_logger.info("useSavedTill = %d" % useSavedTill)
    common_logger.info("chunkSize_getCleanedWords = %d" % chunkSize_getCleanedWords)
    common_logger.info("chunkSize_doc2BoW = %d" % chunkSize_doc2BoW)
    common_logger.info("stem_words = %s" % stem_words)
    common_logger.info("removeNumbers = %s" % removeNumbers)
    common_logger.info("numPasses = %d" % numPasses)


# global config for making LDA model
numProcesses = 3
articleDBPath = file_articledb
goParallel = True
useSavedTill = USESAVED.none
chunkSize_getCleanedWords = 1000
chunkSize_doc2BoW = 1000
stem_words = False
removeNumbers = True
numPasses = 2

if __name__ == "__main__":
    common_logger.info("LDA Model script started")
    logConfig()
    if goParallel:
        process_pool = Pool(numProcesses)
        create_and_save_model(process_pool, useSavedTill=useSavedTill)
    else:
        create_and_save_model(None, useSavedTill=useSavedTill)
Beispiel #18
0
def main():
    app.run(debug=False, host='0.0.0.0', port=80)
    common_logger.info("Server is ready")