Beispiel #1
0
def main(args):

    from argparse import ArgumentParser
    from simplelogsetter import SimpleLogSetter

    parser = ArgumentParser(description=__doc__)
    parser.add_argument(
        "-v",
        "--verbose",
        dest="verbosity",
        default=0,
        action="count",
        help="Verbosity.  Invoke many times for higher verbosity")
    parser.add_argument("-d",
                        "--dictionary",
                        dest="dictionary",
                        required=True,
                        help="Dictionary file")
    parser.add_argument("-m",
                        "--model",
                        dest="model",
                        required=True,
                        help="Model file")
    parser.add_argument("-s",
                        "--show-examples",
                        dest="showExamples",
                        type=int,
                        default=None,
                        help="Show n examples that rank high in each topic")
    parser.add_argument(
        "-e",
        "--english-only",
        dest="filterEnglishOnly",
        default=False,
        action="store_true",
        help="Filter corpus and only let through english lyrics")
    parser.add_argument("--elastic-search",
                        dest="elasticSearch",
                        default=False,
                        action="store_true",
                        help="Get data from elastic search instead of a file")
    parser.add_argument("lyrics", nargs="?", help="File to load lyrics from")

    parameters = parser.parse_args(args)
    logger = SimpleLogSetter(verbosity=parameters.verbosity)
    logger.startLogging()

    dictionaryPath = os.path.expanduser(parameters.dictionary)
    logging.info("Loading dictionary from %s" % dictionaryPath)
    dictionary = corpora.Dictionary.load(dictionaryPath)

    modelPath = os.path.expanduser(parameters.model)
    logging.info("Loading model from %s" % modelPath)
    model = models.ldamodel.LdaModel.load(modelPath)

    if parameters.elasticSearch:
        lyricsGenerator = getLyricsFromElasticSearch()
    else:
        lyricsGenerator = getLyricsFromJson(
            os.path.expanduser(parameters.lyrics))

    for index, songLyrics in enumerate(lyricsGenerator):
        frequencied = dictionary.doc2bow(cleanLyrics(songLyrics),
                                         allow_update=False)
        if not parameters.filterEnglishOnly or isItEnglish(
                frequencied, dictionary):
            distribution = model[frequencied]
            print distribution
            print songLyrics

        if index > 30:
            break

    lyricRatings = {}

    return 0
Beispiel #2
0
def main(args):

    from argparse import ArgumentParser
    from simplelogsetter import SimpleLogSetter

    parser = ArgumentParser(description=__doc__)
    parser.add_argument(
        "-v",
        "--verbose",
        dest="verbosity",
        default=0,
        action="count",
        help="Verbosity.  Invoke many times for higher verbosity")
    parser.add_argument("-c",
                        "--corpus",
                        dest="corpus",
                        required=True,
                        help="Corpus file to save to/load from")
    parser.add_argument("-d",
                        "--dictionary",
                        dest="dictionary",
                        required=True,
                        help="Dictionary file to save to/load from")
    parser.add_argument("-m",
                        "--model",
                        dest="model",
                        required=True,
                        help="Model file to save to/load from")
    parser.add_argument("-t",
                        "--topics",
                        dest="topics",
                        required=True,
                        type=int,
                        help="Number of topics")
    parser.add_argument(
        "-p",
        "--passes",
        dest="passes",
        type=int,
        default=1,
        help="How many passes of the data (default: %(default)s)")
    parser.add_argument(
        "-e",
        "--english-only",
        dest="filterEnglishOnly",
        default=False,
        action="store_true",
        help="Filter corpus and only let through english lyrics")
    parser.add_argument("--save-filtered",
                        dest="filteredCorpus",
                        default=None,
                        help="Save the filtered corpus to this filename")
    parser.add_argument(
        "-s",
        "--filter-stop",
        dest="filterStopWords",
        default=False,
        action="store_true",
        help=
        "Filter stop words before modelling (only implemented for english at the moment)"
    )
    parser.add_argument("--elastic-search",
                        dest="elasticSearch",
                        default=False,
                        action="store_true",
                        help="Get data from elastic search instead of a file")
    parser.add_argument("lyrics", nargs="?", help="File to load lyrics from")

    parameters = parser.parse_args(args)

    logger = SimpleLogSetter(verbosity=parameters.verbosity)
    logger.startLogging()

    dictionaryPath = os.path.expanduser(parameters.dictionary)
    if os.path.exists(dictionaryPath):
        logging.info("Loading dictionary from %s" % dictionaryPath)
        dictionary = corpora.Dictionary.load(dictionaryPath)
        allowUpdate = False
    else:
        dictionary = corpora.Dictionary()
        allowUpdate = True

    corpusPath = os.path.expanduser(parameters.corpus)
    if len(dictionary) > 0 and os.path.exists(corpusPath):
        logging.info("Loading corpus from %s" % corpusPath)
        allLyrics = corpora.MmCorpus(corpusPath)
    else:

        if parameters.elasticSearch:
            lyricsGenerator = getLyricsFromElasticSearch()
        else:
            lyricsGenerator = getLyricsFromJson(
                os.path.expanduser(parameters.lyrics))

        allLyrics = []
        for songLyrics in lyricsGenerator:
            allLyrics.append(
                dictionary.doc2bow(cleanLyrics(songLyrics),
                                   allow_update=allowUpdate))

        corpora.MmCorpus.serialize(corpusPath, allLyrics)

        if allowUpdate:
            logging.info("Saving dictionary to %s" % dictionaryPath)
            dictionary.save(dictionaryPath)

    if parameters.filterEnglishOnly:
        logging.info("Filtering for english only")
        allLyrics = englishOnly(allLyrics, dictionary)

        if parameters.filteredCorpus:
            corpora.MmCorpus.serialize(
                os.path.expanduser(parameters.filteredCorpus), allLyrics)

    if parameters.filterStopWords:
        logging.info("Filtering out stop words")
        allLyrics = filterFrequentWords(allLyrics, dictionary)

    model = models.ldamodel.LdaModel(corpus=allLyrics,
                                     id2word=dictionary,
                                     num_topics=parameters.topics,
                                     passes=parameters.passes)
    model.save(os.path.expanduser(parameters.model))

    return 0