Python chunk_bag_of_word_collection_by_chunk_size Examples, topic_modeling.utils.chunk_bag_of_word_collection_by_chunk_size Python Examples

Example #1

0

Show file

def hdp_celery_task(collection_data, options, user):
    """
    Async gensim HDP task
    :param collection_data:
    :param options:
    :param user:
    :return:
    """
    user = User.objects.get(pk=user)

    # get tokens from collection and filter them
    filtered_docs = []
    for item in collection_data:
        tokens = CollectionParser(
            item['id'], item['filter'],
            wordnet_status=options['wordNetSense']).get_bow()
        filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(
                bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(
                bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    # set up and execute gensim modeling
    handler = LdaHandler(chunked_words_bags)
    handler.create_dictionary()
    handler.create_corpus()
    handler.train_hdp_model(options)

    topics = handler.hdp_model.show_topics(topics=-1,
                                           log=False,
                                           formatted=False)

    # create output models
    topic_group = build_and_save_topic_tuples_and_topic_groups(
        topics, user, collection_data, 'hdp', options)
    # relate collections to topic group
    add_collections_to_topic_group(topic_group, collection_data)

    # email upon completion
    try:
        send_document_done_email(user)
    except Exception as e:
        print e
    return topics

Example #2

0

Show file

File: python_based.py Project: lucasnoah/litmetricscore

def topic_modeling_celery_task(collection_data, options, user, *args, **kwargs):
    """
    Async tosk to do gensim based topic modeling.
    :param collection_data:
    :param options:
    :param user:
    :param args:
    :param kwargs:
    :return:
    """
    # get user from user id
    user = User.objects.get(pk=user)

    # get tokens from collection and parse with filters
    filtered_docs = []
    wordnet_status = options['wordNetSense']
    for item in collection_data:
        # overide the collections filter wordnet status.  This should probably live somewhere else in the future.
        tokens = CollectionParser(item['id'], item['filter'], wordnet_status=wordnet_status).get_bow()
        filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    # set up and execute gensim modeling
    handler = LdaHandler(chunked_words_bags)
    handler.create_dictionary()
    handler.create_corpus()
    update_every = options.get('update_every') or 2
    del options['update_every']
    handler.train_lda_model(options['numTopics'], update_every, options['numPasses'], options)
    handler.lda_model.top_topics(handler.corpus, options['numTopics'])
    topics = handler.lda_model.top_topics(handler.corpus, num_words=options['top_n'])
    # create output models
    topic_group = build_and_save_topic_tuples_and_topic_groups(topics, user, collection_data, 'lda', options)
    # relate collections to topic group
    add_collections_to_topic_group(topic_group, collection_data)

    # email upon completion
    try:
        send_document_done_email(user)
    except Exception as e:
        print e
    return topics

Example #3

0

Show file

File: python_based.py Project: lucasnoah/litmetricscore

def hdp_celery_task(collection_data, options, user):
    """
    Async gensim HDP task
    :param collection_data:
    :param options:
    :param user:
    :return:
    """
    user = User.objects.get(pk=user)

    # get tokens from collection and filter them
    filtered_docs = []
    for item in collection_data:
        tokens = CollectionParser(item['id'], item['filter'], wordnet_status=options['wordNetSense']).get_bow()
        filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    # set up and execute gensim modeling
    handler = LdaHandler(chunked_words_bags)
    handler.create_dictionary()
    handler.create_corpus()
    handler.train_hdp_model(options)

    topics = handler.hdp_model.show_topics(topics=-1, log=False, formatted=False)

    # create output models
    topic_group = build_and_save_topic_tuples_and_topic_groups(topics, user, collection_data, 'hdp', options)
    # relate collections to topic group
    add_collections_to_topic_group(topic_group, collection_data)

    # email upon completion
    try:
        send_document_done_email(user)
    except Exception as e:
        print e
    return topics

Example #4

0

Show file

def lsi_celery_task(collection_data, options, user):
    """
    Async task to perform lsa
    :param collection_data:
    :param options:
    :param user:
    :return:
    """
    user = User.objects.get(pk=user)

    # get tokens from collections and filter them
    filtered_docs = []
    wordnet_status = options['wordNetSense']
    print "wordnet status", wordnet_status
    for item in collection_data:
        tokens = CollectionParser(item['id'],
                                  item['filter'],
                                  wordnet_status=wordnet_status).get_bow()
        filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(
                bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(
                bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    stringed_docs = []
    for doc in chunked_words_bags:
        stringed_docs.append(" ".join([x.lower() for x in doc]))

    try:
        # set up and execute gensim modeling
        print "#######################333333"
        print "GO GO HANDELR FOR DICT AND DCORPUS"
        handler = LdaHandler(chunked_words_bags)
        handler.create_dictionary()
        handler.create_corpus()

        one_d_vec = None
        working = False
        search_query = options['search_query']
        # loop through the dictionary and find the search query
        for key, val in handler.dictionary.items():
            print key, val, search_query, val == search_query
            if val == options['search_query']:
                print val
                print "###########################################33"
                print "The v key is lit"
                working = True
                one_d_vec = val
                break

        num_topics = 200
        lsi_model = gensim.models.lsimodel.LsiModel(corpus=handler.corpus,
                                                    num_topics=1,
                                                    id2word=handler.dictionary,
                                                    chunksize=20000,
                                                    decay=1.0,
                                                    distributed=False,
                                                    onepass=True,
                                                    power_iters=2,
                                                    extra_samples=100)

        one_d_vec = options['search_query']
        print "####ONE D VECK#####sss"
        print one_d_vec
        sim_table = all_in_one_similiarity_matrix([one_d_vec],
                                                  handler.dictionary,
                                                  lsi_model, num_topics, [],
                                                  True)
        print "Showing similarity table "
        print sim_table

        LsiResult(user=user,
                  results=json.dumps(terms),
                  query_term=options['search_query']).save()
        result = LsiResult.objects.last()
        collections = [
            CorpusItemCollection.objects.get(pk=c.get('id'))
            for c in collection_data
        ]
        for collection in collections:
            result.collections.add(collection)
        result.save()
    except Exception as e:
        print e

Example #5

0

Show file

def topic_modeling_celery_task(collection_data, options, user, *args,
                               **kwargs):
    """
    Async tosk to do gensim based topic modeling.
    :param collection_data:
    :param options:
    :param user:
    :param args:
    :param kwargs:
    :return:
    """
    # get user from user id
    user = User.objects.get(pk=user)

    # get tokens from collection and parse with filters
    filtered_docs = []
    wordnet_status = options['wordNetSense']
    for item in collection_data:
        # overide the collections filter wordnet status.  This should probably live somewhere else in the future.
        tokens = CollectionParser(item['id'],
                                  item['filter'],
                                  wordnet_status=wordnet_status).get_bow()
        filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(
                bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(
                bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    # set up and execute gensim modeling
    handler = LdaHandler(chunked_words_bags)
    handler.create_dictionary()
    handler.create_corpus()
    update_every = options.get('update_every') or 2
    del options['update_every']
    handler.train_lda_model(options['numTopics'], update_every,
                            options['numPasses'], options)
    handler.lda_model.top_topics(handler.corpus, options['numTopics'])
    topics = handler.lda_model.top_topics(handler.corpus,
                                          num_words=options['top_n'])
    # create output models
    topic_group = build_and_save_topic_tuples_and_topic_groups(
        topics, user, collection_data, 'lda', options)
    # relate collections to topic group
    add_collections_to_topic_group(topic_group, collection_data)

    # email upon completion
    try:
        send_document_done_email(user)
    except Exception as e:
        print e
    return topics

Example #6

0

Show file

File: python_based.py Project: lucasnoah/litmetricscore

def lsi_celery_task(collection_data, options, user):
    """
    Async task to perform lsa
    :param collection_data:
    :param options:
    :param user:
    :return:
    """
    user = User.objects.get(pk=user)

    # get tokens from collections and filter them
    filtered_docs = []
    for item in collection_data:
            tokens = CollectionParser(item['id'], item['filter'], options['wordNetSense']).get_bow()
            filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    stringed_docs = []
    for doc in chunked_words_bags:
        stringed_docs.append(" ".join([x.lower() for x in doc]))

    # set up and execute gensim modeling
    try:
        transformer = TfidfVectorizer()
        tfidf = transformer.fit_transform(stringed_docs)
        num_components = 2
        if len(stringed_docs) < 2:
            num_components = 1
        svd = TruncatedSVD(n_components=num_components)
        lsa = svd.fit_transform(tfidf.T)
        terms = kClosestTerms(15, options['search_query'], transformer, lsa)
    except Exception as e:
        print e
        terms = ["No results found for search"]
    LsiResult(
        user=user,
        results=json.dumps(terms),
        query_term=options['search_query']
    ).save()
    result = LsiResult.objects.last()
    collections = [CorpusItemCollection.objects.get(pk=c.get('id')) for c in collection_data]
    for collection in collections:
        result.collections.add(collection)
    result.save()