Example #1
0
def topic_modeling_celery_task(collection_data, options, user, *args, **kwargs):
    """
    Async tosk to do gensim based topic modeling.
    :param collection_data:
    :param options:
    :param user:
    :param args:
    :param kwargs:
    :return:
    """
    # get user from user id
    user = User.objects.get(pk=user)

    # get tokens from collection and parse with filters
    filtered_docs = []
    wordnet_status = options['wordNetSense']
    for item in collection_data:
        # overide the collections filter wordnet status.  This should probably live somewhere else in the future.
        tokens = CollectionParser(item['id'], item['filter'], wordnet_status=wordnet_status).get_bow()
        filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    # set up and execute gensim modeling
    handler = LdaHandler(chunked_words_bags)
    handler.create_dictionary()
    handler.create_corpus()
    update_every = options.get('update_every') or 2
    del options['update_every']
    handler.train_lda_model(options['numTopics'], update_every, options['numPasses'], options)
    handler.lda_model.top_topics(handler.corpus, options['numTopics'])
    topics = handler.lda_model.top_topics(handler.corpus, num_words=options['top_n'])
    # create output models
    topic_group = build_and_save_topic_tuples_and_topic_groups(topics, user, collection_data, 'lda', options)
    # relate collections to topic group
    add_collections_to_topic_group(topic_group, collection_data)

    # email upon completion
    try:
        send_document_done_email(user)
    except Exception as e:
        print e
    return topics
Example #2
0
def hdp_celery_task(collection_data, options, user):
    """
    Async gensim HDP task
    :param collection_data:
    :param options:
    :param user:
    :return:
    """
    user = User.objects.get(pk=user)

    # get tokens from collection and filter them
    filtered_docs = []
    for item in collection_data:
        tokens = CollectionParser(item['id'], item['filter'], wordnet_status=options['wordNetSense']).get_bow()
        filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    # set up and execute gensim modeling
    handler = LdaHandler(chunked_words_bags)
    handler.create_dictionary()
    handler.create_corpus()
    handler.train_hdp_model(options)

    topics = handler.hdp_model.show_topics(topics=-1, log=False, formatted=False)

    # create output models
    topic_group = build_and_save_topic_tuples_and_topic_groups(topics, user, collection_data, 'hdp', options)
    # relate collections to topic group
    add_collections_to_topic_group(topic_group, collection_data)

    # email upon completion
    try:
        send_document_done_email(user)
    except Exception as e:
        print e
    return topics
Example #3
0
def lsi_celery_task(collection_data, options, user):
    """
    Async task to perform lsa
    :param collection_data:
    :param options:
    :param user:
    :return:
    """
    user = User.objects.get(pk=user)

    # get tokens from collections and filter them
    filtered_docs = []
    for item in collection_data:
            tokens = CollectionParser(item['id'], item['filter'], options['wordNetSense']).get_bow()
            filtered_docs.append(tokens)

    # handle chunk by count case
    if options['chunking'] == "count":
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_chunk_size(bag, options['chunk_size'])

    # handle chunk by breakchar string
    if options['chunking'] == 'breakword':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags += chunk_bag_of_word_collection_by_char_string(bag, options['breakword'])

    # handle no chunking
    if options['chunking'] == 'none':
        chunked_words_bags = []
        for bag in filtered_docs:
            chunked_words_bags.append(bag)

    stringed_docs = []
    for doc in chunked_words_bags:
        stringed_docs.append(" ".join([x.lower() for x in doc]))

    # set up and execute gensim modeling
    try:
        transformer = TfidfVectorizer()
        tfidf = transformer.fit_transform(stringed_docs)
        num_components = 2
        if len(stringed_docs) < 2:
            num_components = 1
        svd = TruncatedSVD(n_components=num_components)
        lsa = svd.fit_transform(tfidf.T)
        terms = kClosestTerms(15, options['search_query'], transformer, lsa)
    except Exception as e:
        print e
        terms = ["No results found for search"]
    LsiResult(
        user=user,
        results=json.dumps(terms),
        query_term=options['search_query']
    ).save()
    result = LsiResult.objects.last()
    collections = [CorpusItemCollection.objects.get(pk=c.get('id')) for c in collection_data]
    for collection in collections:
        result.collections.add(collection)
    result.save()