Ejemplo n.º 1
0
def match_topics_lda(x):
    import operator
    topics = topics_words_lda
    tweets_topics = {}

    for t in range(len(x.tokenized)):
        tweet = x.tokenized._values[t]+ utils.get_ngrams(2,x.tokenized._values[t])
        num_words = {}
        for i in range(len(topics)):
            topic = topics[i]
            num_words.setdefault(topic[0], 0)
            #tweets_topics.setdefault(topic[0],0)
            for j in range(len(topic)):
                w = topic[j]
                if w in tweet:
                    # weight by invers index in the topic list to give lower weight to less relevant words
                    num_words[topic[0]] += 1/(j+1)
        if all(value == 0 for value in num_words.values()):
            current_topic = "uncategorized"
        else:
            current_topic = max(num_words.items(), key=operator.itemgetter(1))[0]
        tweets_topics.setdefault(current_topic+'_'+sentiments[x.sentiment._values[t]], 0)
        tweets_topics[current_topic+'_'+sentiments[x.sentiment._values[t]]] += 1
    #tweets_topics["uncategorized"] = len(x.tokenized) - sum(tweets_topics.values())
    return tweets_topics
Ejemplo n.º 2
0
        elif params.combination_type == "ngram-word-lstm":
            words = saved_params.pop(-1)
            words_3grams = words[0]
            words_words = words[1]

        if params.combination_type == "ngram-word":
            model = mixed_models(saved_params[0], saved_params[1], params)
        elif params.combination_type == "ngram-word-lstm":
            model = mixed_models(saved_params[0],
                                 saved_params[1],
                                 params,
                                 We_initial_lstm=saved_params[2])

        lasagne.layers.set_all_param_values(model.final_layer, saved_params)
    else:
        words_3grams, We_3gram = utils.get_ngrams(data, params)
        if params.random_embs:
            words_words, We_word = utils.get_words(data, params)
        else:
            words_words, We_word = utils.get_wordmap(args.wordfile)

        We_lstm = copy.deepcopy(We_word)

        if params.combination_type == "ngram-word":
            model = mixed_models(We_3gram, We_word, params)
        elif params.combination_type == "ngram-lstm":
            model = mixed_models(We_3gram,
                                 None,
                                 params,
                                 We_initial_lstm=We_lstm)
        elif params.combination_type == "word-lstm":
                    type=float,
                    help="rate of scrambling")
parser.add_argument("--sp-model", help="SP model to load for evaluation")

args = parser.parse_args()

data = get_data(args)

if args.load_file is not None:
    model, epoch = load_model(data, args)
    print("Loaded model at epoch {0} and resuming training.".format(epoch))
    model.train_epochs(start_epoch=epoch)
else:
    if args.ngrams:
        vocab, vocab_fr = utils.get_ngrams(data,
                                           args.share_vocab,
                                           n=args.ngrams)
    else:
        vocab, vocab_fr = utils.get_words(data, args.share_vocab)

    if args.model == "avg":
        model = Averaging(data, args, vocab, vocab_fr)
    elif args.model == "lstm":
        model = LSTM(data, args, vocab, vocab_fr)

    print(" ".join(sys.argv))
    print("Num examples:", len(data))
    print("Num words:", len(vocab))
    if vocab_fr is not None:
        print("Num fr words:", len(vocab_fr))