'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

print(X.shape)

print(X)

import pymprog as mp
mp.begin('bike production')
x, y = mp.var('x, y')  # variablesas mp
mp.maximize(15 * x + 10 * y, 'profit')
x <= 3  # mountain bike limit
y <= 4  # racer production limit
x + y <= 5  # metal finishing limit
mp.solve()

print("#####################")

import spacy
import spacy_kenlm

nlp = spacy.load('en_core_web_sm')

kenlm_model = spacy_kenlm.spaCyKenLM(
    'coca_fulltext.clean.lm.arpa')  # default model from test.arpa
Esempio n. 2
0
def ppSolver(expectedReturn, numberClients, numberChannels, numberProducts,
             cost, budget, channelCap, minOfferProduct, maxOfferProduct,
             rurdleRate):

    startTime = timeit.default_timer()
    rNumberClients = range(numberClients)
    rNumberChannels = range(numberChannels)
    rNumberProducts = range(numberProducts)

    t = pp.iprod(rNumberClients, rNumberChannels, rNumberProducts)
    pp.begin('basic')  # begin modelling
    pp.verbose(False)  # be verbose

    x = pp.var('choice', t, bool)

    pp.maximize(sum(x[i,j,k]*expectedReturn[i][j][k] for i in rNumberClients\
                 for j in rNumberChannels for k in rNumberProducts))

    #channelLimitConstraint:
    for j in rNumberChannels:
        sum(x[i,j,k] for i in rNumberClients for k in rNumberProducts)\
        <=channelCap[j]

    #maxOfferProductConstraint:
    for k in rNumberProducts:
        sum(x[i,j,k] for i in rNumberClients for j in rNumberChannels)\
        <=maxOfferProduct[k]

    #minOfferProductConstraint:


#    for k in rNumberProducts:
#        sum(x[i,j,k] for i in rNumberClients for j in rNumberChannels)\
#        >=minOfferProduct[k]

#budgetConstraint:

    pp.st(sum(x[i,j,k]*cost[j] for i in rNumberClients for j in\
        rNumberChannels for k in rNumberProducts)<=budget,"Budget Constr.")

    #clientLimitConstraint:

    for i in rNumberClients:
        pp.st(sum(x[i,j,k] for j in rNumberChannels for k in rNumberProducts)\
              <=1,"Client "+str(i)+" limit")

    #rurdleRateConstraint:

    pp.st(sum(x[i,j,k]*expectedReturn[i][j][k] for i in rNumberClients for j \
          in rNumberChannels for k in rNumberProducts)>= (1+rurdleRate)\
            *sum(x[i,j,k]*cost[j] for i in rNumberClients for j in\
                rNumberChannels for k in rNumberProducts),"Rurdle Rate Constr")

    pp.solve()  # solve the model

    #    pp.sensitivity() # sensitivity report
    endTime = timeit.default_timer() - startTime
    print("Objetivo encontrado: ", round(pp.vobj(), 2), " em ",
          round(endTime, 3), " segundos")

    print("\n\n\n")
    appendCsv(numberClients, "Solver method", endTime, True,
              round(pp.vobj(), 2))
    pp.end()  #Good habit: do away with the model
Esempio n. 3
0
import numpy as np
import pymprog as pp
from variables import *

rNumberClients = range(numberClients)
rNumberChannels = range(numberChannels)
rNumberProducts = range(numberProducts)

t = pp.iprod(rNumberClients,rNumberChannels,rNumberProducts)
pp.begin('basic') # begin modelling
pp.verbose(True)  # be verbose

x = pp.var('choice', 
        t, bool) #create 3 variables
        
pp.maximize(sum(x[i,j,k]*expectedReturn[i][j][k] for i in rNumberClients\
             for j in rNumberChannels for k in rNumberProducts))

  
#channelLimitConstraint:
for j in rNumberChannels:
    sum(x[i,j,k] for i in rNumberClients for k in rNumberProducts)\
    <=channelCap[j]
    
#maxOfferProductConstraint:    
for k in rNumberProducts:
    sum(x[i,j,k] for i in rNumberClients for j in rNumberChannels)\
    <=maxOfferProduct[k]

#minOfferProductConstraint:  

#for k in rNumberProducts:
Esempio n. 4
0
def summerize(tweets_df):
    print(len(tweets_df))
    #print(tweets_df['tweet_texts'][1])

    tf_idf.compute_tf_idf(tweets_df)
    term_matrix = np.load('term_matrix.npy')
    vocab_to_idx = np.load('vocab_to_idx.npy', allow_pickle=True).item()
    content_vocab = list(np.load('content_vocab.npy'))
    # tfidf_dict = np.load('tfidf_dict.npy', allow_pickle=True).item()

    print("1 ##################")

    spacy_tweets = []

    for doc in nlp.pipe(tweets_df['tweet_texts'].astype('unicode'),
                        n_threads=-1):
        spacy_tweets.append(doc)
    spacy_tweets = [tweet for tweet in spacy_tweets if len(tweet) > 1]
    # spacy_tweets = np.random.choice(spacy_tweets, 10, replace=False)
    # spacy_tweets = spacy_tweets[:20]
    print(len(spacy_tweets))
    print(spacy_tweets[0])

    print("2 ##################")

    all_bigrams = [
        list(bigrams([token.lemma_ for token in tweets]))
        for tweets in spacy_tweets
    ]
    starting_nodes = [single_bigram[0] for single_bigram in all_bigrams]
    end_nodes = [single_bigram[-1] for single_bigram in all_bigrams]
    all_bigrams = [
        node for single_bigram in all_bigrams for node in single_bigram
    ]
    all_bigrams = list(set(all_bigrams))
    print("all_bigrams len=", len(all_bigrams))
    print(all_bigrams[0])

    print("3 ##################")

    # bigram_graph = make_bigram_graph(all_bigrams, starting_nodes[1])
    # print(len(bigram_graph))
    # print(bigram_graph)
    # path = breadth_first_search(bigram_graph, starting_nodes[1], end_nodes[2])
    # print(path)

    bigram_paths = []

    for single_start_node in tqdm(starting_nodes):
        bigram_graph = make_bigram_graph(all_bigrams, single_start_node)
        for single_end_node in end_nodes:
            possible_paths = breadth_first_search(bigram_graph,
                                                  single_start_node,
                                                  single_end_node)
            for path in possible_paths:
                bigram_paths.append(path)
    print("bigram_paths len=", len(bigram_paths))
    # print(bigram_paths[10])

    # for tweet in spacy_tweets:
    #     bigram_paths.append(list(bigrams([token.lemma_ for token in tweets])))
    word_paths = []
    for path in tqdm(bigram_paths):
        word_paths.append(make_list(path))
    print(word_paths[0])

    print("4 ##################")

    mp.begin('COWABS')
    # Defining my first variable, x
    # This defines whether or not a word path is selected
    x = mp.var(str('x'), len(word_paths), bool)
    # Also defining the second variable, which defines
    # whether or not a content word is chosen
    y = mp.var(str('y'), len(content_vocab), bool)

    mp.maximize(
        sum([
            linguistic_quality(word_paths[i]) *
            informativeness(word_paths[i], term_matrix, vocab_to_idx) * x[i]
            for i in range(len(x))
        ]) + sum(y))
    # hiding the output of this line since its a very long sum
    # sum([x[i] * len(word_paths[i]) for i in range(len(x))]) <= 150

    for j in range(len(y)):
        sum([
            x[i]
            for i in paths_with_content_words(j, word_paths, content_vocab)
        ]) >= y[j]

    for i in range(len(x)):
        sum(y[j] for j in content_words(i, word_paths, content_vocab)) >= len(
            content_words(i, word_paths, content_vocab)) * x[i]
    mp.solve()
    result_x = [value.primal for value in x]
    result_y = [value.primal for value in y]
    mp.end()

    chosen_paths = np.nonzero(result_x)
    chosen_words = np.nonzero(result_y)
    print("*** Total = ", len(chosen_paths[0]))

    min_cosine_sim = 999
    final_sentence = None
    for i in chosen_paths[0]:
        print('--------------')
        print(str(" ").join([token for token in word_paths[i]]))
        cosine_sim = informativeness(word_paths[i], term_matrix, vocab_to_idx)
        print(cosine_sim)
        if min_cosine_sim > cosine_sim:
            min_cosine_sim = cosine_sim
            final_sentence = str(" ").join([token for token in word_paths[i]])

    # print("####### Summary ###########")
    # print(final_sentence)

    return final_sentence