'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(corpus) print(vectorizer.get_feature_names()) print(X.shape) print(X) import pymprog as mp mp.begin('bike production') x, y = mp.var('x, y') # variablesas mp mp.maximize(15 * x + 10 * y, 'profit') x <= 3 # mountain bike limit y <= 4 # racer production limit x + y <= 5 # metal finishing limit mp.solve() print("#####################") import spacy import spacy_kenlm nlp = spacy.load('en_core_web_sm') kenlm_model = spacy_kenlm.spaCyKenLM( 'coca_fulltext.clean.lm.arpa') # default model from test.arpa
def ppSolver(expectedReturn, numberClients, numberChannels, numberProducts, cost, budget, channelCap, minOfferProduct, maxOfferProduct, rurdleRate): startTime = timeit.default_timer() rNumberClients = range(numberClients) rNumberChannels = range(numberChannels) rNumberProducts = range(numberProducts) t = pp.iprod(rNumberClients, rNumberChannels, rNumberProducts) pp.begin('basic') # begin modelling pp.verbose(False) # be verbose x = pp.var('choice', t, bool) pp.maximize(sum(x[i,j,k]*expectedReturn[i][j][k] for i in rNumberClients\ for j in rNumberChannels for k in rNumberProducts)) #channelLimitConstraint: for j in rNumberChannels: sum(x[i,j,k] for i in rNumberClients for k in rNumberProducts)\ <=channelCap[j] #maxOfferProductConstraint: for k in rNumberProducts: sum(x[i,j,k] for i in rNumberClients for j in rNumberChannels)\ <=maxOfferProduct[k] #minOfferProductConstraint: # for k in rNumberProducts: # sum(x[i,j,k] for i in rNumberClients for j in rNumberChannels)\ # >=minOfferProduct[k] #budgetConstraint: pp.st(sum(x[i,j,k]*cost[j] for i in rNumberClients for j in\ rNumberChannels for k in rNumberProducts)<=budget,"Budget Constr.") #clientLimitConstraint: for i in rNumberClients: pp.st(sum(x[i,j,k] for j in rNumberChannels for k in rNumberProducts)\ <=1,"Client "+str(i)+" limit") #rurdleRateConstraint: pp.st(sum(x[i,j,k]*expectedReturn[i][j][k] for i in rNumberClients for j \ in rNumberChannels for k in rNumberProducts)>= (1+rurdleRate)\ *sum(x[i,j,k]*cost[j] for i in rNumberClients for j in\ rNumberChannels for k in rNumberProducts),"Rurdle Rate Constr") pp.solve() # solve the model # pp.sensitivity() # sensitivity report endTime = timeit.default_timer() - startTime print("Objetivo encontrado: ", round(pp.vobj(), 2), " em ", round(endTime, 3), " segundos") print("\n\n\n") appendCsv(numberClients, "Solver method", endTime, True, round(pp.vobj(), 2)) pp.end() #Good habit: do away with the model
import numpy as np import pymprog as pp from variables import * rNumberClients = range(numberClients) rNumberChannels = range(numberChannels) rNumberProducts = range(numberProducts) t = pp.iprod(rNumberClients,rNumberChannels,rNumberProducts) pp.begin('basic') # begin modelling pp.verbose(True) # be verbose x = pp.var('choice', t, bool) #create 3 variables pp.maximize(sum(x[i,j,k]*expectedReturn[i][j][k] for i in rNumberClients\ for j in rNumberChannels for k in rNumberProducts)) #channelLimitConstraint: for j in rNumberChannels: sum(x[i,j,k] for i in rNumberClients for k in rNumberProducts)\ <=channelCap[j] #maxOfferProductConstraint: for k in rNumberProducts: sum(x[i,j,k] for i in rNumberClients for j in rNumberChannels)\ <=maxOfferProduct[k] #minOfferProductConstraint: #for k in rNumberProducts:
def summerize(tweets_df): print(len(tweets_df)) #print(tweets_df['tweet_texts'][1]) tf_idf.compute_tf_idf(tweets_df) term_matrix = np.load('term_matrix.npy') vocab_to_idx = np.load('vocab_to_idx.npy', allow_pickle=True).item() content_vocab = list(np.load('content_vocab.npy')) # tfidf_dict = np.load('tfidf_dict.npy', allow_pickle=True).item() print("1 ##################") spacy_tweets = [] for doc in nlp.pipe(tweets_df['tweet_texts'].astype('unicode'), n_threads=-1): spacy_tweets.append(doc) spacy_tweets = [tweet for tweet in spacy_tweets if len(tweet) > 1] # spacy_tweets = np.random.choice(spacy_tweets, 10, replace=False) # spacy_tweets = spacy_tweets[:20] print(len(spacy_tweets)) print(spacy_tweets[0]) print("2 ##################") all_bigrams = [ list(bigrams([token.lemma_ for token in tweets])) for tweets in spacy_tweets ] starting_nodes = [single_bigram[0] for single_bigram in all_bigrams] end_nodes = [single_bigram[-1] for single_bigram in all_bigrams] all_bigrams = [ node for single_bigram in all_bigrams for node in single_bigram ] all_bigrams = list(set(all_bigrams)) print("all_bigrams len=", len(all_bigrams)) print(all_bigrams[0]) print("3 ##################") # bigram_graph = make_bigram_graph(all_bigrams, starting_nodes[1]) # print(len(bigram_graph)) # print(bigram_graph) # path = breadth_first_search(bigram_graph, starting_nodes[1], end_nodes[2]) # print(path) bigram_paths = [] for single_start_node in tqdm(starting_nodes): bigram_graph = make_bigram_graph(all_bigrams, single_start_node) for single_end_node in end_nodes: possible_paths = breadth_first_search(bigram_graph, single_start_node, single_end_node) for path in possible_paths: bigram_paths.append(path) print("bigram_paths len=", len(bigram_paths)) # print(bigram_paths[10]) # for tweet in spacy_tweets: # bigram_paths.append(list(bigrams([token.lemma_ for token in tweets]))) word_paths = [] for path in tqdm(bigram_paths): word_paths.append(make_list(path)) print(word_paths[0]) print("4 ##################") mp.begin('COWABS') # Defining my first variable, x # This defines whether or not a word path is selected x = mp.var(str('x'), len(word_paths), bool) # Also defining the second variable, which defines # whether or not a content word is chosen y = mp.var(str('y'), len(content_vocab), bool) mp.maximize( sum([ linguistic_quality(word_paths[i]) * informativeness(word_paths[i], term_matrix, vocab_to_idx) * x[i] for i in range(len(x)) ]) + sum(y)) # hiding the output of this line since its a very long sum # sum([x[i] * len(word_paths[i]) for i in range(len(x))]) <= 150 for j in range(len(y)): sum([ x[i] for i in paths_with_content_words(j, word_paths, content_vocab) ]) >= y[j] for i in range(len(x)): sum(y[j] for j in content_words(i, word_paths, content_vocab)) >= len( content_words(i, word_paths, content_vocab)) * x[i] mp.solve() result_x = [value.primal for value in x] result_y = [value.primal for value in y] mp.end() chosen_paths = np.nonzero(result_x) chosen_words = np.nonzero(result_y) print("*** Total = ", len(chosen_paths[0])) min_cosine_sim = 999 final_sentence = None for i in chosen_paths[0]: print('--------------') print(str(" ").join([token for token in word_paths[i]])) cosine_sim = informativeness(word_paths[i], term_matrix, vocab_to_idx) print(cosine_sim) if min_cosine_sim > cosine_sim: min_cosine_sim = cosine_sim final_sentence = str(" ").join([token for token in word_paths[i]]) # print("####### Summary ###########") # print(final_sentence) return final_sentence