#Load the dataset that was obtained from API and pickeled beforehand with open('dataset.pkl', 'rb') as handle: dataset = pickle.load(handle) #Extract all questions tokenized as sentences from the dataset sentences = [] for i in dataset: sentences += nltk.sent_tokenize(i['question1']) sentences += nltk.sent_tokenize(i['question2']) #Tokenize as words and find their frequency and select top 10000 word_dist = nltk.FreqDist() for s in sentences: word_dist.update([i.lower() for i in nltk.word_tokenize(s)]) word_dist = word_dist.most_common(10000) #obtain the glove vectors for the 10000 words from the web service embeddings_list = [] for i in range(10, 100): embeddings_list += client.w2v( [i[0] for i in word_dist[i * 100:i * 100 + 100]]) #with the word as the key and the glove vector as the value and pickle it embeddings_index = {} for i in embeddings_list: embeddings_index[i['word']] = i['vec'] with open('embeddings_index1.pkl', 'wb') as handle: pickle.dump(embeddings_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
# you can access individual fields such as "summary" and "rating" that can be used # to find the sentiment for item in val[:100]: print("Summary ==> ", item["summary"], "\t\tRating ==> ", item["rating"]) # summary may be one or more sentences of text. We need to break these in to words # further, we need to convert each word to a vector form # our web service provides a function that accepts a list of words and returns the # corresponding vectors. In the example below, we take the first item returned by the # previous call and convert that in to a sequence of vectors text = val[0]["summary"] print("The input text is: ", text) # get sentence tokens from text that may have more than 1 sentence # we use NLTK's sent_tokenize for this sentences = sent_tokenize(text) # we get the list of sentences all_words = [] for sentence in sentences: all_words.extend(word_tokenize(sentence)) # all_words contains all the words in the text as a single list # let us get the vectors for these vals = client.w2v(all_words) for val in vals: print(val["word"], val["vec"]) # now you can continue further by vectoring the class label and creating the required dataset # your code ......